be/src/exprs/function/function_regexp.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include <glog/logging.h> |
19 | | #include <re2/re2.h> |
20 | | #include <re2/stringpiece.h> |
21 | | #include <stddef.h> |
22 | | |
23 | | #include <boost/regex.hpp> |
24 | | #include <memory> |
25 | | #include <string> |
26 | | #include <string_view> |
27 | | #include <type_traits> |
28 | | #include <utility> |
29 | | #include <vector> |
30 | | |
31 | | #include "common/status.h" |
32 | | #include "core/block/block.h" |
33 | | #include "core/block/column_numbers.h" |
34 | | #include "core/block/column_with_type_and_name.h" |
35 | | #include "core/column/column.h" |
36 | | #include "core/column/column_const.h" |
37 | | #include "core/column/column_nullable.h" |
38 | | #include "core/column/column_string.h" |
39 | | #include "core/column/column_vector.h" |
40 | | #include "core/data_type/data_type.h" |
41 | | #include "core/data_type/data_type_nullable.h" |
42 | | #include "core/data_type/data_type_number.h" |
43 | | #include "core/data_type/data_type_string.h" |
44 | | #include "core/string_ref.h" |
45 | | #include "core/types.h" |
46 | | #include "exec/common/stringop_substring.h" |
47 | | #include "exprs/aggregate/aggregate_function.h" |
48 | | #include "exprs/function/function.h" |
49 | | #include "exprs/function/simple_function_factory.h" |
50 | | #include "exprs/function_context.h" |
51 | | #include "exprs/string_functions.h" |
52 | | |
53 | | namespace doris { |
54 | | |
55 | | // Helper structure to hold either RE2 or Boost.Regex |
56 | | struct RegexpExtractEngine { |
57 | | std::unique_ptr<re2::RE2> re2_regex; |
58 | | std::unique_ptr<boost::regex> boost_regex; |
59 | | |
60 | 18 | bool is_boost() const { return boost_regex != nullptr; } |
61 | 313 | bool is_re2() const { return re2_regex != nullptr; } |
62 | | |
63 | | // Try to compile with RE2 first, fallback to Boost.Regex if RE2 fails |
64 | | static bool compile(const StringRef& pattern, std::string* error_str, |
65 | 264 | RegexpExtractEngine& engine, bool enable_extended_regex) { |
66 | 264 | re2::RE2::Options options; |
67 | 264 | options.set_log_errors(false); // avoid RE2 printing to stderr; we handle errors ourselves |
68 | 264 | options.set_dot_nl(true); // make '.' match '\n' by default, consistent with REGEXP/LIKE |
69 | 264 | engine.re2_regex = |
70 | 264 | std::make_unique<re2::RE2>(re2::StringPiece(pattern.data, pattern.size), options); |
71 | | |
72 | 264 | if (engine.re2_regex->ok()) { |
73 | 243 | return true; |
74 | 243 | } else if (!enable_extended_regex) { |
75 | 3 | *error_str = fmt::format( |
76 | 3 | "Invalid regex pattern: {}. Error: {}. If you need advanced regex features, " |
77 | 3 | "try setting enable_extended_regex=true", |
78 | 3 | std::string(pattern.data, pattern.size), engine.re2_regex->error()); |
79 | 3 | return false; |
80 | 3 | } |
81 | | |
82 | | // RE2 failed, try Boost.Regex for advanced features like zero-width assertions |
83 | 18 | engine.re2_regex.reset(); |
84 | 18 | try { |
85 | 18 | boost::regex::flag_type flags = boost::regex::normal; |
86 | 18 | engine.boost_regex = std::make_unique<boost::regex>(pattern.data, |
87 | 18 | pattern.data + pattern.size, flags); |
88 | 18 | return true; |
89 | 18 | } catch (const boost::regex_error& e) { |
90 | 0 | if (error_str) { |
91 | 0 | *error_str = fmt::format("Invalid regex pattern: {}. Error: {}", |
92 | 0 | std::string(pattern.data, pattern.size), e.what()); |
93 | 0 | } |
94 | 0 | return false; |
95 | 0 | } |
96 | 18 | } |
97 | | |
98 | | // Get number of capturing groups |
99 | 231 | int number_of_capturing_groups() const { |
100 | 231 | if (is_re2()) { |
101 | 222 | return re2_regex->NumberOfCapturingGroups(); |
102 | 222 | } else if (is_boost()) { |
103 | 9 | return static_cast<int>(boost_regex->mark_count()); |
104 | 9 | } |
105 | 0 | return 0; |
106 | 231 | } |
107 | | |
108 | | // Match function for extraction |
109 | 52 | bool match_and_extract(const char* data, size_t size, int index, std::string& result) const { |
110 | 52 | if (is_re2()) { |
111 | 47 | int max_matches = 1 + re2_regex->NumberOfCapturingGroups(); |
112 | 47 | if (index >= max_matches) { |
113 | 0 | return false; |
114 | 0 | } |
115 | 47 | std::vector<re2::StringPiece> matches(max_matches); |
116 | 47 | bool success = re2_regex->Match(re2::StringPiece(data, size), 0, size, |
117 | 47 | re2::RE2::UNANCHORED, matches.data(), max_matches); |
118 | 47 | if (success && index < matches.size()) { |
119 | 34 | const re2::StringPiece& match = matches[index]; |
120 | 34 | result.assign(match.data(), match.size()); |
121 | 34 | return true; |
122 | 34 | } |
123 | 13 | return false; |
124 | 47 | } else if (is_boost()) { |
125 | 5 | boost::cmatch matches; |
126 | 5 | bool success = boost::regex_search(data, data + size, matches, *boost_regex); |
127 | 5 | if (success && index < matches.size()) { |
128 | 5 | result = matches[index].str(); |
129 | 5 | return true; |
130 | 5 | } |
131 | 0 | return false; |
132 | 5 | } |
133 | 0 | return false; |
134 | 52 | } |
135 | | |
136 | | // Match all occurrences and extract the first capturing group |
137 | | void match_all_and_extract(const char* data, size_t size, |
138 | 30 | std::vector<std::string>& results) const { |
139 | 30 | if (is_re2()) { |
140 | 26 | int max_matches = 1 + re2_regex->NumberOfCapturingGroups(); |
141 | 26 | if (max_matches < 2) { |
142 | 0 | return; // No capturing groups |
143 | 0 | } |
144 | | |
145 | 26 | size_t pos = 0; |
146 | 67 | while (pos < size) { |
147 | 55 | const char* str_pos = data + pos; |
148 | 55 | size_t str_size = size - pos; |
149 | 55 | std::vector<re2::StringPiece> matches(max_matches); |
150 | 55 | bool success = re2_regex->Match(re2::StringPiece(str_pos, str_size), 0, str_size, |
151 | 55 | re2::RE2::UNANCHORED, matches.data(), max_matches); |
152 | 55 | if (!success) { |
153 | 14 | break; |
154 | 14 | } |
155 | 41 | if (matches[0].empty()) { |
156 | 11 | pos += 1; |
157 | 11 | continue; |
158 | 11 | } |
159 | | // Extract first capturing group |
160 | 30 | if (matches.size() > 1 && !matches[1].empty()) { |
161 | 30 | results.emplace_back(matches[1].data(), matches[1].size()); |
162 | 30 | } |
163 | | // Move position forward |
164 | 30 | auto offset = std::string(str_pos, str_size) |
165 | 30 | .find(std::string(matches[0].data(), matches[0].size())); |
166 | 30 | pos += offset + matches[0].size(); |
167 | 30 | } |
168 | 26 | } else if (is_boost()) { |
169 | 4 | const char* search_start = data; |
170 | 4 | const char* search_end = data + size; |
171 | 4 | boost::match_results<const char*> matches; |
172 | | |
173 | 13 | while (boost::regex_search(search_start, search_end, matches, *boost_regex)) { |
174 | 9 | if (matches.size() > 1 && matches[1].matched) { |
175 | 9 | results.emplace_back(matches[1].str()); |
176 | 9 | } |
177 | 9 | if (matches[0].length() == 0) { |
178 | 0 | if (search_start == search_end) { |
179 | 0 | break; |
180 | 0 | } |
181 | 0 | search_start += 1; |
182 | 9 | } else { |
183 | 9 | search_start = matches[0].second; |
184 | 9 | } |
185 | 9 | } |
186 | 4 | } |
187 | 30 | } |
188 | | }; |
189 | | |
190 | | struct RegexpCountImpl { |
191 | | static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], |
192 | 15 | size_t input_rows_count, ColumnInt32::Container& result_data) { |
193 | 15 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
194 | 15 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
195 | 48 | for (int i = 0; i < input_rows_count; ++i) { |
196 | 33 | result_data[i] = _execute_inner_loop(context, str_col, pattern_col, i); |
197 | 33 | } |
198 | 15 | } |
199 | | static int _execute_inner_loop(FunctionContext* context, const ColumnString* str_col, |
200 | 33 | const ColumnString* pattern_col, const size_t index_now) { |
201 | 33 | re2::RE2* re = reinterpret_cast<re2::RE2*>( |
202 | 33 | context->get_function_state(FunctionContext::THREAD_LOCAL)); |
203 | 33 | std::unique_ptr<re2::RE2> scoped_re; |
204 | 33 | if (re == nullptr) { |
205 | 12 | std::string error_str; |
206 | 12 | DCHECK(pattern_col); |
207 | 12 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, false)); |
208 | 12 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), StringRef(), |
209 | 12 | scoped_re); |
210 | 12 | if (!st) { |
211 | 0 | context->add_warning(error_str.c_str()); |
212 | 0 | throw Exception(Status::InvalidArgument(error_str)); |
213 | 0 | return 0; |
214 | 0 | } |
215 | 12 | re = scoped_re.get(); |
216 | 12 | } |
217 | | |
218 | 33 | const auto& str = str_col->get_data_at(index_now); |
219 | 33 | int count = 0; |
220 | 33 | size_t pos = 0; |
221 | 101 | while (pos < str.size) { |
222 | 87 | auto str_pos = str.data + pos; |
223 | 87 | auto str_size = str.size - pos; |
224 | 87 | re2::StringPiece str_sp_current = re2::StringPiece(str_pos, str_size); |
225 | 87 | re2::StringPiece match; |
226 | | |
227 | 87 | bool success = re->Match(str_sp_current, 0, str_size, re2::RE2::UNANCHORED, &match, 1); |
228 | 87 | if (!success) { |
229 | 19 | break; |
230 | 19 | } |
231 | 68 | if (match.empty()) { |
232 | 4 | pos += 1; |
233 | 4 | continue; |
234 | 4 | } |
235 | 64 | count++; |
236 | 64 | size_t match_start = match.data() - str_sp_current.data(); |
237 | 64 | pos += match_start + match.size(); |
238 | 64 | } |
239 | | |
240 | 33 | return count; |
241 | 33 | } |
242 | | }; |
243 | | |
244 | | class FunctionRegexpCount : public IFunction { |
245 | | public: |
246 | | static constexpr auto name = "regexp_count"; |
247 | | |
248 | 24 | static FunctionPtr create() { return std::make_shared<FunctionRegexpCount>(); } |
249 | | |
250 | 1 | String get_name() const override { return name; } |
251 | | |
252 | 15 | size_t get_number_of_arguments() const override { return 2; } |
253 | | |
254 | 15 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
255 | 15 | return std::make_shared<DataTypeInt32>(); |
256 | 15 | } |
257 | | |
258 | 67 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { |
259 | 67 | if (scope == FunctionContext::THREAD_LOCAL) { |
260 | 52 | if (context->is_col_constant(1)) { |
261 | 39 | DCHECK(!context->get_function_state(scope)); |
262 | 39 | const auto pattern_col = context->get_constant_col(1)->column_ptr; |
263 | 39 | const auto& pattern = pattern_col->get_data_at(0); |
264 | 39 | if (pattern.size == 0) { |
265 | 4 | return Status::OK(); |
266 | 4 | } |
267 | | |
268 | 35 | std::string error_str; |
269 | 35 | std::unique_ptr<re2::RE2> scoped_re; |
270 | 35 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), |
271 | 35 | StringRef(), scoped_re); |
272 | 35 | if (!st) { |
273 | 0 | context->set_error(error_str.c_str()); |
274 | 0 | return Status::InvalidArgument(error_str); |
275 | 0 | } |
276 | 35 | std::shared_ptr<re2::RE2> re(scoped_re.release()); |
277 | 35 | context->set_function_state(scope, re); |
278 | 35 | } |
279 | 52 | } |
280 | 63 | return Status::OK(); |
281 | 67 | } |
282 | | |
283 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
284 | 15 | uint32_t result, size_t input_rows_count) const override { |
285 | 15 | auto result_data_column = ColumnInt32::create(input_rows_count); |
286 | 15 | auto& result_data = result_data_column->get_data(); |
287 | | |
288 | 15 | ColumnPtr argument_columns[2]; |
289 | | |
290 | 15 | argument_columns[0] = block.get_by_position(arguments[0]).column; |
291 | 15 | argument_columns[1] = block.get_by_position(arguments[1]).column; |
292 | 15 | RegexpCountImpl::execute_impl(context, argument_columns, input_rows_count, result_data); |
293 | | |
294 | 15 | block.get_by_position(result).column = std::move(result_data_column); |
295 | 15 | return Status::OK(); |
296 | 15 | } |
297 | | }; |
298 | | |
299 | | struct ThreeParamTypes { |
300 | 16 | static DataTypes get_variadic_argument_types() { |
301 | 16 | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(), |
302 | 16 | std::make_shared<DataTypeString>()}; |
303 | 16 | } |
304 | | }; |
305 | | |
306 | | struct FourParamTypes { |
307 | 16 | static DataTypes get_variadic_argument_types() { |
308 | 16 | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(), |
309 | 16 | std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>()}; |
310 | 16 | } |
311 | | }; |
312 | | |
313 | | // template FunctionRegexpFunctionality is used for regexp_replace/regexp_replace_one |
314 | | template <typename Impl, typename ParamTypes> |
315 | | class FunctionRegexpReplace : public IFunction { |
316 | | public: |
317 | | static constexpr auto name = Impl::name; |
318 | | |
319 | 92 | static FunctionPtr create() { return std::make_shared<FunctionRegexpReplace>(); }_ZN5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE6createEv Line | Count | Source | 319 | 33 | static FunctionPtr create() { return std::make_shared<FunctionRegexpReplace>(); } |
_ZN5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE6createEv Line | Count | Source | 319 | 17 | static FunctionPtr create() { return std::make_shared<FunctionRegexpReplace>(); } |
_ZN5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE6createEv Line | Count | Source | 319 | 24 | static FunctionPtr create() { return std::make_shared<FunctionRegexpReplace>(); } |
_ZN5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE6createEv Line | Count | Source | 319 | 18 | static FunctionPtr create() { return std::make_shared<FunctionRegexpReplace>(); } |
|
320 | | |
321 | 0 | String get_name() const override { return name; }Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE8get_nameB5cxx11Ev Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE8get_nameB5cxx11Ev Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE8get_nameB5cxx11Ev Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE8get_nameB5cxx11Ev |
322 | | |
323 | 0 | size_t get_number_of_arguments() const override { |
324 | 0 | return get_variadic_argument_types_impl().size(); |
325 | 0 | } Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE23get_number_of_argumentsEv Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE23get_number_of_argumentsEv Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE23get_number_of_argumentsEv Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE23get_number_of_argumentsEv |
326 | | |
327 | 60 | bool is_variadic() const override { return true; }_ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE11is_variadicEv Line | Count | Source | 327 | 25 | bool is_variadic() const override { return true; } |
_ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE11is_variadicEv Line | Count | Source | 327 | 9 | bool is_variadic() const override { return true; } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE11is_variadicEv Line | Count | Source | 327 | 16 | bool is_variadic() const override { return true; } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE11is_variadicEv Line | Count | Source | 327 | 10 | bool is_variadic() const override { return true; } |
|
328 | | |
329 | 56 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
330 | 56 | return make_nullable(std::make_shared<DataTypeString>()); |
331 | 56 | } _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 329 | 24 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 330 | 24 | return make_nullable(std::make_shared<DataTypeString>()); | 331 | 24 | } |
_ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 329 | 8 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 330 | 8 | return make_nullable(std::make_shared<DataTypeString>()); | 331 | 8 | } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 329 | 15 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 330 | 15 | return make_nullable(std::make_shared<DataTypeString>()); | 331 | 15 | } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 329 | 9 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 330 | 9 | return make_nullable(std::make_shared<DataTypeString>()); | 331 | 9 | } |
|
332 | | |
333 | 32 | DataTypes get_variadic_argument_types_impl() const override { |
334 | 32 | return ParamTypes::get_variadic_argument_types(); |
335 | 32 | } _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE32get_variadic_argument_types_implEv Line | Count | Source | 333 | 8 | DataTypes get_variadic_argument_types_impl() const override { | 334 | 8 | return ParamTypes::get_variadic_argument_types(); | 335 | 8 | } |
_ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE32get_variadic_argument_types_implEv Line | Count | Source | 333 | 8 | DataTypes get_variadic_argument_types_impl() const override { | 334 | 8 | return ParamTypes::get_variadic_argument_types(); | 335 | 8 | } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE32get_variadic_argument_types_implEv Line | Count | Source | 333 | 8 | DataTypes get_variadic_argument_types_impl() const override { | 334 | 8 | return ParamTypes::get_variadic_argument_types(); | 335 | 8 | } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE32get_variadic_argument_types_implEv Line | Count | Source | 333 | 8 | DataTypes get_variadic_argument_types_impl() const override { | 334 | 8 | return ParamTypes::get_variadic_argument_types(); | 335 | 8 | } |
|
336 | | |
337 | 371 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { |
338 | 371 | if (scope == FunctionContext::THREAD_LOCAL) { |
339 | 315 | if (context->is_col_constant(1)) { |
340 | 157 | DCHECK(!context->get_function_state(scope)); |
341 | 157 | const auto pattern_col = context->get_constant_col(1)->column_ptr; |
342 | 157 | const auto& pattern = pattern_col->get_data_at(0); |
343 | 157 | if (pattern.size == 0) { |
344 | 6 | return Status::OK(); |
345 | 6 | } |
346 | | |
347 | 151 | std::string error_str; |
348 | 151 | std::unique_ptr<re2::RE2> scoped_re; |
349 | 151 | StringRef options_value; |
350 | 151 | if constexpr (std::is_same_v<FourParamTypes, ParamTypes>) { |
351 | 102 | DCHECK_EQ(context->get_num_args(), 4); |
352 | 102 | DCHECK(context->is_col_constant(3)); |
353 | 102 | const auto options_col = context->get_constant_col(3)->column_ptr; |
354 | 102 | options_value = options_col->get_data_at(0); |
355 | 102 | } |
356 | | |
357 | 151 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), |
358 | 151 | options_value, scoped_re); |
359 | 151 | if (!st) { |
360 | 0 | context->set_error(error_str.c_str()); |
361 | 0 | return Status::InvalidArgument(error_str); |
362 | 0 | } |
363 | 151 | std::shared_ptr<re2::RE2> re(scoped_re.release()); |
364 | 151 | context->set_function_state(scope, re); |
365 | 151 | } |
366 | 315 | } |
367 | 365 | return Status::OK(); |
368 | 371 | } _ZN5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 337 | 89 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 338 | 89 | if (scope == FunctionContext::THREAD_LOCAL) { | 339 | 65 | if (context->is_col_constant(1)) { | 340 | 41 | DCHECK(!context->get_function_state(scope)); | 341 | 41 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 342 | 41 | const auto& pattern = pattern_col->get_data_at(0); | 343 | 41 | if (pattern.size == 0) { | 344 | 4 | return Status::OK(); | 345 | 4 | } | 346 | | | 347 | 37 | std::string error_str; | 348 | 37 | std::unique_ptr<re2::RE2> scoped_re; | 349 | 37 | StringRef options_value; | 350 | | if constexpr (std::is_same_v<FourParamTypes, ParamTypes>) { | 351 | | DCHECK_EQ(context->get_num_args(), 4); | 352 | | DCHECK(context->is_col_constant(3)); | 353 | | const auto options_col = context->get_constant_col(3)->column_ptr; | 354 | | options_value = options_col->get_data_at(0); | 355 | | } | 356 | | | 357 | 37 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 358 | 37 | options_value, scoped_re); | 359 | 37 | if (!st) { | 360 | 0 | context->set_error(error_str.c_str()); | 361 | 0 | return Status::InvalidArgument(error_str); | 362 | 0 | } | 363 | 37 | std::shared_ptr<re2::RE2> re(scoped_re.release()); | 364 | 37 | context->set_function_state(scope, re); | 365 | 37 | } | 366 | 65 | } | 367 | 85 | return Status::OK(); | 368 | 89 | } |
_ZN5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 337 | 106 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 338 | 106 | if (scope == FunctionContext::THREAD_LOCAL) { | 339 | 98 | if (context->is_col_constant(1)) { | 340 | 50 | DCHECK(!context->get_function_state(scope)); | 341 | 50 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 342 | 50 | const auto& pattern = pattern_col->get_data_at(0); | 343 | 50 | if (pattern.size == 0) { | 344 | 0 | return Status::OK(); | 345 | 0 | } | 346 | | | 347 | 50 | std::string error_str; | 348 | 50 | std::unique_ptr<re2::RE2> scoped_re; | 349 | 50 | StringRef options_value; | 350 | 50 | if constexpr (std::is_same_v<FourParamTypes, ParamTypes>) { | 351 | 50 | DCHECK_EQ(context->get_num_args(), 4); | 352 | 50 | DCHECK(context->is_col_constant(3)); | 353 | 50 | const auto options_col = context->get_constant_col(3)->column_ptr; | 354 | 50 | options_value = options_col->get_data_at(0); | 355 | 50 | } | 356 | | | 357 | 50 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 358 | 50 | options_value, scoped_re); | 359 | 50 | if (!st) { | 360 | 0 | context->set_error(error_str.c_str()); | 361 | 0 | return Status::InvalidArgument(error_str); | 362 | 0 | } | 363 | 50 | std::shared_ptr<re2::RE2> re(scoped_re.release()); | 364 | 50 | context->set_function_state(scope, re); | 365 | 50 | } | 366 | 98 | } | 367 | 106 | return Status::OK(); | 368 | 106 | } |
_ZN5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 337 | 53 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 338 | 53 | if (scope == FunctionContext::THREAD_LOCAL) { | 339 | 38 | if (context->is_col_constant(1)) { | 340 | 14 | DCHECK(!context->get_function_state(scope)); | 341 | 14 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 342 | 14 | const auto& pattern = pattern_col->get_data_at(0); | 343 | 14 | if (pattern.size == 0) { | 344 | 2 | return Status::OK(); | 345 | 2 | } | 346 | | | 347 | 12 | std::string error_str; | 348 | 12 | std::unique_ptr<re2::RE2> scoped_re; | 349 | 12 | StringRef options_value; | 350 | | if constexpr (std::is_same_v<FourParamTypes, ParamTypes>) { | 351 | | DCHECK_EQ(context->get_num_args(), 4); | 352 | | DCHECK(context->is_col_constant(3)); | 353 | | const auto options_col = context->get_constant_col(3)->column_ptr; | 354 | | options_value = options_col->get_data_at(0); | 355 | | } | 356 | | | 357 | 12 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 358 | 12 | options_value, scoped_re); | 359 | 12 | if (!st) { | 360 | 0 | context->set_error(error_str.c_str()); | 361 | 0 | return Status::InvalidArgument(error_str); | 362 | 0 | } | 363 | 12 | std::shared_ptr<re2::RE2> re(scoped_re.release()); | 364 | 12 | context->set_function_state(scope, re); | 365 | 12 | } | 366 | 38 | } | 367 | 51 | return Status::OK(); | 368 | 53 | } |
_ZN5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 337 | 123 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 338 | 123 | if (scope == FunctionContext::THREAD_LOCAL) { | 339 | 114 | if (context->is_col_constant(1)) { | 340 | 52 | DCHECK(!context->get_function_state(scope)); | 341 | 52 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 342 | 52 | const auto& pattern = pattern_col->get_data_at(0); | 343 | 52 | if (pattern.size == 0) { | 344 | 0 | return Status::OK(); | 345 | 0 | } | 346 | | | 347 | 52 | std::string error_str; | 348 | 52 | std::unique_ptr<re2::RE2> scoped_re; | 349 | 52 | StringRef options_value; | 350 | 52 | if constexpr (std::is_same_v<FourParamTypes, ParamTypes>) { | 351 | 52 | DCHECK_EQ(context->get_num_args(), 4); | 352 | 52 | DCHECK(context->is_col_constant(3)); | 353 | 52 | const auto options_col = context->get_constant_col(3)->column_ptr; | 354 | 52 | options_value = options_col->get_data_at(0); | 355 | 52 | } | 356 | | | 357 | 52 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 358 | 52 | options_value, scoped_re); | 359 | 52 | if (!st) { | 360 | 0 | context->set_error(error_str.c_str()); | 361 | 0 | return Status::InvalidArgument(error_str); | 362 | 0 | } | 363 | 52 | std::shared_ptr<re2::RE2> re(scoped_re.release()); | 364 | 52 | context->set_function_state(scope, re); | 365 | 52 | } | 366 | 114 | } | 367 | 123 | return Status::OK(); | 368 | 123 | } |
|
369 | | |
370 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
371 | 71 | uint32_t result, size_t input_rows_count) const override { |
372 | 71 | size_t argument_size = arguments.size(); |
373 | | |
374 | 71 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); |
375 | 71 | auto result_data_column = ColumnString::create(); |
376 | 71 | auto& result_data = result_data_column->get_chars(); |
377 | 71 | auto& result_offset = result_data_column->get_offsets(); |
378 | 71 | result_offset.resize(input_rows_count); |
379 | | |
380 | 71 | bool col_const[3]; |
381 | 71 | ColumnPtr argument_columns[3]; |
382 | 284 | for (int i = 0; i < 3; ++i) { |
383 | 213 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); |
384 | 213 | } |
385 | 71 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( |
386 | 6 | *block.get_by_position(arguments[0]).column) |
387 | 6 | .convert_to_full_column() |
388 | 71 | : block.get_by_position(arguments[0]).column; |
389 | | |
390 | 71 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, arguments); |
391 | | |
392 | 71 | StringRef options_value; |
393 | 71 | if (col_const[1] && col_const[2]) { |
394 | 3 | Impl::execute_impl_const_args(context, argument_columns, options_value, |
395 | 3 | input_rows_count, result_data, result_offset, |
396 | 3 | result_null_map->get_data()); |
397 | 68 | } else { |
398 | | // the options have check in FE, so is always const, and get idx of 0 |
399 | 68 | if (argument_size == 4) { |
400 | 15 | options_value = block.get_by_position(arguments[3]).column->get_data_at(0); |
401 | 15 | } |
402 | 68 | Impl::execute_impl(context, argument_columns, options_value, input_rows_count, |
403 | 68 | result_data, result_offset, result_null_map->get_data()); |
404 | 68 | } |
405 | | |
406 | 71 | block.get_by_position(result).column = |
407 | 71 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); |
408 | 71 | return Status::OK(); |
409 | 71 | } _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 371 | 30 | uint32_t result, size_t input_rows_count) const override { | 372 | 30 | size_t argument_size = arguments.size(); | 373 | | | 374 | 30 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 375 | 30 | auto result_data_column = ColumnString::create(); | 376 | 30 | auto& result_data = result_data_column->get_chars(); | 377 | 30 | auto& result_offset = result_data_column->get_offsets(); | 378 | 30 | result_offset.resize(input_rows_count); | 379 | | | 380 | 30 | bool col_const[3]; | 381 | 30 | ColumnPtr argument_columns[3]; | 382 | 120 | for (int i = 0; i < 3; ++i) { | 383 | 90 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 384 | 90 | } | 385 | 30 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 386 | 0 | *block.get_by_position(arguments[0]).column) | 387 | 0 | .convert_to_full_column() | 388 | 30 | : block.get_by_position(arguments[0]).column; | 389 | | | 390 | 30 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, arguments); | 391 | | | 392 | 30 | StringRef options_value; | 393 | 30 | if (col_const[1] && col_const[2]) { | 394 | 1 | Impl::execute_impl_const_args(context, argument_columns, options_value, | 395 | 1 | input_rows_count, result_data, result_offset, | 396 | 1 | result_null_map->get_data()); | 397 | 29 | } else { | 398 | | // the options have check in FE, so is always const, and get idx of 0 | 399 | 29 | if (argument_size == 4) { | 400 | 0 | options_value = block.get_by_position(arguments[3]).column->get_data_at(0); | 401 | 0 | } | 402 | 29 | Impl::execute_impl(context, argument_columns, options_value, input_rows_count, | 403 | 29 | result_data, result_offset, result_null_map->get_data()); | 404 | 29 | } | 405 | | | 406 | 30 | block.get_by_position(result).column = | 407 | 30 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 408 | 30 | return Status::OK(); | 409 | 30 | } |
_ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 371 | 8 | uint32_t result, size_t input_rows_count) const override { | 372 | 8 | size_t argument_size = arguments.size(); | 373 | | | 374 | 8 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 375 | 8 | auto result_data_column = ColumnString::create(); | 376 | 8 | auto& result_data = result_data_column->get_chars(); | 377 | 8 | auto& result_offset = result_data_column->get_offsets(); | 378 | 8 | result_offset.resize(input_rows_count); | 379 | | | 380 | 8 | bool col_const[3]; | 381 | 8 | ColumnPtr argument_columns[3]; | 382 | 32 | for (int i = 0; i < 3; ++i) { | 383 | 24 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 384 | 24 | } | 385 | 8 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 386 | 3 | *block.get_by_position(arguments[0]).column) | 387 | 3 | .convert_to_full_column() | 388 | 8 | : block.get_by_position(arguments[0]).column; | 389 | | | 390 | 8 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, arguments); | 391 | | | 392 | 8 | StringRef options_value; | 393 | 8 | if (col_const[1] && col_const[2]) { | 394 | 1 | Impl::execute_impl_const_args(context, argument_columns, options_value, | 395 | 1 | input_rows_count, result_data, result_offset, | 396 | 1 | result_null_map->get_data()); | 397 | 7 | } else { | 398 | | // the options have check in FE, so is always const, and get idx of 0 | 399 | 7 | if (argument_size == 4) { | 400 | 7 | options_value = block.get_by_position(arguments[3]).column->get_data_at(0); | 401 | 7 | } | 402 | 7 | Impl::execute_impl(context, argument_columns, options_value, input_rows_count, | 403 | 7 | result_data, result_offset, result_null_map->get_data()); | 404 | 7 | } | 405 | | | 406 | 8 | block.get_by_position(result).column = | 407 | 8 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 408 | 8 | return Status::OK(); | 409 | 8 | } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 371 | 24 | uint32_t result, size_t input_rows_count) const override { | 372 | 24 | size_t argument_size = arguments.size(); | 373 | | | 374 | 24 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 375 | 24 | auto result_data_column = ColumnString::create(); | 376 | 24 | auto& result_data = result_data_column->get_chars(); | 377 | 24 | auto& result_offset = result_data_column->get_offsets(); | 378 | 24 | result_offset.resize(input_rows_count); | 379 | | | 380 | 24 | bool col_const[3]; | 381 | 24 | ColumnPtr argument_columns[3]; | 382 | 96 | for (int i = 0; i < 3; ++i) { | 383 | 72 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 384 | 72 | } | 385 | 24 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 386 | 0 | *block.get_by_position(arguments[0]).column) | 387 | 0 | .convert_to_full_column() | 388 | 24 | : block.get_by_position(arguments[0]).column; | 389 | | | 390 | 24 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, arguments); | 391 | | | 392 | 24 | StringRef options_value; | 393 | 24 | if (col_const[1] && col_const[2]) { | 394 | 0 | Impl::execute_impl_const_args(context, argument_columns, options_value, | 395 | 0 | input_rows_count, result_data, result_offset, | 396 | 0 | result_null_map->get_data()); | 397 | 24 | } else { | 398 | | // the options have check in FE, so is always const, and get idx of 0 | 399 | 24 | if (argument_size == 4) { | 400 | 0 | options_value = block.get_by_position(arguments[3]).column->get_data_at(0); | 401 | 0 | } | 402 | 24 | Impl::execute_impl(context, argument_columns, options_value, input_rows_count, | 403 | 24 | result_data, result_offset, result_null_map->get_data()); | 404 | 24 | } | 405 | | | 406 | 24 | block.get_by_position(result).column = | 407 | 24 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 408 | 24 | return Status::OK(); | 409 | 24 | } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 371 | 9 | uint32_t result, size_t input_rows_count) const override { | 372 | 9 | size_t argument_size = arguments.size(); | 373 | | | 374 | 9 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 375 | 9 | auto result_data_column = ColumnString::create(); | 376 | 9 | auto& result_data = result_data_column->get_chars(); | 377 | 9 | auto& result_offset = result_data_column->get_offsets(); | 378 | 9 | result_offset.resize(input_rows_count); | 379 | | | 380 | 9 | bool col_const[3]; | 381 | 9 | ColumnPtr argument_columns[3]; | 382 | 36 | for (int i = 0; i < 3; ++i) { | 383 | 27 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 384 | 27 | } | 385 | 9 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 386 | 3 | *block.get_by_position(arguments[0]).column) | 387 | 3 | .convert_to_full_column() | 388 | 9 | : block.get_by_position(arguments[0]).column; | 389 | | | 390 | 9 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, arguments); | 391 | | | 392 | 9 | StringRef options_value; | 393 | 9 | if (col_const[1] && col_const[2]) { | 394 | 1 | Impl::execute_impl_const_args(context, argument_columns, options_value, | 395 | 1 | input_rows_count, result_data, result_offset, | 396 | 1 | result_null_map->get_data()); | 397 | 8 | } else { | 398 | | // the options have check in FE, so is always const, and get idx of 0 | 399 | 8 | if (argument_size == 4) { | 400 | 8 | options_value = block.get_by_position(arguments[3]).column->get_data_at(0); | 401 | 8 | } | 402 | 8 | Impl::execute_impl(context, argument_columns, options_value, input_rows_count, | 403 | 8 | result_data, result_offset, result_null_map->get_data()); | 404 | 8 | } | 405 | | | 406 | 9 | block.get_by_position(result).column = | 407 | 9 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 408 | 9 | return Status::OK(); | 409 | 9 | } |
|
410 | | }; |
411 | | |
412 | | struct RegexpReplaceImpl { |
413 | | static constexpr auto name = "regexp_replace"; |
414 | | static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], |
415 | | const StringRef& options_value, size_t input_rows_count, |
416 | | ColumnString::Chars& result_data, ColumnString::Offsets& result_offset, |
417 | 36 | NullMap& null_map) { |
418 | 36 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
419 | 36 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
420 | 36 | const auto* replace_col = check_and_get_column<ColumnString>(argument_columns[2].get()); |
421 | | |
422 | 126 | for (size_t i = 0; i < input_rows_count; ++i) { |
423 | 90 | if (null_map[i]) { |
424 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
425 | 0 | continue; |
426 | 0 | } |
427 | 90 | _execute_inner_loop<false>(context, str_col, pattern_col, replace_col, options_value, |
428 | 90 | result_data, result_offset, null_map, i); |
429 | 90 | } |
430 | 36 | } |
431 | | static void execute_impl_const_args(FunctionContext* context, ColumnPtr argument_columns[], |
432 | | const StringRef& options_value, size_t input_rows_count, |
433 | | ColumnString::Chars& result_data, |
434 | 2 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
435 | 2 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
436 | 2 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
437 | 2 | const auto* replace_col = check_and_get_column<ColumnString>(argument_columns[2].get()); |
438 | | |
439 | 12 | for (size_t i = 0; i < input_rows_count; ++i) { |
440 | 10 | if (null_map[i]) { |
441 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
442 | 0 | continue; |
443 | 0 | } |
444 | 10 | _execute_inner_loop<true>(context, str_col, pattern_col, replace_col, options_value, |
445 | 10 | result_data, result_offset, null_map, i); |
446 | 10 | } |
447 | 2 | } |
448 | | template <bool Const> |
449 | | static void _execute_inner_loop(FunctionContext* context, const ColumnString* str_col, |
450 | | const ColumnString* pattern_col, |
451 | | const ColumnString* replace_col, const StringRef& options_value, |
452 | | ColumnString::Chars& result_data, |
453 | | ColumnString::Offsets& result_offset, NullMap& null_map, |
454 | 100 | const size_t index_now) { |
455 | 100 | re2::RE2* re = reinterpret_cast<re2::RE2*>( |
456 | 100 | context->get_function_state(FunctionContext::THREAD_LOCAL)); |
457 | 100 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr |
458 | 100 | if (re == nullptr) { |
459 | 67 | std::string error_str; |
460 | 67 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); |
461 | 67 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), |
462 | 67 | options_value, scoped_re); |
463 | 67 | if (!st) { |
464 | 0 | context->add_warning(error_str.c_str()); |
465 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); |
466 | 0 | return; |
467 | 0 | } |
468 | 67 | re = scoped_re.get(); |
469 | 67 | } |
470 | | |
471 | 100 | re2::StringPiece replace_str = re2::StringPiece( |
472 | 100 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); |
473 | | |
474 | 100 | std::string result_str(str_col->get_data_at(index_now).to_string()); |
475 | 100 | re2::RE2::GlobalReplace(&result_str, *re, replace_str); |
476 | 100 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); |
477 | 100 | } _ZN5doris17RegexpReplaceImpl19_execute_inner_loopILb1EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_S7_RKNS_9StringRefERNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSB_IjLm4096ESE_Lm16ELm15EEESG_m Line | Count | Source | 454 | 10 | const size_t index_now) { | 455 | 10 | re2::RE2* re = reinterpret_cast<re2::RE2*>( | 456 | 10 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 457 | 10 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr | 458 | 10 | if (re == nullptr) { | 459 | 0 | std::string error_str; | 460 | 0 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 461 | 0 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 462 | 0 | options_value, scoped_re); | 463 | 0 | if (!st) { | 464 | 0 | context->add_warning(error_str.c_str()); | 465 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 466 | 0 | return; | 467 | 0 | } | 468 | 0 | re = scoped_re.get(); | 469 | 0 | } | 470 | | | 471 | 10 | re2::StringPiece replace_str = re2::StringPiece( | 472 | 10 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); | 473 | | | 474 | 10 | std::string result_str(str_col->get_data_at(index_now).to_string()); | 475 | 10 | re2::RE2::GlobalReplace(&result_str, *re, replace_str); | 476 | 10 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); | 477 | 10 | } |
_ZN5doris17RegexpReplaceImpl19_execute_inner_loopILb0EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_S7_RKNS_9StringRefERNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSB_IjLm4096ESE_Lm16ELm15EEESG_m Line | Count | Source | 454 | 90 | const size_t index_now) { | 455 | 90 | re2::RE2* re = reinterpret_cast<re2::RE2*>( | 456 | 90 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 457 | 90 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr | 458 | 90 | if (re == nullptr) { | 459 | 67 | std::string error_str; | 460 | 67 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 461 | 67 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 462 | 67 | options_value, scoped_re); | 463 | 67 | if (!st) { | 464 | 0 | context->add_warning(error_str.c_str()); | 465 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 466 | 0 | return; | 467 | 0 | } | 468 | 67 | re = scoped_re.get(); | 469 | 67 | } | 470 | | | 471 | 90 | re2::StringPiece replace_str = re2::StringPiece( | 472 | 90 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); | 473 | | | 474 | 90 | std::string result_str(str_col->get_data_at(index_now).to_string()); | 475 | 90 | re2::RE2::GlobalReplace(&result_str, *re, replace_str); | 476 | 90 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); | 477 | 90 | } |
|
478 | | }; |
479 | | |
480 | | struct RegexpReplaceOneImpl { |
481 | | static constexpr auto name = "regexp_replace_one"; |
482 | | |
483 | | static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], |
484 | | const StringRef& options_value, size_t input_rows_count, |
485 | | ColumnString::Chars& result_data, ColumnString::Offsets& result_offset, |
486 | 32 | NullMap& null_map) { |
487 | 32 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
488 | 32 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
489 | 32 | const auto* replace_col = check_and_get_column<ColumnString>(argument_columns[2].get()); |
490 | | // 3 args |
491 | 122 | for (size_t i = 0; i < input_rows_count; ++i) { |
492 | 90 | if (null_map[i]) { |
493 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
494 | 0 | continue; |
495 | 0 | } |
496 | 90 | _execute_inner_loop<false>(context, str_col, pattern_col, replace_col, options_value, |
497 | 90 | result_data, result_offset, null_map, i); |
498 | 90 | } |
499 | 32 | } |
500 | | |
501 | | static void execute_impl_const_args(FunctionContext* context, ColumnPtr argument_columns[], |
502 | | const StringRef& options_value, size_t input_rows_count, |
503 | | ColumnString::Chars& result_data, |
504 | 1 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
505 | 1 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
506 | 1 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
507 | 1 | const auto* replace_col = check_and_get_column<ColumnString>(argument_columns[2].get()); |
508 | | // 3 args |
509 | 6 | for (size_t i = 0; i < input_rows_count; ++i) { |
510 | 5 | if (null_map[i]) { |
511 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
512 | 0 | continue; |
513 | 0 | } |
514 | 5 | _execute_inner_loop<true>(context, str_col, pattern_col, replace_col, options_value, |
515 | 5 | result_data, result_offset, null_map, i); |
516 | 5 | } |
517 | 1 | } |
518 | | template <bool Const> |
519 | | static void _execute_inner_loop(FunctionContext* context, const ColumnString* str_col, |
520 | | const ColumnString* pattern_col, |
521 | | const ColumnString* replace_col, const StringRef& options_value, |
522 | | ColumnString::Chars& result_data, |
523 | | ColumnString::Offsets& result_offset, NullMap& null_map, |
524 | 95 | const size_t index_now) { |
525 | 95 | re2::RE2* re = reinterpret_cast<re2::RE2*>( |
526 | 95 | context->get_function_state(FunctionContext::THREAD_LOCAL)); |
527 | 95 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr |
528 | 95 | if (re == nullptr) { |
529 | 72 | std::string error_str; |
530 | 72 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); |
531 | 72 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), |
532 | 72 | options_value, scoped_re); |
533 | 72 | if (!st) { |
534 | 0 | context->add_warning(error_str.c_str()); |
535 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); |
536 | 0 | return; |
537 | 0 | } |
538 | 72 | re = scoped_re.get(); |
539 | 72 | } |
540 | | |
541 | 95 | re2::StringPiece replace_str = re2::StringPiece( |
542 | 95 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); |
543 | | |
544 | 95 | std::string result_str(str_col->get_data_at(index_now).to_string()); |
545 | 95 | re2::RE2::Replace(&result_str, *re, replace_str); |
546 | 95 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); |
547 | 95 | } _ZN5doris20RegexpReplaceOneImpl19_execute_inner_loopILb1EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_S7_RKNS_9StringRefERNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSB_IjLm4096ESE_Lm16ELm15EEESG_m Line | Count | Source | 524 | 5 | const size_t index_now) { | 525 | 5 | re2::RE2* re = reinterpret_cast<re2::RE2*>( | 526 | 5 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 527 | 5 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr | 528 | 5 | if (re == nullptr) { | 529 | 0 | std::string error_str; | 530 | 0 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 531 | 0 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 532 | 0 | options_value, scoped_re); | 533 | 0 | if (!st) { | 534 | 0 | context->add_warning(error_str.c_str()); | 535 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 536 | 0 | return; | 537 | 0 | } | 538 | 0 | re = scoped_re.get(); | 539 | 0 | } | 540 | | | 541 | 5 | re2::StringPiece replace_str = re2::StringPiece( | 542 | 5 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); | 543 | | | 544 | 5 | std::string result_str(str_col->get_data_at(index_now).to_string()); | 545 | 5 | re2::RE2::Replace(&result_str, *re, replace_str); | 546 | 5 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); | 547 | 5 | } |
_ZN5doris20RegexpReplaceOneImpl19_execute_inner_loopILb0EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_S7_RKNS_9StringRefERNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSB_IjLm4096ESE_Lm16ELm15EEESG_m Line | Count | Source | 524 | 90 | const size_t index_now) { | 525 | 90 | re2::RE2* re = reinterpret_cast<re2::RE2*>( | 526 | 90 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 527 | 90 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr | 528 | 90 | if (re == nullptr) { | 529 | 72 | std::string error_str; | 530 | 72 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 531 | 72 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 532 | 72 | options_value, scoped_re); | 533 | 72 | if (!st) { | 534 | 0 | context->add_warning(error_str.c_str()); | 535 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 536 | 0 | return; | 537 | 0 | } | 538 | 72 | re = scoped_re.get(); | 539 | 72 | } | 540 | | | 541 | 90 | re2::StringPiece replace_str = re2::StringPiece( | 542 | 90 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); | 543 | | | 544 | 90 | std::string result_str(str_col->get_data_at(index_now).to_string()); | 545 | 90 | re2::RE2::Replace(&result_str, *re, replace_str); | 546 | 90 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); | 547 | 90 | } |
|
548 | | }; |
549 | | |
550 | | template <bool ReturnNull> |
551 | | struct RegexpExtractImpl { |
552 | | static constexpr auto name = ReturnNull ? "regexp_extract_or_null" : "regexp_extract"; |
553 | | // 3 args |
554 | | static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], |
555 | | size_t input_rows_count, ColumnString::Chars& result_data, |
556 | 59 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
557 | 59 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
558 | 59 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
559 | 59 | const auto* index_col = check_and_get_column<ColumnInt64>(argument_columns[2].get()); |
560 | 188 | for (size_t i = 0; i < input_rows_count; ++i) { |
561 | 129 | if (null_map[i]) { |
562 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
563 | 0 | continue; |
564 | 0 | } |
565 | 129 | const auto& index_data = index_col->get_int(i); |
566 | 129 | if (index_data < 0) { |
567 | 0 | ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map) |
568 | 0 | : StringOP::push_empty_string(i, result_data, result_offset); |
569 | 0 | continue; |
570 | 0 | } |
571 | 129 | _execute_inner_loop<false>(context, str_col, pattern_col, index_data, result_data, |
572 | 129 | result_offset, null_map, i); |
573 | 129 | } |
574 | 59 | } _ZN5doris17RegexpExtractImplILb1EE12execute_implEPNS_15FunctionContextEPNS_3COWINS_7IColumnEE13immutable_ptrIS5_EEmRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSA_IjLm4096ESD_Lm16ELm15EEESF_ Line | Count | Source | 556 | 18 | ColumnString::Offsets& result_offset, NullMap& null_map) { | 557 | 18 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); | 558 | 18 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); | 559 | 18 | const auto* index_col = check_and_get_column<ColumnInt64>(argument_columns[2].get()); | 560 | 36 | for (size_t i = 0; i < input_rows_count; ++i) { | 561 | 18 | if (null_map[i]) { | 562 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); | 563 | 0 | continue; | 564 | 0 | } | 565 | 18 | const auto& index_data = index_col->get_int(i); | 566 | 18 | if (index_data < 0) { | 567 | 0 | ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map) | 568 | 0 | : StringOP::push_empty_string(i, result_data, result_offset); | 569 | 0 | continue; | 570 | 0 | } | 571 | 18 | _execute_inner_loop<false>(context, str_col, pattern_col, index_data, result_data, | 572 | 18 | result_offset, null_map, i); | 573 | 18 | } | 574 | 18 | } |
_ZN5doris17RegexpExtractImplILb0EE12execute_implEPNS_15FunctionContextEPNS_3COWINS_7IColumnEE13immutable_ptrIS5_EEmRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSA_IjLm4096ESD_Lm16ELm15EEESF_ Line | Count | Source | 556 | 41 | ColumnString::Offsets& result_offset, NullMap& null_map) { | 557 | 41 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); | 558 | 41 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); | 559 | 41 | const auto* index_col = check_and_get_column<ColumnInt64>(argument_columns[2].get()); | 560 | 152 | for (size_t i = 0; i < input_rows_count; ++i) { | 561 | 111 | if (null_map[i]) { | 562 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); | 563 | 0 | continue; | 564 | 0 | } | 565 | 111 | const auto& index_data = index_col->get_int(i); | 566 | 111 | if (index_data < 0) { | 567 | 0 | ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map) | 568 | 0 | : StringOP::push_empty_string(i, result_data, result_offset); | 569 | 0 | continue; | 570 | 0 | } | 571 | 111 | _execute_inner_loop<false>(context, str_col, pattern_col, index_data, result_data, | 572 | 111 | result_offset, null_map, i); | 573 | 111 | } | 574 | 41 | } |
|
575 | | |
576 | | static void execute_impl_const_args(FunctionContext* context, ColumnPtr argument_columns[], |
577 | | size_t input_rows_count, ColumnString::Chars& result_data, |
578 | 1 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
579 | 1 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
580 | 1 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
581 | 1 | const auto* index_col = check_and_get_column<ColumnInt64>(argument_columns[2].get()); |
582 | | |
583 | 1 | const auto& index_data = index_col->get_int(0); |
584 | 1 | if (index_data < 0) { |
585 | 0 | for (size_t i = 0; i < input_rows_count; ++i) { |
586 | 0 | ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map) |
587 | 0 | : StringOP::push_empty_string(i, result_data, result_offset); |
588 | 0 | } |
589 | 0 | return; |
590 | 0 | } |
591 | | |
592 | 8 | for (size_t i = 0; i < input_rows_count; ++i) { |
593 | 7 | if (null_map[i]) { |
594 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
595 | 0 | continue; |
596 | 0 | } |
597 | | |
598 | 7 | _execute_inner_loop<true>(context, str_col, pattern_col, index_data, result_data, |
599 | 7 | result_offset, null_map, i); |
600 | 7 | } |
601 | 1 | } Unexecuted instantiation: _ZN5doris17RegexpExtractImplILb1EE23execute_impl_const_argsEPNS_15FunctionContextEPNS_3COWINS_7IColumnEE13immutable_ptrIS5_EEmRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSA_IjLm4096ESD_Lm16ELm15EEESF_ _ZN5doris17RegexpExtractImplILb0EE23execute_impl_const_argsEPNS_15FunctionContextEPNS_3COWINS_7IColumnEE13immutable_ptrIS5_EEmRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSA_IjLm4096ESD_Lm16ELm15EEESF_ Line | Count | Source | 578 | 1 | ColumnString::Offsets& result_offset, NullMap& null_map) { | 579 | 1 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); | 580 | 1 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); | 581 | 1 | const auto* index_col = check_and_get_column<ColumnInt64>(argument_columns[2].get()); | 582 | | | 583 | 1 | const auto& index_data = index_col->get_int(0); | 584 | 1 | if (index_data < 0) { | 585 | 0 | for (size_t i = 0; i < input_rows_count; ++i) { | 586 | 0 | ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map) | 587 | 0 | : StringOP::push_empty_string(i, result_data, result_offset); | 588 | 0 | } | 589 | 0 | return; | 590 | 0 | } | 591 | | | 592 | 8 | for (size_t i = 0; i < input_rows_count; ++i) { | 593 | 7 | if (null_map[i]) { | 594 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); | 595 | 0 | continue; | 596 | 0 | } | 597 | | | 598 | 7 | _execute_inner_loop<true>(context, str_col, pattern_col, index_data, result_data, | 599 | 7 | result_offset, null_map, i); | 600 | 7 | } | 601 | 1 | } |
|
602 | | template <bool Const> |
603 | | static void _execute_inner_loop(FunctionContext* context, const ColumnString* str_col, |
604 | | const ColumnString* pattern_col, const Int64 index_data, |
605 | | ColumnString::Chars& result_data, |
606 | | ColumnString::Offsets& result_offset, NullMap& null_map, |
607 | 136 | const size_t index_now) { |
608 | 136 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( |
609 | 136 | context->get_function_state(FunctionContext::THREAD_LOCAL)); |
610 | 136 | std::unique_ptr<RegexpExtractEngine> scoped_engine; |
611 | | |
612 | 136 | if (engine == nullptr) { |
613 | 78 | std::string error_str; |
614 | 78 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); |
615 | 78 | scoped_engine = std::make_unique<RegexpExtractEngine>(); |
616 | 78 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, |
617 | 78 | context->state()->enable_extended_regex()); |
618 | 78 | if (!st) { |
619 | 0 | context->add_warning(error_str.c_str()); |
620 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); |
621 | 0 | return; |
622 | 0 | } |
623 | 78 | engine = scoped_engine.get(); |
624 | 78 | } |
625 | | |
626 | 136 | const auto& str = str_col->get_data_at(index_now); |
627 | | |
628 | 136 | int max_matches = 1 + engine->number_of_capturing_groups(); |
629 | 136 | if (index_data >= max_matches) { |
630 | 84 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) |
631 | 84 | : StringOP::push_empty_string(index_now, result_data, result_offset); |
632 | 84 | return; |
633 | 84 | } |
634 | | |
635 | 52 | std::string match_result; |
636 | 52 | bool success = engine->match_and_extract(str.data, str.size, static_cast<int>(index_data), |
637 | 52 | match_result); |
638 | | |
639 | 52 | if (!success) { |
640 | 13 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) |
641 | 13 | : StringOP::push_empty_string(index_now, result_data, result_offset); |
642 | 13 | return; |
643 | 13 | } |
644 | | |
645 | 39 | StringOP::push_value_string(std::string_view(match_result.data(), match_result.size()), |
646 | 39 | index_now, result_data, result_offset); |
647 | 39 | } Unexecuted instantiation: _ZN5doris17RegexpExtractImplILb1EE19_execute_inner_loopILb1EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES8_lRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNS9_IjLm4096ESC_Lm16ELm15EEESE_m _ZN5doris17RegexpExtractImplILb1EE19_execute_inner_loopILb0EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES8_lRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNS9_IjLm4096ESC_Lm16ELm15EEESE_m Line | Count | Source | 607 | 18 | const size_t index_now) { | 608 | 18 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( | 609 | 18 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 610 | 18 | std::unique_ptr<RegexpExtractEngine> scoped_engine; | 611 | | | 612 | 18 | if (engine == nullptr) { | 613 | 0 | std::string error_str; | 614 | 0 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 615 | 0 | scoped_engine = std::make_unique<RegexpExtractEngine>(); | 616 | 0 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, | 617 | 0 | context->state()->enable_extended_regex()); | 618 | 0 | if (!st) { | 619 | 0 | context->add_warning(error_str.c_str()); | 620 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 621 | 0 | return; | 622 | 0 | } | 623 | 0 | engine = scoped_engine.get(); | 624 | 0 | } | 625 | | | 626 | 18 | const auto& str = str_col->get_data_at(index_now); | 627 | | | 628 | 18 | int max_matches = 1 + engine->number_of_capturing_groups(); | 629 | 18 | if (index_data >= max_matches) { | 630 | 1 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 631 | 1 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 632 | 1 | return; | 633 | 1 | } | 634 | | | 635 | 17 | std::string match_result; | 636 | 17 | bool success = engine->match_and_extract(str.data, str.size, static_cast<int>(index_data), | 637 | 17 | match_result); | 638 | | | 639 | 17 | if (!success) { | 640 | 1 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 641 | 1 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 642 | 1 | return; | 643 | 1 | } | 644 | | | 645 | 16 | StringOP::push_value_string(std::string_view(match_result.data(), match_result.size()), | 646 | 16 | index_now, result_data, result_offset); | 647 | 16 | } |
_ZN5doris17RegexpExtractImplILb0EE19_execute_inner_loopILb1EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES8_lRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNS9_IjLm4096ESC_Lm16ELm15EEESE_m Line | Count | Source | 607 | 7 | const size_t index_now) { | 608 | 7 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( | 609 | 7 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 610 | 7 | std::unique_ptr<RegexpExtractEngine> scoped_engine; | 611 | | | 612 | 7 | if (engine == nullptr) { | 613 | 0 | std::string error_str; | 614 | 0 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 615 | 0 | scoped_engine = std::make_unique<RegexpExtractEngine>(); | 616 | 0 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, | 617 | 0 | context->state()->enable_extended_regex()); | 618 | 0 | if (!st) { | 619 | 0 | context->add_warning(error_str.c_str()); | 620 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 621 | 0 | return; | 622 | 0 | } | 623 | 0 | engine = scoped_engine.get(); | 624 | 0 | } | 625 | | | 626 | 7 | const auto& str = str_col->get_data_at(index_now); | 627 | | | 628 | 7 | int max_matches = 1 + engine->number_of_capturing_groups(); | 629 | 7 | if (index_data >= max_matches) { | 630 | 0 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 631 | 0 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 632 | 0 | return; | 633 | 0 | } | 634 | | | 635 | 7 | std::string match_result; | 636 | 7 | bool success = engine->match_and_extract(str.data, str.size, static_cast<int>(index_data), | 637 | 7 | match_result); | 638 | | | 639 | 7 | if (!success) { | 640 | 7 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 641 | 7 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 642 | 7 | return; | 643 | 7 | } | 644 | | | 645 | 0 | StringOP::push_value_string(std::string_view(match_result.data(), match_result.size()), | 646 | 0 | index_now, result_data, result_offset); | 647 | 0 | } |
_ZN5doris17RegexpExtractImplILb0EE19_execute_inner_loopILb0EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES8_lRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNS9_IjLm4096ESC_Lm16ELm15EEESE_m Line | Count | Source | 607 | 111 | const size_t index_now) { | 608 | 111 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( | 609 | 111 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 610 | 111 | std::unique_ptr<RegexpExtractEngine> scoped_engine; | 611 | | | 612 | 111 | if (engine == nullptr) { | 613 | 78 | std::string error_str; | 614 | 78 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 615 | 78 | scoped_engine = std::make_unique<RegexpExtractEngine>(); | 616 | 78 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, | 617 | 78 | context->state()->enable_extended_regex()); | 618 | 78 | if (!st) { | 619 | 0 | context->add_warning(error_str.c_str()); | 620 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 621 | 0 | return; | 622 | 0 | } | 623 | 78 | engine = scoped_engine.get(); | 624 | 78 | } | 625 | | | 626 | 111 | const auto& str = str_col->get_data_at(index_now); | 627 | | | 628 | 111 | int max_matches = 1 + engine->number_of_capturing_groups(); | 629 | 111 | if (index_data >= max_matches) { | 630 | 83 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 631 | 83 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 632 | 83 | return; | 633 | 83 | } | 634 | | | 635 | 28 | std::string match_result; | 636 | 28 | bool success = engine->match_and_extract(str.data, str.size, static_cast<int>(index_data), | 637 | 28 | match_result); | 638 | | | 639 | 28 | if (!success) { | 640 | 5 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 641 | 5 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 642 | 5 | return; | 643 | 5 | } | 644 | | | 645 | 23 | StringOP::push_value_string(std::string_view(match_result.data(), match_result.size()), | 646 | 23 | index_now, result_data, result_offset); | 647 | 23 | } |
|
648 | | }; |
649 | | |
650 | | struct RegexpExtractAllImpl { |
651 | | static constexpr auto name = "regexp_extract_all"; |
652 | | |
653 | 0 | size_t get_number_of_arguments() const { return 2; } |
654 | | |
655 | | static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], |
656 | | size_t input_rows_count, ColumnString::Chars& result_data, |
657 | 35 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
658 | 35 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
659 | 35 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
660 | 116 | for (int i = 0; i < input_rows_count; ++i) { |
661 | 81 | if (null_map[i]) { |
662 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
663 | 0 | continue; |
664 | 0 | } |
665 | 81 | _execute_inner_loop<false>(context, str_col, pattern_col, result_data, result_offset, |
666 | 81 | null_map, i); |
667 | 81 | } |
668 | 35 | } |
669 | | |
670 | | static void execute_impl_const_args(FunctionContext* context, ColumnPtr argument_columns[], |
671 | | size_t input_rows_count, ColumnString::Chars& result_data, |
672 | 8 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
673 | 8 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
674 | 8 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
675 | 22 | for (int i = 0; i < input_rows_count; ++i) { |
676 | 14 | if (null_map[i]) { |
677 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
678 | 0 | continue; |
679 | 0 | } |
680 | 14 | _execute_inner_loop<true>(context, str_col, pattern_col, result_data, result_offset, |
681 | 14 | null_map, i); |
682 | 14 | } |
683 | 8 | } |
684 | | template <bool Const> |
685 | | static void _execute_inner_loop(FunctionContext* context, const ColumnString* str_col, |
686 | | const ColumnString* pattern_col, |
687 | | ColumnString::Chars& result_data, |
688 | | ColumnString::Offsets& result_offset, NullMap& null_map, |
689 | 95 | const size_t index_now) { |
690 | 95 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( |
691 | 95 | context->get_function_state(FunctionContext::THREAD_LOCAL)); |
692 | 95 | std::unique_ptr<RegexpExtractEngine> scoped_engine; |
693 | | |
694 | 95 | if (engine == nullptr) { |
695 | 64 | std::string error_str; |
696 | 64 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); |
697 | 64 | scoped_engine = std::make_unique<RegexpExtractEngine>(); |
698 | 64 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, |
699 | 64 | context->state()->enable_extended_regex()); |
700 | 64 | if (!st) { |
701 | 0 | context->add_warning(error_str.c_str()); |
702 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); |
703 | 0 | return; |
704 | 0 | } |
705 | 64 | engine = scoped_engine.get(); |
706 | 64 | } |
707 | | |
708 | 95 | if (engine->number_of_capturing_groups() == 0) { |
709 | 65 | StringOP::push_empty_string(index_now, result_data, result_offset); |
710 | 65 | return; |
711 | 65 | } |
712 | 30 | const auto& str = str_col->get_data_at(index_now); |
713 | 30 | std::vector<std::string> res_matches; |
714 | 30 | engine->match_all_and_extract(str.data, str.size, res_matches); |
715 | | |
716 | 30 | if (res_matches.empty()) { |
717 | 10 | StringOP::push_empty_string(index_now, result_data, result_offset); |
718 | 10 | return; |
719 | 10 | } |
720 | | |
721 | 20 | std::string res = "["; |
722 | 59 | for (int j = 0; j < res_matches.size(); ++j) { |
723 | 39 | res += "'" + res_matches[j] + "'"; |
724 | 39 | if (j < res_matches.size() - 1) { |
725 | 19 | res += ","; |
726 | 19 | } |
727 | 39 | } |
728 | 20 | res += "]"; |
729 | 20 | StringOP::push_value_string(std::string_view(res), index_now, result_data, result_offset); |
730 | 20 | } _ZN5doris20RegexpExtractAllImpl19_execute_inner_loopILb1EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_RNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNS8_IjLm4096ESB_Lm16ELm15EEESD_m Line | Count | Source | 689 | 14 | const size_t index_now) { | 690 | 14 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( | 691 | 14 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 692 | 14 | std::unique_ptr<RegexpExtractEngine> scoped_engine; | 693 | | | 694 | 14 | if (engine == nullptr) { | 695 | 0 | std::string error_str; | 696 | 0 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 697 | 0 | scoped_engine = std::make_unique<RegexpExtractEngine>(); | 698 | 0 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, | 699 | 0 | context->state()->enable_extended_regex()); | 700 | 0 | if (!st) { | 701 | 0 | context->add_warning(error_str.c_str()); | 702 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 703 | 0 | return; | 704 | 0 | } | 705 | 0 | engine = scoped_engine.get(); | 706 | 0 | } | 707 | | | 708 | 14 | if (engine->number_of_capturing_groups() == 0) { | 709 | 0 | StringOP::push_empty_string(index_now, result_data, result_offset); | 710 | 0 | return; | 711 | 0 | } | 712 | 14 | const auto& str = str_col->get_data_at(index_now); | 713 | 14 | std::vector<std::string> res_matches; | 714 | 14 | engine->match_all_and_extract(str.data, str.size, res_matches); | 715 | | | 716 | 14 | if (res_matches.empty()) { | 717 | 7 | StringOP::push_empty_string(index_now, result_data, result_offset); | 718 | 7 | return; | 719 | 7 | } | 720 | | | 721 | 7 | std::string res = "["; | 722 | 19 | for (int j = 0; j < res_matches.size(); ++j) { | 723 | 12 | res += "'" + res_matches[j] + "'"; | 724 | 12 | if (j < res_matches.size() - 1) { | 725 | 5 | res += ","; | 726 | 5 | } | 727 | 12 | } | 728 | 7 | res += "]"; | 729 | 7 | StringOP::push_value_string(std::string_view(res), index_now, result_data, result_offset); | 730 | 7 | } |
_ZN5doris20RegexpExtractAllImpl19_execute_inner_loopILb0EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_RNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNS8_IjLm4096ESB_Lm16ELm15EEESD_m Line | Count | Source | 689 | 81 | const size_t index_now) { | 690 | 81 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( | 691 | 81 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 692 | 81 | std::unique_ptr<RegexpExtractEngine> scoped_engine; | 693 | | | 694 | 81 | if (engine == nullptr) { | 695 | 64 | std::string error_str; | 696 | 64 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 697 | 64 | scoped_engine = std::make_unique<RegexpExtractEngine>(); | 698 | 64 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, | 699 | 64 | context->state()->enable_extended_regex()); | 700 | 64 | if (!st) { | 701 | 0 | context->add_warning(error_str.c_str()); | 702 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 703 | 0 | return; | 704 | 0 | } | 705 | 64 | engine = scoped_engine.get(); | 706 | 64 | } | 707 | | | 708 | 81 | if (engine->number_of_capturing_groups() == 0) { | 709 | 65 | StringOP::push_empty_string(index_now, result_data, result_offset); | 710 | 65 | return; | 711 | 65 | } | 712 | 16 | const auto& str = str_col->get_data_at(index_now); | 713 | 16 | std::vector<std::string> res_matches; | 714 | 16 | engine->match_all_and_extract(str.data, str.size, res_matches); | 715 | | | 716 | 16 | if (res_matches.empty()) { | 717 | 3 | StringOP::push_empty_string(index_now, result_data, result_offset); | 718 | 3 | return; | 719 | 3 | } | 720 | | | 721 | 13 | std::string res = "["; | 722 | 40 | for (int j = 0; j < res_matches.size(); ++j) { | 723 | 27 | res += "'" + res_matches[j] + "'"; | 724 | 27 | if (j < res_matches.size() - 1) { | 725 | 14 | res += ","; | 726 | 14 | } | 727 | 27 | } | 728 | 13 | res += "]"; | 729 | 13 | StringOP::push_value_string(std::string_view(res), index_now, result_data, result_offset); | 730 | 13 | } |
|
731 | | }; |
732 | | |
733 | | // template FunctionRegexpFunctionality is used for regexp_xxxx series functions, not for regexp match. |
734 | | template <typename Impl> |
735 | | class FunctionRegexpFunctionality : public IFunction { |
736 | | public: |
737 | | static constexpr auto name = Impl::name; |
738 | | |
739 | 117 | static FunctionPtr create() { return std::make_shared<FunctionRegexpFunctionality>(); }_ZN5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE6createEv Line | Count | Source | 739 | 30 | static FunctionPtr create() { return std::make_shared<FunctionRegexpFunctionality>(); } |
_ZN5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE6createEv Line | Count | Source | 739 | 43 | static FunctionPtr create() { return std::make_shared<FunctionRegexpFunctionality>(); } |
_ZN5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE6createEv Line | Count | Source | 739 | 44 | static FunctionPtr create() { return std::make_shared<FunctionRegexpFunctionality>(); } |
|
740 | | |
741 | 3 | String get_name() const override { return name; }_ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE8get_nameB5cxx11Ev Line | Count | Source | 741 | 1 | String get_name() const override { return name; } |
_ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE8get_nameB5cxx11Ev Line | Count | Source | 741 | 1 | String get_name() const override { return name; } |
_ZNK5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE8get_nameB5cxx11Ev Line | Count | Source | 741 | 1 | String get_name() const override { return name; } |
|
742 | | |
743 | 90 | size_t get_number_of_arguments() const override { |
744 | 90 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { |
745 | 35 | return 2; |
746 | 35 | } |
747 | 0 | return 3; |
748 | 90 | } _ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE23get_number_of_argumentsEv Line | Count | Source | 743 | 21 | size_t get_number_of_arguments() const override { | 744 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 745 | | return 2; | 746 | | } | 747 | 21 | return 3; | 748 | 21 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE23get_number_of_argumentsEv Line | Count | Source | 743 | 34 | size_t get_number_of_arguments() const override { | 744 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 745 | | return 2; | 746 | | } | 747 | 34 | return 3; | 748 | 34 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE23get_number_of_argumentsEv Line | Count | Source | 743 | 35 | size_t get_number_of_arguments() const override { | 744 | 35 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 745 | 35 | return 2; | 746 | 35 | } | 747 | 0 | return 3; | 748 | 35 | } |
|
749 | | |
750 | 90 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
751 | 90 | return make_nullable(std::make_shared<DataTypeString>()); |
752 | 90 | } _ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 750 | 21 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 751 | 21 | return make_nullable(std::make_shared<DataTypeString>()); | 752 | 21 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 750 | 34 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 751 | 34 | return make_nullable(std::make_shared<DataTypeString>()); | 752 | 34 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE Line | Count | Source | 750 | 35 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 751 | 35 | return make_nullable(std::make_shared<DataTypeString>()); | 752 | 35 | } |
|
753 | | |
754 | 281 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { |
755 | 281 | if (scope == FunctionContext::THREAD_LOCAL) { |
756 | 191 | if (context->is_col_constant(1)) { |
757 | 125 | DCHECK(!context->get_function_state(scope)); |
758 | 125 | const auto pattern_col = context->get_constant_col(1)->column_ptr; |
759 | 125 | const auto& pattern = pattern_col->get_data_at(0); |
760 | 125 | if (pattern.size == 0) { |
761 | 3 | return Status::OK(); |
762 | 3 | } |
763 | | |
764 | 122 | std::string error_str; |
765 | 122 | auto engine = std::make_shared<RegexpExtractEngine>(); |
766 | 122 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *engine, |
767 | 122 | context->state()->enable_extended_regex()); |
768 | 122 | if (!st) { |
769 | 3 | context->set_error(error_str.c_str()); |
770 | 3 | return Status::InvalidArgument(error_str); |
771 | 3 | } |
772 | 119 | context->set_function_state(scope, engine); |
773 | 119 | } |
774 | 191 | } |
775 | 275 | return Status::OK(); |
776 | 281 | } _ZN5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 754 | 52 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 755 | 52 | if (scope == FunctionContext::THREAD_LOCAL) { | 756 | 31 | if (context->is_col_constant(1)) { | 757 | 31 | DCHECK(!context->get_function_state(scope)); | 758 | 31 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 759 | 31 | const auto& pattern = pattern_col->get_data_at(0); | 760 | 31 | if (pattern.size == 0) { | 761 | 1 | return Status::OK(); | 762 | 1 | } | 763 | | | 764 | 30 | std::string error_str; | 765 | 30 | auto engine = std::make_shared<RegexpExtractEngine>(); | 766 | 30 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *engine, | 767 | 30 | context->state()->enable_extended_regex()); | 768 | 30 | if (!st) { | 769 | 1 | context->set_error(error_str.c_str()); | 770 | 1 | return Status::InvalidArgument(error_str); | 771 | 1 | } | 772 | 29 | context->set_function_state(scope, engine); | 773 | 29 | } | 774 | 31 | } | 775 | 50 | return Status::OK(); | 776 | 52 | } |
_ZN5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 754 | 114 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 755 | 114 | if (scope == FunctionContext::THREAD_LOCAL) { | 756 | 80 | if (context->is_col_constant(1)) { | 757 | 44 | DCHECK(!context->get_function_state(scope)); | 758 | 44 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 759 | 44 | const auto& pattern = pattern_col->get_data_at(0); | 760 | 44 | if (pattern.size == 0) { | 761 | 1 | return Status::OK(); | 762 | 1 | } | 763 | | | 764 | 43 | std::string error_str; | 765 | 43 | auto engine = std::make_shared<RegexpExtractEngine>(); | 766 | 43 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *engine, | 767 | 43 | context->state()->enable_extended_regex()); | 768 | 43 | if (!st) { | 769 | 1 | context->set_error(error_str.c_str()); | 770 | 1 | return Status::InvalidArgument(error_str); | 771 | 1 | } | 772 | 42 | context->set_function_state(scope, engine); | 773 | 42 | } | 774 | 80 | } | 775 | 112 | return Status::OK(); | 776 | 114 | } |
_ZN5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE4openEPNS_15FunctionContextENS3_18FunctionStateScopeE Line | Count | Source | 754 | 115 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 755 | 115 | if (scope == FunctionContext::THREAD_LOCAL) { | 756 | 80 | if (context->is_col_constant(1)) { | 757 | 50 | DCHECK(!context->get_function_state(scope)); | 758 | 50 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 759 | 50 | const auto& pattern = pattern_col->get_data_at(0); | 760 | 50 | if (pattern.size == 0) { | 761 | 1 | return Status::OK(); | 762 | 1 | } | 763 | | | 764 | 49 | std::string error_str; | 765 | 49 | auto engine = std::make_shared<RegexpExtractEngine>(); | 766 | 49 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *engine, | 767 | 49 | context->state()->enable_extended_regex()); | 768 | 49 | if (!st) { | 769 | 1 | context->set_error(error_str.c_str()); | 770 | 1 | return Status::InvalidArgument(error_str); | 771 | 1 | } | 772 | 48 | context->set_function_state(scope, engine); | 773 | 48 | } | 774 | 80 | } | 775 | 113 | return Status::OK(); | 776 | 115 | } |
|
777 | | |
778 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
779 | 103 | uint32_t result, size_t input_rows_count) const override { |
780 | 103 | size_t argument_size = arguments.size(); |
781 | | |
782 | 103 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); |
783 | 103 | auto result_data_column = ColumnString::create(); |
784 | 103 | auto& result_data = result_data_column->get_chars(); |
785 | 103 | auto& result_offset = result_data_column->get_offsets(); |
786 | 103 | result_offset.resize(input_rows_count); |
787 | | |
788 | 103 | bool col_const[3]; |
789 | 103 | ColumnPtr argument_columns[3]; |
790 | 369 | for (int i = 0; i < argument_size; ++i) { |
791 | 266 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); |
792 | 266 | } |
793 | 103 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( |
794 | 4 | *block.get_by_position(arguments[0]).column) |
795 | 4 | .convert_to_full_column() |
796 | 103 | : block.get_by_position(arguments[0]).column; |
797 | 103 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { |
798 | 43 | default_preprocess_parameter_columns(argument_columns, col_const, {1}, block, |
799 | 43 | arguments); |
800 | 60 | } else { |
801 | 60 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, |
802 | 60 | arguments); |
803 | 60 | } |
804 | | |
805 | 103 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { |
806 | 43 | if (col_const[1]) { |
807 | 8 | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, |
808 | 8 | result_data, result_offset, |
809 | 8 | result_null_map->get_data()); |
810 | 35 | } else { |
811 | 35 | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, |
812 | 35 | result_offset, result_null_map->get_data()); |
813 | 35 | } |
814 | 60 | } else { |
815 | 60 | if (col_const[1] && col_const[2]) { |
816 | 1 | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, |
817 | 1 | result_data, result_offset, |
818 | 1 | result_null_map->get_data()); |
819 | 59 | } else { |
820 | 59 | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, |
821 | 59 | result_offset, result_null_map->get_data()); |
822 | 59 | } |
823 | 60 | } |
824 | | |
825 | 103 | block.get_by_position(result).column = |
826 | 103 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); |
827 | 103 | return Status::OK(); |
828 | 103 | } _ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 779 | 18 | uint32_t result, size_t input_rows_count) const override { | 780 | 18 | size_t argument_size = arguments.size(); | 781 | | | 782 | 18 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 783 | 18 | auto result_data_column = ColumnString::create(); | 784 | 18 | auto& result_data = result_data_column->get_chars(); | 785 | 18 | auto& result_offset = result_data_column->get_offsets(); | 786 | 18 | result_offset.resize(input_rows_count); | 787 | | | 788 | 18 | bool col_const[3]; | 789 | 18 | ColumnPtr argument_columns[3]; | 790 | 72 | for (int i = 0; i < argument_size; ++i) { | 791 | 54 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 792 | 54 | } | 793 | 18 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 794 | 0 | *block.get_by_position(arguments[0]).column) | 795 | 0 | .convert_to_full_column() | 796 | 18 | : block.get_by_position(arguments[0]).column; | 797 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 798 | | default_preprocess_parameter_columns(argument_columns, col_const, {1}, block, | 799 | | arguments); | 800 | 18 | } else { | 801 | 18 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, | 802 | 18 | arguments); | 803 | 18 | } | 804 | | | 805 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 806 | | if (col_const[1]) { | 807 | | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 808 | | result_data, result_offset, | 809 | | result_null_map->get_data()); | 810 | | } else { | 811 | | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 812 | | result_offset, result_null_map->get_data()); | 813 | | } | 814 | 18 | } else { | 815 | 18 | if (col_const[1] && col_const[2]) { | 816 | 0 | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 817 | 0 | result_data, result_offset, | 818 | 0 | result_null_map->get_data()); | 819 | 18 | } else { | 820 | 18 | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 821 | 18 | result_offset, result_null_map->get_data()); | 822 | 18 | } | 823 | 18 | } | 824 | | | 825 | 18 | block.get_by_position(result).column = | 826 | 18 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 827 | 18 | return Status::OK(); | 828 | 18 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 779 | 42 | uint32_t result, size_t input_rows_count) const override { | 780 | 42 | size_t argument_size = arguments.size(); | 781 | | | 782 | 42 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 783 | 42 | auto result_data_column = ColumnString::create(); | 784 | 42 | auto& result_data = result_data_column->get_chars(); | 785 | 42 | auto& result_offset = result_data_column->get_offsets(); | 786 | 42 | result_offset.resize(input_rows_count); | 787 | | | 788 | 42 | bool col_const[3]; | 789 | 42 | ColumnPtr argument_columns[3]; | 790 | 168 | for (int i = 0; i < argument_size; ++i) { | 791 | 126 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 792 | 126 | } | 793 | 42 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 794 | 3 | *block.get_by_position(arguments[0]).column) | 795 | 3 | .convert_to_full_column() | 796 | 42 | : block.get_by_position(arguments[0]).column; | 797 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 798 | | default_preprocess_parameter_columns(argument_columns, col_const, {1}, block, | 799 | | arguments); | 800 | 42 | } else { | 801 | 42 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, | 802 | 42 | arguments); | 803 | 42 | } | 804 | | | 805 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 806 | | if (col_const[1]) { | 807 | | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 808 | | result_data, result_offset, | 809 | | result_null_map->get_data()); | 810 | | } else { | 811 | | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 812 | | result_offset, result_null_map->get_data()); | 813 | | } | 814 | 42 | } else { | 815 | 42 | if (col_const[1] && col_const[2]) { | 816 | 1 | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 817 | 1 | result_data, result_offset, | 818 | 1 | result_null_map->get_data()); | 819 | 41 | } else { | 820 | 41 | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 821 | 41 | result_offset, result_null_map->get_data()); | 822 | 41 | } | 823 | 42 | } | 824 | | | 825 | 42 | block.get_by_position(result).column = | 826 | 42 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 827 | 42 | return Status::OK(); | 828 | 42 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 779 | 43 | uint32_t result, size_t input_rows_count) const override { | 780 | 43 | size_t argument_size = arguments.size(); | 781 | | | 782 | 43 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 783 | 43 | auto result_data_column = ColumnString::create(); | 784 | 43 | auto& result_data = result_data_column->get_chars(); | 785 | 43 | auto& result_offset = result_data_column->get_offsets(); | 786 | 43 | result_offset.resize(input_rows_count); | 787 | | | 788 | 43 | bool col_const[3]; | 789 | 43 | ColumnPtr argument_columns[3]; | 790 | 129 | for (int i = 0; i < argument_size; ++i) { | 791 | 86 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 792 | 86 | } | 793 | 43 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 794 | 1 | *block.get_by_position(arguments[0]).column) | 795 | 1 | .convert_to_full_column() | 796 | 43 | : block.get_by_position(arguments[0]).column; | 797 | 43 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 798 | 43 | default_preprocess_parameter_columns(argument_columns, col_const, {1}, block, | 799 | 43 | arguments); | 800 | | } else { | 801 | | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, | 802 | | arguments); | 803 | | } | 804 | | | 805 | 43 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 806 | 43 | if (col_const[1]) { | 807 | 8 | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 808 | 8 | result_data, result_offset, | 809 | 8 | result_null_map->get_data()); | 810 | 35 | } else { | 811 | 35 | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 812 | 35 | result_offset, result_null_map->get_data()); | 813 | 35 | } | 814 | | } else { | 815 | | if (col_const[1] && col_const[2]) { | 816 | | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 817 | | result_data, result_offset, | 818 | | result_null_map->get_data()); | 819 | | } else { | 820 | | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 821 | | result_offset, result_null_map->get_data()); | 822 | | } | 823 | | } | 824 | | | 825 | 43 | block.get_by_position(result).column = | 826 | 43 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 827 | 43 | return Status::OK(); | 828 | 43 | } |
|
829 | | }; |
830 | | |
831 | 8 | void register_function_regexp_extract(SimpleFunctionFactory& factory) { |
832 | 8 | factory.register_function<FunctionRegexpReplace<RegexpReplaceImpl, ThreeParamTypes>>(); |
833 | 8 | factory.register_function<FunctionRegexpReplace<RegexpReplaceImpl, FourParamTypes>>(); |
834 | 8 | factory.register_function<FunctionRegexpReplace<RegexpReplaceOneImpl, ThreeParamTypes>>(); |
835 | 8 | factory.register_function<FunctionRegexpReplace<RegexpReplaceOneImpl, FourParamTypes>>(); |
836 | 8 | factory.register_function<FunctionRegexpFunctionality<RegexpExtractImpl<true>>>(); |
837 | 8 | factory.register_function<FunctionRegexpFunctionality<RegexpExtractImpl<false>>>(); |
838 | 8 | factory.register_function<FunctionRegexpFunctionality<RegexpExtractAllImpl>>(); |
839 | 8 | factory.register_function<FunctionRegexpCount>(); |
840 | 8 | } |
841 | | |
842 | | } // namespace doris |