be/src/exprs/function/function_regexp.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include <glog/logging.h> |
19 | | #include <re2/re2.h> |
20 | | #include <re2/stringpiece.h> |
21 | | #include <stddef.h> |
22 | | |
23 | | #include <boost/regex.hpp> |
24 | | #include <memory> |
25 | | #include <string> |
26 | | #include <string_view> |
27 | | #include <type_traits> |
28 | | #include <utility> |
29 | | #include <vector> |
30 | | |
31 | | #include "common/status.h" |
32 | | #include "core/block/block.h" |
33 | | #include "core/block/column_numbers.h" |
34 | | #include "core/block/column_with_type_and_name.h" |
35 | | #include "core/column/column.h" |
36 | | #include "core/column/column_const.h" |
37 | | #include "core/column/column_nullable.h" |
38 | | #include "core/column/column_string.h" |
39 | | #include "core/column/column_vector.h" |
40 | | #include "core/data_type/data_type.h" |
41 | | #include "core/data_type/data_type_nullable.h" |
42 | | #include "core/data_type/data_type_number.h" |
43 | | #include "core/data_type/data_type_string.h" |
44 | | #include "core/string_ref.h" |
45 | | #include "core/types.h" |
46 | | #include "exec/common/stringop_substring.h" |
47 | | #include "exprs/aggregate/aggregate_function.h" |
48 | | #include "exprs/function/function.h" |
49 | | #include "exprs/function/simple_function_factory.h" |
50 | | #include "exprs/function_context.h" |
51 | | #include "exprs/string_functions.h" |
52 | | |
53 | | namespace doris { |
54 | | |
55 | | // Helper structure to hold either RE2 or Boost.Regex |
56 | | struct RegexpExtractEngine { |
57 | | std::unique_ptr<re2::RE2> re2_regex; |
58 | | std::unique_ptr<boost::regex> boost_regex; |
59 | | |
60 | 0 | bool is_boost() const { return boost_regex != nullptr; } |
61 | 46 | bool is_re2() const { return re2_regex != nullptr; } |
62 | | |
63 | | // Try to compile with RE2 first, fallback to Boost.Regex if RE2 fails |
64 | | static bool compile(const StringRef& pattern, std::string* error_str, |
65 | 26 | RegexpExtractEngine& engine, bool enable_extended_regex) { |
66 | 26 | re2::RE2::Options options; |
67 | 26 | options.set_log_errors(false); // avoid RE2 printing to stderr; we handle errors ourselves |
68 | 26 | options.set_dot_nl(true); // make '.' match '\n' by default, consistent with REGEXP/LIKE |
69 | 26 | engine.re2_regex = |
70 | 26 | std::make_unique<re2::RE2>(re2::StringPiece(pattern.data, pattern.size), options); |
71 | | |
72 | 26 | if (engine.re2_regex->ok()) { |
73 | 26 | return true; |
74 | 26 | } else if (!enable_extended_regex) { |
75 | 0 | *error_str = fmt::format( |
76 | 0 | "Invalid regex pattern: {}. Error: {}. If you need advanced regex features, " |
77 | 0 | "try setting enable_extended_regex=true", |
78 | 0 | std::string(pattern.data, pattern.size), engine.re2_regex->error()); |
79 | 0 | return false; |
80 | 0 | } |
81 | | |
82 | | // RE2 failed, try Boost.Regex for advanced features like zero-width assertions |
83 | 0 | engine.re2_regex.reset(); |
84 | 0 | try { |
85 | 0 | boost::regex::flag_type flags = boost::regex::normal; |
86 | 0 | engine.boost_regex = std::make_unique<boost::regex>(pattern.data, |
87 | 0 | pattern.data + pattern.size, flags); |
88 | 0 | return true; |
89 | 0 | } catch (const boost::regex_error& e) { |
90 | 0 | if (error_str) { |
91 | 0 | *error_str = fmt::format("Invalid regex pattern: {}. Error: {}", |
92 | 0 | std::string(pattern.data, pattern.size), e.what()); |
93 | 0 | } |
94 | 0 | return false; |
95 | 0 | } |
96 | 0 | } |
97 | | |
98 | | // Get number of capturing groups |
99 | 23 | int number_of_capturing_groups() const { |
100 | 23 | if (is_re2()) { |
101 | 23 | return re2_regex->NumberOfCapturingGroups(); |
102 | 23 | } else if (is_boost()) { |
103 | 0 | return static_cast<int>(boost_regex->mark_count()); |
104 | 0 | } |
105 | 0 | return 0; |
106 | 23 | } |
107 | | |
108 | | // Match function for extraction |
109 | 16 | bool match_and_extract(const char* data, size_t size, int index, std::string& result) const { |
110 | 16 | if (is_re2()) { |
111 | 16 | int max_matches = 1 + re2_regex->NumberOfCapturingGroups(); |
112 | 16 | if (index >= max_matches) { |
113 | 0 | return false; |
114 | 0 | } |
115 | 16 | std::vector<re2::StringPiece> matches(max_matches); |
116 | 16 | bool success = re2_regex->Match(re2::StringPiece(data, size), 0, size, |
117 | 16 | re2::RE2::UNANCHORED, matches.data(), max_matches); |
118 | 16 | if (success && index < matches.size()) { |
119 | 16 | const re2::StringPiece& match = matches[index]; |
120 | 16 | result.assign(match.data(), match.size()); |
121 | 16 | return true; |
122 | 16 | } |
123 | 0 | return false; |
124 | 16 | } else if (is_boost()) { |
125 | 0 | boost::cmatch matches; |
126 | 0 | bool success = boost::regex_search(data, data + size, matches, *boost_regex); |
127 | 0 | if (success && index < matches.size()) { |
128 | 0 | result = matches[index].str(); |
129 | 0 | return true; |
130 | 0 | } |
131 | 0 | return false; |
132 | 0 | } |
133 | 0 | return false; |
134 | 16 | } |
135 | | |
136 | | // Match all occurrences and extract the first capturing group |
137 | | void match_all_and_extract(const char* data, size_t size, |
138 | 7 | std::vector<std::string>& results) const { |
139 | 7 | if (is_re2()) { |
140 | 7 | int max_matches = 1 + re2_regex->NumberOfCapturingGroups(); |
141 | 7 | if (max_matches < 2) { |
142 | 0 | return; // No capturing groups |
143 | 0 | } |
144 | | |
145 | 7 | size_t pos = 0; |
146 | 19 | while (pos < size) { |
147 | 18 | const char* str_pos = data + pos; |
148 | 18 | size_t str_size = size - pos; |
149 | 18 | std::vector<re2::StringPiece> matches(max_matches); |
150 | 18 | bool success = re2_regex->Match(re2::StringPiece(str_pos, str_size), 0, str_size, |
151 | 18 | re2::RE2::UNANCHORED, matches.data(), max_matches); |
152 | 18 | if (!success) { |
153 | 6 | break; |
154 | 6 | } |
155 | 12 | if (matches[0].empty()) { |
156 | 0 | pos += 1; |
157 | 0 | continue; |
158 | 0 | } |
159 | | // Extract first capturing group |
160 | 12 | if (matches.size() > 1 && !matches[1].empty()) { |
161 | 12 | results.emplace_back(matches[1].data(), matches[1].size()); |
162 | 12 | } |
163 | | // Move position forward |
164 | 12 | auto offset = std::string(str_pos, str_size) |
165 | 12 | .find(std::string(matches[0].data(), matches[0].size())); |
166 | 12 | pos += offset + matches[0].size(); |
167 | 12 | } |
168 | 7 | } else if (is_boost()) { |
169 | 0 | const char* search_start = data; |
170 | 0 | const char* search_end = data + size; |
171 | 0 | boost::match_results<const char*> matches; |
172 | |
|
173 | 0 | while (boost::regex_search(search_start, search_end, matches, *boost_regex)) { |
174 | 0 | if (matches.size() > 1 && matches[1].matched) { |
175 | 0 | results.emplace_back(matches[1].str()); |
176 | 0 | } |
177 | 0 | if (matches[0].length() == 0) { |
178 | 0 | if (search_start == search_end) { |
179 | 0 | break; |
180 | 0 | } |
181 | 0 | search_start += 1; |
182 | 0 | } else { |
183 | 0 | search_start = matches[0].second; |
184 | 0 | } |
185 | 0 | } |
186 | 0 | } |
187 | 7 | } |
188 | | }; |
189 | | |
190 | | struct RegexpCountImpl { |
191 | | static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], |
192 | 0 | size_t input_rows_count, ColumnInt32::Container& result_data) { |
193 | 0 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
194 | 0 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
195 | 0 | for (int i = 0; i < input_rows_count; ++i) { |
196 | 0 | result_data[i] = _execute_inner_loop(context, str_col, pattern_col, i); |
197 | 0 | } |
198 | 0 | } |
199 | | static int _execute_inner_loop(FunctionContext* context, const ColumnString* str_col, |
200 | 0 | const ColumnString* pattern_col, const size_t index_now) { |
201 | 0 | re2::RE2* re = reinterpret_cast<re2::RE2*>( |
202 | 0 | context->get_function_state(FunctionContext::THREAD_LOCAL)); |
203 | 0 | std::unique_ptr<re2::RE2> scoped_re; |
204 | 0 | if (re == nullptr) { |
205 | 0 | std::string error_str; |
206 | 0 | DCHECK(pattern_col); |
207 | 0 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, false)); |
208 | 0 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), StringRef(), |
209 | 0 | scoped_re); |
210 | 0 | if (!st) { |
211 | 0 | context->add_warning(error_str.c_str()); |
212 | 0 | throw Exception(Status::InvalidArgument(error_str)); |
213 | 0 | return 0; |
214 | 0 | } |
215 | 0 | re = scoped_re.get(); |
216 | 0 | } |
217 | | |
218 | 0 | const auto& str = str_col->get_data_at(index_now); |
219 | 0 | int count = 0; |
220 | 0 | size_t pos = 0; |
221 | 0 | while (pos < str.size) { |
222 | 0 | auto str_pos = str.data + pos; |
223 | 0 | auto str_size = str.size - pos; |
224 | 0 | re2::StringPiece str_sp_current = re2::StringPiece(str_pos, str_size); |
225 | 0 | re2::StringPiece match; |
226 | |
|
227 | 0 | bool success = re->Match(str_sp_current, 0, str_size, re2::RE2::UNANCHORED, &match, 1); |
228 | 0 | if (!success) { |
229 | 0 | break; |
230 | 0 | } |
231 | 0 | if (match.empty()) { |
232 | 0 | pos += 1; |
233 | 0 | continue; |
234 | 0 | } |
235 | 0 | count++; |
236 | 0 | size_t match_start = match.data() - str_sp_current.data(); |
237 | 0 | pos += match_start + match.size(); |
238 | 0 | } |
239 | |
|
240 | 0 | return count; |
241 | 0 | } |
242 | | }; |
243 | | |
244 | | class FunctionRegexpCount : public IFunction { |
245 | | public: |
246 | | static constexpr auto name = "regexp_count"; |
247 | | |
248 | 2 | static FunctionPtr create() { return std::make_shared<FunctionRegexpCount>(); } |
249 | | |
250 | 1 | String get_name() const override { return name; } |
251 | | |
252 | 0 | size_t get_number_of_arguments() const override { return 2; } |
253 | | |
254 | 0 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
255 | 0 | return std::make_shared<DataTypeInt32>(); |
256 | 0 | } |
257 | | |
258 | 0 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { |
259 | 0 | if (scope == FunctionContext::THREAD_LOCAL) { |
260 | 0 | if (context->is_col_constant(1)) { |
261 | 0 | DCHECK(!context->get_function_state(scope)); |
262 | 0 | const auto pattern_col = context->get_constant_col(1)->column_ptr; |
263 | 0 | const auto& pattern = pattern_col->get_data_at(0); |
264 | 0 | if (pattern.size == 0) { |
265 | 0 | return Status::OK(); |
266 | 0 | } |
267 | | |
268 | 0 | std::string error_str; |
269 | 0 | std::unique_ptr<re2::RE2> scoped_re; |
270 | 0 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), |
271 | 0 | StringRef(), scoped_re); |
272 | 0 | if (!st) { |
273 | 0 | context->set_error(error_str.c_str()); |
274 | 0 | return Status::InvalidArgument(error_str); |
275 | 0 | } |
276 | 0 | std::shared_ptr<re2::RE2> re(scoped_re.release()); |
277 | 0 | context->set_function_state(scope, re); |
278 | 0 | } |
279 | 0 | } |
280 | 0 | return Status::OK(); |
281 | 0 | } |
282 | | |
283 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
284 | 0 | uint32_t result, size_t input_rows_count) const override { |
285 | 0 | auto result_data_column = ColumnInt32::create(input_rows_count); |
286 | 0 | auto& result_data = result_data_column->get_data(); |
287 | |
|
288 | 0 | ColumnPtr argument_columns[2]; |
289 | |
|
290 | 0 | argument_columns[0] = block.get_by_position(arguments[0]).column; |
291 | 0 | argument_columns[1] = block.get_by_position(arguments[1]).column; |
292 | 0 | RegexpCountImpl::execute_impl(context, argument_columns, input_rows_count, result_data); |
293 | |
|
294 | 0 | block.get_by_position(result).column = std::move(result_data_column); |
295 | 0 | return Status::OK(); |
296 | 0 | } |
297 | | }; |
298 | | |
299 | | struct ThreeParamTypes { |
300 | 2 | static DataTypes get_variadic_argument_types() { |
301 | 2 | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(), |
302 | 2 | std::make_shared<DataTypeString>()}; |
303 | 2 | } |
304 | | }; |
305 | | |
306 | | struct FourParamTypes { |
307 | 2 | static DataTypes get_variadic_argument_types() { |
308 | 2 | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(), |
309 | 2 | std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>()}; |
310 | 2 | } |
311 | | }; |
312 | | |
313 | | // template FunctionRegexpFunctionality is used for regexp_replace/regexp_replace_one |
314 | | template <typename Impl, typename ParamTypes> |
315 | | class FunctionRegexpReplace : public IFunction { |
316 | | public: |
317 | | static constexpr auto name = Impl::name; |
318 | | |
319 | 24 | static FunctionPtr create() { return std::make_shared<FunctionRegexpReplace>(); }_ZN5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE6createEv Line | Count | Source | 319 | 10 | static FunctionPtr create() { return std::make_shared<FunctionRegexpReplace>(); } |
_ZN5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE6createEv Line | Count | Source | 319 | 2 | static FunctionPtr create() { return std::make_shared<FunctionRegexpReplace>(); } |
_ZN5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE6createEv Line | Count | Source | 319 | 10 | static FunctionPtr create() { return std::make_shared<FunctionRegexpReplace>(); } |
_ZN5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE6createEv Line | Count | Source | 319 | 2 | static FunctionPtr create() { return std::make_shared<FunctionRegexpReplace>(); } |
|
320 | | |
321 | 0 | String get_name() const override { return name; }Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE8get_nameB5cxx11Ev Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE8get_nameB5cxx11Ev Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE8get_nameB5cxx11Ev Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE8get_nameB5cxx11Ev |
322 | | |
323 | 0 | size_t get_number_of_arguments() const override { |
324 | 0 | return get_variadic_argument_types_impl().size(); |
325 | 0 | } Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE23get_number_of_argumentsEv Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE23get_number_of_argumentsEv Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE23get_number_of_argumentsEv Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE23get_number_of_argumentsEv |
326 | | |
327 | 20 | bool is_variadic() const override { return true; }_ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE11is_variadicEv Line | Count | Source | 327 | 9 | bool is_variadic() const override { return true; } |
_ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE11is_variadicEv Line | Count | Source | 327 | 1 | bool is_variadic() const override { return true; } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE11is_variadicEv Line | Count | Source | 327 | 9 | bool is_variadic() const override { return true; } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE11is_variadicEv Line | Count | Source | 327 | 1 | bool is_variadic() const override { return true; } |
|
328 | | |
329 | 16 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
330 | 16 | return make_nullable(std::make_shared<DataTypeString>()); |
331 | 16 | } _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 329 | 8 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 330 | 8 | return make_nullable(std::make_shared<DataTypeString>()); | 331 | 8 | } |
Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE _ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 329 | 8 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 330 | 8 | return make_nullable(std::make_shared<DataTypeString>()); | 331 | 8 | } |
Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE |
332 | | |
333 | 4 | DataTypes get_variadic_argument_types_impl() const override { |
334 | 4 | return ParamTypes::get_variadic_argument_types(); |
335 | 4 | } _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE32get_variadic_argument_types_implEv Line | Count | Source | 333 | 1 | DataTypes get_variadic_argument_types_impl() const override { | 334 | 1 | return ParamTypes::get_variadic_argument_types(); | 335 | 1 | } |
_ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE32get_variadic_argument_types_implEv Line | Count | Source | 333 | 1 | DataTypes get_variadic_argument_types_impl() const override { | 334 | 1 | return ParamTypes::get_variadic_argument_types(); | 335 | 1 | } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE32get_variadic_argument_types_implEv Line | Count | Source | 333 | 1 | DataTypes get_variadic_argument_types_impl() const override { | 334 | 1 | return ParamTypes::get_variadic_argument_types(); | 335 | 1 | } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE32get_variadic_argument_types_implEv Line | Count | Source | 333 | 1 | DataTypes get_variadic_argument_types_impl() const override { | 334 | 1 | return ParamTypes::get_variadic_argument_types(); | 335 | 1 | } |
|
336 | | |
337 | 32 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { |
338 | 32 | if (scope == FunctionContext::THREAD_LOCAL) { |
339 | 16 | if (context->is_col_constant(1)) { |
340 | 16 | DCHECK(!context->get_function_state(scope)); |
341 | 16 | const auto pattern_col = context->get_constant_col(1)->column_ptr; |
342 | 16 | const auto& pattern = pattern_col->get_data_at(0); |
343 | 16 | if (pattern.size == 0) { |
344 | 4 | return Status::OK(); |
345 | 4 | } |
346 | | |
347 | 12 | std::string error_str; |
348 | 12 | std::unique_ptr<re2::RE2> scoped_re; |
349 | 12 | StringRef options_value; |
350 | 12 | if constexpr (std::is_same_v<FourParamTypes, ParamTypes>) { |
351 | 0 | DCHECK_EQ(context->get_num_args(), 4); |
352 | 0 | DCHECK(context->is_col_constant(3)); |
353 | 0 | const auto options_col = context->get_constant_col(3)->column_ptr; |
354 | 0 | options_value = options_col->get_data_at(0); |
355 | 0 | } |
356 | | |
357 | 12 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), |
358 | 12 | options_value, scoped_re); |
359 | 12 | if (!st) { |
360 | 0 | context->set_error(error_str.c_str()); |
361 | 0 | return Status::InvalidArgument(error_str); |
362 | 0 | } |
363 | 12 | std::shared_ptr<re2::RE2> re(scoped_re.release()); |
364 | 12 | context->set_function_state(scope, re); |
365 | 12 | } |
366 | 16 | } |
367 | 28 | return Status::OK(); |
368 | 32 | } _ZN5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 337 | 16 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 338 | 16 | if (scope == FunctionContext::THREAD_LOCAL) { | 339 | 8 | if (context->is_col_constant(1)) { | 340 | 8 | DCHECK(!context->get_function_state(scope)); | 341 | 8 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 342 | 8 | const auto& pattern = pattern_col->get_data_at(0); | 343 | 8 | if (pattern.size == 0) { | 344 | 2 | return Status::OK(); | 345 | 2 | } | 346 | | | 347 | 6 | std::string error_str; | 348 | 6 | std::unique_ptr<re2::RE2> scoped_re; | 349 | 6 | StringRef options_value; | 350 | | if constexpr (std::is_same_v<FourParamTypes, ParamTypes>) { | 351 | | DCHECK_EQ(context->get_num_args(), 4); | 352 | | DCHECK(context->is_col_constant(3)); | 353 | | const auto options_col = context->get_constant_col(3)->column_ptr; | 354 | | options_value = options_col->get_data_at(0); | 355 | | } | 356 | | | 357 | 6 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 358 | 6 | options_value, scoped_re); | 359 | 6 | if (!st) { | 360 | 0 | context->set_error(error_str.c_str()); | 361 | 0 | return Status::InvalidArgument(error_str); | 362 | 0 | } | 363 | 6 | std::shared_ptr<re2::RE2> re(scoped_re.release()); | 364 | 6 | context->set_function_state(scope, re); | 365 | 6 | } | 366 | 8 | } | 367 | 14 | return Status::OK(); | 368 | 16 | } |
Unexecuted instantiation: _ZN5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE _ZN5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 337 | 16 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 338 | 16 | if (scope == FunctionContext::THREAD_LOCAL) { | 339 | 8 | if (context->is_col_constant(1)) { | 340 | 8 | DCHECK(!context->get_function_state(scope)); | 341 | 8 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 342 | 8 | const auto& pattern = pattern_col->get_data_at(0); | 343 | 8 | if (pattern.size == 0) { | 344 | 2 | return Status::OK(); | 345 | 2 | } | 346 | | | 347 | 6 | std::string error_str; | 348 | 6 | std::unique_ptr<re2::RE2> scoped_re; | 349 | 6 | StringRef options_value; | 350 | | if constexpr (std::is_same_v<FourParamTypes, ParamTypes>) { | 351 | | DCHECK_EQ(context->get_num_args(), 4); | 352 | | DCHECK(context->is_col_constant(3)); | 353 | | const auto options_col = context->get_constant_col(3)->column_ptr; | 354 | | options_value = options_col->get_data_at(0); | 355 | | } | 356 | | | 357 | 6 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 358 | 6 | options_value, scoped_re); | 359 | 6 | if (!st) { | 360 | 0 | context->set_error(error_str.c_str()); | 361 | 0 | return Status::InvalidArgument(error_str); | 362 | 0 | } | 363 | 6 | std::shared_ptr<re2::RE2> re(scoped_re.release()); | 364 | 6 | context->set_function_state(scope, re); | 365 | 6 | } | 366 | 8 | } | 367 | 14 | return Status::OK(); | 368 | 16 | } |
Unexecuted instantiation: _ZN5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE |
369 | | |
370 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
371 | 12 | uint32_t result, size_t input_rows_count) const override { |
372 | 12 | size_t argument_size = arguments.size(); |
373 | | |
374 | 12 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); |
375 | 12 | auto result_data_column = ColumnString::create(); |
376 | 12 | auto& result_data = result_data_column->get_chars(); |
377 | 12 | auto& result_offset = result_data_column->get_offsets(); |
378 | 12 | result_offset.resize(input_rows_count); |
379 | | |
380 | 12 | bool col_const[3]; |
381 | 12 | ColumnPtr argument_columns[3]; |
382 | 48 | for (int i = 0; i < 3; ++i) { |
383 | 36 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); |
384 | 36 | } |
385 | 12 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( |
386 | 0 | *block.get_by_position(arguments[0]).column) |
387 | 0 | .convert_to_full_column() |
388 | 12 | : block.get_by_position(arguments[0]).column; |
389 | | |
390 | 12 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, arguments); |
391 | | |
392 | 12 | StringRef options_value; |
393 | 12 | if (col_const[1] && col_const[2]) { |
394 | 0 | Impl::execute_impl_const_args(context, argument_columns, options_value, |
395 | 0 | input_rows_count, result_data, result_offset, |
396 | 0 | result_null_map->get_data()); |
397 | 12 | } else { |
398 | | // the options have check in FE, so is always const, and get idx of 0 |
399 | 12 | if (argument_size == 4) { |
400 | 0 | options_value = block.get_by_position(arguments[3]).column->get_data_at(0); |
401 | 0 | } |
402 | 12 | Impl::execute_impl(context, argument_columns, options_value, input_rows_count, |
403 | 12 | result_data, result_offset, result_null_map->get_data()); |
404 | 12 | } |
405 | | |
406 | 12 | block.get_by_position(result).column = |
407 | 12 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); |
408 | 12 | return Status::OK(); |
409 | 12 | } _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 371 | 6 | uint32_t result, size_t input_rows_count) const override { | 372 | 6 | size_t argument_size = arguments.size(); | 373 | | | 374 | 6 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 375 | 6 | auto result_data_column = ColumnString::create(); | 376 | 6 | auto& result_data = result_data_column->get_chars(); | 377 | 6 | auto& result_offset = result_data_column->get_offsets(); | 378 | 6 | result_offset.resize(input_rows_count); | 379 | | | 380 | 6 | bool col_const[3]; | 381 | 6 | ColumnPtr argument_columns[3]; | 382 | 24 | for (int i = 0; i < 3; ++i) { | 383 | 18 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 384 | 18 | } | 385 | 6 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 386 | 0 | *block.get_by_position(arguments[0]).column) | 387 | 0 | .convert_to_full_column() | 388 | 6 | : block.get_by_position(arguments[0]).column; | 389 | | | 390 | 6 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, arguments); | 391 | | | 392 | 6 | StringRef options_value; | 393 | 6 | if (col_const[1] && col_const[2]) { | 394 | 0 | Impl::execute_impl_const_args(context, argument_columns, options_value, | 395 | 0 | input_rows_count, result_data, result_offset, | 396 | 0 | result_null_map->get_data()); | 397 | 6 | } else { | 398 | | // the options have check in FE, so is always const, and get idx of 0 | 399 | 6 | if (argument_size == 4) { | 400 | 0 | options_value = block.get_by_position(arguments[3]).column->get_data_at(0); | 401 | 0 | } | 402 | 6 | Impl::execute_impl(context, argument_columns, options_value, input_rows_count, | 403 | 6 | result_data, result_offset, result_null_map->get_data()); | 404 | 6 | } | 405 | | | 406 | 6 | block.get_by_position(result).column = | 407 | 6 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 408 | 6 | return Status::OK(); | 409 | 6 | } |
Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm _ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 371 | 6 | uint32_t result, size_t input_rows_count) const override { | 372 | 6 | size_t argument_size = arguments.size(); | 373 | | | 374 | 6 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 375 | 6 | auto result_data_column = ColumnString::create(); | 376 | 6 | auto& result_data = result_data_column->get_chars(); | 377 | 6 | auto& result_offset = result_data_column->get_offsets(); | 378 | 6 | result_offset.resize(input_rows_count); | 379 | | | 380 | 6 | bool col_const[3]; | 381 | 6 | ColumnPtr argument_columns[3]; | 382 | 24 | for (int i = 0; i < 3; ++i) { | 383 | 18 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 384 | 18 | } | 385 | 6 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 386 | 0 | *block.get_by_position(arguments[0]).column) | 387 | 0 | .convert_to_full_column() | 388 | 6 | : block.get_by_position(arguments[0]).column; | 389 | | | 390 | 6 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, arguments); | 391 | | | 392 | 6 | StringRef options_value; | 393 | 6 | if (col_const[1] && col_const[2]) { | 394 | 0 | Impl::execute_impl_const_args(context, argument_columns, options_value, | 395 | 0 | input_rows_count, result_data, result_offset, | 396 | 0 | result_null_map->get_data()); | 397 | 6 | } else { | 398 | | // the options have check in FE, so is always const, and get idx of 0 | 399 | 6 | if (argument_size == 4) { | 400 | 0 | options_value = block.get_by_position(arguments[3]).column->get_data_at(0); | 401 | 0 | } | 402 | 6 | Impl::execute_impl(context, argument_columns, options_value, input_rows_count, | 403 | 6 | result_data, result_offset, result_null_map->get_data()); | 404 | 6 | } | 405 | | | 406 | 6 | block.get_by_position(result).column = | 407 | 6 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 408 | 6 | return Status::OK(); | 409 | 6 | } |
Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm |
410 | | }; |
411 | | |
412 | | struct RegexpReplaceImpl { |
413 | | static constexpr auto name = "regexp_replace"; |
414 | | static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], |
415 | | const StringRef& options_value, size_t input_rows_count, |
416 | | ColumnString::Chars& result_data, ColumnString::Offsets& result_offset, |
417 | 6 | NullMap& null_map) { |
418 | 6 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
419 | 6 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
420 | 6 | const auto* replace_col = check_and_get_column<ColumnString>(argument_columns[2].get()); |
421 | | |
422 | 12 | for (size_t i = 0; i < input_rows_count; ++i) { |
423 | 6 | if (null_map[i]) { |
424 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
425 | 0 | continue; |
426 | 0 | } |
427 | 6 | _execute_inner_loop<false>(context, str_col, pattern_col, replace_col, options_value, |
428 | 6 | result_data, result_offset, null_map, i); |
429 | 6 | } |
430 | 6 | } |
431 | | static void execute_impl_const_args(FunctionContext* context, ColumnPtr argument_columns[], |
432 | | const StringRef& options_value, size_t input_rows_count, |
433 | | ColumnString::Chars& result_data, |
434 | 0 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
435 | 0 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
436 | 0 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
437 | 0 | const auto* replace_col = check_and_get_column<ColumnString>(argument_columns[2].get()); |
438 | |
|
439 | 0 | for (size_t i = 0; i < input_rows_count; ++i) { |
440 | 0 | if (null_map[i]) { |
441 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
442 | 0 | continue; |
443 | 0 | } |
444 | 0 | _execute_inner_loop<true>(context, str_col, pattern_col, replace_col, options_value, |
445 | 0 | result_data, result_offset, null_map, i); |
446 | 0 | } |
447 | 0 | } |
448 | | template <bool Const> |
449 | | static void _execute_inner_loop(FunctionContext* context, const ColumnString* str_col, |
450 | | const ColumnString* pattern_col, |
451 | | const ColumnString* replace_col, const StringRef& options_value, |
452 | | ColumnString::Chars& result_data, |
453 | | ColumnString::Offsets& result_offset, NullMap& null_map, |
454 | 6 | const size_t index_now) { |
455 | 6 | re2::RE2* re = reinterpret_cast<re2::RE2*>( |
456 | 6 | context->get_function_state(FunctionContext::THREAD_LOCAL)); |
457 | 6 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr |
458 | 6 | if (re == nullptr) { |
459 | 2 | std::string error_str; |
460 | 2 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); |
461 | 2 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), |
462 | 2 | options_value, scoped_re); |
463 | 2 | if (!st) { |
464 | 0 | context->add_warning(error_str.c_str()); |
465 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); |
466 | 0 | return; |
467 | 0 | } |
468 | 2 | re = scoped_re.get(); |
469 | 2 | } |
470 | | |
471 | 6 | re2::StringPiece replace_str = re2::StringPiece( |
472 | 6 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); |
473 | | |
474 | 6 | std::string result_str(str_col->get_data_at(index_now).to_string()); |
475 | 6 | re2::RE2::GlobalReplace(&result_str, *re, replace_str); |
476 | 6 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); |
477 | 6 | } Unexecuted instantiation: _ZN5doris17RegexpReplaceImpl19_execute_inner_loopILb1EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_S7_RKNS_9StringRefERNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSB_IjLm4096ESE_Lm16ELm15EEESG_m _ZN5doris17RegexpReplaceImpl19_execute_inner_loopILb0EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_S7_RKNS_9StringRefERNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSB_IjLm4096ESE_Lm16ELm15EEESG_m Line | Count | Source | 454 | 6 | const size_t index_now) { | 455 | 6 | re2::RE2* re = reinterpret_cast<re2::RE2*>( | 456 | 6 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 457 | 6 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr | 458 | 6 | if (re == nullptr) { | 459 | 2 | std::string error_str; | 460 | 2 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 461 | 2 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 462 | 2 | options_value, scoped_re); | 463 | 2 | if (!st) { | 464 | 0 | context->add_warning(error_str.c_str()); | 465 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 466 | 0 | return; | 467 | 0 | } | 468 | 2 | re = scoped_re.get(); | 469 | 2 | } | 470 | | | 471 | 6 | re2::StringPiece replace_str = re2::StringPiece( | 472 | 6 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); | 473 | | | 474 | 6 | std::string result_str(str_col->get_data_at(index_now).to_string()); | 475 | 6 | re2::RE2::GlobalReplace(&result_str, *re, replace_str); | 476 | 6 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); | 477 | 6 | } |
|
478 | | }; |
479 | | |
480 | | struct RegexpReplaceOneImpl { |
481 | | static constexpr auto name = "regexp_replace_one"; |
482 | | |
483 | | static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], |
484 | | const StringRef& options_value, size_t input_rows_count, |
485 | | ColumnString::Chars& result_data, ColumnString::Offsets& result_offset, |
486 | 6 | NullMap& null_map) { |
487 | 6 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
488 | 6 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
489 | 6 | const auto* replace_col = check_and_get_column<ColumnString>(argument_columns[2].get()); |
490 | | // 3 args |
491 | 12 | for (size_t i = 0; i < input_rows_count; ++i) { |
492 | 6 | if (null_map[i]) { |
493 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
494 | 0 | continue; |
495 | 0 | } |
496 | 6 | _execute_inner_loop<false>(context, str_col, pattern_col, replace_col, options_value, |
497 | 6 | result_data, result_offset, null_map, i); |
498 | 6 | } |
499 | 6 | } |
500 | | |
501 | | static void execute_impl_const_args(FunctionContext* context, ColumnPtr argument_columns[], |
502 | | const StringRef& options_value, size_t input_rows_count, |
503 | | ColumnString::Chars& result_data, |
504 | 0 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
505 | 0 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
506 | 0 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
507 | 0 | const auto* replace_col = check_and_get_column<ColumnString>(argument_columns[2].get()); |
508 | | // 3 args |
509 | 0 | for (size_t i = 0; i < input_rows_count; ++i) { |
510 | 0 | if (null_map[i]) { |
511 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
512 | 0 | continue; |
513 | 0 | } |
514 | 0 | _execute_inner_loop<true>(context, str_col, pattern_col, replace_col, options_value, |
515 | 0 | result_data, result_offset, null_map, i); |
516 | 0 | } |
517 | 0 | } |
518 | | template <bool Const> |
519 | | static void _execute_inner_loop(FunctionContext* context, const ColumnString* str_col, |
520 | | const ColumnString* pattern_col, |
521 | | const ColumnString* replace_col, const StringRef& options_value, |
522 | | ColumnString::Chars& result_data, |
523 | | ColumnString::Offsets& result_offset, NullMap& null_map, |
524 | 6 | const size_t index_now) { |
525 | 6 | re2::RE2* re = reinterpret_cast<re2::RE2*>( |
526 | 6 | context->get_function_state(FunctionContext::THREAD_LOCAL)); |
527 | 6 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr |
528 | 6 | if (re == nullptr) { |
529 | 2 | std::string error_str; |
530 | 2 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); |
531 | 2 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), |
532 | 2 | options_value, scoped_re); |
533 | 2 | if (!st) { |
534 | 0 | context->add_warning(error_str.c_str()); |
535 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); |
536 | 0 | return; |
537 | 0 | } |
538 | 2 | re = scoped_re.get(); |
539 | 2 | } |
540 | | |
541 | 6 | re2::StringPiece replace_str = re2::StringPiece( |
542 | 6 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); |
543 | | |
544 | 6 | std::string result_str(str_col->get_data_at(index_now).to_string()); |
545 | 6 | re2::RE2::Replace(&result_str, *re, replace_str); |
546 | 6 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); |
547 | 6 | } Unexecuted instantiation: _ZN5doris20RegexpReplaceOneImpl19_execute_inner_loopILb1EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_S7_RKNS_9StringRefERNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSB_IjLm4096ESE_Lm16ELm15EEESG_m _ZN5doris20RegexpReplaceOneImpl19_execute_inner_loopILb0EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_S7_RKNS_9StringRefERNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSB_IjLm4096ESE_Lm16ELm15EEESG_m Line | Count | Source | 524 | 6 | const size_t index_now) { | 525 | 6 | re2::RE2* re = reinterpret_cast<re2::RE2*>( | 526 | 6 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 527 | 6 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr | 528 | 6 | if (re == nullptr) { | 529 | 2 | std::string error_str; | 530 | 2 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 531 | 2 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 532 | 2 | options_value, scoped_re); | 533 | 2 | if (!st) { | 534 | 0 | context->add_warning(error_str.c_str()); | 535 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 536 | 0 | return; | 537 | 0 | } | 538 | 2 | re = scoped_re.get(); | 539 | 2 | } | 540 | | | 541 | 6 | re2::StringPiece replace_str = re2::StringPiece( | 542 | 6 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); | 543 | | | 544 | 6 | std::string result_str(str_col->get_data_at(index_now).to_string()); | 545 | 6 | re2::RE2::Replace(&result_str, *re, replace_str); | 546 | 6 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); | 547 | 6 | } |
|
548 | | }; |
549 | | |
550 | | template <bool ReturnNull> |
551 | | struct RegexpExtractImpl { |
552 | | static constexpr auto name = ReturnNull ? "regexp_extract_or_null" : "regexp_extract"; |
553 | | // 3 args |
554 | | static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], |
555 | | size_t input_rows_count, ColumnString::Chars& result_data, |
556 | 16 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
557 | 16 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
558 | 16 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
559 | 16 | const auto* index_col = check_and_get_column<ColumnInt64>(argument_columns[2].get()); |
560 | 32 | for (size_t i = 0; i < input_rows_count; ++i) { |
561 | 16 | if (null_map[i]) { |
562 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
563 | 0 | continue; |
564 | 0 | } |
565 | 16 | const auto& index_data = index_col->get_int(i); |
566 | 16 | if (index_data < 0) { |
567 | 0 | ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map) |
568 | 0 | : StringOP::push_empty_string(i, result_data, result_offset); |
569 | 0 | continue; |
570 | 0 | } |
571 | 16 | _execute_inner_loop<false>(context, str_col, pattern_col, index_data, result_data, |
572 | 16 | result_offset, null_map, i); |
573 | 16 | } |
574 | 16 | } _ZN5doris17RegexpExtractImplILb1EE12execute_implEPNS_15FunctionContextEPNS_3COWINS_7IColumnEE13immutable_ptrIS5_EEmRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSA_IjLm4096ESD_Lm16ELm15EEESF_ Line | Count | Source | 556 | 8 | ColumnString::Offsets& result_offset, NullMap& null_map) { | 557 | 8 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); | 558 | 8 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); | 559 | 8 | const auto* index_col = check_and_get_column<ColumnInt64>(argument_columns[2].get()); | 560 | 16 | for (size_t i = 0; i < input_rows_count; ++i) { | 561 | 8 | if (null_map[i]) { | 562 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); | 563 | 0 | continue; | 564 | 0 | } | 565 | 8 | const auto& index_data = index_col->get_int(i); | 566 | 8 | if (index_data < 0) { | 567 | 0 | ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map) | 568 | 0 | : StringOP::push_empty_string(i, result_data, result_offset); | 569 | 0 | continue; | 570 | 0 | } | 571 | 8 | _execute_inner_loop<false>(context, str_col, pattern_col, index_data, result_data, | 572 | 8 | result_offset, null_map, i); | 573 | 8 | } | 574 | 8 | } |
_ZN5doris17RegexpExtractImplILb0EE12execute_implEPNS_15FunctionContextEPNS_3COWINS_7IColumnEE13immutable_ptrIS5_EEmRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSA_IjLm4096ESD_Lm16ELm15EEESF_ Line | Count | Source | 556 | 8 | ColumnString::Offsets& result_offset, NullMap& null_map) { | 557 | 8 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); | 558 | 8 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); | 559 | 8 | const auto* index_col = check_and_get_column<ColumnInt64>(argument_columns[2].get()); | 560 | 16 | for (size_t i = 0; i < input_rows_count; ++i) { | 561 | 8 | if (null_map[i]) { | 562 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); | 563 | 0 | continue; | 564 | 0 | } | 565 | 8 | const auto& index_data = index_col->get_int(i); | 566 | 8 | if (index_data < 0) { | 567 | 0 | ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map) | 568 | 0 | : StringOP::push_empty_string(i, result_data, result_offset); | 569 | 0 | continue; | 570 | 0 | } | 571 | 8 | _execute_inner_loop<false>(context, str_col, pattern_col, index_data, result_data, | 572 | 8 | result_offset, null_map, i); | 573 | 8 | } | 574 | 8 | } |
|
575 | | |
576 | | static void execute_impl_const_args(FunctionContext* context, ColumnPtr argument_columns[], |
577 | | size_t input_rows_count, ColumnString::Chars& result_data, |
578 | 0 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
579 | 0 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
580 | 0 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
581 | 0 | const auto* index_col = check_and_get_column<ColumnInt64>(argument_columns[2].get()); |
582 | |
|
583 | 0 | const auto& index_data = index_col->get_int(0); |
584 | 0 | if (index_data < 0) { |
585 | 0 | for (size_t i = 0; i < input_rows_count; ++i) { |
586 | 0 | ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map) |
587 | 0 | : StringOP::push_empty_string(i, result_data, result_offset); |
588 | 0 | } |
589 | 0 | return; |
590 | 0 | } |
591 | | |
592 | 0 | for (size_t i = 0; i < input_rows_count; ++i) { |
593 | 0 | if (null_map[i]) { |
594 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
595 | 0 | continue; |
596 | 0 | } |
597 | | |
598 | 0 | _execute_inner_loop<true>(context, str_col, pattern_col, index_data, result_data, |
599 | 0 | result_offset, null_map, i); |
600 | 0 | } |
601 | 0 | } Unexecuted instantiation: _ZN5doris17RegexpExtractImplILb1EE23execute_impl_const_argsEPNS_15FunctionContextEPNS_3COWINS_7IColumnEE13immutable_ptrIS5_EEmRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSA_IjLm4096ESD_Lm16ELm15EEESF_ Unexecuted instantiation: _ZN5doris17RegexpExtractImplILb0EE23execute_impl_const_argsEPNS_15FunctionContextEPNS_3COWINS_7IColumnEE13immutable_ptrIS5_EEmRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSA_IjLm4096ESD_Lm16ELm15EEESF_ |
602 | | template <bool Const> |
603 | | static void _execute_inner_loop(FunctionContext* context, const ColumnString* str_col, |
604 | | const ColumnString* pattern_col, const Int64 index_data, |
605 | | ColumnString::Chars& result_data, |
606 | | ColumnString::Offsets& result_offset, NullMap& null_map, |
607 | 16 | const size_t index_now) { |
608 | 16 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( |
609 | 16 | context->get_function_state(FunctionContext::THREAD_LOCAL)); |
610 | 16 | std::unique_ptr<RegexpExtractEngine> scoped_engine; |
611 | | |
612 | 16 | if (engine == nullptr) { |
613 | 0 | std::string error_str; |
614 | 0 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); |
615 | 0 | scoped_engine = std::make_unique<RegexpExtractEngine>(); |
616 | 0 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, |
617 | 0 | context->state()->enable_extended_regex()); |
618 | 0 | if (!st) { |
619 | 0 | context->add_warning(error_str.c_str()); |
620 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); |
621 | 0 | return; |
622 | 0 | } |
623 | 0 | engine = scoped_engine.get(); |
624 | 0 | } |
625 | | |
626 | 16 | const auto& str = str_col->get_data_at(index_now); |
627 | | |
628 | 16 | int max_matches = 1 + engine->number_of_capturing_groups(); |
629 | 16 | if (index_data >= max_matches) { |
630 | 0 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) |
631 | 0 | : StringOP::push_empty_string(index_now, result_data, result_offset); |
632 | 0 | return; |
633 | 0 | } |
634 | | |
635 | 16 | std::string match_result; |
636 | 16 | bool success = engine->match_and_extract(str.data, str.size, static_cast<int>(index_data), |
637 | 16 | match_result); |
638 | | |
639 | 16 | if (!success) { |
640 | 0 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) |
641 | 0 | : StringOP::push_empty_string(index_now, result_data, result_offset); |
642 | 0 | return; |
643 | 0 | } |
644 | | |
645 | 16 | StringOP::push_value_string(std::string_view(match_result.data(), match_result.size()), |
646 | 16 | index_now, result_data, result_offset); |
647 | 16 | } Unexecuted instantiation: _ZN5doris17RegexpExtractImplILb1EE19_execute_inner_loopILb1EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES8_lRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNS9_IjLm4096ESC_Lm16ELm15EEESE_m _ZN5doris17RegexpExtractImplILb1EE19_execute_inner_loopILb0EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES8_lRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNS9_IjLm4096ESC_Lm16ELm15EEESE_m Line | Count | Source | 607 | 8 | const size_t index_now) { | 608 | 8 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( | 609 | 8 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 610 | 8 | std::unique_ptr<RegexpExtractEngine> scoped_engine; | 611 | | | 612 | 8 | if (engine == nullptr) { | 613 | 0 | std::string error_str; | 614 | 0 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 615 | 0 | scoped_engine = std::make_unique<RegexpExtractEngine>(); | 616 | 0 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, | 617 | 0 | context->state()->enable_extended_regex()); | 618 | 0 | if (!st) { | 619 | 0 | context->add_warning(error_str.c_str()); | 620 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 621 | 0 | return; | 622 | 0 | } | 623 | 0 | engine = scoped_engine.get(); | 624 | 0 | } | 625 | | | 626 | 8 | const auto& str = str_col->get_data_at(index_now); | 627 | | | 628 | 8 | int max_matches = 1 + engine->number_of_capturing_groups(); | 629 | 8 | if (index_data >= max_matches) { | 630 | 0 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 631 | 0 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 632 | 0 | return; | 633 | 0 | } | 634 | | | 635 | 8 | std::string match_result; | 636 | 8 | bool success = engine->match_and_extract(str.data, str.size, static_cast<int>(index_data), | 637 | 8 | match_result); | 638 | | | 639 | 8 | if (!success) { | 640 | 0 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 641 | 0 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 642 | 0 | return; | 643 | 0 | } | 644 | | | 645 | 8 | StringOP::push_value_string(std::string_view(match_result.data(), match_result.size()), | 646 | 8 | index_now, result_data, result_offset); | 647 | 8 | } |
Unexecuted instantiation: _ZN5doris17RegexpExtractImplILb0EE19_execute_inner_loopILb1EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES8_lRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNS9_IjLm4096ESC_Lm16ELm15EEESE_m _ZN5doris17RegexpExtractImplILb0EE19_execute_inner_loopILb0EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES8_lRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNS9_IjLm4096ESC_Lm16ELm15EEESE_m Line | Count | Source | 607 | 8 | const size_t index_now) { | 608 | 8 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( | 609 | 8 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 610 | 8 | std::unique_ptr<RegexpExtractEngine> scoped_engine; | 611 | | | 612 | 8 | if (engine == nullptr) { | 613 | 0 | std::string error_str; | 614 | 0 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 615 | 0 | scoped_engine = std::make_unique<RegexpExtractEngine>(); | 616 | 0 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, | 617 | 0 | context->state()->enable_extended_regex()); | 618 | 0 | if (!st) { | 619 | 0 | context->add_warning(error_str.c_str()); | 620 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 621 | 0 | return; | 622 | 0 | } | 623 | 0 | engine = scoped_engine.get(); | 624 | 0 | } | 625 | | | 626 | 8 | const auto& str = str_col->get_data_at(index_now); | 627 | | | 628 | 8 | int max_matches = 1 + engine->number_of_capturing_groups(); | 629 | 8 | if (index_data >= max_matches) { | 630 | 0 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 631 | 0 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 632 | 0 | return; | 633 | 0 | } | 634 | | | 635 | 8 | std::string match_result; | 636 | 8 | bool success = engine->match_and_extract(str.data, str.size, static_cast<int>(index_data), | 637 | 8 | match_result); | 638 | | | 639 | 8 | if (!success) { | 640 | 0 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 641 | 0 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 642 | 0 | return; | 643 | 0 | } | 644 | | | 645 | 8 | StringOP::push_value_string(std::string_view(match_result.data(), match_result.size()), | 646 | 8 | index_now, result_data, result_offset); | 647 | 8 | } |
|
648 | | }; |
649 | | |
650 | | struct RegexpExtractAllImpl { |
651 | | static constexpr auto name = "regexp_extract_all"; |
652 | | |
653 | 0 | size_t get_number_of_arguments() const { return 2; } |
654 | | |
655 | | static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], |
656 | | size_t input_rows_count, ColumnString::Chars& result_data, |
657 | 0 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
658 | 0 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
659 | 0 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
660 | 0 | for (int i = 0; i < input_rows_count; ++i) { |
661 | 0 | if (null_map[i]) { |
662 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
663 | 0 | continue; |
664 | 0 | } |
665 | 0 | _execute_inner_loop<false>(context, str_col, pattern_col, result_data, result_offset, |
666 | 0 | null_map, i); |
667 | 0 | } |
668 | 0 | } |
669 | | |
670 | | static void execute_impl_const_args(FunctionContext* context, ColumnPtr argument_columns[], |
671 | | size_t input_rows_count, ColumnString::Chars& result_data, |
672 | 7 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
673 | 7 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
674 | 7 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
675 | 14 | for (int i = 0; i < input_rows_count; ++i) { |
676 | 7 | if (null_map[i]) { |
677 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
678 | 0 | continue; |
679 | 0 | } |
680 | 7 | _execute_inner_loop<true>(context, str_col, pattern_col, result_data, result_offset, |
681 | 7 | null_map, i); |
682 | 7 | } |
683 | 7 | } |
684 | | template <bool Const> |
685 | | static void _execute_inner_loop(FunctionContext* context, const ColumnString* str_col, |
686 | | const ColumnString* pattern_col, |
687 | | ColumnString::Chars& result_data, |
688 | | ColumnString::Offsets& result_offset, NullMap& null_map, |
689 | 7 | const size_t index_now) { |
690 | 7 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( |
691 | 7 | context->get_function_state(FunctionContext::THREAD_LOCAL)); |
692 | 7 | std::unique_ptr<RegexpExtractEngine> scoped_engine; |
693 | | |
694 | 7 | if (engine == nullptr) { |
695 | 0 | std::string error_str; |
696 | 0 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); |
697 | 0 | scoped_engine = std::make_unique<RegexpExtractEngine>(); |
698 | 0 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, |
699 | 0 | context->state()->enable_extended_regex()); |
700 | 0 | if (!st) { |
701 | 0 | context->add_warning(error_str.c_str()); |
702 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); |
703 | 0 | return; |
704 | 0 | } |
705 | 0 | engine = scoped_engine.get(); |
706 | 0 | } |
707 | | |
708 | 7 | if (engine->number_of_capturing_groups() == 0) { |
709 | 0 | StringOP::push_empty_string(index_now, result_data, result_offset); |
710 | 0 | return; |
711 | 0 | } |
712 | 7 | const auto& str = str_col->get_data_at(index_now); |
713 | 7 | std::vector<std::string> res_matches; |
714 | 7 | engine->match_all_and_extract(str.data, str.size, res_matches); |
715 | | |
716 | 7 | if (res_matches.empty()) { |
717 | 0 | StringOP::push_empty_string(index_now, result_data, result_offset); |
718 | 0 | return; |
719 | 0 | } |
720 | | |
721 | 7 | std::string res = "["; |
722 | 19 | for (int j = 0; j < res_matches.size(); ++j) { |
723 | 12 | res += "'" + res_matches[j] + "'"; |
724 | 12 | if (j < res_matches.size() - 1) { |
725 | 5 | res += ","; |
726 | 5 | } |
727 | 12 | } |
728 | 7 | res += "]"; |
729 | 7 | StringOP::push_value_string(std::string_view(res), index_now, result_data, result_offset); |
730 | 7 | } _ZN5doris20RegexpExtractAllImpl19_execute_inner_loopILb1EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_RNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNS8_IjLm4096ESB_Lm16ELm15EEESD_m Line | Count | Source | 689 | 7 | const size_t index_now) { | 690 | 7 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( | 691 | 7 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 692 | 7 | std::unique_ptr<RegexpExtractEngine> scoped_engine; | 693 | | | 694 | 7 | if (engine == nullptr) { | 695 | 0 | std::string error_str; | 696 | 0 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 697 | 0 | scoped_engine = std::make_unique<RegexpExtractEngine>(); | 698 | 0 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, | 699 | 0 | context->state()->enable_extended_regex()); | 700 | 0 | if (!st) { | 701 | 0 | context->add_warning(error_str.c_str()); | 702 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 703 | 0 | return; | 704 | 0 | } | 705 | 0 | engine = scoped_engine.get(); | 706 | 0 | } | 707 | | | 708 | 7 | if (engine->number_of_capturing_groups() == 0) { | 709 | 0 | StringOP::push_empty_string(index_now, result_data, result_offset); | 710 | 0 | return; | 711 | 0 | } | 712 | 7 | const auto& str = str_col->get_data_at(index_now); | 713 | 7 | std::vector<std::string> res_matches; | 714 | 7 | engine->match_all_and_extract(str.data, str.size, res_matches); | 715 | | | 716 | 7 | if (res_matches.empty()) { | 717 | 0 | StringOP::push_empty_string(index_now, result_data, result_offset); | 718 | 0 | return; | 719 | 0 | } | 720 | | | 721 | 7 | std::string res = "["; | 722 | 19 | for (int j = 0; j < res_matches.size(); ++j) { | 723 | 12 | res += "'" + res_matches[j] + "'"; | 724 | 12 | if (j < res_matches.size() - 1) { | 725 | 5 | res += ","; | 726 | 5 | } | 727 | 12 | } | 728 | 7 | res += "]"; | 729 | 7 | StringOP::push_value_string(std::string_view(res), index_now, result_data, result_offset); | 730 | 7 | } |
Unexecuted instantiation: _ZN5doris20RegexpExtractAllImpl19_execute_inner_loopILb0EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_RNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNS8_IjLm4096ESB_Lm16ELm15EEESD_m |
731 | | }; |
732 | | |
733 | | // template FunctionRegexpFunctionality is used for regexp_xxxx series functions, not for regexp match. |
734 | | template <typename Impl> |
735 | | class FunctionRegexpFunctionality : public IFunction { |
736 | | public: |
737 | | static constexpr auto name = Impl::name; |
738 | | |
739 | 35 | static FunctionPtr create() { return std::make_shared<FunctionRegexpFunctionality>(); }_ZN5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE6createEv Line | Count | Source | 739 | 12 | static FunctionPtr create() { return std::make_shared<FunctionRegexpFunctionality>(); } |
_ZN5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE6createEv Line | Count | Source | 739 | 12 | static FunctionPtr create() { return std::make_shared<FunctionRegexpFunctionality>(); } |
_ZN5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE6createEv Line | Count | Source | 739 | 11 | static FunctionPtr create() { return std::make_shared<FunctionRegexpFunctionality>(); } |
|
740 | | |
741 | 3 | String get_name() const override { return name; }_ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE8get_nameB5cxx11Ev Line | Count | Source | 741 | 1 | String get_name() const override { return name; } |
_ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE8get_nameB5cxx11Ev Line | Count | Source | 741 | 1 | String get_name() const override { return name; } |
_ZNK5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE8get_nameB5cxx11Ev Line | Count | Source | 741 | 1 | String get_name() const override { return name; } |
|
742 | | |
743 | 29 | size_t get_number_of_arguments() const override { |
744 | 29 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { |
745 | 9 | return 2; |
746 | 9 | } |
747 | 0 | return 3; |
748 | 29 | } _ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE23get_number_of_argumentsEv Line | Count | Source | 743 | 10 | size_t get_number_of_arguments() const override { | 744 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 745 | | return 2; | 746 | | } | 747 | 10 | return 3; | 748 | 10 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE23get_number_of_argumentsEv Line | Count | Source | 743 | 10 | size_t get_number_of_arguments() const override { | 744 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 745 | | return 2; | 746 | | } | 747 | 10 | return 3; | 748 | 10 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE23get_number_of_argumentsEv Line | Count | Source | 743 | 9 | size_t get_number_of_arguments() const override { | 744 | 9 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 745 | 9 | return 2; | 746 | 9 | } | 747 | 0 | return 3; | 748 | 9 | } |
|
749 | | |
750 | 29 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
751 | 29 | return make_nullable(std::make_shared<DataTypeString>()); |
752 | 29 | } _ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 750 | 10 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 751 | 10 | return make_nullable(std::make_shared<DataTypeString>()); | 752 | 10 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 750 | 10 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 751 | 10 | return make_nullable(std::make_shared<DataTypeString>()); | 752 | 10 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE Line | Count | Source | 750 | 9 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 751 | 9 | return make_nullable(std::make_shared<DataTypeString>()); | 752 | 9 | } |
|
753 | | |
754 | 58 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { |
755 | 58 | if (scope == FunctionContext::THREAD_LOCAL) { |
756 | 29 | if (context->is_col_constant(1)) { |
757 | 29 | DCHECK(!context->get_function_state(scope)); |
758 | 29 | const auto pattern_col = context->get_constant_col(1)->column_ptr; |
759 | 29 | const auto& pattern = pattern_col->get_data_at(0); |
760 | 29 | if (pattern.size == 0) { |
761 | 3 | return Status::OK(); |
762 | 3 | } |
763 | | |
764 | 26 | std::string error_str; |
765 | 26 | auto engine = std::make_shared<RegexpExtractEngine>(); |
766 | 26 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *engine, |
767 | 26 | context->state()->enable_extended_regex()); |
768 | 26 | if (!st) { |
769 | 0 | context->set_error(error_str.c_str()); |
770 | 0 | return Status::InvalidArgument(error_str); |
771 | 0 | } |
772 | 26 | context->set_function_state(scope, engine); |
773 | 26 | } |
774 | 29 | } |
775 | 55 | return Status::OK(); |
776 | 58 | } _ZN5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 754 | 20 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 755 | 20 | if (scope == FunctionContext::THREAD_LOCAL) { | 756 | 10 | if (context->is_col_constant(1)) { | 757 | 10 | DCHECK(!context->get_function_state(scope)); | 758 | 10 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 759 | 10 | const auto& pattern = pattern_col->get_data_at(0); | 760 | 10 | if (pattern.size == 0) { | 761 | 1 | return Status::OK(); | 762 | 1 | } | 763 | | | 764 | 9 | std::string error_str; | 765 | 9 | auto engine = std::make_shared<RegexpExtractEngine>(); | 766 | 9 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *engine, | 767 | 9 | context->state()->enable_extended_regex()); | 768 | 9 | if (!st) { | 769 | 0 | context->set_error(error_str.c_str()); | 770 | 0 | return Status::InvalidArgument(error_str); | 771 | 0 | } | 772 | 9 | context->set_function_state(scope, engine); | 773 | 9 | } | 774 | 10 | } | 775 | 19 | return Status::OK(); | 776 | 20 | } |
_ZN5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 754 | 20 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 755 | 20 | if (scope == FunctionContext::THREAD_LOCAL) { | 756 | 10 | if (context->is_col_constant(1)) { | 757 | 10 | DCHECK(!context->get_function_state(scope)); | 758 | 10 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 759 | 10 | const auto& pattern = pattern_col->get_data_at(0); | 760 | 10 | if (pattern.size == 0) { | 761 | 1 | return Status::OK(); | 762 | 1 | } | 763 | | | 764 | 9 | std::string error_str; | 765 | 9 | auto engine = std::make_shared<RegexpExtractEngine>(); | 766 | 9 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *engine, | 767 | 9 | context->state()->enable_extended_regex()); | 768 | 9 | if (!st) { | 769 | 0 | context->set_error(error_str.c_str()); | 770 | 0 | return Status::InvalidArgument(error_str); | 771 | 0 | } | 772 | 9 | context->set_function_state(scope, engine); | 773 | 9 | } | 774 | 10 | } | 775 | 19 | return Status::OK(); | 776 | 20 | } |
_ZN5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE4openEPNS_15FunctionContextENS3_18FunctionStateScopeE Line | Count | Source | 754 | 18 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 755 | 18 | if (scope == FunctionContext::THREAD_LOCAL) { | 756 | 9 | if (context->is_col_constant(1)) { | 757 | 9 | DCHECK(!context->get_function_state(scope)); | 758 | 9 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 759 | 9 | const auto& pattern = pattern_col->get_data_at(0); | 760 | 9 | if (pattern.size == 0) { | 761 | 1 | return Status::OK(); | 762 | 1 | } | 763 | | | 764 | 8 | std::string error_str; | 765 | 8 | auto engine = std::make_shared<RegexpExtractEngine>(); | 766 | 8 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *engine, | 767 | 8 | context->state()->enable_extended_regex()); | 768 | 8 | if (!st) { | 769 | 0 | context->set_error(error_str.c_str()); | 770 | 0 | return Status::InvalidArgument(error_str); | 771 | 0 | } | 772 | 8 | context->set_function_state(scope, engine); | 773 | 8 | } | 774 | 9 | } | 775 | 17 | return Status::OK(); | 776 | 18 | } |
|
777 | | |
778 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
779 | 23 | uint32_t result, size_t input_rows_count) const override { |
780 | 23 | size_t argument_size = arguments.size(); |
781 | | |
782 | 23 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); |
783 | 23 | auto result_data_column = ColumnString::create(); |
784 | 23 | auto& result_data = result_data_column->get_chars(); |
785 | 23 | auto& result_offset = result_data_column->get_offsets(); |
786 | 23 | result_offset.resize(input_rows_count); |
787 | | |
788 | 23 | bool col_const[3]; |
789 | 23 | ColumnPtr argument_columns[3]; |
790 | 85 | for (int i = 0; i < argument_size; ++i) { |
791 | 62 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); |
792 | 62 | } |
793 | 23 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( |
794 | 0 | *block.get_by_position(arguments[0]).column) |
795 | 0 | .convert_to_full_column() |
796 | 23 | : block.get_by_position(arguments[0]).column; |
797 | 23 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { |
798 | 7 | default_preprocess_parameter_columns(argument_columns, col_const, {1}, block, |
799 | 7 | arguments); |
800 | 16 | } else { |
801 | 16 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, |
802 | 16 | arguments); |
803 | 16 | } |
804 | | |
805 | 23 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { |
806 | 7 | if (col_const[1]) { |
807 | 7 | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, |
808 | 7 | result_data, result_offset, |
809 | 7 | result_null_map->get_data()); |
810 | 7 | } else { |
811 | 0 | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, |
812 | 0 | result_offset, result_null_map->get_data()); |
813 | 0 | } |
814 | 16 | } else { |
815 | 16 | if (col_const[1] && col_const[2]) { |
816 | 0 | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, |
817 | 0 | result_data, result_offset, |
818 | 0 | result_null_map->get_data()); |
819 | 16 | } else { |
820 | 16 | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, |
821 | 16 | result_offset, result_null_map->get_data()); |
822 | 16 | } |
823 | 16 | } |
824 | | |
825 | 23 | block.get_by_position(result).column = |
826 | 23 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); |
827 | 23 | return Status::OK(); |
828 | 23 | } _ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 779 | 8 | uint32_t result, size_t input_rows_count) const override { | 780 | 8 | size_t argument_size = arguments.size(); | 781 | | | 782 | 8 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 783 | 8 | auto result_data_column = ColumnString::create(); | 784 | 8 | auto& result_data = result_data_column->get_chars(); | 785 | 8 | auto& result_offset = result_data_column->get_offsets(); | 786 | 8 | result_offset.resize(input_rows_count); | 787 | | | 788 | 8 | bool col_const[3]; | 789 | 8 | ColumnPtr argument_columns[3]; | 790 | 32 | for (int i = 0; i < argument_size; ++i) { | 791 | 24 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 792 | 24 | } | 793 | 8 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 794 | 0 | *block.get_by_position(arguments[0]).column) | 795 | 0 | .convert_to_full_column() | 796 | 8 | : block.get_by_position(arguments[0]).column; | 797 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 798 | | default_preprocess_parameter_columns(argument_columns, col_const, {1}, block, | 799 | | arguments); | 800 | 8 | } else { | 801 | 8 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, | 802 | 8 | arguments); | 803 | 8 | } | 804 | | | 805 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 806 | | if (col_const[1]) { | 807 | | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 808 | | result_data, result_offset, | 809 | | result_null_map->get_data()); | 810 | | } else { | 811 | | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 812 | | result_offset, result_null_map->get_data()); | 813 | | } | 814 | 8 | } else { | 815 | 8 | if (col_const[1] && col_const[2]) { | 816 | 0 | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 817 | 0 | result_data, result_offset, | 818 | 0 | result_null_map->get_data()); | 819 | 8 | } else { | 820 | 8 | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 821 | 8 | result_offset, result_null_map->get_data()); | 822 | 8 | } | 823 | 8 | } | 824 | | | 825 | 8 | block.get_by_position(result).column = | 826 | 8 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 827 | 8 | return Status::OK(); | 828 | 8 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 779 | 8 | uint32_t result, size_t input_rows_count) const override { | 780 | 8 | size_t argument_size = arguments.size(); | 781 | | | 782 | 8 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 783 | 8 | auto result_data_column = ColumnString::create(); | 784 | 8 | auto& result_data = result_data_column->get_chars(); | 785 | 8 | auto& result_offset = result_data_column->get_offsets(); | 786 | 8 | result_offset.resize(input_rows_count); | 787 | | | 788 | 8 | bool col_const[3]; | 789 | 8 | ColumnPtr argument_columns[3]; | 790 | 32 | for (int i = 0; i < argument_size; ++i) { | 791 | 24 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 792 | 24 | } | 793 | 8 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 794 | 0 | *block.get_by_position(arguments[0]).column) | 795 | 0 | .convert_to_full_column() | 796 | 8 | : block.get_by_position(arguments[0]).column; | 797 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 798 | | default_preprocess_parameter_columns(argument_columns, col_const, {1}, block, | 799 | | arguments); | 800 | 8 | } else { | 801 | 8 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, | 802 | 8 | arguments); | 803 | 8 | } | 804 | | | 805 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 806 | | if (col_const[1]) { | 807 | | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 808 | | result_data, result_offset, | 809 | | result_null_map->get_data()); | 810 | | } else { | 811 | | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 812 | | result_offset, result_null_map->get_data()); | 813 | | } | 814 | 8 | } else { | 815 | 8 | if (col_const[1] && col_const[2]) { | 816 | 0 | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 817 | 0 | result_data, result_offset, | 818 | 0 | result_null_map->get_data()); | 819 | 8 | } else { | 820 | 8 | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 821 | 8 | result_offset, result_null_map->get_data()); | 822 | 8 | } | 823 | 8 | } | 824 | | | 825 | 8 | block.get_by_position(result).column = | 826 | 8 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 827 | 8 | return Status::OK(); | 828 | 8 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 779 | 7 | uint32_t result, size_t input_rows_count) const override { | 780 | 7 | size_t argument_size = arguments.size(); | 781 | | | 782 | 7 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 783 | 7 | auto result_data_column = ColumnString::create(); | 784 | 7 | auto& result_data = result_data_column->get_chars(); | 785 | 7 | auto& result_offset = result_data_column->get_offsets(); | 786 | 7 | result_offset.resize(input_rows_count); | 787 | | | 788 | 7 | bool col_const[3]; | 789 | 7 | ColumnPtr argument_columns[3]; | 790 | 21 | for (int i = 0; i < argument_size; ++i) { | 791 | 14 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 792 | 14 | } | 793 | 7 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 794 | 0 | *block.get_by_position(arguments[0]).column) | 795 | 0 | .convert_to_full_column() | 796 | 7 | : block.get_by_position(arguments[0]).column; | 797 | 7 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 798 | 7 | default_preprocess_parameter_columns(argument_columns, col_const, {1}, block, | 799 | 7 | arguments); | 800 | | } else { | 801 | | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, | 802 | | arguments); | 803 | | } | 804 | | | 805 | 7 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 806 | 7 | if (col_const[1]) { | 807 | 7 | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 808 | 7 | result_data, result_offset, | 809 | 7 | result_null_map->get_data()); | 810 | 7 | } else { | 811 | 0 | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 812 | 0 | result_offset, result_null_map->get_data()); | 813 | 0 | } | 814 | | } else { | 815 | | if (col_const[1] && col_const[2]) { | 816 | | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 817 | | result_data, result_offset, | 818 | | result_null_map->get_data()); | 819 | | } else { | 820 | | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 821 | | result_offset, result_null_map->get_data()); | 822 | | } | 823 | | } | 824 | | | 825 | 7 | block.get_by_position(result).column = | 826 | 7 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 827 | 7 | return Status::OK(); | 828 | 7 | } |
|
829 | | }; |
830 | | |
831 | 1 | void register_function_regexp_extract(SimpleFunctionFactory& factory) { |
832 | 1 | factory.register_function<FunctionRegexpReplace<RegexpReplaceImpl, ThreeParamTypes>>(); |
833 | 1 | factory.register_function<FunctionRegexpReplace<RegexpReplaceImpl, FourParamTypes>>(); |
834 | 1 | factory.register_function<FunctionRegexpReplace<RegexpReplaceOneImpl, ThreeParamTypes>>(); |
835 | 1 | factory.register_function<FunctionRegexpReplace<RegexpReplaceOneImpl, FourParamTypes>>(); |
836 | 1 | factory.register_function<FunctionRegexpFunctionality<RegexpExtractImpl<true>>>(); |
837 | 1 | factory.register_function<FunctionRegexpFunctionality<RegexpExtractImpl<false>>>(); |
838 | 1 | factory.register_function<FunctionRegexpFunctionality<RegexpExtractAllImpl>>(); |
839 | 1 | factory.register_function<FunctionRegexpCount>(); |
840 | 1 | } |
841 | | |
842 | | } // namespace doris |