be/src/exprs/function/function_regexp.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include <glog/logging.h> |
19 | | #include <re2/re2.h> |
20 | | #include <re2/stringpiece.h> |
21 | | #include <stddef.h> |
22 | | |
23 | | #include <boost/regex.hpp> |
24 | | #include <memory> |
25 | | #include <string> |
26 | | #include <string_view> |
27 | | #include <type_traits> |
28 | | #include <utility> |
29 | | #include <vector> |
30 | | |
31 | | #include "common/status.h" |
32 | | #include "core/block/block.h" |
33 | | #include "core/block/column_numbers.h" |
34 | | #include "core/block/column_with_type_and_name.h" |
35 | | #include "core/column/column.h" |
36 | | #include "core/column/column_const.h" |
37 | | #include "core/column/column_nullable.h" |
38 | | #include "core/column/column_string.h" |
39 | | #include "core/column/column_vector.h" |
40 | | #include "core/data_type/data_type.h" |
41 | | #include "core/data_type/data_type_nullable.h" |
42 | | #include "core/data_type/data_type_number.h" |
43 | | #include "core/data_type/data_type_string.h" |
44 | | #include "core/string_ref.h" |
45 | | #include "core/types.h" |
46 | | #include "exec/common/stringop_substring.h" |
47 | | #include "exprs/aggregate/aggregate_function.h" |
48 | | #include "exprs/function/function.h" |
49 | | #include "exprs/function/simple_function_factory.h" |
50 | | #include "exprs/function_context.h" |
51 | | #include "exprs/string_functions.h" |
52 | | |
53 | | namespace doris { |
54 | | #include "common/compile_check_begin.h" |
55 | | |
56 | | // Helper structure to hold either RE2 or Boost.Regex |
57 | | struct RegexpExtractEngine { |
58 | | std::unique_ptr<re2::RE2> re2_regex; |
59 | | std::unique_ptr<boost::regex> boost_regex; |
60 | | |
61 | 18 | bool is_boost() const { return boost_regex != nullptr; } |
62 | 313 | bool is_re2() const { return re2_regex != nullptr; } |
63 | | |
64 | | // Try to compile with RE2 first, fallback to Boost.Regex if RE2 fails |
65 | | static bool compile(const StringRef& pattern, std::string* error_str, |
66 | 260 | RegexpExtractEngine& engine, bool enable_extended_regex) { |
67 | 260 | re2::RE2::Options options; |
68 | 260 | options.set_log_errors(false); // avoid RE2 printing to stderr; we handle errors ourselves |
69 | 260 | options.set_dot_nl(true); // make '.' match '\n' by default, consistent with REGEXP/LIKE |
70 | 260 | engine.re2_regex = |
71 | 260 | std::make_unique<re2::RE2>(re2::StringPiece(pattern.data, pattern.size), options); |
72 | | |
73 | 260 | if (engine.re2_regex->ok()) { |
74 | 239 | return true; |
75 | 239 | } else if (!enable_extended_regex) { |
76 | 3 | *error_str = fmt::format( |
77 | 3 | "Invalid regex pattern: {}. Error: {}. If you need advanced regex features, " |
78 | 3 | "try setting enable_extended_regex=true", |
79 | 3 | std::string(pattern.data, pattern.size), engine.re2_regex->error()); |
80 | 3 | return false; |
81 | 3 | } |
82 | | |
83 | | // RE2 failed, try Boost.Regex for advanced features like zero-width assertions |
84 | 18 | engine.re2_regex.reset(); |
85 | 18 | try { |
86 | 18 | boost::regex::flag_type flags = boost::regex::normal; |
87 | 18 | engine.boost_regex = std::make_unique<boost::regex>(pattern.data, |
88 | 18 | pattern.data + pattern.size, flags); |
89 | 18 | return true; |
90 | 18 | } catch (const boost::regex_error& e) { |
91 | 0 | if (error_str) { |
92 | 0 | *error_str = fmt::format("Invalid regex pattern: {}. Error: {}", |
93 | 0 | std::string(pattern.data, pattern.size), e.what()); |
94 | 0 | } |
95 | 0 | return false; |
96 | 0 | } |
97 | 18 | } |
98 | | |
99 | | // Get number of capturing groups |
100 | 231 | int number_of_capturing_groups() const { |
101 | 231 | if (is_re2()) { |
102 | 222 | return re2_regex->NumberOfCapturingGroups(); |
103 | 222 | } else if (is_boost()) { |
104 | 9 | return static_cast<int>(boost_regex->mark_count()); |
105 | 9 | } |
106 | 0 | return 0; |
107 | 231 | } |
108 | | |
109 | | // Match function for extraction |
110 | 52 | bool match_and_extract(const char* data, size_t size, int index, std::string& result) const { |
111 | 52 | if (is_re2()) { |
112 | 47 | int max_matches = 1 + re2_regex->NumberOfCapturingGroups(); |
113 | 47 | if (index >= max_matches) { |
114 | 0 | return false; |
115 | 0 | } |
116 | 47 | std::vector<re2::StringPiece> matches(max_matches); |
117 | 47 | bool success = re2_regex->Match(re2::StringPiece(data, size), 0, size, |
118 | 47 | re2::RE2::UNANCHORED, matches.data(), max_matches); |
119 | 47 | if (success && index < matches.size()) { |
120 | 34 | const re2::StringPiece& match = matches[index]; |
121 | 34 | result.assign(match.data(), match.size()); |
122 | 34 | return true; |
123 | 34 | } |
124 | 13 | return false; |
125 | 47 | } else if (is_boost()) { |
126 | 5 | boost::cmatch matches; |
127 | 5 | bool success = boost::regex_search(data, data + size, matches, *boost_regex); |
128 | 5 | if (success && index < matches.size()) { |
129 | 5 | result = matches[index].str(); |
130 | 5 | return true; |
131 | 5 | } |
132 | 0 | return false; |
133 | 5 | } |
134 | 0 | return false; |
135 | 52 | } |
136 | | |
137 | | // Match all occurrences and extract the first capturing group |
138 | | void match_all_and_extract(const char* data, size_t size, |
139 | 30 | std::vector<std::string>& results) const { |
140 | 30 | if (is_re2()) { |
141 | 26 | int max_matches = 1 + re2_regex->NumberOfCapturingGroups(); |
142 | 26 | if (max_matches < 2) { |
143 | 0 | return; // No capturing groups |
144 | 0 | } |
145 | | |
146 | 26 | size_t pos = 0; |
147 | 67 | while (pos < size) { |
148 | 55 | const char* str_pos = data + pos; |
149 | 55 | size_t str_size = size - pos; |
150 | 55 | std::vector<re2::StringPiece> matches(max_matches); |
151 | 55 | bool success = re2_regex->Match(re2::StringPiece(str_pos, str_size), 0, str_size, |
152 | 55 | re2::RE2::UNANCHORED, matches.data(), max_matches); |
153 | 55 | if (!success) { |
154 | 14 | break; |
155 | 14 | } |
156 | 41 | if (matches[0].empty()) { |
157 | 11 | pos += 1; |
158 | 11 | continue; |
159 | 11 | } |
160 | | // Extract first capturing group |
161 | 30 | if (matches.size() > 1 && !matches[1].empty()) { |
162 | 30 | results.emplace_back(matches[1].data(), matches[1].size()); |
163 | 30 | } |
164 | | // Move position forward |
165 | 30 | auto offset = std::string(str_pos, str_size) |
166 | 30 | .find(std::string(matches[0].data(), matches[0].size())); |
167 | 30 | pos += offset + matches[0].size(); |
168 | 30 | } |
169 | 26 | } else if (is_boost()) { |
170 | 4 | const char* search_start = data; |
171 | 4 | const char* search_end = data + size; |
172 | 4 | boost::match_results<const char*> matches; |
173 | | |
174 | 13 | while (boost::regex_search(search_start, search_end, matches, *boost_regex)) { |
175 | 9 | if (matches.size() > 1 && matches[1].matched) { |
176 | 9 | results.emplace_back(matches[1].str()); |
177 | 9 | } |
178 | 9 | if (matches[0].length() == 0) { |
179 | 0 | if (search_start == search_end) { |
180 | 0 | break; |
181 | 0 | } |
182 | 0 | search_start += 1; |
183 | 9 | } else { |
184 | 9 | search_start = matches[0].second; |
185 | 9 | } |
186 | 9 | } |
187 | 4 | } |
188 | 30 | } |
189 | | }; |
190 | | |
191 | | struct RegexpCountImpl { |
192 | | static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], |
193 | 15 | size_t input_rows_count, ColumnInt32::Container& result_data) { |
194 | 15 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
195 | 15 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
196 | 48 | for (int i = 0; i < input_rows_count; ++i) { |
197 | 33 | result_data[i] = _execute_inner_loop(context, str_col, pattern_col, i); |
198 | 33 | } |
199 | 15 | } |
200 | | static int _execute_inner_loop(FunctionContext* context, const ColumnString* str_col, |
201 | 33 | const ColumnString* pattern_col, const size_t index_now) { |
202 | 33 | re2::RE2* re = reinterpret_cast<re2::RE2*>( |
203 | 33 | context->get_function_state(FunctionContext::THREAD_LOCAL)); |
204 | 33 | std::unique_ptr<re2::RE2> scoped_re; |
205 | 33 | if (re == nullptr) { |
206 | 12 | std::string error_str; |
207 | 12 | DCHECK(pattern_col); |
208 | 12 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, false)); |
209 | 12 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), StringRef(), |
210 | 12 | scoped_re); |
211 | 12 | if (!st) { |
212 | 0 | context->add_warning(error_str.c_str()); |
213 | 0 | throw Exception(Status::InvalidArgument(error_str)); |
214 | 0 | return 0; |
215 | 0 | } |
216 | 12 | re = scoped_re.get(); |
217 | 12 | } |
218 | | |
219 | 33 | const auto& str = str_col->get_data_at(index_now); |
220 | 33 | int count = 0; |
221 | 33 | size_t pos = 0; |
222 | 101 | while (pos < str.size) { |
223 | 87 | auto str_pos = str.data + pos; |
224 | 87 | auto str_size = str.size - pos; |
225 | 87 | re2::StringPiece str_sp_current = re2::StringPiece(str_pos, str_size); |
226 | 87 | re2::StringPiece match; |
227 | | |
228 | 87 | bool success = re->Match(str_sp_current, 0, str_size, re2::RE2::UNANCHORED, &match, 1); |
229 | 87 | if (!success) { |
230 | 19 | break; |
231 | 19 | } |
232 | 68 | if (match.empty()) { |
233 | 4 | pos += 1; |
234 | 4 | continue; |
235 | 4 | } |
236 | 64 | count++; |
237 | 64 | size_t match_start = match.data() - str_sp_current.data(); |
238 | 64 | pos += match_start + match.size(); |
239 | 64 | } |
240 | | |
241 | 33 | return count; |
242 | 33 | } |
243 | | }; |
244 | | |
245 | | class FunctionRegexpCount : public IFunction { |
246 | | public: |
247 | | static constexpr auto name = "regexp_count"; |
248 | | |
249 | 24 | static FunctionPtr create() { return std::make_shared<FunctionRegexpCount>(); } |
250 | | |
251 | 1 | String get_name() const override { return name; } |
252 | | |
253 | 15 | size_t get_number_of_arguments() const override { return 2; } |
254 | | |
255 | 15 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
256 | 15 | return std::make_shared<DataTypeInt32>(); |
257 | 15 | } |
258 | | |
259 | 71 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { |
260 | 71 | if (scope == FunctionContext::THREAD_LOCAL) { |
261 | 56 | if (context->is_col_constant(1)) { |
262 | 41 | DCHECK(!context->get_function_state(scope)); |
263 | 41 | const auto pattern_col = context->get_constant_col(1)->column_ptr; |
264 | 41 | const auto& pattern = pattern_col->get_data_at(0); |
265 | 41 | if (pattern.size == 0) { |
266 | 4 | return Status::OK(); |
267 | 4 | } |
268 | | |
269 | 37 | std::string error_str; |
270 | 37 | std::unique_ptr<re2::RE2> scoped_re; |
271 | 37 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), |
272 | 37 | StringRef(), scoped_re); |
273 | 37 | if (!st) { |
274 | 0 | context->set_error(error_str.c_str()); |
275 | 0 | return Status::InvalidArgument(error_str); |
276 | 0 | } |
277 | 37 | std::shared_ptr<re2::RE2> re(scoped_re.release()); |
278 | 37 | context->set_function_state(scope, re); |
279 | 37 | } |
280 | 56 | } |
281 | 67 | return Status::OK(); |
282 | 71 | } |
283 | | |
284 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
285 | 15 | uint32_t result, size_t input_rows_count) const override { |
286 | 15 | auto result_data_column = ColumnInt32::create(input_rows_count); |
287 | 15 | auto& result_data = result_data_column->get_data(); |
288 | | |
289 | 15 | ColumnPtr argument_columns[2]; |
290 | | |
291 | 15 | argument_columns[0] = block.get_by_position(arguments[0]).column; |
292 | 15 | argument_columns[1] = block.get_by_position(arguments[1]).column; |
293 | 15 | RegexpCountImpl::execute_impl(context, argument_columns, input_rows_count, result_data); |
294 | | |
295 | 15 | block.get_by_position(result).column = std::move(result_data_column); |
296 | 15 | return Status::OK(); |
297 | 15 | } |
298 | | }; |
299 | | |
300 | | struct ThreeParamTypes { |
301 | 16 | static DataTypes get_variadic_argument_types() { |
302 | 16 | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(), |
303 | 16 | std::make_shared<DataTypeString>()}; |
304 | 16 | } |
305 | | }; |
306 | | |
307 | | struct FourParamTypes { |
308 | 16 | static DataTypes get_variadic_argument_types() { |
309 | 16 | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(), |
310 | 16 | std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>()}; |
311 | 16 | } |
312 | | }; |
313 | | |
314 | | // template FunctionRegexpFunctionality is used for regexp_replace/regexp_replace_one |
315 | | template <typename Impl, typename ParamTypes> |
316 | | class FunctionRegexpReplace : public IFunction { |
317 | | public: |
318 | | static constexpr auto name = Impl::name; |
319 | | |
320 | 94 | static FunctionPtr create() { return std::make_shared<FunctionRegexpReplace>(); }_ZN5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE6createEv Line | Count | Source | 320 | 34 | static FunctionPtr create() { return std::make_shared<FunctionRegexpReplace>(); } |
_ZN5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE6createEv Line | Count | Source | 320 | 17 | static FunctionPtr create() { return std::make_shared<FunctionRegexpReplace>(); } |
_ZN5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE6createEv Line | Count | Source | 320 | 25 | static FunctionPtr create() { return std::make_shared<FunctionRegexpReplace>(); } |
_ZN5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE6createEv Line | Count | Source | 320 | 18 | static FunctionPtr create() { return std::make_shared<FunctionRegexpReplace>(); } |
|
321 | | |
322 | 0 | String get_name() const override { return name; }Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE8get_nameB5cxx11Ev Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE8get_nameB5cxx11Ev Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE8get_nameB5cxx11Ev Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE8get_nameB5cxx11Ev |
323 | | |
324 | 0 | size_t get_number_of_arguments() const override { |
325 | 0 | return get_variadic_argument_types_impl().size(); |
326 | 0 | } Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE23get_number_of_argumentsEv Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE23get_number_of_argumentsEv Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE23get_number_of_argumentsEv Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE23get_number_of_argumentsEv |
327 | | |
328 | 62 | bool is_variadic() const override { return true; }_ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE11is_variadicEv Line | Count | Source | 328 | 26 | bool is_variadic() const override { return true; } |
_ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE11is_variadicEv Line | Count | Source | 328 | 9 | bool is_variadic() const override { return true; } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE11is_variadicEv Line | Count | Source | 328 | 17 | bool is_variadic() const override { return true; } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE11is_variadicEv Line | Count | Source | 328 | 10 | bool is_variadic() const override { return true; } |
|
329 | | |
330 | 58 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
331 | 58 | return make_nullable(std::make_shared<DataTypeString>()); |
332 | 58 | } _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 330 | 25 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 331 | 25 | return make_nullable(std::make_shared<DataTypeString>()); | 332 | 25 | } |
_ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 330 | 8 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 331 | 8 | return make_nullable(std::make_shared<DataTypeString>()); | 332 | 8 | } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 330 | 16 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 331 | 16 | return make_nullable(std::make_shared<DataTypeString>()); | 332 | 16 | } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 330 | 9 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 331 | 9 | return make_nullable(std::make_shared<DataTypeString>()); | 332 | 9 | } |
|
333 | | |
334 | 32 | DataTypes get_variadic_argument_types_impl() const override { |
335 | 32 | return ParamTypes::get_variadic_argument_types(); |
336 | 32 | } _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE32get_variadic_argument_types_implEv Line | Count | Source | 334 | 8 | DataTypes get_variadic_argument_types_impl() const override { | 335 | 8 | return ParamTypes::get_variadic_argument_types(); | 336 | 8 | } |
_ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE32get_variadic_argument_types_implEv Line | Count | Source | 334 | 8 | DataTypes get_variadic_argument_types_impl() const override { | 335 | 8 | return ParamTypes::get_variadic_argument_types(); | 336 | 8 | } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE32get_variadic_argument_types_implEv Line | Count | Source | 334 | 8 | DataTypes get_variadic_argument_types_impl() const override { | 335 | 8 | return ParamTypes::get_variadic_argument_types(); | 336 | 8 | } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE32get_variadic_argument_types_implEv Line | Count | Source | 334 | 8 | DataTypes get_variadic_argument_types_impl() const override { | 335 | 8 | return ParamTypes::get_variadic_argument_types(); | 336 | 8 | } |
|
337 | | |
338 | 345 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { |
339 | 345 | if (scope == FunctionContext::THREAD_LOCAL) { |
340 | 287 | if (context->is_col_constant(1)) { |
341 | 139 | DCHECK(!context->get_function_state(scope)); |
342 | 139 | const auto pattern_col = context->get_constant_col(1)->column_ptr; |
343 | 139 | const auto& pattern = pattern_col->get_data_at(0); |
344 | 139 | if (pattern.size == 0) { |
345 | 6 | return Status::OK(); |
346 | 6 | } |
347 | | |
348 | 133 | std::string error_str; |
349 | 133 | std::unique_ptr<re2::RE2> scoped_re; |
350 | 133 | StringRef options_value; |
351 | 133 | if constexpr (std::is_same_v<FourParamTypes, ParamTypes>) { |
352 | 80 | DCHECK_EQ(context->get_num_args(), 4); |
353 | 80 | DCHECK(context->is_col_constant(3)); |
354 | 80 | const auto options_col = context->get_constant_col(3)->column_ptr; |
355 | 80 | options_value = options_col->get_data_at(0); |
356 | 80 | } |
357 | | |
358 | 133 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), |
359 | 133 | options_value, scoped_re); |
360 | 133 | if (!st) { |
361 | 0 | context->set_error(error_str.c_str()); |
362 | 0 | return Status::InvalidArgument(error_str); |
363 | 0 | } |
364 | 133 | std::shared_ptr<re2::RE2> re(scoped_re.release()); |
365 | 133 | context->set_function_state(scope, re); |
366 | 133 | } |
367 | 287 | } |
368 | 339 | return Status::OK(); |
369 | 345 | } _ZN5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 338 | 100 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 339 | 100 | if (scope == FunctionContext::THREAD_LOCAL) { | 340 | 75 | if (context->is_col_constant(1)) { | 341 | 43 | DCHECK(!context->get_function_state(scope)); | 342 | 43 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 343 | 43 | const auto& pattern = pattern_col->get_data_at(0); | 344 | 43 | if (pattern.size == 0) { | 345 | 4 | return Status::OK(); | 346 | 4 | } | 347 | | | 348 | 39 | std::string error_str; | 349 | 39 | std::unique_ptr<re2::RE2> scoped_re; | 350 | 39 | StringRef options_value; | 351 | | if constexpr (std::is_same_v<FourParamTypes, ParamTypes>) { | 352 | | DCHECK_EQ(context->get_num_args(), 4); | 353 | | DCHECK(context->is_col_constant(3)); | 354 | | const auto options_col = context->get_constant_col(3)->column_ptr; | 355 | | options_value = options_col->get_data_at(0); | 356 | | } | 357 | | | 358 | 39 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 359 | 39 | options_value, scoped_re); | 360 | 39 | if (!st) { | 361 | 0 | context->set_error(error_str.c_str()); | 362 | 0 | return Status::InvalidArgument(error_str); | 363 | 0 | } | 364 | 39 | std::shared_ptr<re2::RE2> re(scoped_re.release()); | 365 | 39 | context->set_function_state(scope, re); | 366 | 39 | } | 367 | 75 | } | 368 | 96 | return Status::OK(); | 369 | 100 | } |
_ZN5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 338 | 84 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 339 | 84 | if (scope == FunctionContext::THREAD_LOCAL) { | 340 | 76 | if (context->is_col_constant(1)) { | 341 | 40 | DCHECK(!context->get_function_state(scope)); | 342 | 40 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 343 | 40 | const auto& pattern = pattern_col->get_data_at(0); | 344 | 40 | if (pattern.size == 0) { | 345 | 0 | return Status::OK(); | 346 | 0 | } | 347 | | | 348 | 40 | std::string error_str; | 349 | 40 | std::unique_ptr<re2::RE2> scoped_re; | 350 | 40 | StringRef options_value; | 351 | 40 | if constexpr (std::is_same_v<FourParamTypes, ParamTypes>) { | 352 | 40 | DCHECK_EQ(context->get_num_args(), 4); | 353 | 40 | DCHECK(context->is_col_constant(3)); | 354 | 40 | const auto options_col = context->get_constant_col(3)->column_ptr; | 355 | 40 | options_value = options_col->get_data_at(0); | 356 | 40 | } | 357 | | | 358 | 40 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 359 | 40 | options_value, scoped_re); | 360 | 40 | if (!st) { | 361 | 0 | context->set_error(error_str.c_str()); | 362 | 0 | return Status::InvalidArgument(error_str); | 363 | 0 | } | 364 | 40 | std::shared_ptr<re2::RE2> re(scoped_re.release()); | 365 | 40 | context->set_function_state(scope, re); | 366 | 40 | } | 367 | 76 | } | 368 | 84 | return Status::OK(); | 369 | 84 | } |
_ZN5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 338 | 64 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 339 | 64 | if (scope == FunctionContext::THREAD_LOCAL) { | 340 | 48 | if (context->is_col_constant(1)) { | 341 | 16 | DCHECK(!context->get_function_state(scope)); | 342 | 16 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 343 | 16 | const auto& pattern = pattern_col->get_data_at(0); | 344 | 16 | if (pattern.size == 0) { | 345 | 2 | return Status::OK(); | 346 | 2 | } | 347 | | | 348 | 14 | std::string error_str; | 349 | 14 | std::unique_ptr<re2::RE2> scoped_re; | 350 | 14 | StringRef options_value; | 351 | | if constexpr (std::is_same_v<FourParamTypes, ParamTypes>) { | 352 | | DCHECK_EQ(context->get_num_args(), 4); | 353 | | DCHECK(context->is_col_constant(3)); | 354 | | const auto options_col = context->get_constant_col(3)->column_ptr; | 355 | | options_value = options_col->get_data_at(0); | 356 | | } | 357 | | | 358 | 14 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 359 | 14 | options_value, scoped_re); | 360 | 14 | if (!st) { | 361 | 0 | context->set_error(error_str.c_str()); | 362 | 0 | return Status::InvalidArgument(error_str); | 363 | 0 | } | 364 | 14 | std::shared_ptr<re2::RE2> re(scoped_re.release()); | 365 | 14 | context->set_function_state(scope, re); | 366 | 14 | } | 367 | 48 | } | 368 | 62 | return Status::OK(); | 369 | 64 | } |
_ZN5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 338 | 97 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 339 | 97 | if (scope == FunctionContext::THREAD_LOCAL) { | 340 | 88 | if (context->is_col_constant(1)) { | 341 | 40 | DCHECK(!context->get_function_state(scope)); | 342 | 40 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 343 | 40 | const auto& pattern = pattern_col->get_data_at(0); | 344 | 40 | if (pattern.size == 0) { | 345 | 0 | return Status::OK(); | 346 | 0 | } | 347 | | | 348 | 40 | std::string error_str; | 349 | 40 | std::unique_ptr<re2::RE2> scoped_re; | 350 | 40 | StringRef options_value; | 351 | 40 | if constexpr (std::is_same_v<FourParamTypes, ParamTypes>) { | 352 | 40 | DCHECK_EQ(context->get_num_args(), 4); | 353 | 40 | DCHECK(context->is_col_constant(3)); | 354 | 40 | const auto options_col = context->get_constant_col(3)->column_ptr; | 355 | 40 | options_value = options_col->get_data_at(0); | 356 | 40 | } | 357 | | | 358 | 40 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 359 | 40 | options_value, scoped_re); | 360 | 40 | if (!st) { | 361 | 0 | context->set_error(error_str.c_str()); | 362 | 0 | return Status::InvalidArgument(error_str); | 363 | 0 | } | 364 | 40 | std::shared_ptr<re2::RE2> re(scoped_re.release()); | 365 | 40 | context->set_function_state(scope, re); | 366 | 40 | } | 367 | 88 | } | 368 | 97 | return Status::OK(); | 369 | 97 | } |
|
370 | | |
371 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
372 | 75 | uint32_t result, size_t input_rows_count) const override { |
373 | 75 | size_t argument_size = arguments.size(); |
374 | | |
375 | 75 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); |
376 | 75 | auto result_data_column = ColumnString::create(); |
377 | 75 | auto& result_data = result_data_column->get_chars(); |
378 | 75 | auto& result_offset = result_data_column->get_offsets(); |
379 | 75 | result_offset.resize(input_rows_count); |
380 | | |
381 | 75 | bool col_const[3]; |
382 | 75 | ColumnPtr argument_columns[3]; |
383 | 300 | for (int i = 0; i < 3; ++i) { |
384 | 225 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); |
385 | 225 | } |
386 | 75 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( |
387 | 6 | *block.get_by_position(arguments[0]).column) |
388 | 6 | .convert_to_full_column() |
389 | 75 | : block.get_by_position(arguments[0]).column; |
390 | | |
391 | 75 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, arguments); |
392 | | |
393 | 75 | StringRef options_value; |
394 | 75 | if (col_const[1] && col_const[2]) { |
395 | 3 | Impl::execute_impl_const_args(context, argument_columns, options_value, |
396 | 3 | input_rows_count, result_data, result_offset, |
397 | 3 | result_null_map->get_data()); |
398 | 72 | } else { |
399 | | // the options have check in FE, so is always const, and get idx of 0 |
400 | 72 | if (argument_size == 4) { |
401 | 15 | options_value = block.get_by_position(arguments[3]).column->get_data_at(0); |
402 | 15 | } |
403 | 72 | Impl::execute_impl(context, argument_columns, options_value, input_rows_count, |
404 | 72 | result_data, result_offset, result_null_map->get_data()); |
405 | 72 | } |
406 | | |
407 | 75 | block.get_by_position(result).column = |
408 | 75 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); |
409 | 75 | return Status::OK(); |
410 | 75 | } _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 372 | 32 | uint32_t result, size_t input_rows_count) const override { | 373 | 32 | size_t argument_size = arguments.size(); | 374 | | | 375 | 32 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 376 | 32 | auto result_data_column = ColumnString::create(); | 377 | 32 | auto& result_data = result_data_column->get_chars(); | 378 | 32 | auto& result_offset = result_data_column->get_offsets(); | 379 | 32 | result_offset.resize(input_rows_count); | 380 | | | 381 | 32 | bool col_const[3]; | 382 | 32 | ColumnPtr argument_columns[3]; | 383 | 128 | for (int i = 0; i < 3; ++i) { | 384 | 96 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 385 | 96 | } | 386 | 32 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 387 | 0 | *block.get_by_position(arguments[0]).column) | 388 | 0 | .convert_to_full_column() | 389 | 32 | : block.get_by_position(arguments[0]).column; | 390 | | | 391 | 32 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, arguments); | 392 | | | 393 | 32 | StringRef options_value; | 394 | 32 | if (col_const[1] && col_const[2]) { | 395 | 1 | Impl::execute_impl_const_args(context, argument_columns, options_value, | 396 | 1 | input_rows_count, result_data, result_offset, | 397 | 1 | result_null_map->get_data()); | 398 | 31 | } else { | 399 | | // the options have check in FE, so is always const, and get idx of 0 | 400 | 31 | if (argument_size == 4) { | 401 | 0 | options_value = block.get_by_position(arguments[3]).column->get_data_at(0); | 402 | 0 | } | 403 | 31 | Impl::execute_impl(context, argument_columns, options_value, input_rows_count, | 404 | 31 | result_data, result_offset, result_null_map->get_data()); | 405 | 31 | } | 406 | | | 407 | 32 | block.get_by_position(result).column = | 408 | 32 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 409 | 32 | return Status::OK(); | 410 | 32 | } |
_ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 372 | 8 | uint32_t result, size_t input_rows_count) const override { | 373 | 8 | size_t argument_size = arguments.size(); | 374 | | | 375 | 8 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 376 | 8 | auto result_data_column = ColumnString::create(); | 377 | 8 | auto& result_data = result_data_column->get_chars(); | 378 | 8 | auto& result_offset = result_data_column->get_offsets(); | 379 | 8 | result_offset.resize(input_rows_count); | 380 | | | 381 | 8 | bool col_const[3]; | 382 | 8 | ColumnPtr argument_columns[3]; | 383 | 32 | for (int i = 0; i < 3; ++i) { | 384 | 24 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 385 | 24 | } | 386 | 8 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 387 | 3 | *block.get_by_position(arguments[0]).column) | 388 | 3 | .convert_to_full_column() | 389 | 8 | : block.get_by_position(arguments[0]).column; | 390 | | | 391 | 8 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, arguments); | 392 | | | 393 | 8 | StringRef options_value; | 394 | 8 | if (col_const[1] && col_const[2]) { | 395 | 1 | Impl::execute_impl_const_args(context, argument_columns, options_value, | 396 | 1 | input_rows_count, result_data, result_offset, | 397 | 1 | result_null_map->get_data()); | 398 | 7 | } else { | 399 | | // the options have check in FE, so is always const, and get idx of 0 | 400 | 7 | if (argument_size == 4) { | 401 | 7 | options_value = block.get_by_position(arguments[3]).column->get_data_at(0); | 402 | 7 | } | 403 | 7 | Impl::execute_impl(context, argument_columns, options_value, input_rows_count, | 404 | 7 | result_data, result_offset, result_null_map->get_data()); | 405 | 7 | } | 406 | | | 407 | 8 | block.get_by_position(result).column = | 408 | 8 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 409 | 8 | return Status::OK(); | 410 | 8 | } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 372 | 26 | uint32_t result, size_t input_rows_count) const override { | 373 | 26 | size_t argument_size = arguments.size(); | 374 | | | 375 | 26 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 376 | 26 | auto result_data_column = ColumnString::create(); | 377 | 26 | auto& result_data = result_data_column->get_chars(); | 378 | 26 | auto& result_offset = result_data_column->get_offsets(); | 379 | 26 | result_offset.resize(input_rows_count); | 380 | | | 381 | 26 | bool col_const[3]; | 382 | 26 | ColumnPtr argument_columns[3]; | 383 | 104 | for (int i = 0; i < 3; ++i) { | 384 | 78 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 385 | 78 | } | 386 | 26 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 387 | 0 | *block.get_by_position(arguments[0]).column) | 388 | 0 | .convert_to_full_column() | 389 | 26 | : block.get_by_position(arguments[0]).column; | 390 | | | 391 | 26 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, arguments); | 392 | | | 393 | 26 | StringRef options_value; | 394 | 26 | if (col_const[1] && col_const[2]) { | 395 | 0 | Impl::execute_impl_const_args(context, argument_columns, options_value, | 396 | 0 | input_rows_count, result_data, result_offset, | 397 | 0 | result_null_map->get_data()); | 398 | 26 | } else { | 399 | | // the options have check in FE, so is always const, and get idx of 0 | 400 | 26 | if (argument_size == 4) { | 401 | 0 | options_value = block.get_by_position(arguments[3]).column->get_data_at(0); | 402 | 0 | } | 403 | 26 | Impl::execute_impl(context, argument_columns, options_value, input_rows_count, | 404 | 26 | result_data, result_offset, result_null_map->get_data()); | 405 | 26 | } | 406 | | | 407 | 26 | block.get_by_position(result).column = | 408 | 26 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 409 | 26 | return Status::OK(); | 410 | 26 | } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 372 | 9 | uint32_t result, size_t input_rows_count) const override { | 373 | 9 | size_t argument_size = arguments.size(); | 374 | | | 375 | 9 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 376 | 9 | auto result_data_column = ColumnString::create(); | 377 | 9 | auto& result_data = result_data_column->get_chars(); | 378 | 9 | auto& result_offset = result_data_column->get_offsets(); | 379 | 9 | result_offset.resize(input_rows_count); | 380 | | | 381 | 9 | bool col_const[3]; | 382 | 9 | ColumnPtr argument_columns[3]; | 383 | 36 | for (int i = 0; i < 3; ++i) { | 384 | 27 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 385 | 27 | } | 386 | 9 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 387 | 3 | *block.get_by_position(arguments[0]).column) | 388 | 3 | .convert_to_full_column() | 389 | 9 | : block.get_by_position(arguments[0]).column; | 390 | | | 391 | 9 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, arguments); | 392 | | | 393 | 9 | StringRef options_value; | 394 | 9 | if (col_const[1] && col_const[2]) { | 395 | 1 | Impl::execute_impl_const_args(context, argument_columns, options_value, | 396 | 1 | input_rows_count, result_data, result_offset, | 397 | 1 | result_null_map->get_data()); | 398 | 8 | } else { | 399 | | // the options have check in FE, so is always const, and get idx of 0 | 400 | 8 | if (argument_size == 4) { | 401 | 8 | options_value = block.get_by_position(arguments[3]).column->get_data_at(0); | 402 | 8 | } | 403 | 8 | Impl::execute_impl(context, argument_columns, options_value, input_rows_count, | 404 | 8 | result_data, result_offset, result_null_map->get_data()); | 405 | 8 | } | 406 | | | 407 | 9 | block.get_by_position(result).column = | 408 | 9 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 409 | 9 | return Status::OK(); | 410 | 9 | } |
|
411 | | }; |
412 | | |
413 | | struct RegexpReplaceImpl { |
414 | | static constexpr auto name = "regexp_replace"; |
415 | | static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], |
416 | | const StringRef& options_value, size_t input_rows_count, |
417 | | ColumnString::Chars& result_data, ColumnString::Offsets& result_offset, |
418 | 38 | NullMap& null_map) { |
419 | 38 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
420 | 38 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
421 | 38 | const auto* replace_col = check_and_get_column<ColumnString>(argument_columns[2].get()); |
422 | | |
423 | 130 | for (size_t i = 0; i < input_rows_count; ++i) { |
424 | 92 | if (null_map[i]) { |
425 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
426 | 0 | continue; |
427 | 0 | } |
428 | 92 | _execute_inner_loop<false>(context, str_col, pattern_col, replace_col, options_value, |
429 | 92 | result_data, result_offset, null_map, i); |
430 | 92 | } |
431 | 38 | } |
432 | | static void execute_impl_const_args(FunctionContext* context, ColumnPtr argument_columns[], |
433 | | const StringRef& options_value, size_t input_rows_count, |
434 | | ColumnString::Chars& result_data, |
435 | 2 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
436 | 2 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
437 | 2 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
438 | 2 | const auto* replace_col = check_and_get_column<ColumnString>(argument_columns[2].get()); |
439 | | |
440 | 12 | for (size_t i = 0; i < input_rows_count; ++i) { |
441 | 10 | if (null_map[i]) { |
442 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
443 | 0 | continue; |
444 | 0 | } |
445 | 10 | _execute_inner_loop<true>(context, str_col, pattern_col, replace_col, options_value, |
446 | 10 | result_data, result_offset, null_map, i); |
447 | 10 | } |
448 | 2 | } |
449 | | template <bool Const> |
450 | | static void _execute_inner_loop(FunctionContext* context, const ColumnString* str_col, |
451 | | const ColumnString* pattern_col, |
452 | | const ColumnString* replace_col, const StringRef& options_value, |
453 | | ColumnString::Chars& result_data, |
454 | | ColumnString::Offsets& result_offset, NullMap& null_map, |
455 | 102 | const size_t index_now) { |
456 | 102 | re2::RE2* re = reinterpret_cast<re2::RE2*>( |
457 | 102 | context->get_function_state(FunctionContext::THREAD_LOCAL)); |
458 | 102 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr |
459 | 102 | if (re == nullptr) { |
460 | 67 | std::string error_str; |
461 | 67 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); |
462 | 67 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), |
463 | 67 | options_value, scoped_re); |
464 | 67 | if (!st) { |
465 | 0 | context->add_warning(error_str.c_str()); |
466 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); |
467 | 0 | return; |
468 | 0 | } |
469 | 67 | re = scoped_re.get(); |
470 | 67 | } |
471 | | |
472 | 102 | re2::StringPiece replace_str = re2::StringPiece( |
473 | 102 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); |
474 | | |
475 | 102 | std::string result_str(str_col->get_data_at(index_now).to_string()); |
476 | 102 | re2::RE2::GlobalReplace(&result_str, *re, replace_str); |
477 | 102 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); |
478 | 102 | } _ZN5doris17RegexpReplaceImpl19_execute_inner_loopILb1EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_S7_RKNS_9StringRefERNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERNSB_IjLm4096ESE_Lm16ELm15EEESG_m Line | Count | Source | 455 | 10 | const size_t index_now) { | 456 | 10 | re2::RE2* re = reinterpret_cast<re2::RE2*>( | 457 | 10 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 458 | 10 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr | 459 | 10 | if (re == nullptr) { | 460 | 0 | std::string error_str; | 461 | 0 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 462 | 0 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 463 | 0 | options_value, scoped_re); | 464 | 0 | if (!st) { | 465 | 0 | context->add_warning(error_str.c_str()); | 466 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 467 | 0 | return; | 468 | 0 | } | 469 | 0 | re = scoped_re.get(); | 470 | 0 | } | 471 | | | 472 | 10 | re2::StringPiece replace_str = re2::StringPiece( | 473 | 10 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); | 474 | | | 475 | 10 | std::string result_str(str_col->get_data_at(index_now).to_string()); | 476 | 10 | re2::RE2::GlobalReplace(&result_str, *re, replace_str); | 477 | 10 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); | 478 | 10 | } |
_ZN5doris17RegexpReplaceImpl19_execute_inner_loopILb0EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_S7_RKNS_9StringRefERNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERNSB_IjLm4096ESE_Lm16ELm15EEESG_m Line | Count | Source | 455 | 92 | const size_t index_now) { | 456 | 92 | re2::RE2* re = reinterpret_cast<re2::RE2*>( | 457 | 92 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 458 | 92 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr | 459 | 92 | if (re == nullptr) { | 460 | 67 | std::string error_str; | 461 | 67 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 462 | 67 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 463 | 67 | options_value, scoped_re); | 464 | 67 | if (!st) { | 465 | 0 | context->add_warning(error_str.c_str()); | 466 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 467 | 0 | return; | 468 | 0 | } | 469 | 67 | re = scoped_re.get(); | 470 | 67 | } | 471 | | | 472 | 92 | re2::StringPiece replace_str = re2::StringPiece( | 473 | 92 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); | 474 | | | 475 | 92 | std::string result_str(str_col->get_data_at(index_now).to_string()); | 476 | 92 | re2::RE2::GlobalReplace(&result_str, *re, replace_str); | 477 | 92 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); | 478 | 92 | } |
|
479 | | }; |
480 | | |
481 | | struct RegexpReplaceOneImpl { |
482 | | static constexpr auto name = "regexp_replace_one"; |
483 | | |
484 | | static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], |
485 | | const StringRef& options_value, size_t input_rows_count, |
486 | | ColumnString::Chars& result_data, ColumnString::Offsets& result_offset, |
487 | 34 | NullMap& null_map) { |
488 | 34 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
489 | 34 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
490 | 34 | const auto* replace_col = check_and_get_column<ColumnString>(argument_columns[2].get()); |
491 | | // 3 args |
492 | 126 | for (size_t i = 0; i < input_rows_count; ++i) { |
493 | 92 | if (null_map[i]) { |
494 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
495 | 0 | continue; |
496 | 0 | } |
497 | 92 | _execute_inner_loop<false>(context, str_col, pattern_col, replace_col, options_value, |
498 | 92 | result_data, result_offset, null_map, i); |
499 | 92 | } |
500 | 34 | } |
501 | | |
502 | | static void execute_impl_const_args(FunctionContext* context, ColumnPtr argument_columns[], |
503 | | const StringRef& options_value, size_t input_rows_count, |
504 | | ColumnString::Chars& result_data, |
505 | 1 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
506 | 1 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
507 | 1 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
508 | 1 | const auto* replace_col = check_and_get_column<ColumnString>(argument_columns[2].get()); |
509 | | // 3 args |
510 | 6 | for (size_t i = 0; i < input_rows_count; ++i) { |
511 | 5 | if (null_map[i]) { |
512 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
513 | 0 | continue; |
514 | 0 | } |
515 | 5 | _execute_inner_loop<true>(context, str_col, pattern_col, replace_col, options_value, |
516 | 5 | result_data, result_offset, null_map, i); |
517 | 5 | } |
518 | 1 | } |
519 | | template <bool Const> |
520 | | static void _execute_inner_loop(FunctionContext* context, const ColumnString* str_col, |
521 | | const ColumnString* pattern_col, |
522 | | const ColumnString* replace_col, const StringRef& options_value, |
523 | | ColumnString::Chars& result_data, |
524 | | ColumnString::Offsets& result_offset, NullMap& null_map, |
525 | 97 | const size_t index_now) { |
526 | 97 | re2::RE2* re = reinterpret_cast<re2::RE2*>( |
527 | 97 | context->get_function_state(FunctionContext::THREAD_LOCAL)); |
528 | 97 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr |
529 | 97 | if (re == nullptr) { |
530 | 72 | std::string error_str; |
531 | 72 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); |
532 | 72 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), |
533 | 72 | options_value, scoped_re); |
534 | 72 | if (!st) { |
535 | 0 | context->add_warning(error_str.c_str()); |
536 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); |
537 | 0 | return; |
538 | 0 | } |
539 | 72 | re = scoped_re.get(); |
540 | 72 | } |
541 | | |
542 | 97 | re2::StringPiece replace_str = re2::StringPiece( |
543 | 97 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); |
544 | | |
545 | 97 | std::string result_str(str_col->get_data_at(index_now).to_string()); |
546 | 97 | re2::RE2::Replace(&result_str, *re, replace_str); |
547 | 97 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); |
548 | 97 | } _ZN5doris20RegexpReplaceOneImpl19_execute_inner_loopILb1EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_S7_RKNS_9StringRefERNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERNSB_IjLm4096ESE_Lm16ELm15EEESG_m Line | Count | Source | 525 | 5 | const size_t index_now) { | 526 | 5 | re2::RE2* re = reinterpret_cast<re2::RE2*>( | 527 | 5 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 528 | 5 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr | 529 | 5 | if (re == nullptr) { | 530 | 0 | std::string error_str; | 531 | 0 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 532 | 0 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 533 | 0 | options_value, scoped_re); | 534 | 0 | if (!st) { | 535 | 0 | context->add_warning(error_str.c_str()); | 536 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 537 | 0 | return; | 538 | 0 | } | 539 | 0 | re = scoped_re.get(); | 540 | 0 | } | 541 | | | 542 | 5 | re2::StringPiece replace_str = re2::StringPiece( | 543 | 5 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); | 544 | | | 545 | 5 | std::string result_str(str_col->get_data_at(index_now).to_string()); | 546 | 5 | re2::RE2::Replace(&result_str, *re, replace_str); | 547 | 5 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); | 548 | 5 | } |
_ZN5doris20RegexpReplaceOneImpl19_execute_inner_loopILb0EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_S7_RKNS_9StringRefERNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERNSB_IjLm4096ESE_Lm16ELm15EEESG_m Line | Count | Source | 525 | 92 | const size_t index_now) { | 526 | 92 | re2::RE2* re = reinterpret_cast<re2::RE2*>( | 527 | 92 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 528 | 92 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr | 529 | 92 | if (re == nullptr) { | 530 | 72 | std::string error_str; | 531 | 72 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 532 | 72 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 533 | 72 | options_value, scoped_re); | 534 | 72 | if (!st) { | 535 | 0 | context->add_warning(error_str.c_str()); | 536 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 537 | 0 | return; | 538 | 0 | } | 539 | 72 | re = scoped_re.get(); | 540 | 72 | } | 541 | | | 542 | 92 | re2::StringPiece replace_str = re2::StringPiece( | 543 | 92 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); | 544 | | | 545 | 92 | std::string result_str(str_col->get_data_at(index_now).to_string()); | 546 | 92 | re2::RE2::Replace(&result_str, *re, replace_str); | 547 | 92 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); | 548 | 92 | } |
|
549 | | }; |
550 | | |
551 | | template <bool ReturnNull> |
552 | | struct RegexpExtractImpl { |
553 | | static constexpr auto name = ReturnNull ? "regexp_extract_or_null" : "regexp_extract"; |
554 | | // 3 args |
555 | | static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], |
556 | | size_t input_rows_count, ColumnString::Chars& result_data, |
557 | 59 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
558 | 59 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
559 | 59 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
560 | 59 | const auto* index_col = check_and_get_column<ColumnInt64>(argument_columns[2].get()); |
561 | 188 | for (size_t i = 0; i < input_rows_count; ++i) { |
562 | 129 | if (null_map[i]) { |
563 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
564 | 0 | continue; |
565 | 0 | } |
566 | 129 | const auto& index_data = index_col->get_int(i); |
567 | 129 | if (index_data < 0) { |
568 | 0 | ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map) |
569 | 0 | : StringOP::push_empty_string(i, result_data, result_offset); |
570 | 0 | continue; |
571 | 0 | } |
572 | 129 | _execute_inner_loop<false>(context, str_col, pattern_col, index_data, result_data, |
573 | 129 | result_offset, null_map, i); |
574 | 129 | } |
575 | 59 | } _ZN5doris17RegexpExtractImplILb1EE12execute_implEPNS_15FunctionContextEPNS_3COWINS_7IColumnEE13immutable_ptrIS5_EEmRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERNSA_IjLm4096ESD_Lm16ELm15EEESF_ Line | Count | Source | 557 | 18 | ColumnString::Offsets& result_offset, NullMap& null_map) { | 558 | 18 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); | 559 | 18 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); | 560 | 18 | const auto* index_col = check_and_get_column<ColumnInt64>(argument_columns[2].get()); | 561 | 36 | for (size_t i = 0; i < input_rows_count; ++i) { | 562 | 18 | if (null_map[i]) { | 563 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); | 564 | 0 | continue; | 565 | 0 | } | 566 | 18 | const auto& index_data = index_col->get_int(i); | 567 | 18 | if (index_data < 0) { | 568 | 0 | ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map) | 569 | 0 | : StringOP::push_empty_string(i, result_data, result_offset); | 570 | 0 | continue; | 571 | 0 | } | 572 | 18 | _execute_inner_loop<false>(context, str_col, pattern_col, index_data, result_data, | 573 | 18 | result_offset, null_map, i); | 574 | 18 | } | 575 | 18 | } |
_ZN5doris17RegexpExtractImplILb0EE12execute_implEPNS_15FunctionContextEPNS_3COWINS_7IColumnEE13immutable_ptrIS5_EEmRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERNSA_IjLm4096ESD_Lm16ELm15EEESF_ Line | Count | Source | 557 | 41 | ColumnString::Offsets& result_offset, NullMap& null_map) { | 558 | 41 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); | 559 | 41 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); | 560 | 41 | const auto* index_col = check_and_get_column<ColumnInt64>(argument_columns[2].get()); | 561 | 152 | for (size_t i = 0; i < input_rows_count; ++i) { | 562 | 111 | if (null_map[i]) { | 563 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); | 564 | 0 | continue; | 565 | 0 | } | 566 | 111 | const auto& index_data = index_col->get_int(i); | 567 | 111 | if (index_data < 0) { | 568 | 0 | ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map) | 569 | 0 | : StringOP::push_empty_string(i, result_data, result_offset); | 570 | 0 | continue; | 571 | 0 | } | 572 | 111 | _execute_inner_loop<false>(context, str_col, pattern_col, index_data, result_data, | 573 | 111 | result_offset, null_map, i); | 574 | 111 | } | 575 | 41 | } |
|
576 | | |
577 | | static void execute_impl_const_args(FunctionContext* context, ColumnPtr argument_columns[], |
578 | | size_t input_rows_count, ColumnString::Chars& result_data, |
579 | 1 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
580 | 1 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
581 | 1 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
582 | 1 | const auto* index_col = check_and_get_column<ColumnInt64>(argument_columns[2].get()); |
583 | | |
584 | 1 | const auto& index_data = index_col->get_int(0); |
585 | 1 | if (index_data < 0) { |
586 | 0 | for (size_t i = 0; i < input_rows_count; ++i) { |
587 | 0 | ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map) |
588 | 0 | : StringOP::push_empty_string(i, result_data, result_offset); |
589 | 0 | } |
590 | 0 | return; |
591 | 0 | } |
592 | | |
593 | 8 | for (size_t i = 0; i < input_rows_count; ++i) { |
594 | 7 | if (null_map[i]) { |
595 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
596 | 0 | continue; |
597 | 0 | } |
598 | | |
599 | 7 | _execute_inner_loop<true>(context, str_col, pattern_col, index_data, result_data, |
600 | 7 | result_offset, null_map, i); |
601 | 7 | } |
602 | 1 | } Unexecuted instantiation: _ZN5doris17RegexpExtractImplILb1EE23execute_impl_const_argsEPNS_15FunctionContextEPNS_3COWINS_7IColumnEE13immutable_ptrIS5_EEmRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERNSA_IjLm4096ESD_Lm16ELm15EEESF_ _ZN5doris17RegexpExtractImplILb0EE23execute_impl_const_argsEPNS_15FunctionContextEPNS_3COWINS_7IColumnEE13immutable_ptrIS5_EEmRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERNSA_IjLm4096ESD_Lm16ELm15EEESF_ Line | Count | Source | 579 | 1 | ColumnString::Offsets& result_offset, NullMap& null_map) { | 580 | 1 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); | 581 | 1 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); | 582 | 1 | const auto* index_col = check_and_get_column<ColumnInt64>(argument_columns[2].get()); | 583 | | | 584 | 1 | const auto& index_data = index_col->get_int(0); | 585 | 1 | if (index_data < 0) { | 586 | 0 | for (size_t i = 0; i < input_rows_count; ++i) { | 587 | 0 | ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map) | 588 | 0 | : StringOP::push_empty_string(i, result_data, result_offset); | 589 | 0 | } | 590 | 0 | return; | 591 | 0 | } | 592 | | | 593 | 8 | for (size_t i = 0; i < input_rows_count; ++i) { | 594 | 7 | if (null_map[i]) { | 595 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); | 596 | 0 | continue; | 597 | 0 | } | 598 | | | 599 | 7 | _execute_inner_loop<true>(context, str_col, pattern_col, index_data, result_data, | 600 | 7 | result_offset, null_map, i); | 601 | 7 | } | 602 | 1 | } |
|
603 | | template <bool Const> |
604 | | static void _execute_inner_loop(FunctionContext* context, const ColumnString* str_col, |
605 | | const ColumnString* pattern_col, const Int64 index_data, |
606 | | ColumnString::Chars& result_data, |
607 | | ColumnString::Offsets& result_offset, NullMap& null_map, |
608 | 136 | const size_t index_now) { |
609 | 136 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( |
610 | 136 | context->get_function_state(FunctionContext::THREAD_LOCAL)); |
611 | 136 | std::unique_ptr<RegexpExtractEngine> scoped_engine; |
612 | | |
613 | 136 | if (engine == nullptr) { |
614 | 78 | std::string error_str; |
615 | 78 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); |
616 | 78 | scoped_engine = std::make_unique<RegexpExtractEngine>(); |
617 | 78 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, |
618 | 78 | context->state()->enable_extended_regex()); |
619 | 78 | if (!st) { |
620 | 0 | context->add_warning(error_str.c_str()); |
621 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); |
622 | 0 | return; |
623 | 0 | } |
624 | 78 | engine = scoped_engine.get(); |
625 | 78 | } |
626 | | |
627 | 136 | const auto& str = str_col->get_data_at(index_now); |
628 | | |
629 | 136 | int max_matches = 1 + engine->number_of_capturing_groups(); |
630 | 136 | if (index_data >= max_matches) { |
631 | 84 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) |
632 | 84 | : StringOP::push_empty_string(index_now, result_data, result_offset); |
633 | 84 | return; |
634 | 84 | } |
635 | | |
636 | 52 | std::string match_result; |
637 | 52 | bool success = engine->match_and_extract(str.data, str.size, static_cast<int>(index_data), |
638 | 52 | match_result); |
639 | | |
640 | 52 | if (!success) { |
641 | 13 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) |
642 | 13 | : StringOP::push_empty_string(index_now, result_data, result_offset); |
643 | 13 | return; |
644 | 13 | } |
645 | | |
646 | 39 | StringOP::push_value_string(std::string_view(match_result.data(), match_result.size()), |
647 | 39 | index_now, result_data, result_offset); |
648 | 39 | } Unexecuted instantiation: _ZN5doris17RegexpExtractImplILb1EE19_execute_inner_loopILb1EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES8_lRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERNS9_IjLm4096ESC_Lm16ELm15EEESE_m _ZN5doris17RegexpExtractImplILb1EE19_execute_inner_loopILb0EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES8_lRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERNS9_IjLm4096ESC_Lm16ELm15EEESE_m Line | Count | Source | 608 | 18 | const size_t index_now) { | 609 | 18 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( | 610 | 18 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 611 | 18 | std::unique_ptr<RegexpExtractEngine> scoped_engine; | 612 | | | 613 | 18 | if (engine == nullptr) { | 614 | 0 | std::string error_str; | 615 | 0 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 616 | 0 | scoped_engine = std::make_unique<RegexpExtractEngine>(); | 617 | 0 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, | 618 | 0 | context->state()->enable_extended_regex()); | 619 | 0 | if (!st) { | 620 | 0 | context->add_warning(error_str.c_str()); | 621 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 622 | 0 | return; | 623 | 0 | } | 624 | 0 | engine = scoped_engine.get(); | 625 | 0 | } | 626 | | | 627 | 18 | const auto& str = str_col->get_data_at(index_now); | 628 | | | 629 | 18 | int max_matches = 1 + engine->number_of_capturing_groups(); | 630 | 18 | if (index_data >= max_matches) { | 631 | 1 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 632 | 1 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 633 | 1 | return; | 634 | 1 | } | 635 | | | 636 | 17 | std::string match_result; | 637 | 17 | bool success = engine->match_and_extract(str.data, str.size, static_cast<int>(index_data), | 638 | 17 | match_result); | 639 | | | 640 | 17 | if (!success) { | 641 | 1 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 642 | 1 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 643 | 1 | return; | 644 | 1 | } | 645 | | | 646 | 16 | StringOP::push_value_string(std::string_view(match_result.data(), match_result.size()), | 647 | 16 | index_now, result_data, result_offset); | 648 | 16 | } |
_ZN5doris17RegexpExtractImplILb0EE19_execute_inner_loopILb1EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES8_lRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERNS9_IjLm4096ESC_Lm16ELm15EEESE_m Line | Count | Source | 608 | 7 | const size_t index_now) { | 609 | 7 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( | 610 | 7 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 611 | 7 | std::unique_ptr<RegexpExtractEngine> scoped_engine; | 612 | | | 613 | 7 | if (engine == nullptr) { | 614 | 0 | std::string error_str; | 615 | 0 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 616 | 0 | scoped_engine = std::make_unique<RegexpExtractEngine>(); | 617 | 0 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, | 618 | 0 | context->state()->enable_extended_regex()); | 619 | 0 | if (!st) { | 620 | 0 | context->add_warning(error_str.c_str()); | 621 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 622 | 0 | return; | 623 | 0 | } | 624 | 0 | engine = scoped_engine.get(); | 625 | 0 | } | 626 | | | 627 | 7 | const auto& str = str_col->get_data_at(index_now); | 628 | | | 629 | 7 | int max_matches = 1 + engine->number_of_capturing_groups(); | 630 | 7 | if (index_data >= max_matches) { | 631 | 0 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 632 | 0 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 633 | 0 | return; | 634 | 0 | } | 635 | | | 636 | 7 | std::string match_result; | 637 | 7 | bool success = engine->match_and_extract(str.data, str.size, static_cast<int>(index_data), | 638 | 7 | match_result); | 639 | | | 640 | 7 | if (!success) { | 641 | 7 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 642 | 7 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 643 | 7 | return; | 644 | 7 | } | 645 | | | 646 | 0 | StringOP::push_value_string(std::string_view(match_result.data(), match_result.size()), | 647 | 0 | index_now, result_data, result_offset); | 648 | 0 | } |
_ZN5doris17RegexpExtractImplILb0EE19_execute_inner_loopILb0EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES8_lRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERNS9_IjLm4096ESC_Lm16ELm15EEESE_m Line | Count | Source | 608 | 111 | const size_t index_now) { | 609 | 111 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( | 610 | 111 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 611 | 111 | std::unique_ptr<RegexpExtractEngine> scoped_engine; | 612 | | | 613 | 111 | if (engine == nullptr) { | 614 | 78 | std::string error_str; | 615 | 78 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 616 | 78 | scoped_engine = std::make_unique<RegexpExtractEngine>(); | 617 | 78 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, | 618 | 78 | context->state()->enable_extended_regex()); | 619 | 78 | if (!st) { | 620 | 0 | context->add_warning(error_str.c_str()); | 621 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 622 | 0 | return; | 623 | 0 | } | 624 | 78 | engine = scoped_engine.get(); | 625 | 78 | } | 626 | | | 627 | 111 | const auto& str = str_col->get_data_at(index_now); | 628 | | | 629 | 111 | int max_matches = 1 + engine->number_of_capturing_groups(); | 630 | 111 | if (index_data >= max_matches) { | 631 | 83 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 632 | 83 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 633 | 83 | return; | 634 | 83 | } | 635 | | | 636 | 28 | std::string match_result; | 637 | 28 | bool success = engine->match_and_extract(str.data, str.size, static_cast<int>(index_data), | 638 | 28 | match_result); | 639 | | | 640 | 28 | if (!success) { | 641 | 5 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 642 | 5 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 643 | 5 | return; | 644 | 5 | } | 645 | | | 646 | 23 | StringOP::push_value_string(std::string_view(match_result.data(), match_result.size()), | 647 | 23 | index_now, result_data, result_offset); | 648 | 23 | } |
|
649 | | }; |
650 | | |
651 | | struct RegexpExtractAllImpl { |
652 | | static constexpr auto name = "regexp_extract_all"; |
653 | | |
654 | 0 | size_t get_number_of_arguments() const { return 2; } |
655 | | |
656 | | static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], |
657 | | size_t input_rows_count, ColumnString::Chars& result_data, |
658 | 35 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
659 | 35 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
660 | 35 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
661 | 116 | for (int i = 0; i < input_rows_count; ++i) { |
662 | 81 | if (null_map[i]) { |
663 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
664 | 0 | continue; |
665 | 0 | } |
666 | 81 | _execute_inner_loop<false>(context, str_col, pattern_col, result_data, result_offset, |
667 | 81 | null_map, i); |
668 | 81 | } |
669 | 35 | } |
670 | | |
671 | | static void execute_impl_const_args(FunctionContext* context, ColumnPtr argument_columns[], |
672 | | size_t input_rows_count, ColumnString::Chars& result_data, |
673 | 8 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
674 | 8 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
675 | 8 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
676 | 22 | for (int i = 0; i < input_rows_count; ++i) { |
677 | 14 | if (null_map[i]) { |
678 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
679 | 0 | continue; |
680 | 0 | } |
681 | 14 | _execute_inner_loop<true>(context, str_col, pattern_col, result_data, result_offset, |
682 | 14 | null_map, i); |
683 | 14 | } |
684 | 8 | } |
685 | | template <bool Const> |
686 | | static void _execute_inner_loop(FunctionContext* context, const ColumnString* str_col, |
687 | | const ColumnString* pattern_col, |
688 | | ColumnString::Chars& result_data, |
689 | | ColumnString::Offsets& result_offset, NullMap& null_map, |
690 | 95 | const size_t index_now) { |
691 | 95 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( |
692 | 95 | context->get_function_state(FunctionContext::THREAD_LOCAL)); |
693 | 95 | std::unique_ptr<RegexpExtractEngine> scoped_engine; |
694 | | |
695 | 95 | if (engine == nullptr) { |
696 | 64 | std::string error_str; |
697 | 64 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); |
698 | 64 | scoped_engine = std::make_unique<RegexpExtractEngine>(); |
699 | 64 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, |
700 | 64 | context->state()->enable_extended_regex()); |
701 | 64 | if (!st) { |
702 | 0 | context->add_warning(error_str.c_str()); |
703 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); |
704 | 0 | return; |
705 | 0 | } |
706 | 64 | engine = scoped_engine.get(); |
707 | 64 | } |
708 | | |
709 | 95 | if (engine->number_of_capturing_groups() == 0) { |
710 | 65 | StringOP::push_empty_string(index_now, result_data, result_offset); |
711 | 65 | return; |
712 | 65 | } |
713 | 30 | const auto& str = str_col->get_data_at(index_now); |
714 | 30 | std::vector<std::string> res_matches; |
715 | 30 | engine->match_all_and_extract(str.data, str.size, res_matches); |
716 | | |
717 | 30 | if (res_matches.empty()) { |
718 | 10 | StringOP::push_empty_string(index_now, result_data, result_offset); |
719 | 10 | return; |
720 | 10 | } |
721 | | |
722 | 20 | std::string res = "["; |
723 | 59 | for (int j = 0; j < res_matches.size(); ++j) { |
724 | 39 | res += "'" + res_matches[j] + "'"; |
725 | 39 | if (j < res_matches.size() - 1) { |
726 | 19 | res += ","; |
727 | 19 | } |
728 | 39 | } |
729 | 20 | res += "]"; |
730 | 20 | StringOP::push_value_string(std::string_view(res), index_now, result_data, result_offset); |
731 | 20 | } _ZN5doris20RegexpExtractAllImpl19_execute_inner_loopILb1EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_RNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERNS8_IjLm4096ESB_Lm16ELm15EEESD_m Line | Count | Source | 690 | 14 | const size_t index_now) { | 691 | 14 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( | 692 | 14 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 693 | 14 | std::unique_ptr<RegexpExtractEngine> scoped_engine; | 694 | | | 695 | 14 | if (engine == nullptr) { | 696 | 0 | std::string error_str; | 697 | 0 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 698 | 0 | scoped_engine = std::make_unique<RegexpExtractEngine>(); | 699 | 0 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, | 700 | 0 | context->state()->enable_extended_regex()); | 701 | 0 | if (!st) { | 702 | 0 | context->add_warning(error_str.c_str()); | 703 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 704 | 0 | return; | 705 | 0 | } | 706 | 0 | engine = scoped_engine.get(); | 707 | 0 | } | 708 | | | 709 | 14 | if (engine->number_of_capturing_groups() == 0) { | 710 | 0 | StringOP::push_empty_string(index_now, result_data, result_offset); | 711 | 0 | return; | 712 | 0 | } | 713 | 14 | const auto& str = str_col->get_data_at(index_now); | 714 | 14 | std::vector<std::string> res_matches; | 715 | 14 | engine->match_all_and_extract(str.data, str.size, res_matches); | 716 | | | 717 | 14 | if (res_matches.empty()) { | 718 | 7 | StringOP::push_empty_string(index_now, result_data, result_offset); | 719 | 7 | return; | 720 | 7 | } | 721 | | | 722 | 7 | std::string res = "["; | 723 | 19 | for (int j = 0; j < res_matches.size(); ++j) { | 724 | 12 | res += "'" + res_matches[j] + "'"; | 725 | 12 | if (j < res_matches.size() - 1) { | 726 | 5 | res += ","; | 727 | 5 | } | 728 | 12 | } | 729 | 7 | res += "]"; | 730 | 7 | StringOP::push_value_string(std::string_view(res), index_now, result_data, result_offset); | 731 | 7 | } |
_ZN5doris20RegexpExtractAllImpl19_execute_inner_loopILb0EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_RNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERNS8_IjLm4096ESB_Lm16ELm15EEESD_m Line | Count | Source | 690 | 81 | const size_t index_now) { | 691 | 81 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( | 692 | 81 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 693 | 81 | std::unique_ptr<RegexpExtractEngine> scoped_engine; | 694 | | | 695 | 81 | if (engine == nullptr) { | 696 | 64 | std::string error_str; | 697 | 64 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 698 | 64 | scoped_engine = std::make_unique<RegexpExtractEngine>(); | 699 | 64 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, | 700 | 64 | context->state()->enable_extended_regex()); | 701 | 64 | if (!st) { | 702 | 0 | context->add_warning(error_str.c_str()); | 703 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 704 | 0 | return; | 705 | 0 | } | 706 | 64 | engine = scoped_engine.get(); | 707 | 64 | } | 708 | | | 709 | 81 | if (engine->number_of_capturing_groups() == 0) { | 710 | 65 | StringOP::push_empty_string(index_now, result_data, result_offset); | 711 | 65 | return; | 712 | 65 | } | 713 | 16 | const auto& str = str_col->get_data_at(index_now); | 714 | 16 | std::vector<std::string> res_matches; | 715 | 16 | engine->match_all_and_extract(str.data, str.size, res_matches); | 716 | | | 717 | 16 | if (res_matches.empty()) { | 718 | 3 | StringOP::push_empty_string(index_now, result_data, result_offset); | 719 | 3 | return; | 720 | 3 | } | 721 | | | 722 | 13 | std::string res = "["; | 723 | 40 | for (int j = 0; j < res_matches.size(); ++j) { | 724 | 27 | res += "'" + res_matches[j] + "'"; | 725 | 27 | if (j < res_matches.size() - 1) { | 726 | 14 | res += ","; | 727 | 14 | } | 728 | 27 | } | 729 | 13 | res += "]"; | 730 | 13 | StringOP::push_value_string(std::string_view(res), index_now, result_data, result_offset); | 731 | 13 | } |
|
732 | | }; |
733 | | |
734 | | // template FunctionRegexpFunctionality is used for regexp_xxxx series functions, not for regexp match. |
735 | | template <typename Impl> |
736 | | class FunctionRegexpFunctionality : public IFunction { |
737 | | public: |
738 | | static constexpr auto name = Impl::name; |
739 | | |
740 | 115 | static FunctionPtr create() { return std::make_shared<FunctionRegexpFunctionality>(); }_ZN5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE6createEv Line | Count | Source | 740 | 30 | static FunctionPtr create() { return std::make_shared<FunctionRegexpFunctionality>(); } |
_ZN5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE6createEv Line | Count | Source | 740 | 42 | static FunctionPtr create() { return std::make_shared<FunctionRegexpFunctionality>(); } |
_ZN5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE6createEv Line | Count | Source | 740 | 43 | static FunctionPtr create() { return std::make_shared<FunctionRegexpFunctionality>(); } |
|
741 | | |
742 | 3 | String get_name() const override { return name; }_ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE8get_nameB5cxx11Ev Line | Count | Source | 742 | 1 | String get_name() const override { return name; } |
_ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE8get_nameB5cxx11Ev Line | Count | Source | 742 | 1 | String get_name() const override { return name; } |
_ZNK5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE8get_nameB5cxx11Ev Line | Count | Source | 742 | 1 | String get_name() const override { return name; } |
|
743 | | |
744 | 88 | size_t get_number_of_arguments() const override { |
745 | 88 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { |
746 | 34 | return 2; |
747 | 34 | } |
748 | 0 | return 3; |
749 | 88 | } _ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE23get_number_of_argumentsEv Line | Count | Source | 744 | 21 | size_t get_number_of_arguments() const override { | 745 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 746 | | return 2; | 747 | | } | 748 | 21 | return 3; | 749 | 21 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE23get_number_of_argumentsEv Line | Count | Source | 744 | 33 | size_t get_number_of_arguments() const override { | 745 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 746 | | return 2; | 747 | | } | 748 | 33 | return 3; | 749 | 33 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE23get_number_of_argumentsEv Line | Count | Source | 744 | 34 | size_t get_number_of_arguments() const override { | 745 | 34 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 746 | 34 | return 2; | 747 | 34 | } | 748 | 0 | return 3; | 749 | 34 | } |
|
750 | | |
751 | 88 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
752 | 88 | return make_nullable(std::make_shared<DataTypeString>()); |
753 | 88 | } _ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 751 | 21 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 752 | 21 | return make_nullable(std::make_shared<DataTypeString>()); | 753 | 21 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 751 | 33 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 752 | 33 | return make_nullable(std::make_shared<DataTypeString>()); | 753 | 33 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE Line | Count | Source | 751 | 34 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 752 | 34 | return make_nullable(std::make_shared<DataTypeString>()); | 753 | 34 | } |
|
754 | | |
755 | 290 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { |
756 | 290 | if (scope == FunctionContext::THREAD_LOCAL) { |
757 | 202 | if (context->is_col_constant(1)) { |
758 | 121 | DCHECK(!context->get_function_state(scope)); |
759 | 121 | const auto pattern_col = context->get_constant_col(1)->column_ptr; |
760 | 121 | const auto& pattern = pattern_col->get_data_at(0); |
761 | 121 | if (pattern.size == 0) { |
762 | 3 | return Status::OK(); |
763 | 3 | } |
764 | | |
765 | 118 | std::string error_str; |
766 | 118 | auto engine = std::make_shared<RegexpExtractEngine>(); |
767 | 118 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *engine, |
768 | 118 | context->state()->enable_extended_regex()); |
769 | 118 | if (!st) { |
770 | 3 | context->set_error(error_str.c_str()); |
771 | 3 | return Status::InvalidArgument(error_str); |
772 | 3 | } |
773 | 115 | context->set_function_state(scope, engine); |
774 | 115 | } |
775 | 202 | } |
776 | 284 | return Status::OK(); |
777 | 290 | } _ZN5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 755 | 52 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 756 | 52 | if (scope == FunctionContext::THREAD_LOCAL) { | 757 | 31 | if (context->is_col_constant(1)) { | 758 | 31 | DCHECK(!context->get_function_state(scope)); | 759 | 31 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 760 | 31 | const auto& pattern = pattern_col->get_data_at(0); | 761 | 31 | if (pattern.size == 0) { | 762 | 1 | return Status::OK(); | 763 | 1 | } | 764 | | | 765 | 30 | std::string error_str; | 766 | 30 | auto engine = std::make_shared<RegexpExtractEngine>(); | 767 | 30 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *engine, | 768 | 30 | context->state()->enable_extended_regex()); | 769 | 30 | if (!st) { | 770 | 1 | context->set_error(error_str.c_str()); | 771 | 1 | return Status::InvalidArgument(error_str); | 772 | 1 | } | 773 | 29 | context->set_function_state(scope, engine); | 774 | 29 | } | 775 | 31 | } | 776 | 50 | return Status::OK(); | 777 | 52 | } |
_ZN5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 755 | 118 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 756 | 118 | if (scope == FunctionContext::THREAD_LOCAL) { | 757 | 85 | if (context->is_col_constant(1)) { | 758 | 42 | DCHECK(!context->get_function_state(scope)); | 759 | 42 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 760 | 42 | const auto& pattern = pattern_col->get_data_at(0); | 761 | 42 | if (pattern.size == 0) { | 762 | 1 | return Status::OK(); | 763 | 1 | } | 764 | | | 765 | 41 | std::string error_str; | 766 | 41 | auto engine = std::make_shared<RegexpExtractEngine>(); | 767 | 41 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *engine, | 768 | 41 | context->state()->enable_extended_regex()); | 769 | 41 | if (!st) { | 770 | 1 | context->set_error(error_str.c_str()); | 771 | 1 | return Status::InvalidArgument(error_str); | 772 | 1 | } | 773 | 40 | context->set_function_state(scope, engine); | 774 | 40 | } | 775 | 85 | } | 776 | 116 | return Status::OK(); | 777 | 118 | } |
_ZN5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE4openEPNS_15FunctionContextENS3_18FunctionStateScopeE Line | Count | Source | 755 | 120 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 756 | 120 | if (scope == FunctionContext::THREAD_LOCAL) { | 757 | 86 | if (context->is_col_constant(1)) { | 758 | 48 | DCHECK(!context->get_function_state(scope)); | 759 | 48 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 760 | 48 | const auto& pattern = pattern_col->get_data_at(0); | 761 | 48 | if (pattern.size == 0) { | 762 | 1 | return Status::OK(); | 763 | 1 | } | 764 | | | 765 | 47 | std::string error_str; | 766 | 47 | auto engine = std::make_shared<RegexpExtractEngine>(); | 767 | 47 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *engine, | 768 | 47 | context->state()->enable_extended_regex()); | 769 | 47 | if (!st) { | 770 | 1 | context->set_error(error_str.c_str()); | 771 | 1 | return Status::InvalidArgument(error_str); | 772 | 1 | } | 773 | 46 | context->set_function_state(scope, engine); | 774 | 46 | } | 775 | 86 | } | 776 | 118 | return Status::OK(); | 777 | 120 | } |
|
778 | | |
779 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
780 | 103 | uint32_t result, size_t input_rows_count) const override { |
781 | 103 | size_t argument_size = arguments.size(); |
782 | | |
783 | 103 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); |
784 | 103 | auto result_data_column = ColumnString::create(); |
785 | 103 | auto& result_data = result_data_column->get_chars(); |
786 | 103 | auto& result_offset = result_data_column->get_offsets(); |
787 | 103 | result_offset.resize(input_rows_count); |
788 | | |
789 | 103 | bool col_const[3]; |
790 | 103 | ColumnPtr argument_columns[3]; |
791 | 369 | for (int i = 0; i < argument_size; ++i) { |
792 | 266 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); |
793 | 266 | } |
794 | 103 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( |
795 | 4 | *block.get_by_position(arguments[0]).column) |
796 | 4 | .convert_to_full_column() |
797 | 103 | : block.get_by_position(arguments[0]).column; |
798 | 103 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { |
799 | 43 | default_preprocess_parameter_columns(argument_columns, col_const, {1}, block, |
800 | 43 | arguments); |
801 | 60 | } else { |
802 | 60 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, |
803 | 60 | arguments); |
804 | 60 | } |
805 | | |
806 | 103 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { |
807 | 43 | if (col_const[1]) { |
808 | 8 | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, |
809 | 8 | result_data, result_offset, |
810 | 8 | result_null_map->get_data()); |
811 | 35 | } else { |
812 | 35 | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, |
813 | 35 | result_offset, result_null_map->get_data()); |
814 | 35 | } |
815 | 60 | } else { |
816 | 60 | if (col_const[1] && col_const[2]) { |
817 | 1 | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, |
818 | 1 | result_data, result_offset, |
819 | 1 | result_null_map->get_data()); |
820 | 59 | } else { |
821 | 59 | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, |
822 | 59 | result_offset, result_null_map->get_data()); |
823 | 59 | } |
824 | 60 | } |
825 | | |
826 | 103 | block.get_by_position(result).column = |
827 | 103 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); |
828 | 103 | return Status::OK(); |
829 | 103 | } _ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 780 | 18 | uint32_t result, size_t input_rows_count) const override { | 781 | 18 | size_t argument_size = arguments.size(); | 782 | | | 783 | 18 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 784 | 18 | auto result_data_column = ColumnString::create(); | 785 | 18 | auto& result_data = result_data_column->get_chars(); | 786 | 18 | auto& result_offset = result_data_column->get_offsets(); | 787 | 18 | result_offset.resize(input_rows_count); | 788 | | | 789 | 18 | bool col_const[3]; | 790 | 18 | ColumnPtr argument_columns[3]; | 791 | 72 | for (int i = 0; i < argument_size; ++i) { | 792 | 54 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 793 | 54 | } | 794 | 18 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 795 | 0 | *block.get_by_position(arguments[0]).column) | 796 | 0 | .convert_to_full_column() | 797 | 18 | : block.get_by_position(arguments[0]).column; | 798 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 799 | | default_preprocess_parameter_columns(argument_columns, col_const, {1}, block, | 800 | | arguments); | 801 | 18 | } else { | 802 | 18 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, | 803 | 18 | arguments); | 804 | 18 | } | 805 | | | 806 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 807 | | if (col_const[1]) { | 808 | | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 809 | | result_data, result_offset, | 810 | | result_null_map->get_data()); | 811 | | } else { | 812 | | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 813 | | result_offset, result_null_map->get_data()); | 814 | | } | 815 | 18 | } else { | 816 | 18 | if (col_const[1] && col_const[2]) { | 817 | 0 | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 818 | 0 | result_data, result_offset, | 819 | 0 | result_null_map->get_data()); | 820 | 18 | } else { | 821 | 18 | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 822 | 18 | result_offset, result_null_map->get_data()); | 823 | 18 | } | 824 | 18 | } | 825 | | | 826 | 18 | block.get_by_position(result).column = | 827 | 18 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 828 | 18 | return Status::OK(); | 829 | 18 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 780 | 42 | uint32_t result, size_t input_rows_count) const override { | 781 | 42 | size_t argument_size = arguments.size(); | 782 | | | 783 | 42 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 784 | 42 | auto result_data_column = ColumnString::create(); | 785 | 42 | auto& result_data = result_data_column->get_chars(); | 786 | 42 | auto& result_offset = result_data_column->get_offsets(); | 787 | 42 | result_offset.resize(input_rows_count); | 788 | | | 789 | 42 | bool col_const[3]; | 790 | 42 | ColumnPtr argument_columns[3]; | 791 | 168 | for (int i = 0; i < argument_size; ++i) { | 792 | 126 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 793 | 126 | } | 794 | 42 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 795 | 3 | *block.get_by_position(arguments[0]).column) | 796 | 3 | .convert_to_full_column() | 797 | 42 | : block.get_by_position(arguments[0]).column; | 798 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 799 | | default_preprocess_parameter_columns(argument_columns, col_const, {1}, block, | 800 | | arguments); | 801 | 42 | } else { | 802 | 42 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, | 803 | 42 | arguments); | 804 | 42 | } | 805 | | | 806 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 807 | | if (col_const[1]) { | 808 | | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 809 | | result_data, result_offset, | 810 | | result_null_map->get_data()); | 811 | | } else { | 812 | | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 813 | | result_offset, result_null_map->get_data()); | 814 | | } | 815 | 42 | } else { | 816 | 42 | if (col_const[1] && col_const[2]) { | 817 | 1 | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 818 | 1 | result_data, result_offset, | 819 | 1 | result_null_map->get_data()); | 820 | 41 | } else { | 821 | 41 | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 822 | 41 | result_offset, result_null_map->get_data()); | 823 | 41 | } | 824 | 42 | } | 825 | | | 826 | 42 | block.get_by_position(result).column = | 827 | 42 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 828 | 42 | return Status::OK(); | 829 | 42 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 780 | 43 | uint32_t result, size_t input_rows_count) const override { | 781 | 43 | size_t argument_size = arguments.size(); | 782 | | | 783 | 43 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 784 | 43 | auto result_data_column = ColumnString::create(); | 785 | 43 | auto& result_data = result_data_column->get_chars(); | 786 | 43 | auto& result_offset = result_data_column->get_offsets(); | 787 | 43 | result_offset.resize(input_rows_count); | 788 | | | 789 | 43 | bool col_const[3]; | 790 | 43 | ColumnPtr argument_columns[3]; | 791 | 129 | for (int i = 0; i < argument_size; ++i) { | 792 | 86 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 793 | 86 | } | 794 | 43 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 795 | 1 | *block.get_by_position(arguments[0]).column) | 796 | 1 | .convert_to_full_column() | 797 | 43 | : block.get_by_position(arguments[0]).column; | 798 | 43 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 799 | 43 | default_preprocess_parameter_columns(argument_columns, col_const, {1}, block, | 800 | 43 | arguments); | 801 | | } else { | 802 | | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, | 803 | | arguments); | 804 | | } | 805 | | | 806 | 43 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 807 | 43 | if (col_const[1]) { | 808 | 8 | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 809 | 8 | result_data, result_offset, | 810 | 8 | result_null_map->get_data()); | 811 | 35 | } else { | 812 | 35 | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 813 | 35 | result_offset, result_null_map->get_data()); | 814 | 35 | } | 815 | | } else { | 816 | | if (col_const[1] && col_const[2]) { | 817 | | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 818 | | result_data, result_offset, | 819 | | result_null_map->get_data()); | 820 | | } else { | 821 | | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 822 | | result_offset, result_null_map->get_data()); | 823 | | } | 824 | | } | 825 | | | 826 | 43 | block.get_by_position(result).column = | 827 | 43 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 828 | 43 | return Status::OK(); | 829 | 43 | } |
|
830 | | }; |
831 | | |
832 | 8 | void register_function_regexp_extract(SimpleFunctionFactory& factory) { |
833 | 8 | factory.register_function<FunctionRegexpReplace<RegexpReplaceImpl, ThreeParamTypes>>(); |
834 | 8 | factory.register_function<FunctionRegexpReplace<RegexpReplaceImpl, FourParamTypes>>(); |
835 | 8 | factory.register_function<FunctionRegexpReplace<RegexpReplaceOneImpl, ThreeParamTypes>>(); |
836 | 8 | factory.register_function<FunctionRegexpReplace<RegexpReplaceOneImpl, FourParamTypes>>(); |
837 | 8 | factory.register_function<FunctionRegexpFunctionality<RegexpExtractImpl<true>>>(); |
838 | 8 | factory.register_function<FunctionRegexpFunctionality<RegexpExtractImpl<false>>>(); |
839 | 8 | factory.register_function<FunctionRegexpFunctionality<RegexpExtractAllImpl>>(); |
840 | 8 | factory.register_function<FunctionRegexpCount>(); |
841 | 8 | } |
842 | | |
843 | | } // namespace doris |