be/src/exprs/function/function_regexp.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include <glog/logging.h> |
19 | | #include <re2/re2.h> |
20 | | #include <re2/stringpiece.h> |
21 | | #include <stddef.h> |
22 | | |
23 | | #include <boost/regex.hpp> |
24 | | #include <memory> |
25 | | #include <string> |
26 | | #include <string_view> |
27 | | #include <type_traits> |
28 | | #include <utility> |
29 | | #include <vector> |
30 | | |
31 | | #include "common/status.h" |
32 | | #include "core/block/block.h" |
33 | | #include "core/block/column_numbers.h" |
34 | | #include "core/block/column_with_type_and_name.h" |
35 | | #include "core/column/column.h" |
36 | | #include "core/column/column_const.h" |
37 | | #include "core/column/column_execute_util.h" |
38 | | #include "core/column/column_nullable.h" |
39 | | #include "core/column/column_string.h" |
40 | | #include "core/column/column_vector.h" |
41 | | #include "core/data_type/data_type.h" |
42 | | #include "core/data_type/data_type_nullable.h" |
43 | | #include "core/data_type/data_type_number.h" |
44 | | #include "core/data_type/data_type_string.h" |
45 | | #include "core/string_ref.h" |
46 | | #include "core/types.h" |
47 | | #include "exec/common/stringop_substring.h" |
48 | | #include "exprs/aggregate/aggregate_function.h" |
49 | | #include "exprs/function/function.h" |
50 | | #include "exprs/function/simple_function_factory.h" |
51 | | #include "exprs/function_context.h" |
52 | | #include "exprs/string_functions.h" |
53 | | |
54 | | namespace doris { |
55 | | |
56 | | // Helper structure to hold either RE2 or Boost.Regex |
57 | | struct RegexpExtractEngine { |
58 | | std::unique_ptr<re2::RE2> re2_regex; |
59 | | std::unique_ptr<boost::regex> boost_regex; |
60 | | |
61 | 18 | bool is_boost() const { return boost_regex != nullptr; } |
62 | 313 | bool is_re2() const { return re2_regex != nullptr; } |
63 | | |
64 | | // Try to compile with RE2 first, fallback to Boost.Regex if RE2 fails |
65 | | static bool compile(const StringRef& pattern, std::string* error_str, |
66 | 264 | RegexpExtractEngine& engine, bool enable_extended_regex) { |
67 | 264 | re2::RE2::Options options; |
68 | 264 | options.set_log_errors(false); // avoid RE2 printing to stderr; we handle errors ourselves |
69 | 264 | options.set_dot_nl(true); // make '.' match '\n' by default, consistent with REGEXP/LIKE |
70 | 264 | engine.re2_regex = |
71 | 264 | std::make_unique<re2::RE2>(re2::StringPiece(pattern.data, pattern.size), options); |
72 | | |
73 | 264 | if (engine.re2_regex->ok()) { |
74 | 242 | return true; |
75 | 242 | } else if (!enable_extended_regex) { |
76 | 3 | *error_str = fmt::format( |
77 | 3 | "Invalid regex pattern: {}. Error: {}. If you need advanced regex features, " |
78 | 3 | "try setting enable_extended_regex=true", |
79 | 3 | std::string(pattern.data, pattern.size), engine.re2_regex->error()); |
80 | 3 | return false; |
81 | 3 | } |
82 | | |
83 | | // RE2 failed, try Boost.Regex for advanced features like zero-width assertions |
84 | 19 | engine.re2_regex.reset(); |
85 | 19 | try { |
86 | 19 | boost::regex::flag_type flags = boost::regex::normal; |
87 | 19 | engine.boost_regex = std::make_unique<boost::regex>(pattern.data, |
88 | 19 | pattern.data + pattern.size, flags); |
89 | 19 | return true; |
90 | 19 | } catch (const boost::regex_error& e) { |
91 | 0 | if (error_str) { |
92 | 0 | *error_str = fmt::format("Invalid regex pattern: {}. Error: {}", |
93 | 0 | std::string(pattern.data, pattern.size), e.what()); |
94 | 0 | } |
95 | 0 | return false; |
96 | 0 | } |
97 | 19 | } |
98 | | |
99 | | // Get number of capturing groups |
100 | 231 | int number_of_capturing_groups() const { |
101 | 231 | if (is_re2()) { |
102 | 222 | return re2_regex->NumberOfCapturingGroups(); |
103 | 222 | } else if (is_boost()) { |
104 | 9 | return static_cast<int>(boost_regex->mark_count()); |
105 | 9 | } |
106 | 0 | return 0; |
107 | 231 | } |
108 | | |
109 | | // Match function for extraction |
110 | 52 | bool match_and_extract(const char* data, size_t size, int index, std::string& result) const { |
111 | 52 | if (is_re2()) { |
112 | 47 | int max_matches = 1 + re2_regex->NumberOfCapturingGroups(); |
113 | 47 | if (index >= max_matches) { |
114 | 0 | return false; |
115 | 0 | } |
116 | 47 | std::vector<re2::StringPiece> matches(max_matches); |
117 | 47 | bool success = re2_regex->Match(re2::StringPiece(data, size), 0, size, |
118 | 47 | re2::RE2::UNANCHORED, matches.data(), max_matches); |
119 | 47 | if (success && index < matches.size()) { |
120 | 34 | const re2::StringPiece& match = matches[index]; |
121 | 34 | result.assign(match.data(), match.size()); |
122 | 34 | return true; |
123 | 34 | } |
124 | 13 | return false; |
125 | 47 | } else if (is_boost()) { |
126 | 5 | boost::cmatch matches; |
127 | 5 | bool success = boost::regex_search(data, data + size, matches, *boost_regex); |
128 | 5 | if (success && index < matches.size()) { |
129 | 5 | result = matches[index].str(); |
130 | 5 | return true; |
131 | 5 | } |
132 | 0 | return false; |
133 | 5 | } |
134 | 0 | return false; |
135 | 52 | } |
136 | | |
137 | | // Match all occurrences and extract the first capturing group |
138 | | void match_all_and_extract(const char* data, size_t size, |
139 | 30 | std::vector<std::string>& results) const { |
140 | 30 | if (is_re2()) { |
141 | 26 | int max_matches = 1 + re2_regex->NumberOfCapturingGroups(); |
142 | 26 | if (max_matches < 2) { |
143 | 0 | return; // No capturing groups |
144 | 0 | } |
145 | | |
146 | 26 | size_t pos = 0; |
147 | 67 | while (pos < size) { |
148 | 55 | const char* str_pos = data + pos; |
149 | 55 | size_t str_size = size - pos; |
150 | 55 | std::vector<re2::StringPiece> matches(max_matches); |
151 | 55 | bool success = re2_regex->Match(re2::StringPiece(str_pos, str_size), 0, str_size, |
152 | 55 | re2::RE2::UNANCHORED, matches.data(), max_matches); |
153 | 55 | if (!success) { |
154 | 14 | break; |
155 | 14 | } |
156 | 41 | if (matches[0].empty()) { |
157 | 11 | pos += 1; |
158 | 11 | continue; |
159 | 11 | } |
160 | | // Extract first capturing group |
161 | 30 | if (matches.size() > 1 && !matches[1].empty()) { |
162 | 30 | results.emplace_back(matches[1].data(), matches[1].size()); |
163 | 30 | } |
164 | | // Move position forward |
165 | 30 | auto offset = std::string(str_pos, str_size) |
166 | 30 | .find(std::string(matches[0].data(), matches[0].size())); |
167 | 30 | pos += offset + matches[0].size(); |
168 | 30 | } |
169 | 26 | } else if (is_boost()) { |
170 | 4 | const char* search_start = data; |
171 | 4 | const char* search_end = data + size; |
172 | 4 | boost::match_results<const char*> matches; |
173 | | |
174 | 13 | while (boost::regex_search(search_start, search_end, matches, *boost_regex)) { |
175 | 9 | if (matches.size() > 1 && matches[1].matched) { |
176 | 9 | results.emplace_back(matches[1].str()); |
177 | 9 | } |
178 | 9 | if (matches[0].length() == 0) { |
179 | 0 | if (search_start == search_end) { |
180 | 0 | break; |
181 | 0 | } |
182 | 0 | search_start += 1; |
183 | 9 | } else { |
184 | 9 | search_start = matches[0].second; |
185 | 9 | } |
186 | 9 | } |
187 | 4 | } |
188 | 30 | } |
189 | | }; |
190 | | |
191 | | struct RegexpCountImpl { |
192 | | using StringColumnView = ColumnView<TYPE_STRING>; |
193 | | |
194 | | static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], |
195 | 28 | size_t input_rows_count, ColumnInt32::Container& result_data) { |
196 | 28 | auto str_col = StringColumnView::create(argument_columns[0]); |
197 | 28 | auto pattern_col = StringColumnView::create(argument_columns[1]); |
198 | 79 | for (size_t i = 0; i < input_rows_count; ++i) { |
199 | 51 | DCHECK(!str_col.is_null_at(i)); |
200 | 51 | DCHECK(!pattern_col.is_null_at(i)); |
201 | 51 | result_data[i] = _execute_inner_loop(context, str_col, pattern_col, i); |
202 | 51 | } |
203 | 28 | } |
204 | | static int _execute_inner_loop(FunctionContext* context, const StringColumnView& str_col, |
205 | 51 | const StringColumnView& pattern_col, const size_t index_now) { |
206 | 51 | re2::RE2* re = reinterpret_cast<re2::RE2*>( |
207 | 51 | context->get_function_state(FunctionContext::THREAD_LOCAL)); |
208 | 51 | std::unique_ptr<re2::RE2> scoped_re; |
209 | 51 | if (re == nullptr) { |
210 | 24 | std::string error_str; |
211 | 24 | const auto pattern = pattern_col.value_at(index_now); |
212 | 24 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), StringRef(), |
213 | 24 | scoped_re); |
214 | 24 | if (!st) { |
215 | 0 | context->add_warning(error_str.c_str()); |
216 | 0 | throw Exception(Status::InvalidArgument(error_str)); |
217 | 0 | return 0; |
218 | 0 | } |
219 | 24 | re = scoped_re.get(); |
220 | 24 | } |
221 | | |
222 | 51 | const auto str = str_col.value_at(index_now); |
223 | 51 | int count = 0; |
224 | 51 | size_t pos = 0; |
225 | 171 | while (pos < str.size) { |
226 | 151 | auto str_pos = str.data + pos; |
227 | 151 | auto str_size = str.size - pos; |
228 | 151 | re2::StringPiece str_sp_current = re2::StringPiece(str_pos, str_size); |
229 | 151 | re2::StringPiece match; |
230 | | |
231 | 151 | bool success = re->Match(str_sp_current, 0, str_size, re2::RE2::UNANCHORED, &match, 1); |
232 | 151 | if (!success) { |
233 | 31 | break; |
234 | 31 | } |
235 | 120 | if (match.empty()) { |
236 | 24 | pos += 1; |
237 | 24 | continue; |
238 | 24 | } |
239 | 96 | count++; |
240 | 96 | size_t match_start = match.data() - str_sp_current.data(); |
241 | 96 | pos += match_start + match.size(); |
242 | 96 | } |
243 | | |
244 | 51 | return count; |
245 | 51 | } |
246 | | }; |
247 | | |
248 | | class FunctionRegexpCount : public IFunction { |
249 | | public: |
250 | | static constexpr auto name = "regexp_count"; |
251 | | |
252 | 43 | static FunctionPtr create() { return std::make_shared<FunctionRegexpCount>(); } |
253 | | |
254 | 1 | String get_name() const override { return name; } |
255 | | |
256 | 34 | size_t get_number_of_arguments() const override { return 2; } |
257 | | |
258 | 34 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
259 | 34 | return std::make_shared<DataTypeInt32>(); |
260 | 34 | } |
261 | | |
262 | 111 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { |
263 | 111 | if (scope == FunctionContext::THREAD_LOCAL) { |
264 | 77 | if (context->is_col_constant(1)) { |
265 | 54 | DCHECK(!context->get_function_state(scope)); |
266 | 54 | const auto pattern_col = context->get_constant_col(1)->column_ptr; |
267 | 54 | const auto& pattern = pattern_col->get_data_at(0); |
268 | 54 | if (pattern.size == 0) { |
269 | 8 | return Status::OK(); |
270 | 8 | } |
271 | | |
272 | 46 | std::string error_str; |
273 | 46 | std::unique_ptr<re2::RE2> scoped_re; |
274 | 46 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), |
275 | 46 | StringRef(), scoped_re); |
276 | 46 | if (!st) { |
277 | 0 | context->set_error(error_str.c_str()); |
278 | 0 | return Status::InvalidArgument(error_str); |
279 | 0 | } |
280 | 46 | std::shared_ptr<re2::RE2> re(scoped_re.release()); |
281 | 46 | context->set_function_state(scope, re); |
282 | 46 | } |
283 | 77 | } |
284 | 103 | return Status::OK(); |
285 | 111 | } |
286 | | |
287 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
288 | 28 | uint32_t result, size_t input_rows_count) const override { |
289 | 28 | auto result_data_column = ColumnInt32::create(input_rows_count); |
290 | 28 | auto& result_data = result_data_column->get_data(); |
291 | | |
292 | 28 | ColumnPtr argument_columns[2]; |
293 | | |
294 | 28 | argument_columns[0] = block.get_by_position(arguments[0]).column; |
295 | 28 | argument_columns[1] = block.get_by_position(arguments[1]).column; |
296 | 28 | RegexpCountImpl::execute_impl(context, argument_columns, input_rows_count, result_data); |
297 | | |
298 | 28 | block.get_by_position(result).column = std::move(result_data_column); |
299 | 28 | return Status::OK(); |
300 | 28 | } |
301 | | }; |
302 | | |
303 | | struct ThreeParamTypes { |
304 | 16 | static DataTypes get_variadic_argument_types() { |
305 | 16 | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(), |
306 | 16 | std::make_shared<DataTypeString>()}; |
307 | 16 | } |
308 | | }; |
309 | | |
310 | | struct FourParamTypes { |
311 | 16 | static DataTypes get_variadic_argument_types() { |
312 | 16 | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(), |
313 | 16 | std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>()}; |
314 | 16 | } |
315 | | }; |
316 | | |
317 | | // template FunctionRegexpFunctionality is used for regexp_replace/regexp_replace_one |
318 | | template <typename Impl, typename ParamTypes> |
319 | | class FunctionRegexpReplace : public IFunction { |
320 | | public: |
321 | | static constexpr auto name = Impl::name; |
322 | | |
323 | 92 | static FunctionPtr create() { return std::make_shared<FunctionRegexpReplace>(); }_ZN5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE6createEv Line | Count | Source | 323 | 33 | static FunctionPtr create() { return std::make_shared<FunctionRegexpReplace>(); } |
_ZN5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE6createEv Line | Count | Source | 323 | 17 | static FunctionPtr create() { return std::make_shared<FunctionRegexpReplace>(); } |
_ZN5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE6createEv Line | Count | Source | 323 | 24 | static FunctionPtr create() { return std::make_shared<FunctionRegexpReplace>(); } |
_ZN5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE6createEv Line | Count | Source | 323 | 18 | static FunctionPtr create() { return std::make_shared<FunctionRegexpReplace>(); } |
|
324 | | |
325 | 0 | String get_name() const override { return name; }Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE8get_nameB5cxx11Ev Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE8get_nameB5cxx11Ev Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE8get_nameB5cxx11Ev Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE8get_nameB5cxx11Ev |
326 | | |
327 | 0 | size_t get_number_of_arguments() const override { |
328 | 0 | return get_variadic_argument_types_impl().size(); |
329 | 0 | } Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE23get_number_of_argumentsEv Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE23get_number_of_argumentsEv Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE23get_number_of_argumentsEv Unexecuted instantiation: _ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE23get_number_of_argumentsEv |
330 | | |
331 | 60 | bool is_variadic() const override { return true; }_ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE11is_variadicEv Line | Count | Source | 331 | 25 | bool is_variadic() const override { return true; } |
_ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE11is_variadicEv Line | Count | Source | 331 | 9 | bool is_variadic() const override { return true; } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE11is_variadicEv Line | Count | Source | 331 | 16 | bool is_variadic() const override { return true; } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE11is_variadicEv Line | Count | Source | 331 | 10 | bool is_variadic() const override { return true; } |
|
332 | | |
333 | 56 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
334 | 56 | return make_nullable(std::make_shared<DataTypeString>()); |
335 | 56 | } _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 333 | 24 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 334 | 24 | return make_nullable(std::make_shared<DataTypeString>()); | 335 | 24 | } |
_ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 333 | 8 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 334 | 8 | return make_nullable(std::make_shared<DataTypeString>()); | 335 | 8 | } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 333 | 15 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 334 | 15 | return make_nullable(std::make_shared<DataTypeString>()); | 335 | 15 | } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 333 | 9 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 334 | 9 | return make_nullable(std::make_shared<DataTypeString>()); | 335 | 9 | } |
|
336 | | |
337 | 32 | DataTypes get_variadic_argument_types_impl() const override { |
338 | 32 | return ParamTypes::get_variadic_argument_types(); |
339 | 32 | } _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE32get_variadic_argument_types_implEv Line | Count | Source | 337 | 8 | DataTypes get_variadic_argument_types_impl() const override { | 338 | 8 | return ParamTypes::get_variadic_argument_types(); | 339 | 8 | } |
_ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE32get_variadic_argument_types_implEv Line | Count | Source | 337 | 8 | DataTypes get_variadic_argument_types_impl() const override { | 338 | 8 | return ParamTypes::get_variadic_argument_types(); | 339 | 8 | } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE32get_variadic_argument_types_implEv Line | Count | Source | 337 | 8 | DataTypes get_variadic_argument_types_impl() const override { | 338 | 8 | return ParamTypes::get_variadic_argument_types(); | 339 | 8 | } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE32get_variadic_argument_types_implEv Line | Count | Source | 337 | 8 | DataTypes get_variadic_argument_types_impl() const override { | 338 | 8 | return ParamTypes::get_variadic_argument_types(); | 339 | 8 | } |
|
340 | | |
341 | 245 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { |
342 | 245 | if (scope == FunctionContext::THREAD_LOCAL) { |
343 | 189 | if (context->is_col_constant(1)) { |
344 | 84 | DCHECK(!context->get_function_state(scope)); |
345 | 84 | const auto pattern_col = context->get_constant_col(1)->column_ptr; |
346 | 84 | const auto& pattern = pattern_col->get_data_at(0); |
347 | 84 | if (pattern.size == 0) { |
348 | 6 | return Status::OK(); |
349 | 6 | } |
350 | | |
351 | 78 | std::string error_str; |
352 | 78 | std::unique_ptr<re2::RE2> scoped_re; |
353 | 78 | StringRef options_value; |
354 | 78 | if constexpr (std::is_same_v<FourParamTypes, ParamTypes>) { |
355 | 38 | DCHECK_EQ(context->get_num_args(), 4); |
356 | 38 | DCHECK(context->is_col_constant(3)); |
357 | 38 | const auto options_col = context->get_constant_col(3)->column_ptr; |
358 | 38 | options_value = options_col->get_data_at(0); |
359 | 38 | } |
360 | | |
361 | 78 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), |
362 | 78 | options_value, scoped_re); |
363 | 78 | if (!st) { |
364 | 0 | context->set_error(error_str.c_str()); |
365 | 0 | return Status::InvalidArgument(error_str); |
366 | 0 | } |
367 | 78 | std::shared_ptr<re2::RE2> re(scoped_re.release()); |
368 | 78 | context->set_function_state(scope, re); |
369 | 78 | } |
370 | 189 | } |
371 | 239 | return Status::OK(); |
372 | 245 | } _ZN5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 341 | 92 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 342 | 92 | if (scope == FunctionContext::THREAD_LOCAL) { | 343 | 68 | if (context->is_col_constant(1)) { | 344 | 32 | DCHECK(!context->get_function_state(scope)); | 345 | 32 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 346 | 32 | const auto& pattern = pattern_col->get_data_at(0); | 347 | 32 | if (pattern.size == 0) { | 348 | 4 | return Status::OK(); | 349 | 4 | } | 350 | | | 351 | 28 | std::string error_str; | 352 | 28 | std::unique_ptr<re2::RE2> scoped_re; | 353 | 28 | StringRef options_value; | 354 | | if constexpr (std::is_same_v<FourParamTypes, ParamTypes>) { | 355 | | DCHECK_EQ(context->get_num_args(), 4); | 356 | | DCHECK(context->is_col_constant(3)); | 357 | | const auto options_col = context->get_constant_col(3)->column_ptr; | 358 | | options_value = options_col->get_data_at(0); | 359 | | } | 360 | | | 361 | 28 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 362 | 28 | options_value, scoped_re); | 363 | 28 | if (!st) { | 364 | 0 | context->set_error(error_str.c_str()); | 365 | 0 | return Status::InvalidArgument(error_str); | 366 | 0 | } | 367 | 28 | std::shared_ptr<re2::RE2> re(scoped_re.release()); | 368 | 28 | context->set_function_state(scope, re); | 369 | 28 | } | 370 | 68 | } | 371 | 88 | return Status::OK(); | 372 | 92 | } |
_ZN5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 341 | 42 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 342 | 42 | if (scope == FunctionContext::THREAD_LOCAL) { | 343 | 34 | if (context->is_col_constant(1)) { | 344 | 19 | DCHECK(!context->get_function_state(scope)); | 345 | 19 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 346 | 19 | const auto& pattern = pattern_col->get_data_at(0); | 347 | 19 | if (pattern.size == 0) { | 348 | 0 | return Status::OK(); | 349 | 0 | } | 350 | | | 351 | 19 | std::string error_str; | 352 | 19 | std::unique_ptr<re2::RE2> scoped_re; | 353 | 19 | StringRef options_value; | 354 | 19 | if constexpr (std::is_same_v<FourParamTypes, ParamTypes>) { | 355 | 19 | DCHECK_EQ(context->get_num_args(), 4); | 356 | 19 | DCHECK(context->is_col_constant(3)); | 357 | 19 | const auto options_col = context->get_constant_col(3)->column_ptr; | 358 | 19 | options_value = options_col->get_data_at(0); | 359 | 19 | } | 360 | | | 361 | 19 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 362 | 19 | options_value, scoped_re); | 363 | 19 | if (!st) { | 364 | 0 | context->set_error(error_str.c_str()); | 365 | 0 | return Status::InvalidArgument(error_str); | 366 | 0 | } | 367 | 19 | std::shared_ptr<re2::RE2> re(scoped_re.release()); | 368 | 19 | context->set_function_state(scope, re); | 369 | 19 | } | 370 | 34 | } | 371 | 42 | return Status::OK(); | 372 | 42 | } |
_ZN5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 341 | 64 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 342 | 64 | if (scope == FunctionContext::THREAD_LOCAL) { | 343 | 49 | if (context->is_col_constant(1)) { | 344 | 14 | DCHECK(!context->get_function_state(scope)); | 345 | 14 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 346 | 14 | const auto& pattern = pattern_col->get_data_at(0); | 347 | 14 | if (pattern.size == 0) { | 348 | 2 | return Status::OK(); | 349 | 2 | } | 350 | | | 351 | 12 | std::string error_str; | 352 | 12 | std::unique_ptr<re2::RE2> scoped_re; | 353 | 12 | StringRef options_value; | 354 | | if constexpr (std::is_same_v<FourParamTypes, ParamTypes>) { | 355 | | DCHECK_EQ(context->get_num_args(), 4); | 356 | | DCHECK(context->is_col_constant(3)); | 357 | | const auto options_col = context->get_constant_col(3)->column_ptr; | 358 | | options_value = options_col->get_data_at(0); | 359 | | } | 360 | | | 361 | 12 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 362 | 12 | options_value, scoped_re); | 363 | 12 | if (!st) { | 364 | 0 | context->set_error(error_str.c_str()); | 365 | 0 | return Status::InvalidArgument(error_str); | 366 | 0 | } | 367 | 12 | std::shared_ptr<re2::RE2> re(scoped_re.release()); | 368 | 12 | context->set_function_state(scope, re); | 369 | 12 | } | 370 | 49 | } | 371 | 62 | return Status::OK(); | 372 | 64 | } |
_ZN5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 341 | 47 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 342 | 47 | if (scope == FunctionContext::THREAD_LOCAL) { | 343 | 38 | if (context->is_col_constant(1)) { | 344 | 19 | DCHECK(!context->get_function_state(scope)); | 345 | 19 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 346 | 19 | const auto& pattern = pattern_col->get_data_at(0); | 347 | 19 | if (pattern.size == 0) { | 348 | 0 | return Status::OK(); | 349 | 0 | } | 350 | | | 351 | 19 | std::string error_str; | 352 | 19 | std::unique_ptr<re2::RE2> scoped_re; | 353 | 19 | StringRef options_value; | 354 | 19 | if constexpr (std::is_same_v<FourParamTypes, ParamTypes>) { | 355 | 19 | DCHECK_EQ(context->get_num_args(), 4); | 356 | 19 | DCHECK(context->is_col_constant(3)); | 357 | 19 | const auto options_col = context->get_constant_col(3)->column_ptr; | 358 | 19 | options_value = options_col->get_data_at(0); | 359 | 19 | } | 360 | | | 361 | 19 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 362 | 19 | options_value, scoped_re); | 363 | 19 | if (!st) { | 364 | 0 | context->set_error(error_str.c_str()); | 365 | 0 | return Status::InvalidArgument(error_str); | 366 | 0 | } | 367 | 19 | std::shared_ptr<re2::RE2> re(scoped_re.release()); | 368 | 19 | context->set_function_state(scope, re); | 369 | 19 | } | 370 | 38 | } | 371 | 47 | return Status::OK(); | 372 | 47 | } |
|
373 | | |
374 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
375 | 71 | uint32_t result, size_t input_rows_count) const override { |
376 | 71 | size_t argument_size = arguments.size(); |
377 | | |
378 | 71 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); |
379 | 71 | auto result_data_column = ColumnString::create(); |
380 | 71 | auto& result_data = result_data_column->get_chars(); |
381 | 71 | auto& result_offset = result_data_column->get_offsets(); |
382 | 71 | result_offset.resize(input_rows_count); |
383 | | |
384 | 71 | bool col_const[3]; |
385 | 71 | ColumnPtr argument_columns[3]; |
386 | 284 | for (int i = 0; i < 3; ++i) { |
387 | 213 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); |
388 | 213 | } |
389 | 71 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( |
390 | 6 | *block.get_by_position(arguments[0]).column) |
391 | 6 | .convert_to_full_column() |
392 | 71 | : block.get_by_position(arguments[0]).column; |
393 | | |
394 | 71 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, arguments); |
395 | | |
396 | 71 | StringRef options_value; |
397 | 71 | if (col_const[1] && col_const[2]) { |
398 | 3 | Impl::execute_impl_const_args(context, argument_columns, options_value, |
399 | 3 | input_rows_count, result_data, result_offset, |
400 | 3 | result_null_map->get_data()); |
401 | 68 | } else { |
402 | | // the options have check in FE, so is always const, and get idx of 0 |
403 | 68 | if (argument_size == 4) { |
404 | 15 | options_value = block.get_by_position(arguments[3]).column->get_data_at(0); |
405 | 15 | } |
406 | 68 | Impl::execute_impl(context, argument_columns, options_value, input_rows_count, |
407 | 68 | result_data, result_offset, result_null_map->get_data()); |
408 | 68 | } |
409 | | |
410 | 71 | block.get_by_position(result).column = |
411 | 71 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); |
412 | 71 | return Status::OK(); |
413 | 71 | } _ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_15ThreeParamTypesEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 375 | 30 | uint32_t result, size_t input_rows_count) const override { | 376 | 30 | size_t argument_size = arguments.size(); | 377 | | | 378 | 30 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 379 | 30 | auto result_data_column = ColumnString::create(); | 380 | 30 | auto& result_data = result_data_column->get_chars(); | 381 | 30 | auto& result_offset = result_data_column->get_offsets(); | 382 | 30 | result_offset.resize(input_rows_count); | 383 | | | 384 | 30 | bool col_const[3]; | 385 | 30 | ColumnPtr argument_columns[3]; | 386 | 120 | for (int i = 0; i < 3; ++i) { | 387 | 90 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 388 | 90 | } | 389 | 30 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 390 | 0 | *block.get_by_position(arguments[0]).column) | 391 | 0 | .convert_to_full_column() | 392 | 30 | : block.get_by_position(arguments[0]).column; | 393 | | | 394 | 30 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, arguments); | 395 | | | 396 | 30 | StringRef options_value; | 397 | 30 | if (col_const[1] && col_const[2]) { | 398 | 1 | Impl::execute_impl_const_args(context, argument_columns, options_value, | 399 | 1 | input_rows_count, result_data, result_offset, | 400 | 1 | result_null_map->get_data()); | 401 | 29 | } else { | 402 | | // the options have check in FE, so is always const, and get idx of 0 | 403 | 29 | if (argument_size == 4) { | 404 | 0 | options_value = block.get_by_position(arguments[3]).column->get_data_at(0); | 405 | 0 | } | 406 | 29 | Impl::execute_impl(context, argument_columns, options_value, input_rows_count, | 407 | 29 | result_data, result_offset, result_null_map->get_data()); | 408 | 29 | } | 409 | | | 410 | 30 | block.get_by_position(result).column = | 411 | 30 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 412 | 30 | return Status::OK(); | 413 | 30 | } |
_ZNK5doris21FunctionRegexpReplaceINS_17RegexpReplaceImplENS_14FourParamTypesEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 375 | 8 | uint32_t result, size_t input_rows_count) const override { | 376 | 8 | size_t argument_size = arguments.size(); | 377 | | | 378 | 8 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 379 | 8 | auto result_data_column = ColumnString::create(); | 380 | 8 | auto& result_data = result_data_column->get_chars(); | 381 | 8 | auto& result_offset = result_data_column->get_offsets(); | 382 | 8 | result_offset.resize(input_rows_count); | 383 | | | 384 | 8 | bool col_const[3]; | 385 | 8 | ColumnPtr argument_columns[3]; | 386 | 32 | for (int i = 0; i < 3; ++i) { | 387 | 24 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 388 | 24 | } | 389 | 8 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 390 | 3 | *block.get_by_position(arguments[0]).column) | 391 | 3 | .convert_to_full_column() | 392 | 8 | : block.get_by_position(arguments[0]).column; | 393 | | | 394 | 8 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, arguments); | 395 | | | 396 | 8 | StringRef options_value; | 397 | 8 | if (col_const[1] && col_const[2]) { | 398 | 1 | Impl::execute_impl_const_args(context, argument_columns, options_value, | 399 | 1 | input_rows_count, result_data, result_offset, | 400 | 1 | result_null_map->get_data()); | 401 | 7 | } else { | 402 | | // the options have check in FE, so is always const, and get idx of 0 | 403 | 7 | if (argument_size == 4) { | 404 | 7 | options_value = block.get_by_position(arguments[3]).column->get_data_at(0); | 405 | 7 | } | 406 | 7 | Impl::execute_impl(context, argument_columns, options_value, input_rows_count, | 407 | 7 | result_data, result_offset, result_null_map->get_data()); | 408 | 7 | } | 409 | | | 410 | 8 | block.get_by_position(result).column = | 411 | 8 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 412 | 8 | return Status::OK(); | 413 | 8 | } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_15ThreeParamTypesEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 375 | 24 | uint32_t result, size_t input_rows_count) const override { | 376 | 24 | size_t argument_size = arguments.size(); | 377 | | | 378 | 24 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 379 | 24 | auto result_data_column = ColumnString::create(); | 380 | 24 | auto& result_data = result_data_column->get_chars(); | 381 | 24 | auto& result_offset = result_data_column->get_offsets(); | 382 | 24 | result_offset.resize(input_rows_count); | 383 | | | 384 | 24 | bool col_const[3]; | 385 | 24 | ColumnPtr argument_columns[3]; | 386 | 96 | for (int i = 0; i < 3; ++i) { | 387 | 72 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 388 | 72 | } | 389 | 24 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 390 | 0 | *block.get_by_position(arguments[0]).column) | 391 | 0 | .convert_to_full_column() | 392 | 24 | : block.get_by_position(arguments[0]).column; | 393 | | | 394 | 24 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, arguments); | 395 | | | 396 | 24 | StringRef options_value; | 397 | 24 | if (col_const[1] && col_const[2]) { | 398 | 0 | Impl::execute_impl_const_args(context, argument_columns, options_value, | 399 | 0 | input_rows_count, result_data, result_offset, | 400 | 0 | result_null_map->get_data()); | 401 | 24 | } else { | 402 | | // the options have check in FE, so is always const, and get idx of 0 | 403 | 24 | if (argument_size == 4) { | 404 | 0 | options_value = block.get_by_position(arguments[3]).column->get_data_at(0); | 405 | 0 | } | 406 | 24 | Impl::execute_impl(context, argument_columns, options_value, input_rows_count, | 407 | 24 | result_data, result_offset, result_null_map->get_data()); | 408 | 24 | } | 409 | | | 410 | 24 | block.get_by_position(result).column = | 411 | 24 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 412 | 24 | return Status::OK(); | 413 | 24 | } |
_ZNK5doris21FunctionRegexpReplaceINS_20RegexpReplaceOneImplENS_14FourParamTypesEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 375 | 9 | uint32_t result, size_t input_rows_count) const override { | 376 | 9 | size_t argument_size = arguments.size(); | 377 | | | 378 | 9 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 379 | 9 | auto result_data_column = ColumnString::create(); | 380 | 9 | auto& result_data = result_data_column->get_chars(); | 381 | 9 | auto& result_offset = result_data_column->get_offsets(); | 382 | 9 | result_offset.resize(input_rows_count); | 383 | | | 384 | 9 | bool col_const[3]; | 385 | 9 | ColumnPtr argument_columns[3]; | 386 | 36 | for (int i = 0; i < 3; ++i) { | 387 | 27 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 388 | 27 | } | 389 | 9 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 390 | 3 | *block.get_by_position(arguments[0]).column) | 391 | 3 | .convert_to_full_column() | 392 | 9 | : block.get_by_position(arguments[0]).column; | 393 | | | 394 | 9 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, arguments); | 395 | | | 396 | 9 | StringRef options_value; | 397 | 9 | if (col_const[1] && col_const[2]) { | 398 | 1 | Impl::execute_impl_const_args(context, argument_columns, options_value, | 399 | 1 | input_rows_count, result_data, result_offset, | 400 | 1 | result_null_map->get_data()); | 401 | 8 | } else { | 402 | | // the options have check in FE, so is always const, and get idx of 0 | 403 | 8 | if (argument_size == 4) { | 404 | 8 | options_value = block.get_by_position(arguments[3]).column->get_data_at(0); | 405 | 8 | } | 406 | 8 | Impl::execute_impl(context, argument_columns, options_value, input_rows_count, | 407 | 8 | result_data, result_offset, result_null_map->get_data()); | 408 | 8 | } | 409 | | | 410 | 9 | block.get_by_position(result).column = | 411 | 9 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 412 | 9 | return Status::OK(); | 413 | 9 | } |
|
414 | | }; |
415 | | |
416 | | struct RegexpReplaceImpl { |
417 | | static constexpr auto name = "regexp_replace"; |
418 | | static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], |
419 | | const StringRef& options_value, size_t input_rows_count, |
420 | | ColumnString::Chars& result_data, ColumnString::Offsets& result_offset, |
421 | 36 | NullMap& null_map) { |
422 | 36 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
423 | 36 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
424 | 36 | const auto* replace_col = check_and_get_column<ColumnString>(argument_columns[2].get()); |
425 | | |
426 | 126 | for (size_t i = 0; i < input_rows_count; ++i) { |
427 | 90 | if (null_map[i]) { |
428 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
429 | 0 | continue; |
430 | 0 | } |
431 | 90 | _execute_inner_loop<false>(context, str_col, pattern_col, replace_col, options_value, |
432 | 90 | result_data, result_offset, null_map, i); |
433 | 90 | } |
434 | 36 | } |
435 | | static void execute_impl_const_args(FunctionContext* context, ColumnPtr argument_columns[], |
436 | | const StringRef& options_value, size_t input_rows_count, |
437 | | ColumnString::Chars& result_data, |
438 | 2 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
439 | 2 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
440 | 2 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
441 | 2 | const auto* replace_col = check_and_get_column<ColumnString>(argument_columns[2].get()); |
442 | | |
443 | 12 | for (size_t i = 0; i < input_rows_count; ++i) { |
444 | 10 | if (null_map[i]) { |
445 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
446 | 0 | continue; |
447 | 0 | } |
448 | 10 | _execute_inner_loop<true>(context, str_col, pattern_col, replace_col, options_value, |
449 | 10 | result_data, result_offset, null_map, i); |
450 | 10 | } |
451 | 2 | } |
452 | | template <bool Const> |
453 | | static void _execute_inner_loop(FunctionContext* context, const ColumnString* str_col, |
454 | | const ColumnString* pattern_col, |
455 | | const ColumnString* replace_col, const StringRef& options_value, |
456 | | ColumnString::Chars& result_data, |
457 | | ColumnString::Offsets& result_offset, NullMap& null_map, |
458 | 100 | const size_t index_now) { |
459 | 100 | re2::RE2* re = reinterpret_cast<re2::RE2*>( |
460 | 100 | context->get_function_state(FunctionContext::THREAD_LOCAL)); |
461 | 100 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr |
462 | 100 | if (re == nullptr) { |
463 | 67 | std::string error_str; |
464 | 67 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); |
465 | 67 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), |
466 | 67 | options_value, scoped_re); |
467 | 67 | if (!st) { |
468 | 0 | context->add_warning(error_str.c_str()); |
469 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); |
470 | 0 | return; |
471 | 0 | } |
472 | 67 | re = scoped_re.get(); |
473 | 67 | } |
474 | | |
475 | 100 | re2::StringPiece replace_str = re2::StringPiece( |
476 | 100 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); |
477 | | |
478 | 100 | std::string result_str(str_col->get_data_at(index_now).to_string()); |
479 | 100 | re2::RE2::GlobalReplace(&result_str, *re, replace_str); |
480 | 100 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); |
481 | 100 | } _ZN5doris17RegexpReplaceImpl19_execute_inner_loopILb1EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_S7_RKNS_9StringRefERNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSB_IjLm4096ESE_Lm16ELm15EEESG_m Line | Count | Source | 458 | 10 | const size_t index_now) { | 459 | 10 | re2::RE2* re = reinterpret_cast<re2::RE2*>( | 460 | 10 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 461 | 10 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr | 462 | 10 | if (re == nullptr) { | 463 | 0 | std::string error_str; | 464 | 0 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 465 | 0 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 466 | 0 | options_value, scoped_re); | 467 | 0 | if (!st) { | 468 | 0 | context->add_warning(error_str.c_str()); | 469 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 470 | 0 | return; | 471 | 0 | } | 472 | 0 | re = scoped_re.get(); | 473 | 0 | } | 474 | | | 475 | 10 | re2::StringPiece replace_str = re2::StringPiece( | 476 | 10 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); | 477 | | | 478 | 10 | std::string result_str(str_col->get_data_at(index_now).to_string()); | 479 | 10 | re2::RE2::GlobalReplace(&result_str, *re, replace_str); | 480 | 10 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); | 481 | 10 | } |
_ZN5doris17RegexpReplaceImpl19_execute_inner_loopILb0EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_S7_RKNS_9StringRefERNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSB_IjLm4096ESE_Lm16ELm15EEESG_m Line | Count | Source | 458 | 90 | const size_t index_now) { | 459 | 90 | re2::RE2* re = reinterpret_cast<re2::RE2*>( | 460 | 90 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 461 | 90 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr | 462 | 90 | if (re == nullptr) { | 463 | 67 | std::string error_str; | 464 | 67 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 465 | 67 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 466 | 67 | options_value, scoped_re); | 467 | 67 | if (!st) { | 468 | 0 | context->add_warning(error_str.c_str()); | 469 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 470 | 0 | return; | 471 | 0 | } | 472 | 67 | re = scoped_re.get(); | 473 | 67 | } | 474 | | | 475 | 90 | re2::StringPiece replace_str = re2::StringPiece( | 476 | 90 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); | 477 | | | 478 | 90 | std::string result_str(str_col->get_data_at(index_now).to_string()); | 479 | 90 | re2::RE2::GlobalReplace(&result_str, *re, replace_str); | 480 | 90 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); | 481 | 90 | } |
|
482 | | }; |
483 | | |
484 | | struct RegexpReplaceOneImpl { |
485 | | static constexpr auto name = "regexp_replace_one"; |
486 | | |
487 | | static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], |
488 | | const StringRef& options_value, size_t input_rows_count, |
489 | | ColumnString::Chars& result_data, ColumnString::Offsets& result_offset, |
490 | 32 | NullMap& null_map) { |
491 | 32 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
492 | 32 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
493 | 32 | const auto* replace_col = check_and_get_column<ColumnString>(argument_columns[2].get()); |
494 | | // 3 args |
495 | 122 | for (size_t i = 0; i < input_rows_count; ++i) { |
496 | 90 | if (null_map[i]) { |
497 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
498 | 0 | continue; |
499 | 0 | } |
500 | 90 | _execute_inner_loop<false>(context, str_col, pattern_col, replace_col, options_value, |
501 | 90 | result_data, result_offset, null_map, i); |
502 | 90 | } |
503 | 32 | } |
504 | | |
505 | | static void execute_impl_const_args(FunctionContext* context, ColumnPtr argument_columns[], |
506 | | const StringRef& options_value, size_t input_rows_count, |
507 | | ColumnString::Chars& result_data, |
508 | 1 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
509 | 1 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
510 | 1 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
511 | 1 | const auto* replace_col = check_and_get_column<ColumnString>(argument_columns[2].get()); |
512 | | // 3 args |
513 | 6 | for (size_t i = 0; i < input_rows_count; ++i) { |
514 | 5 | if (null_map[i]) { |
515 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
516 | 0 | continue; |
517 | 0 | } |
518 | 5 | _execute_inner_loop<true>(context, str_col, pattern_col, replace_col, options_value, |
519 | 5 | result_data, result_offset, null_map, i); |
520 | 5 | } |
521 | 1 | } |
522 | | template <bool Const> |
523 | | static void _execute_inner_loop(FunctionContext* context, const ColumnString* str_col, |
524 | | const ColumnString* pattern_col, |
525 | | const ColumnString* replace_col, const StringRef& options_value, |
526 | | ColumnString::Chars& result_data, |
527 | | ColumnString::Offsets& result_offset, NullMap& null_map, |
528 | 95 | const size_t index_now) { |
529 | 95 | re2::RE2* re = reinterpret_cast<re2::RE2*>( |
530 | 95 | context->get_function_state(FunctionContext::THREAD_LOCAL)); |
531 | 95 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr |
532 | 95 | if (re == nullptr) { |
533 | 72 | std::string error_str; |
534 | 72 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); |
535 | 72 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), |
536 | 72 | options_value, scoped_re); |
537 | 72 | if (!st) { |
538 | 0 | context->add_warning(error_str.c_str()); |
539 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); |
540 | 0 | return; |
541 | 0 | } |
542 | 72 | re = scoped_re.get(); |
543 | 72 | } |
544 | | |
545 | 95 | re2::StringPiece replace_str = re2::StringPiece( |
546 | 95 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); |
547 | | |
548 | 95 | std::string result_str(str_col->get_data_at(index_now).to_string()); |
549 | 95 | re2::RE2::Replace(&result_str, *re, replace_str); |
550 | 95 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); |
551 | 95 | } _ZN5doris20RegexpReplaceOneImpl19_execute_inner_loopILb1EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_S7_RKNS_9StringRefERNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSB_IjLm4096ESE_Lm16ELm15EEESG_m Line | Count | Source | 528 | 5 | const size_t index_now) { | 529 | 5 | re2::RE2* re = reinterpret_cast<re2::RE2*>( | 530 | 5 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 531 | 5 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr | 532 | 5 | if (re == nullptr) { | 533 | 0 | std::string error_str; | 534 | 0 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 535 | 0 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 536 | 0 | options_value, scoped_re); | 537 | 0 | if (!st) { | 538 | 0 | context->add_warning(error_str.c_str()); | 539 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 540 | 0 | return; | 541 | 0 | } | 542 | 0 | re = scoped_re.get(); | 543 | 0 | } | 544 | | | 545 | 5 | re2::StringPiece replace_str = re2::StringPiece( | 546 | 5 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); | 547 | | | 548 | 5 | std::string result_str(str_col->get_data_at(index_now).to_string()); | 549 | 5 | re2::RE2::Replace(&result_str, *re, replace_str); | 550 | 5 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); | 551 | 5 | } |
_ZN5doris20RegexpReplaceOneImpl19_execute_inner_loopILb0EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_S7_RKNS_9StringRefERNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSB_IjLm4096ESE_Lm16ELm15EEESG_m Line | Count | Source | 528 | 90 | const size_t index_now) { | 529 | 90 | re2::RE2* re = reinterpret_cast<re2::RE2*>( | 530 | 90 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 531 | 90 | std::unique_ptr<re2::RE2> scoped_re; // destroys re if state->re is nullptr | 532 | 90 | if (re == nullptr) { | 533 | 72 | std::string error_str; | 534 | 72 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 535 | 72 | bool st = StringFunctions::compile_regex(pattern, &error_str, StringRef(), | 536 | 72 | options_value, scoped_re); | 537 | 72 | if (!st) { | 538 | 0 | context->add_warning(error_str.c_str()); | 539 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 540 | 0 | return; | 541 | 0 | } | 542 | 72 | re = scoped_re.get(); | 543 | 72 | } | 544 | | | 545 | 90 | re2::StringPiece replace_str = re2::StringPiece( | 546 | 90 | replace_col->get_data_at(index_check_const(index_now, Const)).to_string_view()); | 547 | | | 548 | 90 | std::string result_str(str_col->get_data_at(index_now).to_string()); | 549 | 90 | re2::RE2::Replace(&result_str, *re, replace_str); | 550 | 90 | StringOP::push_value_string(result_str, index_now, result_data, result_offset); | 551 | 90 | } |
|
552 | | }; |
553 | | |
554 | | template <bool ReturnNull> |
555 | | struct RegexpExtractImpl { |
556 | | static constexpr auto name = ReturnNull ? "regexp_extract_or_null" : "regexp_extract"; |
557 | | // 3 args |
558 | | static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], |
559 | | size_t input_rows_count, ColumnString::Chars& result_data, |
560 | 59 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
561 | 59 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
562 | 59 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
563 | 59 | const auto* index_col = check_and_get_column<ColumnInt64>(argument_columns[2].get()); |
564 | 188 | for (size_t i = 0; i < input_rows_count; ++i) { |
565 | 129 | if (null_map[i]) { |
566 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
567 | 0 | continue; |
568 | 0 | } |
569 | 129 | const auto& index_data = index_col->get_int(i); |
570 | 129 | if (index_data < 0) { |
571 | 0 | ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map) |
572 | 0 | : StringOP::push_empty_string(i, result_data, result_offset); |
573 | 0 | continue; |
574 | 0 | } |
575 | 129 | _execute_inner_loop<false>(context, str_col, pattern_col, index_data, result_data, |
576 | 129 | result_offset, null_map, i); |
577 | 129 | } |
578 | 59 | } _ZN5doris17RegexpExtractImplILb1EE12execute_implEPNS_15FunctionContextEPNS_3COWINS_7IColumnEE13immutable_ptrIS5_EEmRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSA_IjLm4096ESD_Lm16ELm15EEESF_ Line | Count | Source | 560 | 18 | ColumnString::Offsets& result_offset, NullMap& null_map) { | 561 | 18 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); | 562 | 18 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); | 563 | 18 | const auto* index_col = check_and_get_column<ColumnInt64>(argument_columns[2].get()); | 564 | 36 | for (size_t i = 0; i < input_rows_count; ++i) { | 565 | 18 | if (null_map[i]) { | 566 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); | 567 | 0 | continue; | 568 | 0 | } | 569 | 18 | const auto& index_data = index_col->get_int(i); | 570 | 18 | if (index_data < 0) { | 571 | 0 | ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map) | 572 | 0 | : StringOP::push_empty_string(i, result_data, result_offset); | 573 | 0 | continue; | 574 | 0 | } | 575 | 18 | _execute_inner_loop<false>(context, str_col, pattern_col, index_data, result_data, | 576 | 18 | result_offset, null_map, i); | 577 | 18 | } | 578 | 18 | } |
_ZN5doris17RegexpExtractImplILb0EE12execute_implEPNS_15FunctionContextEPNS_3COWINS_7IColumnEE13immutable_ptrIS5_EEmRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSA_IjLm4096ESD_Lm16ELm15EEESF_ Line | Count | Source | 560 | 41 | ColumnString::Offsets& result_offset, NullMap& null_map) { | 561 | 41 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); | 562 | 41 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); | 563 | 41 | const auto* index_col = check_and_get_column<ColumnInt64>(argument_columns[2].get()); | 564 | 152 | for (size_t i = 0; i < input_rows_count; ++i) { | 565 | 111 | if (null_map[i]) { | 566 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); | 567 | 0 | continue; | 568 | 0 | } | 569 | 111 | const auto& index_data = index_col->get_int(i); | 570 | 111 | if (index_data < 0) { | 571 | 0 | ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map) | 572 | 0 | : StringOP::push_empty_string(i, result_data, result_offset); | 573 | 0 | continue; | 574 | 0 | } | 575 | 111 | _execute_inner_loop<false>(context, str_col, pattern_col, index_data, result_data, | 576 | 111 | result_offset, null_map, i); | 577 | 111 | } | 578 | 41 | } |
|
579 | | |
580 | | static void execute_impl_const_args(FunctionContext* context, ColumnPtr argument_columns[], |
581 | | size_t input_rows_count, ColumnString::Chars& result_data, |
582 | 1 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
583 | 1 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
584 | 1 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
585 | 1 | const auto* index_col = check_and_get_column<ColumnInt64>(argument_columns[2].get()); |
586 | | |
587 | 1 | const auto& index_data = index_col->get_int(0); |
588 | 1 | if (index_data < 0) { |
589 | 0 | for (size_t i = 0; i < input_rows_count; ++i) { |
590 | 0 | ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map) |
591 | 0 | : StringOP::push_empty_string(i, result_data, result_offset); |
592 | 0 | } |
593 | 0 | return; |
594 | 0 | } |
595 | | |
596 | 8 | for (size_t i = 0; i < input_rows_count; ++i) { |
597 | 7 | if (null_map[i]) { |
598 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
599 | 0 | continue; |
600 | 0 | } |
601 | | |
602 | 7 | _execute_inner_loop<true>(context, str_col, pattern_col, index_data, result_data, |
603 | 7 | result_offset, null_map, i); |
604 | 7 | } |
605 | 1 | } Unexecuted instantiation: _ZN5doris17RegexpExtractImplILb1EE23execute_impl_const_argsEPNS_15FunctionContextEPNS_3COWINS_7IColumnEE13immutable_ptrIS5_EEmRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSA_IjLm4096ESD_Lm16ELm15EEESF_ _ZN5doris17RegexpExtractImplILb0EE23execute_impl_const_argsEPNS_15FunctionContextEPNS_3COWINS_7IColumnEE13immutable_ptrIS5_EEmRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNSA_IjLm4096ESD_Lm16ELm15EEESF_ Line | Count | Source | 582 | 1 | ColumnString::Offsets& result_offset, NullMap& null_map) { | 583 | 1 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); | 584 | 1 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); | 585 | 1 | const auto* index_col = check_and_get_column<ColumnInt64>(argument_columns[2].get()); | 586 | | | 587 | 1 | const auto& index_data = index_col->get_int(0); | 588 | 1 | if (index_data < 0) { | 589 | 0 | for (size_t i = 0; i < input_rows_count; ++i) { | 590 | 0 | ReturnNull ? StringOP::push_null_string(i, result_data, result_offset, null_map) | 591 | 0 | : StringOP::push_empty_string(i, result_data, result_offset); | 592 | 0 | } | 593 | 0 | return; | 594 | 0 | } | 595 | | | 596 | 8 | for (size_t i = 0; i < input_rows_count; ++i) { | 597 | 7 | if (null_map[i]) { | 598 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); | 599 | 0 | continue; | 600 | 0 | } | 601 | | | 602 | 7 | _execute_inner_loop<true>(context, str_col, pattern_col, index_data, result_data, | 603 | 7 | result_offset, null_map, i); | 604 | 7 | } | 605 | 1 | } |
|
606 | | template <bool Const> |
607 | | static void _execute_inner_loop(FunctionContext* context, const ColumnString* str_col, |
608 | | const ColumnString* pattern_col, const Int64 index_data, |
609 | | ColumnString::Chars& result_data, |
610 | | ColumnString::Offsets& result_offset, NullMap& null_map, |
611 | 136 | const size_t index_now) { |
612 | 136 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( |
613 | 136 | context->get_function_state(FunctionContext::THREAD_LOCAL)); |
614 | 136 | std::unique_ptr<RegexpExtractEngine> scoped_engine; |
615 | | |
616 | 136 | if (engine == nullptr) { |
617 | 78 | std::string error_str; |
618 | 78 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); |
619 | 78 | scoped_engine = std::make_unique<RegexpExtractEngine>(); |
620 | 78 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, |
621 | 78 | context->state()->enable_extended_regex()); |
622 | 78 | if (!st) { |
623 | 0 | context->add_warning(error_str.c_str()); |
624 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); |
625 | 0 | return; |
626 | 0 | } |
627 | 78 | engine = scoped_engine.get(); |
628 | 78 | } |
629 | | |
630 | 136 | const auto& str = str_col->get_data_at(index_now); |
631 | | |
632 | 136 | int max_matches = 1 + engine->number_of_capturing_groups(); |
633 | 136 | if (index_data >= max_matches) { |
634 | 84 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) |
635 | 84 | : StringOP::push_empty_string(index_now, result_data, result_offset); |
636 | 84 | return; |
637 | 84 | } |
638 | | |
639 | 52 | std::string match_result; |
640 | 52 | bool success = engine->match_and_extract(str.data, str.size, static_cast<int>(index_data), |
641 | 52 | match_result); |
642 | | |
643 | 52 | if (!success) { |
644 | 13 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) |
645 | 13 | : StringOP::push_empty_string(index_now, result_data, result_offset); |
646 | 13 | return; |
647 | 13 | } |
648 | | |
649 | 39 | StringOP::push_value_string(std::string_view(match_result.data(), match_result.size()), |
650 | 39 | index_now, result_data, result_offset); |
651 | 39 | } Unexecuted instantiation: _ZN5doris17RegexpExtractImplILb1EE19_execute_inner_loopILb1EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES8_lRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNS9_IjLm4096ESC_Lm16ELm15EEESE_m _ZN5doris17RegexpExtractImplILb1EE19_execute_inner_loopILb0EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES8_lRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNS9_IjLm4096ESC_Lm16ELm15EEESE_m Line | Count | Source | 611 | 18 | const size_t index_now) { | 612 | 18 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( | 613 | 18 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 614 | 18 | std::unique_ptr<RegexpExtractEngine> scoped_engine; | 615 | | | 616 | 18 | if (engine == nullptr) { | 617 | 0 | std::string error_str; | 618 | 0 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 619 | 0 | scoped_engine = std::make_unique<RegexpExtractEngine>(); | 620 | 0 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, | 621 | 0 | context->state()->enable_extended_regex()); | 622 | 0 | if (!st) { | 623 | 0 | context->add_warning(error_str.c_str()); | 624 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 625 | 0 | return; | 626 | 0 | } | 627 | 0 | engine = scoped_engine.get(); | 628 | 0 | } | 629 | | | 630 | 18 | const auto& str = str_col->get_data_at(index_now); | 631 | | | 632 | 18 | int max_matches = 1 + engine->number_of_capturing_groups(); | 633 | 18 | if (index_data >= max_matches) { | 634 | 1 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 635 | 1 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 636 | 1 | return; | 637 | 1 | } | 638 | | | 639 | 17 | std::string match_result; | 640 | 17 | bool success = engine->match_and_extract(str.data, str.size, static_cast<int>(index_data), | 641 | 17 | match_result); | 642 | | | 643 | 17 | if (!success) { | 644 | 1 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 645 | 1 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 646 | 1 | return; | 647 | 1 | } | 648 | | | 649 | 16 | StringOP::push_value_string(std::string_view(match_result.data(), match_result.size()), | 650 | 16 | index_now, result_data, result_offset); | 651 | 16 | } |
_ZN5doris17RegexpExtractImplILb0EE19_execute_inner_loopILb1EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES8_lRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNS9_IjLm4096ESC_Lm16ELm15EEESE_m Line | Count | Source | 611 | 7 | const size_t index_now) { | 612 | 7 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( | 613 | 7 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 614 | 7 | std::unique_ptr<RegexpExtractEngine> scoped_engine; | 615 | | | 616 | 7 | if (engine == nullptr) { | 617 | 0 | std::string error_str; | 618 | 0 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 619 | 0 | scoped_engine = std::make_unique<RegexpExtractEngine>(); | 620 | 0 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, | 621 | 0 | context->state()->enable_extended_regex()); | 622 | 0 | if (!st) { | 623 | 0 | context->add_warning(error_str.c_str()); | 624 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 625 | 0 | return; | 626 | 0 | } | 627 | 0 | engine = scoped_engine.get(); | 628 | 0 | } | 629 | | | 630 | 7 | const auto& str = str_col->get_data_at(index_now); | 631 | | | 632 | 7 | int max_matches = 1 + engine->number_of_capturing_groups(); | 633 | 7 | if (index_data >= max_matches) { | 634 | 0 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 635 | 0 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 636 | 0 | return; | 637 | 0 | } | 638 | | | 639 | 7 | std::string match_result; | 640 | 7 | bool success = engine->match_and_extract(str.data, str.size, static_cast<int>(index_data), | 641 | 7 | match_result); | 642 | | | 643 | 7 | if (!success) { | 644 | 7 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 645 | 7 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 646 | 7 | return; | 647 | 7 | } | 648 | | | 649 | 0 | StringOP::push_value_string(std::string_view(match_result.data(), match_result.size()), | 650 | 0 | index_now, result_data, result_offset); | 651 | 0 | } |
_ZN5doris17RegexpExtractImplILb0EE19_execute_inner_loopILb0EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES8_lRNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNS9_IjLm4096ESC_Lm16ELm15EEESE_m Line | Count | Source | 611 | 111 | const size_t index_now) { | 612 | 111 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( | 613 | 111 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 614 | 111 | std::unique_ptr<RegexpExtractEngine> scoped_engine; | 615 | | | 616 | 111 | if (engine == nullptr) { | 617 | 78 | std::string error_str; | 618 | 78 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 619 | 78 | scoped_engine = std::make_unique<RegexpExtractEngine>(); | 620 | 78 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, | 621 | 78 | context->state()->enable_extended_regex()); | 622 | 78 | if (!st) { | 623 | 0 | context->add_warning(error_str.c_str()); | 624 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 625 | 0 | return; | 626 | 0 | } | 627 | 78 | engine = scoped_engine.get(); | 628 | 78 | } | 629 | | | 630 | 111 | const auto& str = str_col->get_data_at(index_now); | 631 | | | 632 | 111 | int max_matches = 1 + engine->number_of_capturing_groups(); | 633 | 111 | if (index_data >= max_matches) { | 634 | 83 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 635 | 83 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 636 | 83 | return; | 637 | 83 | } | 638 | | | 639 | 28 | std::string match_result; | 640 | 28 | bool success = engine->match_and_extract(str.data, str.size, static_cast<int>(index_data), | 641 | 28 | match_result); | 642 | | | 643 | 28 | if (!success) { | 644 | 5 | ReturnNull ? StringOP::push_null_string(index_now, result_data, result_offset, null_map) | 645 | 5 | : StringOP::push_empty_string(index_now, result_data, result_offset); | 646 | 5 | return; | 647 | 5 | } | 648 | | | 649 | 23 | StringOP::push_value_string(std::string_view(match_result.data(), match_result.size()), | 650 | 23 | index_now, result_data, result_offset); | 651 | 23 | } |
|
652 | | }; |
653 | | |
654 | | struct RegexpExtractAllImpl { |
655 | | static constexpr auto name = "regexp_extract_all"; |
656 | | |
657 | 0 | size_t get_number_of_arguments() const { return 2; } |
658 | | |
659 | | static void execute_impl(FunctionContext* context, ColumnPtr argument_columns[], |
660 | | size_t input_rows_count, ColumnString::Chars& result_data, |
661 | 35 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
662 | 35 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
663 | 35 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
664 | 116 | for (int i = 0; i < input_rows_count; ++i) { |
665 | 81 | if (null_map[i]) { |
666 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
667 | 0 | continue; |
668 | 0 | } |
669 | 81 | _execute_inner_loop<false>(context, str_col, pattern_col, result_data, result_offset, |
670 | 81 | null_map, i); |
671 | 81 | } |
672 | 35 | } |
673 | | |
674 | | static void execute_impl_const_args(FunctionContext* context, ColumnPtr argument_columns[], |
675 | | size_t input_rows_count, ColumnString::Chars& result_data, |
676 | 8 | ColumnString::Offsets& result_offset, NullMap& null_map) { |
677 | 8 | const auto* str_col = check_and_get_column<ColumnString>(argument_columns[0].get()); |
678 | 8 | const auto* pattern_col = check_and_get_column<ColumnString>(argument_columns[1].get()); |
679 | 22 | for (int i = 0; i < input_rows_count; ++i) { |
680 | 14 | if (null_map[i]) { |
681 | 0 | StringOP::push_null_string(i, result_data, result_offset, null_map); |
682 | 0 | continue; |
683 | 0 | } |
684 | 14 | _execute_inner_loop<true>(context, str_col, pattern_col, result_data, result_offset, |
685 | 14 | null_map, i); |
686 | 14 | } |
687 | 8 | } |
688 | | template <bool Const> |
689 | | static void _execute_inner_loop(FunctionContext* context, const ColumnString* str_col, |
690 | | const ColumnString* pattern_col, |
691 | | ColumnString::Chars& result_data, |
692 | | ColumnString::Offsets& result_offset, NullMap& null_map, |
693 | 95 | const size_t index_now) { |
694 | 95 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( |
695 | 95 | context->get_function_state(FunctionContext::THREAD_LOCAL)); |
696 | 95 | std::unique_ptr<RegexpExtractEngine> scoped_engine; |
697 | | |
698 | 95 | if (engine == nullptr) { |
699 | 64 | std::string error_str; |
700 | 64 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); |
701 | 64 | scoped_engine = std::make_unique<RegexpExtractEngine>(); |
702 | 64 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, |
703 | 64 | context->state()->enable_extended_regex()); |
704 | 64 | if (!st) { |
705 | 0 | context->add_warning(error_str.c_str()); |
706 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); |
707 | 0 | return; |
708 | 0 | } |
709 | 64 | engine = scoped_engine.get(); |
710 | 64 | } |
711 | | |
712 | 95 | if (engine->number_of_capturing_groups() == 0) { |
713 | 65 | StringOP::push_empty_string(index_now, result_data, result_offset); |
714 | 65 | return; |
715 | 65 | } |
716 | 30 | const auto& str = str_col->get_data_at(index_now); |
717 | 30 | std::vector<std::string> res_matches; |
718 | 30 | engine->match_all_and_extract(str.data, str.size, res_matches); |
719 | | |
720 | 30 | if (res_matches.empty()) { |
721 | 10 | StringOP::push_empty_string(index_now, result_data, result_offset); |
722 | 10 | return; |
723 | 10 | } |
724 | | |
725 | 20 | std::string res = "["; |
726 | 59 | for (int j = 0; j < res_matches.size(); ++j) { |
727 | 39 | res += "'" + res_matches[j] + "'"; |
728 | 39 | if (j < res_matches.size() - 1) { |
729 | 19 | res += ","; |
730 | 19 | } |
731 | 39 | } |
732 | 20 | res += "]"; |
733 | 20 | StringOP::push_value_string(std::string_view(res), index_now, result_data, result_offset); |
734 | 20 | } _ZN5doris20RegexpExtractAllImpl19_execute_inner_loopILb1EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_RNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNS8_IjLm4096ESB_Lm16ELm15EEESD_m Line | Count | Source | 693 | 14 | const size_t index_now) { | 694 | 14 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( | 695 | 14 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 696 | 14 | std::unique_ptr<RegexpExtractEngine> scoped_engine; | 697 | | | 698 | 14 | if (engine == nullptr) { | 699 | 0 | std::string error_str; | 700 | 0 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 701 | 0 | scoped_engine = std::make_unique<RegexpExtractEngine>(); | 702 | 0 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, | 703 | 0 | context->state()->enable_extended_regex()); | 704 | 0 | if (!st) { | 705 | 0 | context->add_warning(error_str.c_str()); | 706 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 707 | 0 | return; | 708 | 0 | } | 709 | 0 | engine = scoped_engine.get(); | 710 | 0 | } | 711 | | | 712 | 14 | if (engine->number_of_capturing_groups() == 0) { | 713 | 0 | StringOP::push_empty_string(index_now, result_data, result_offset); | 714 | 0 | return; | 715 | 0 | } | 716 | 14 | const auto& str = str_col->get_data_at(index_now); | 717 | 14 | std::vector<std::string> res_matches; | 718 | 14 | engine->match_all_and_extract(str.data, str.size, res_matches); | 719 | | | 720 | 14 | if (res_matches.empty()) { | 721 | 7 | StringOP::push_empty_string(index_now, result_data, result_offset); | 722 | 7 | return; | 723 | 7 | } | 724 | | | 725 | 7 | std::string res = "["; | 726 | 19 | for (int j = 0; j < res_matches.size(); ++j) { | 727 | 12 | res += "'" + res_matches[j] + "'"; | 728 | 12 | if (j < res_matches.size() - 1) { | 729 | 5 | res += ","; | 730 | 5 | } | 731 | 12 | } | 732 | 7 | res += "]"; | 733 | 7 | StringOP::push_value_string(std::string_view(res), index_now, result_data, result_offset); | 734 | 7 | } |
_ZN5doris20RegexpExtractAllImpl19_execute_inner_loopILb0EEEvPNS_15FunctionContextEPKNS_9ColumnStrIjEES7_RNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERNS8_IjLm4096ESB_Lm16ELm15EEESD_m Line | Count | Source | 693 | 81 | const size_t index_now) { | 694 | 81 | auto* engine = reinterpret_cast<RegexpExtractEngine*>( | 695 | 81 | context->get_function_state(FunctionContext::THREAD_LOCAL)); | 696 | 81 | std::unique_ptr<RegexpExtractEngine> scoped_engine; | 697 | | | 698 | 81 | if (engine == nullptr) { | 699 | 64 | std::string error_str; | 700 | 64 | const auto& pattern = pattern_col->get_data_at(index_check_const(index_now, Const)); | 701 | 64 | scoped_engine = std::make_unique<RegexpExtractEngine>(); | 702 | 64 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *scoped_engine, | 703 | 64 | context->state()->enable_extended_regex()); | 704 | 64 | if (!st) { | 705 | 0 | context->add_warning(error_str.c_str()); | 706 | 0 | StringOP::push_null_string(index_now, result_data, result_offset, null_map); | 707 | 0 | return; | 708 | 0 | } | 709 | 64 | engine = scoped_engine.get(); | 710 | 64 | } | 711 | | | 712 | 81 | if (engine->number_of_capturing_groups() == 0) { | 713 | 65 | StringOP::push_empty_string(index_now, result_data, result_offset); | 714 | 65 | return; | 715 | 65 | } | 716 | 16 | const auto& str = str_col->get_data_at(index_now); | 717 | 16 | std::vector<std::string> res_matches; | 718 | 16 | engine->match_all_and_extract(str.data, str.size, res_matches); | 719 | | | 720 | 16 | if (res_matches.empty()) { | 721 | 3 | StringOP::push_empty_string(index_now, result_data, result_offset); | 722 | 3 | return; | 723 | 3 | } | 724 | | | 725 | 13 | std::string res = "["; | 726 | 40 | for (int j = 0; j < res_matches.size(); ++j) { | 727 | 27 | res += "'" + res_matches[j] + "'"; | 728 | 27 | if (j < res_matches.size() - 1) { | 729 | 14 | res += ","; | 730 | 14 | } | 731 | 27 | } | 732 | 13 | res += "]"; | 733 | 13 | StringOP::push_value_string(std::string_view(res), index_now, result_data, result_offset); | 734 | 13 | } |
|
735 | | }; |
736 | | |
737 | | // template FunctionRegexpFunctionality is used for regexp_xxxx series functions, not for regexp match. |
738 | | template <typename Impl> |
739 | | class FunctionRegexpFunctionality : public IFunction { |
740 | | public: |
741 | | static constexpr auto name = Impl::name; |
742 | | |
743 | 117 | static FunctionPtr create() { return std::make_shared<FunctionRegexpFunctionality>(); }_ZN5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE6createEv Line | Count | Source | 743 | 30 | static FunctionPtr create() { return std::make_shared<FunctionRegexpFunctionality>(); } |
_ZN5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE6createEv Line | Count | Source | 743 | 43 | static FunctionPtr create() { return std::make_shared<FunctionRegexpFunctionality>(); } |
_ZN5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE6createEv Line | Count | Source | 743 | 44 | static FunctionPtr create() { return std::make_shared<FunctionRegexpFunctionality>(); } |
|
744 | | |
745 | 3 | String get_name() const override { return name; }_ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE8get_nameB5cxx11Ev Line | Count | Source | 745 | 1 | String get_name() const override { return name; } |
_ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE8get_nameB5cxx11Ev Line | Count | Source | 745 | 1 | String get_name() const override { return name; } |
_ZNK5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE8get_nameB5cxx11Ev Line | Count | Source | 745 | 1 | String get_name() const override { return name; } |
|
746 | | |
747 | 90 | size_t get_number_of_arguments() const override { |
748 | 90 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { |
749 | 35 | return 2; |
750 | 35 | } |
751 | 0 | return 3; |
752 | 90 | } _ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE23get_number_of_argumentsEv Line | Count | Source | 747 | 21 | size_t get_number_of_arguments() const override { | 748 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 749 | | return 2; | 750 | | } | 751 | 21 | return 3; | 752 | 21 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE23get_number_of_argumentsEv Line | Count | Source | 747 | 34 | size_t get_number_of_arguments() const override { | 748 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 749 | | return 2; | 750 | | } | 751 | 34 | return 3; | 752 | 34 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE23get_number_of_argumentsEv Line | Count | Source | 747 | 35 | size_t get_number_of_arguments() const override { | 748 | 35 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 749 | 35 | return 2; | 750 | 35 | } | 751 | 0 | return 3; | 752 | 35 | } |
|
753 | | |
754 | 90 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
755 | 90 | return make_nullable(std::make_shared<DataTypeString>()); |
756 | 90 | } _ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 754 | 21 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 755 | 21 | return make_nullable(std::make_shared<DataTypeString>()); | 756 | 21 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 754 | 34 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 755 | 34 | return make_nullable(std::make_shared<DataTypeString>()); | 756 | 34 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE Line | Count | Source | 754 | 35 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 755 | 35 | return make_nullable(std::make_shared<DataTypeString>()); | 756 | 35 | } |
|
757 | | |
758 | 303 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { |
759 | 303 | if (scope == FunctionContext::THREAD_LOCAL) { |
760 | 213 | if (context->is_col_constant(1)) { |
761 | 125 | DCHECK(!context->get_function_state(scope)); |
762 | 125 | const auto pattern_col = context->get_constant_col(1)->column_ptr; |
763 | 125 | const auto& pattern = pattern_col->get_data_at(0); |
764 | 125 | if (pattern.size == 0) { |
765 | 3 | return Status::OK(); |
766 | 3 | } |
767 | | |
768 | 122 | std::string error_str; |
769 | 122 | auto engine = std::make_shared<RegexpExtractEngine>(); |
770 | 122 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *engine, |
771 | 122 | context->state()->enable_extended_regex()); |
772 | 122 | if (!st) { |
773 | 3 | context->set_error(error_str.c_str()); |
774 | 3 | return Status::InvalidArgument(error_str); |
775 | 3 | } |
776 | 119 | context->set_function_state(scope, engine); |
777 | 119 | } |
778 | 213 | } |
779 | 297 | return Status::OK(); |
780 | 303 | } _ZN5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 758 | 52 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 759 | 52 | if (scope == FunctionContext::THREAD_LOCAL) { | 760 | 31 | if (context->is_col_constant(1)) { | 761 | 31 | DCHECK(!context->get_function_state(scope)); | 762 | 31 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 763 | 31 | const auto& pattern = pattern_col->get_data_at(0); | 764 | 31 | if (pattern.size == 0) { | 765 | 1 | return Status::OK(); | 766 | 1 | } | 767 | | | 768 | 30 | std::string error_str; | 769 | 30 | auto engine = std::make_shared<RegexpExtractEngine>(); | 770 | 30 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *engine, | 771 | 30 | context->state()->enable_extended_regex()); | 772 | 30 | if (!st) { | 773 | 1 | context->set_error(error_str.c_str()); | 774 | 1 | return Status::InvalidArgument(error_str); | 775 | 1 | } | 776 | 29 | context->set_function_state(scope, engine); | 777 | 29 | } | 778 | 31 | } | 779 | 50 | return Status::OK(); | 780 | 52 | } |
_ZN5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE4openEPNS_15FunctionContextENS4_18FunctionStateScopeE Line | Count | Source | 758 | 125 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 759 | 125 | if (scope == FunctionContext::THREAD_LOCAL) { | 760 | 91 | if (context->is_col_constant(1)) { | 761 | 44 | DCHECK(!context->get_function_state(scope)); | 762 | 44 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 763 | 44 | const auto& pattern = pattern_col->get_data_at(0); | 764 | 44 | if (pattern.size == 0) { | 765 | 1 | return Status::OK(); | 766 | 1 | } | 767 | | | 768 | 43 | std::string error_str; | 769 | 43 | auto engine = std::make_shared<RegexpExtractEngine>(); | 770 | 43 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *engine, | 771 | 43 | context->state()->enable_extended_regex()); | 772 | 43 | if (!st) { | 773 | 1 | context->set_error(error_str.c_str()); | 774 | 1 | return Status::InvalidArgument(error_str); | 775 | 1 | } | 776 | 42 | context->set_function_state(scope, engine); | 777 | 42 | } | 778 | 91 | } | 779 | 123 | return Status::OK(); | 780 | 125 | } |
_ZN5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE4openEPNS_15FunctionContextENS3_18FunctionStateScopeE Line | Count | Source | 758 | 126 | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override { | 759 | 126 | if (scope == FunctionContext::THREAD_LOCAL) { | 760 | 91 | if (context->is_col_constant(1)) { | 761 | 50 | DCHECK(!context->get_function_state(scope)); | 762 | 50 | const auto pattern_col = context->get_constant_col(1)->column_ptr; | 763 | 50 | const auto& pattern = pattern_col->get_data_at(0); | 764 | 50 | if (pattern.size == 0) { | 765 | 1 | return Status::OK(); | 766 | 1 | } | 767 | | | 768 | 49 | std::string error_str; | 769 | 49 | auto engine = std::make_shared<RegexpExtractEngine>(); | 770 | 49 | bool st = RegexpExtractEngine::compile(pattern, &error_str, *engine, | 771 | 49 | context->state()->enable_extended_regex()); | 772 | 49 | if (!st) { | 773 | 1 | context->set_error(error_str.c_str()); | 774 | 1 | return Status::InvalidArgument(error_str); | 775 | 1 | } | 776 | 48 | context->set_function_state(scope, engine); | 777 | 48 | } | 778 | 91 | } | 779 | 124 | return Status::OK(); | 780 | 126 | } |
|
781 | | |
782 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
783 | 103 | uint32_t result, size_t input_rows_count) const override { |
784 | 103 | size_t argument_size = arguments.size(); |
785 | | |
786 | 103 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); |
787 | 103 | auto result_data_column = ColumnString::create(); |
788 | 103 | auto& result_data = result_data_column->get_chars(); |
789 | 103 | auto& result_offset = result_data_column->get_offsets(); |
790 | 103 | result_offset.resize(input_rows_count); |
791 | | |
792 | 103 | bool col_const[3]; |
793 | 103 | ColumnPtr argument_columns[3]; |
794 | 369 | for (int i = 0; i < argument_size; ++i) { |
795 | 266 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); |
796 | 266 | } |
797 | 103 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( |
798 | 4 | *block.get_by_position(arguments[0]).column) |
799 | 4 | .convert_to_full_column() |
800 | 103 | : block.get_by_position(arguments[0]).column; |
801 | 103 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { |
802 | 43 | default_preprocess_parameter_columns(argument_columns, col_const, {1}, block, |
803 | 43 | arguments); |
804 | 60 | } else { |
805 | 60 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, |
806 | 60 | arguments); |
807 | 60 | } |
808 | | |
809 | 103 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { |
810 | 43 | if (col_const[1]) { |
811 | 8 | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, |
812 | 8 | result_data, result_offset, |
813 | 8 | result_null_map->get_data()); |
814 | 35 | } else { |
815 | 35 | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, |
816 | 35 | result_offset, result_null_map->get_data()); |
817 | 35 | } |
818 | 60 | } else { |
819 | 60 | if (col_const[1] && col_const[2]) { |
820 | 1 | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, |
821 | 1 | result_data, result_offset, |
822 | 1 | result_null_map->get_data()); |
823 | 59 | } else { |
824 | 59 | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, |
825 | 59 | result_offset, result_null_map->get_data()); |
826 | 59 | } |
827 | 60 | } |
828 | | |
829 | 103 | block.get_by_position(result).column = |
830 | 103 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); |
831 | 103 | return Status::OK(); |
832 | 103 | } _ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb1EEEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 783 | 18 | uint32_t result, size_t input_rows_count) const override { | 784 | 18 | size_t argument_size = arguments.size(); | 785 | | | 786 | 18 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 787 | 18 | auto result_data_column = ColumnString::create(); | 788 | 18 | auto& result_data = result_data_column->get_chars(); | 789 | 18 | auto& result_offset = result_data_column->get_offsets(); | 790 | 18 | result_offset.resize(input_rows_count); | 791 | | | 792 | 18 | bool col_const[3]; | 793 | 18 | ColumnPtr argument_columns[3]; | 794 | 72 | for (int i = 0; i < argument_size; ++i) { | 795 | 54 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 796 | 54 | } | 797 | 18 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 798 | 0 | *block.get_by_position(arguments[0]).column) | 799 | 0 | .convert_to_full_column() | 800 | 18 | : block.get_by_position(arguments[0]).column; | 801 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 802 | | default_preprocess_parameter_columns(argument_columns, col_const, {1}, block, | 803 | | arguments); | 804 | 18 | } else { | 805 | 18 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, | 806 | 18 | arguments); | 807 | 18 | } | 808 | | | 809 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 810 | | if (col_const[1]) { | 811 | | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 812 | | result_data, result_offset, | 813 | | result_null_map->get_data()); | 814 | | } else { | 815 | | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 816 | | result_offset, result_null_map->get_data()); | 817 | | } | 818 | 18 | } else { | 819 | 18 | if (col_const[1] && col_const[2]) { | 820 | 0 | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 821 | 0 | result_data, result_offset, | 822 | 0 | result_null_map->get_data()); | 823 | 18 | } else { | 824 | 18 | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 825 | 18 | result_offset, result_null_map->get_data()); | 826 | 18 | } | 827 | 18 | } | 828 | | | 829 | 18 | block.get_by_position(result).column = | 830 | 18 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 831 | 18 | return Status::OK(); | 832 | 18 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_17RegexpExtractImplILb0EEEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 783 | 42 | uint32_t result, size_t input_rows_count) const override { | 784 | 42 | size_t argument_size = arguments.size(); | 785 | | | 786 | 42 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 787 | 42 | auto result_data_column = ColumnString::create(); | 788 | 42 | auto& result_data = result_data_column->get_chars(); | 789 | 42 | auto& result_offset = result_data_column->get_offsets(); | 790 | 42 | result_offset.resize(input_rows_count); | 791 | | | 792 | 42 | bool col_const[3]; | 793 | 42 | ColumnPtr argument_columns[3]; | 794 | 168 | for (int i = 0; i < argument_size; ++i) { | 795 | 126 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 796 | 126 | } | 797 | 42 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 798 | 3 | *block.get_by_position(arguments[0]).column) | 799 | 3 | .convert_to_full_column() | 800 | 42 | : block.get_by_position(arguments[0]).column; | 801 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 802 | | default_preprocess_parameter_columns(argument_columns, col_const, {1}, block, | 803 | | arguments); | 804 | 42 | } else { | 805 | 42 | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, | 806 | 42 | arguments); | 807 | 42 | } | 808 | | | 809 | | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 810 | | if (col_const[1]) { | 811 | | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 812 | | result_data, result_offset, | 813 | | result_null_map->get_data()); | 814 | | } else { | 815 | | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 816 | | result_offset, result_null_map->get_data()); | 817 | | } | 818 | 42 | } else { | 819 | 42 | if (col_const[1] && col_const[2]) { | 820 | 1 | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 821 | 1 | result_data, result_offset, | 822 | 1 | result_null_map->get_data()); | 823 | 41 | } else { | 824 | 41 | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 825 | 41 | result_offset, result_null_map->get_data()); | 826 | 41 | } | 827 | 42 | } | 828 | | | 829 | 42 | block.get_by_position(result).column = | 830 | 42 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 831 | 42 | return Status::OK(); | 832 | 42 | } |
_ZNK5doris27FunctionRegexpFunctionalityINS_20RegexpExtractAllImplEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 783 | 43 | uint32_t result, size_t input_rows_count) const override { | 784 | 43 | size_t argument_size = arguments.size(); | 785 | | | 786 | 43 | auto result_null_map = ColumnUInt8::create(input_rows_count, 0); | 787 | 43 | auto result_data_column = ColumnString::create(); | 788 | 43 | auto& result_data = result_data_column->get_chars(); | 789 | 43 | auto& result_offset = result_data_column->get_offsets(); | 790 | 43 | result_offset.resize(input_rows_count); | 791 | | | 792 | 43 | bool col_const[3]; | 793 | 43 | ColumnPtr argument_columns[3]; | 794 | 129 | for (int i = 0; i < argument_size; ++i) { | 795 | 86 | col_const[i] = is_column_const(*block.get_by_position(arguments[i]).column); | 796 | 86 | } | 797 | 43 | argument_columns[0] = col_const[0] ? static_cast<const ColumnConst&>( | 798 | 1 | *block.get_by_position(arguments[0]).column) | 799 | 1 | .convert_to_full_column() | 800 | 43 | : block.get_by_position(arguments[0]).column; | 801 | 43 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 802 | 43 | default_preprocess_parameter_columns(argument_columns, col_const, {1}, block, | 803 | 43 | arguments); | 804 | | } else { | 805 | | default_preprocess_parameter_columns(argument_columns, col_const, {1, 2}, block, | 806 | | arguments); | 807 | | } | 808 | | | 809 | 43 | if constexpr (std::is_same_v<Impl, RegexpExtractAllImpl>) { | 810 | 43 | if (col_const[1]) { | 811 | 8 | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 812 | 8 | result_data, result_offset, | 813 | 8 | result_null_map->get_data()); | 814 | 35 | } else { | 815 | 35 | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 816 | 35 | result_offset, result_null_map->get_data()); | 817 | 35 | } | 818 | | } else { | 819 | | if (col_const[1] && col_const[2]) { | 820 | | Impl::execute_impl_const_args(context, argument_columns, input_rows_count, | 821 | | result_data, result_offset, | 822 | | result_null_map->get_data()); | 823 | | } else { | 824 | | Impl::execute_impl(context, argument_columns, input_rows_count, result_data, | 825 | | result_offset, result_null_map->get_data()); | 826 | | } | 827 | | } | 828 | | | 829 | 43 | block.get_by_position(result).column = | 830 | 43 | ColumnNullable::create(std::move(result_data_column), std::move(result_null_map)); | 831 | 43 | return Status::OK(); | 832 | 43 | } |
|
833 | | }; |
834 | | |
835 | 8 | void register_function_regexp_extract(SimpleFunctionFactory& factory) { |
836 | 8 | factory.register_function<FunctionRegexpReplace<RegexpReplaceImpl, ThreeParamTypes>>(); |
837 | 8 | factory.register_function<FunctionRegexpReplace<RegexpReplaceImpl, FourParamTypes>>(); |
838 | 8 | factory.register_function<FunctionRegexpReplace<RegexpReplaceOneImpl, ThreeParamTypes>>(); |
839 | 8 | factory.register_function<FunctionRegexpReplace<RegexpReplaceOneImpl, FourParamTypes>>(); |
840 | 8 | factory.register_function<FunctionRegexpFunctionality<RegexpExtractImpl<true>>>(); |
841 | 8 | factory.register_function<FunctionRegexpFunctionality<RegexpExtractImpl<false>>>(); |
842 | 8 | factory.register_function<FunctionRegexpFunctionality<RegexpExtractAllImpl>>(); |
843 | 8 | factory.register_function<FunctionRegexpCount>(); |
844 | 8 | } |
845 | | |
846 | | } // namespace doris |