be/src/exprs/function/function_split_by_regexp.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include <fmt/format.h> |
19 | | #include <glog/logging.h> |
20 | | |
21 | | #include "common/status.h" |
22 | | #include "core/column/column_array.h" |
23 | | #include "core/column/column_const.h" |
24 | | #include "core/data_type/data_type_array.h" |
25 | | #include "core/data_type/data_type_number.h" |
26 | | #include "core/data_type/data_type_string.h" |
27 | | #include "core/types.h" |
28 | | #include "exprs/function/function.h" |
29 | | #include "exprs/function/function_string.h" |
30 | | #include "exprs/function/simple_function_factory.h" |
31 | | |
32 | | namespace doris { |
33 | | #include "common/compile_check_begin.h" |
34 | | |
35 | | struct Match { |
36 | | std::string::size_type offset; |
37 | | std::string::size_type length; |
38 | | }; |
39 | | |
40 | | class RegexpSplit { |
41 | | public: |
42 | | void init(re2::RE2* re2, int32_t max_splits); |
43 | | void set(const char* pos, const char* end); |
44 | | bool get(const char*& token_begin, const char*& token_end); |
45 | | |
46 | | private: |
47 | | const char* _pos; |
48 | | const char* _end; |
49 | | |
50 | | std::int32_t _max_splits = 0; |
51 | | std::vector<Match> _matches; |
52 | | int32_t _splits; |
53 | | re2::RE2* _re2 = nullptr; |
54 | | unsigned _number_of_subpatterns = 0; |
55 | | |
56 | | unsigned match(const char* subject, size_t subject_size, std::vector<Match>& matches, |
57 | | unsigned limit) const; |
58 | | }; |
59 | | |
60 | | unsigned RegexpSplit::match(const char* subject, size_t subject_size, std::vector<Match>& matches, |
61 | 0 | unsigned limit) const { |
62 | 0 | matches.clear(); |
63 | |
|
64 | 0 | if (limit == 0) { |
65 | 0 | return 0; |
66 | 0 | } |
67 | | |
68 | 0 | limit = std::min(limit, _number_of_subpatterns + 1); |
69 | 0 | std::vector<re2::StringPiece> pieces(limit); |
70 | |
|
71 | 0 | if (!_re2->Match({subject, subject_size}, 0, subject_size, re2::RE2::UNANCHORED, pieces.data(), |
72 | 0 | limit)) { |
73 | 0 | return 0; |
74 | 0 | } else { |
75 | 0 | matches.resize(limit); |
76 | 0 | for (size_t i = 0; i < limit; ++i) { |
77 | 0 | if (pieces[i].empty()) { |
78 | 0 | matches[i].offset = std::string::npos; |
79 | 0 | matches[i].length = 0; |
80 | 0 | } else { |
81 | 0 | matches[i].offset = pieces[i].data() - subject; |
82 | 0 | matches[i].length = pieces[i].length(); |
83 | 0 | } |
84 | 0 | } |
85 | 0 | return limit; |
86 | 0 | } |
87 | 0 | } |
88 | | |
89 | 0 | void RegexpSplit::init(re2::RE2* re2, int32_t max_splits) { |
90 | 0 | _max_splits = max_splits; |
91 | 0 | _re2 = re2; |
92 | 0 | if (_re2) { |
93 | 0 | _number_of_subpatterns = _re2->NumberOfCapturingGroups(); |
94 | 0 | } |
95 | 0 | } |
96 | | |
97 | | // Called for each next string. |
98 | 0 | void RegexpSplit::set(const char* pos, const char* end) { |
99 | 0 | _pos = pos; |
100 | 0 | _end = end; |
101 | 0 | _splits = 0; |
102 | 0 | } |
103 | | |
104 | | // Get the next token, if any, or return false. |
105 | 0 | bool RegexpSplit::get(const char*& token_begin, const char*& token_end) { |
106 | 0 | if (!_re2) { |
107 | 0 | if (_pos == _end) { |
108 | 0 | return false; |
109 | 0 | } |
110 | | |
111 | 0 | token_begin = _pos; |
112 | 0 | if (_max_splits != -1) { |
113 | 0 | if (_splits == _max_splits - 1) { |
114 | 0 | token_end = _end; |
115 | 0 | _pos = _end; |
116 | 0 | return true; |
117 | 0 | } |
118 | 0 | } |
119 | | |
120 | 0 | _pos += 1; |
121 | 0 | token_end = _pos; |
122 | 0 | ++_splits; |
123 | 0 | } else { |
124 | 0 | if (!_pos || _pos > _end) { |
125 | 0 | return false; |
126 | 0 | } |
127 | | |
128 | 0 | token_begin = _pos; |
129 | 0 | if (_max_splits != -1) { |
130 | 0 | if (_splits == _max_splits - 1) { |
131 | 0 | token_end = _end; |
132 | 0 | _pos = nullptr; |
133 | 0 | return true; |
134 | 0 | } |
135 | 0 | } |
136 | | |
137 | 0 | if (!match(_pos, _end - _pos, _matches, _number_of_subpatterns + 1) || |
138 | 0 | !_matches[0].length) { |
139 | 0 | token_end = _end; |
140 | 0 | _pos = _end + 1; |
141 | 0 | } else { |
142 | 0 | token_end = _pos + _matches[0].offset; |
143 | 0 | _pos = token_end + _matches[0].length; |
144 | 0 | ++_splits; |
145 | 0 | } |
146 | 0 | } |
147 | | |
148 | 0 | return true; |
149 | 0 | } |
150 | | |
151 | | template <typename Impl> |
152 | | class SplitByRegexp : public IFunction { |
153 | | public: |
154 | | static constexpr auto name = "split_by_regexp"; |
155 | | |
156 | 4 | static FunctionPtr create() { return std::make_shared<SplitByRegexp>(); }_ZN5doris13SplitByRegexpINS_15TwoArgumentImplEE6createEv Line | Count | Source | 156 | 2 | static FunctionPtr create() { return std::make_shared<SplitByRegexp>(); } |
_ZN5doris13SplitByRegexpINS_17ThreeArgumentImplEE6createEv Line | Count | Source | 156 | 2 | static FunctionPtr create() { return std::make_shared<SplitByRegexp>(); } |
|
157 | | |
158 | 0 | String get_name() const override { return name; }Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_15TwoArgumentImplEE8get_nameB5cxx11Ev Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_17ThreeArgumentImplEE8get_nameB5cxx11Ev |
159 | | |
160 | 0 | size_t get_number_of_arguments() const override { |
161 | 0 | return get_variadic_argument_types_impl().size(); |
162 | 0 | } Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_15TwoArgumentImplEE23get_number_of_argumentsEv Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_17ThreeArgumentImplEE23get_number_of_argumentsEv |
163 | | |
164 | 2 | bool is_variadic() const override { return true; }_ZNK5doris13SplitByRegexpINS_15TwoArgumentImplEE11is_variadicEv Line | Count | Source | 164 | 1 | bool is_variadic() const override { return true; } |
_ZNK5doris13SplitByRegexpINS_17ThreeArgumentImplEE11is_variadicEv Line | Count | Source | 164 | 1 | bool is_variadic() const override { return true; } |
|
165 | | |
166 | 2 | DataTypes get_variadic_argument_types_impl() const override { |
167 | 2 | return Impl::get_variadic_argument_types(); |
168 | 2 | } _ZNK5doris13SplitByRegexpINS_15TwoArgumentImplEE32get_variadic_argument_types_implEv Line | Count | Source | 166 | 1 | DataTypes get_variadic_argument_types_impl() const override { | 167 | 1 | return Impl::get_variadic_argument_types(); | 168 | 1 | } |
_ZNK5doris13SplitByRegexpINS_17ThreeArgumentImplEE32get_variadic_argument_types_implEv Line | Count | Source | 166 | 1 | DataTypes get_variadic_argument_types_impl() const override { | 167 | 1 | return Impl::get_variadic_argument_types(); | 168 | 1 | } |
|
169 | | |
170 | 0 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
171 | 0 | DCHECK(is_string_type(arguments[0]->get_primitive_type())) |
172 | 0 | << "first argument for function: " << name << " should be string" |
173 | 0 | << " and arguments[0] is " << arguments[0]->get_name(); |
174 | 0 | DCHECK(is_string_type(arguments[1]->get_primitive_type())) |
175 | 0 | << "second argument for function: " << name << " should be string" |
176 | 0 | << " and arguments[1] is " << arguments[1]->get_name(); |
177 | 0 | auto nullable_string_type = make_nullable(std::make_shared<DataTypeString>()); |
178 | 0 | return std::make_shared<DataTypeArray>(nullable_string_type); |
179 | 0 | } Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_15TwoArgumentImplEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_17ThreeArgumentImplEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE |
180 | | |
181 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
182 | 0 | uint32_t result, size_t input_rows_count) const override { |
183 | 0 | return Impl::execute_impl(context, block, arguments, result, input_rows_count); |
184 | 0 | } Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_15TwoArgumentImplEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Unexecuted instantiation: _ZNK5doris13SplitByRegexpINS_17ThreeArgumentImplEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm |
185 | | }; |
186 | | |
187 | | struct ExecuteImpl { |
188 | | using NullMapType = PaddedPODArray<UInt8>; |
189 | | static Status execute_impl(FunctionContext* context, Block& block, |
190 | | const ColumnNumbers& arguments, uint32_t result, |
191 | 0 | size_t input_rows_count) { |
192 | 0 | const auto& [first_column, left_const] = |
193 | 0 | unpack_if_const(block.get_by_position(arguments[0]).column); |
194 | 0 | const auto& [second_column, right_const] = |
195 | 0 | unpack_if_const(block.get_by_position(arguments[1]).column); |
196 | 0 | const auto& [three_column, three_is_const] = |
197 | 0 | unpack_if_const(block.get_by_position(arguments[2]).column); |
198 | 0 | auto limit_value = assert_cast<const ColumnInt32&>(*three_column).get_element(0); |
199 | 0 | const auto& src_column = assert_cast<const ColumnString&>(*first_column); |
200 | 0 | const auto& pattern_column = assert_cast<const ColumnString&>(*second_column); |
201 | |
|
202 | 0 | auto nullable_string_type = make_nullable(std::make_shared<DataTypeString>()); |
203 | 0 | auto dest_column_ptr = ColumnArray::create(nullable_string_type->create_column(), |
204 | 0 | ColumnArray::ColumnOffsets::create()); |
205 | 0 | IColumn* dest_nested_column = &dest_column_ptr->get_data(); |
206 | 0 | auto& dest_offsets = dest_column_ptr->get_offsets(); |
207 | 0 | DCHECK(dest_nested_column != nullptr); |
208 | |
|
209 | 0 | NullMapType* dest_nested_null_map = nullptr; |
210 | 0 | auto* dest_nullable_col = assert_cast<ColumnNullable*>(dest_nested_column); |
211 | 0 | auto& dest_column_string = |
212 | 0 | assert_cast<ColumnString&>(*(dest_nullable_col->get_nested_column_ptr())); |
213 | 0 | dest_nested_null_map = &dest_nullable_col->get_null_map_column().get_data(); |
214 | 0 | RE2::Options opts; |
215 | 0 | opts.set_never_nl(false); |
216 | 0 | opts.set_dot_nl(true); |
217 | | // split_by_regexp(ColumnString, "xxx") |
218 | 0 | if (right_const) { |
219 | 0 | RETURN_IF_ERROR(_execute_constant_pattern( |
220 | 0 | src_column, pattern_column.get_data_at(0), dest_column_string, dest_offsets, |
221 | 0 | dest_nested_null_map, limit_value, input_rows_count, &opts)); |
222 | 0 | } else if (left_const) { |
223 | | // split_by_regexp("xxx", ColumnString) |
224 | 0 | _execute_constant_src_string(src_column.get_data_at(0), pattern_column, |
225 | 0 | dest_column_string, dest_offsets, dest_nested_null_map, |
226 | 0 | limit_value, input_rows_count, &opts); |
227 | 0 | } else { |
228 | | // split_by_regexp(ColumnString, ColumnString) |
229 | 0 | _execute_vector_vector(src_column, pattern_column, dest_column_string, dest_offsets, |
230 | 0 | dest_nested_null_map, limit_value, input_rows_count, &opts); |
231 | 0 | } |
232 | | |
233 | 0 | block.replace_by_position(result, std::move(dest_column_ptr)); |
234 | 0 | return Status::OK(); |
235 | 0 | } |
236 | | |
237 | | private: |
238 | | static Status _execute_constant_pattern(const ColumnString& src_column_string, |
239 | | const StringRef& pattern_ref, |
240 | | ColumnString& dest_column_string, |
241 | | ColumnArray::Offsets64& dest_offsets, |
242 | | NullMapType* dest_nested_null_map, Int32 limit_value, |
243 | 0 | size_t input_rows_count, RE2::Options* opts) { |
244 | 0 | const char* token_begin = nullptr; |
245 | 0 | const char* token_end = nullptr; |
246 | 0 | UInt64 index = 0; |
247 | 0 | std::unique_ptr<re2::RE2> re2_ptr = nullptr; |
248 | 0 | if (pattern_ref.size) { |
249 | 0 | re2_ptr = std::make_unique<re2::RE2>(pattern_ref.to_string_view(), *opts); |
250 | 0 | } |
251 | 0 | RegexpSplit RegexpSplit; |
252 | 0 | RegexpSplit.init(re2_ptr.get(), limit_value); |
253 | 0 | for (int row = 0; row < input_rows_count; ++row) { |
254 | 0 | auto str_data = src_column_string.get_data_at(row); |
255 | 0 | RegexpSplit.set(str_data.begin(), str_data.end()); |
256 | 0 | while (RegexpSplit.get(token_begin, token_end)) { |
257 | 0 | size_t token_size = token_end - token_begin; |
258 | 0 | dest_column_string.insert_data(token_begin, token_size); |
259 | 0 | dest_nested_null_map->push_back(false); |
260 | 0 | index += 1; |
261 | 0 | } |
262 | 0 | dest_offsets.push_back(index); |
263 | 0 | } |
264 | 0 | return Status::OK(); |
265 | 0 | } |
266 | | |
267 | | static void _execute_constant_src_string(const StringRef& str_ref, |
268 | | const ColumnString& pattern_column, |
269 | | ColumnString& dest_column_string, |
270 | | ColumnArray::Offsets64& dest_offsets, |
271 | | NullMapType* dest_nested_null_map, Int32 limit_value, |
272 | 0 | size_t input_rows_count, RE2::Options* opts) { |
273 | 0 | const char* token_begin = nullptr; |
274 | 0 | const char* token_end = nullptr; |
275 | 0 | UInt64 index = 0; |
276 | 0 | RegexpSplit RegexpSplit; |
277 | |
|
278 | 0 | for (int row = 0; row < input_rows_count; ++row) { |
279 | 0 | std::unique_ptr<re2::RE2> re2_ptr = nullptr; |
280 | 0 | auto pattern = pattern_column.get_data_at(row); |
281 | 0 | if (pattern.size) { |
282 | 0 | re2_ptr = std::make_unique<re2::RE2>(pattern.to_string_view(), *opts); |
283 | 0 | if (!re2_ptr->ok()) { |
284 | 0 | dest_column_string.insert_default(); |
285 | 0 | dest_nested_null_map->push_back(true); |
286 | 0 | index += 1; |
287 | 0 | dest_offsets.push_back(index); |
288 | 0 | continue; |
289 | 0 | } |
290 | 0 | } |
291 | | |
292 | 0 | RegexpSplit.init(re2_ptr.get(), limit_value); |
293 | 0 | RegexpSplit.set(str_ref.begin(), str_ref.end()); |
294 | 0 | while (RegexpSplit.get(token_begin, token_end)) { |
295 | 0 | size_t token_size = token_end - token_begin; |
296 | 0 | dest_column_string.insert_data(token_begin, token_size); |
297 | 0 | dest_nested_null_map->push_back(false); |
298 | 0 | index += 1; |
299 | 0 | } |
300 | 0 | dest_offsets.push_back(index); |
301 | 0 | } |
302 | 0 | } |
303 | | |
304 | | static void _execute_vector_vector(const ColumnString& src_column_string, |
305 | | const ColumnString& pattern_column, |
306 | | ColumnString& dest_column_string, |
307 | | ColumnArray::Offsets64& dest_offsets, |
308 | | NullMapType* dest_nested_null_map, Int32 limit_value, |
309 | 0 | size_t input_rows_count, RE2::Options* opts) { |
310 | 0 | const char* token_begin = nullptr; |
311 | 0 | const char* token_end = nullptr; |
312 | 0 | UInt64 index = 0; |
313 | 0 | RegexpSplit RegexpSplit; |
314 | |
|
315 | 0 | for (int row = 0; row < input_rows_count; ++row) { |
316 | 0 | std::unique_ptr<re2::RE2> re2_ptr = nullptr; |
317 | 0 | auto str_data = src_column_string.get_data_at(row); |
318 | 0 | auto pattern = pattern_column.get_data_at(row); |
319 | 0 | if (pattern.size) { |
320 | 0 | re2_ptr = std::make_unique<re2::RE2>(pattern.to_string_view(), *opts); |
321 | 0 | if (!re2_ptr->ok()) { |
322 | 0 | dest_column_string.insert_default(); |
323 | 0 | dest_nested_null_map->push_back(true); |
324 | 0 | index += 1; |
325 | 0 | dest_offsets.push_back(index); |
326 | 0 | continue; |
327 | 0 | } |
328 | 0 | } |
329 | 0 | RegexpSplit.init(re2_ptr.get(), limit_value); |
330 | 0 | RegexpSplit.set(str_data.begin(), str_data.end()); |
331 | 0 | while (RegexpSplit.get(token_begin, token_end)) { |
332 | 0 | size_t token_size = token_end - token_begin; |
333 | 0 | dest_column_string.insert_data(token_begin, token_size); |
334 | 0 | dest_nested_null_map->push_back(false); |
335 | 0 | index += 1; |
336 | 0 | } |
337 | 0 | dest_offsets.push_back(index); |
338 | 0 | } |
339 | 0 | } |
340 | | }; |
341 | | |
342 | | struct TwoArgumentImpl { |
343 | 1 | static DataTypes get_variadic_argument_types() { |
344 | 1 | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>()}; |
345 | 1 | } |
346 | | |
347 | | static Status execute_impl(FunctionContext* context, Block& block, |
348 | | const ColumnNumbers& arguments, uint32_t result, |
349 | 0 | size_t input_rows_count) { |
350 | 0 | DCHECK_EQ(arguments.size(), 2); |
351 | 0 | auto max_limit = ColumnConst::create(ColumnInt32::create(1, -1), input_rows_count); |
352 | 0 | block.insert({std::move(max_limit), std::make_shared<DataTypeInt32>(), "max_limit"}); |
353 | 0 | ColumnNumbers temp_arguments = {arguments[0], arguments[1], block.columns() - 1}; |
354 | 0 | return ExecuteImpl::execute_impl(context, block, temp_arguments, result, input_rows_count); |
355 | 0 | } |
356 | | }; |
357 | | |
358 | | struct ThreeArgumentImpl { |
359 | 1 | static DataTypes get_variadic_argument_types() { |
360 | 1 | return {std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>(), |
361 | 1 | std::make_shared<DataTypeInt32>()}; |
362 | 1 | } |
363 | | static Status execute_impl(FunctionContext* context, Block& block, |
364 | | const ColumnNumbers& arguments, uint32_t result, |
365 | 0 | size_t input_rows_count) { |
366 | 0 | DCHECK_EQ(arguments.size(), 3); |
367 | 0 | return ExecuteImpl::execute_impl(context, block, arguments, result, input_rows_count); |
368 | 0 | } |
369 | | }; |
370 | | |
371 | 1 | void register_function_split_by_regexp(SimpleFunctionFactory& factory) { |
372 | 1 | factory.register_function<SplitByRegexp<TwoArgumentImpl>>(); |
373 | 1 | factory.register_function<SplitByRegexp<ThreeArgumentImpl>>(); |
374 | 1 | } |
375 | | |
376 | | } // namespace doris |