be/src/exprs/function/like.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "exprs/function/like.h" |
19 | | |
20 | | #include <fmt/format.h> |
21 | | #include <hs/hs_compile.h> |
22 | | #include <re2/stringpiece.h> |
23 | | |
24 | | #include <cstddef> |
25 | | #include <ostream> |
26 | | #include <utility> |
27 | | #include <vector> |
28 | | |
29 | | #include "common/logging.h" |
30 | | #include "core/block/block.h" |
31 | | #include "core/block/column_with_type_and_name.h" |
32 | | #include "core/column/column.h" |
33 | | #include "core/column/column_const.h" |
34 | | #include "core/column/column_vector.h" |
35 | | #include "core/string_ref.h" |
36 | | #include "exprs/function/simple_function_factory.h" |
37 | | |
38 | | namespace doris { |
39 | | #include "common/compile_check_begin.h" |
40 | | // A regex to match any regex pattern is equivalent to a substring search. |
41 | | static const RE2 SUBSTRING_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)"); |
42 | | |
43 | | // A regex to match any regex pattern which is equivalent to matching a constant string |
44 | | // at the end of the string values. |
45 | | static const RE2 ENDS_WITH_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)"); |
46 | | |
47 | | // A regex to match any regex pattern which is equivalent to matching a constant string |
48 | | // at the end of the string values. |
49 | | static const RE2 STARTS_WITH_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)"); |
50 | | |
51 | | // A regex to match any regex pattern which is equivalent to a constant string match. |
52 | | static const RE2 EQUALS_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)"); |
53 | | // A regex to match .* |
54 | | static const RE2 ALLPASS_RE(R"((\.\*)+)"); |
55 | | |
56 | | // Like patterns |
57 | | static const re2::RE2 LIKE_SUBSTRING_RE(R"((?:%+)(((\\_)|([^%_\\]))+)(?:%+))"); |
58 | | static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\_)|([^%_]))+)"); |
59 | | static const re2::RE2 LIKE_STARTS_WITH_RE(R"((((\\%)|(\\_)|([^%_\\]))+)(?:%+))"); |
60 | | static const re2::RE2 LIKE_EQUALS_RE("(((\\\\_)|([^%_]))+)"); |
61 | | static const re2::RE2 LIKE_ALLPASS_RE("%+"); |
62 | | |
63 | | struct VectorAllpassSearchState : public VectorPatternSearchState { |
64 | 451 | VectorAllpassSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_allpass_fn) {} |
65 | | |
66 | | ~VectorAllpassSearchState() override = default; |
67 | | |
68 | 424 | void like_pattern_match(const std::string& pattern_str) override { |
69 | 424 | if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) { |
70 | 26 | _search_strings->insert_default(); |
71 | 398 | } else { |
72 | 398 | _pattern_matched = false; |
73 | 398 | } |
74 | 424 | } |
75 | | |
76 | 32 | void regexp_pattern_match(const std::string& pattern_str) override { |
77 | 32 | if (RE2::FullMatch(pattern_str, ALLPASS_RE)) { |
78 | 0 | _search_strings->insert_default(); |
79 | 32 | } else { |
80 | 32 | _pattern_matched = false; |
81 | 32 | } |
82 | 32 | } |
83 | | }; |
84 | | |
85 | | struct VectorEqualSearchState : public VectorPatternSearchState { |
86 | 451 | VectorEqualSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_equals_fn) {} |
87 | | |
88 | | ~VectorEqualSearchState() override = default; |
89 | | |
90 | 450 | void like_pattern_match(const std::string& pattern_str) override { |
91 | 450 | _search_string.clear(); |
92 | 450 | if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &_search_string)) { |
93 | 119 | FunctionLike::remove_escape_character(&_search_string); |
94 | 119 | _search_strings->insert_data(_search_string.c_str(), _search_string.size()); |
95 | 331 | } else { |
96 | 331 | _pattern_matched = false; |
97 | 331 | } |
98 | 450 | } |
99 | | |
100 | 32 | void regexp_pattern_match(const std::string& pattern_str) override { |
101 | 32 | _search_string.clear(); |
102 | 32 | if (RE2::FullMatch(pattern_str, EQUALS_RE, &_search_string)) { |
103 | 0 | _search_strings->insert_data(_search_string.c_str(), _search_string.size()); |
104 | 32 | } else { |
105 | 32 | _pattern_matched = false; |
106 | 32 | } |
107 | 32 | } |
108 | | }; |
109 | | |
110 | | struct VectorSubStringSearchState : public VectorPatternSearchState { |
111 | | VectorSubStringSearchState() |
112 | 451 | : VectorPatternSearchState(FunctionLikeBase::vector_substring_fn) {} |
113 | | |
114 | | ~VectorSubStringSearchState() override = default; |
115 | | |
116 | 442 | void like_pattern_match(const std::string& pattern_str) override { |
117 | 442 | _search_string.clear(); |
118 | 442 | if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &_search_string)) { |
119 | 53 | FunctionLike::remove_escape_character(&_search_string); |
120 | 53 | _search_strings->insert_data(_search_string.c_str(), _search_string.size()); |
121 | 389 | } else { |
122 | 389 | _pattern_matched = false; |
123 | 389 | } |
124 | 442 | } |
125 | | |
126 | 55 | void regexp_pattern_match(const std::string& pattern_str) override { |
127 | 55 | _search_string.clear(); |
128 | 55 | if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &_search_string)) { |
129 | 34 | _search_strings->insert_data(_search_string.c_str(), _search_string.size()); |
130 | 34 | } else { |
131 | 21 | _pattern_matched = false; |
132 | 21 | } |
133 | 55 | } |
134 | | }; |
135 | | |
136 | | struct VectorStartsWithSearchState : public VectorPatternSearchState { |
137 | | VectorStartsWithSearchState() |
138 | 451 | : VectorPatternSearchState(FunctionLikeBase::vector_starts_with_fn) {} |
139 | | |
140 | | ~VectorStartsWithSearchState() override = default; |
141 | | |
142 | 431 | void like_pattern_match(const std::string& pattern_str) override { |
143 | 431 | _search_string.clear(); |
144 | 431 | if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &_search_string)) { |
145 | 83 | FunctionLike::remove_escape_character(&_search_string); |
146 | 83 | _search_strings->insert_data(_search_string.c_str(), _search_string.size()); |
147 | 348 | } else { |
148 | 348 | _pattern_matched = false; |
149 | 348 | } |
150 | 431 | } |
151 | | |
152 | 50 | void regexp_pattern_match(const std::string& pattern_str) override { |
153 | 50 | _search_string.clear(); |
154 | 50 | if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &_search_string)) { |
155 | 23 | _search_strings->insert_data(_search_string.c_str(), _search_string.size()); |
156 | 27 | } else { |
157 | 27 | _pattern_matched = false; |
158 | 27 | } |
159 | 50 | } |
160 | | }; |
161 | | |
162 | | struct VectorEndsWithSearchState : public VectorPatternSearchState { |
163 | 451 | VectorEndsWithSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_ends_with_fn) {} |
164 | | |
165 | | ~VectorEndsWithSearchState() override = default; |
166 | | |
167 | 430 | void like_pattern_match(const std::string& pattern_str) override { |
168 | 430 | _search_string.clear(); |
169 | 430 | if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &_search_string)) { |
170 | 42 | FunctionLike::remove_escape_character(&_search_string); |
171 | 42 | _search_strings->insert_data(_search_string.c_str(), _search_string.size()); |
172 | 388 | } else { |
173 | 388 | _pattern_matched = false; |
174 | 388 | } |
175 | 430 | } |
176 | | |
177 | 38 | void regexp_pattern_match(const std::string& pattern_str) override { |
178 | 38 | _search_string.clear(); |
179 | 38 | if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &_search_string)) { |
180 | 9 | _search_strings->insert_data(_search_string.c_str(), _search_string.size()); |
181 | 29 | } else { |
182 | 29 | _pattern_matched = false; |
183 | 29 | } |
184 | 38 | } |
185 | | }; |
186 | | |
187 | 419 | Status LikeSearchState::clone(LikeSearchState& cloned) { |
188 | 419 | cloned.set_search_string(search_string); |
189 | | |
190 | 419 | std::string re_pattern; |
191 | 419 | FunctionLike::convert_like_pattern(this, pattern_str, &re_pattern); |
192 | 419 | if (hs_database) { // use hyperscan |
193 | 159 | hs_database_t* database = nullptr; |
194 | 159 | hs_scratch_t* scratch = nullptr; |
195 | 159 | RETURN_IF_ERROR(FunctionLike::hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch)); |
196 | | |
197 | 159 | cloned.hs_database.reset(database); |
198 | 159 | cloned.hs_scratch.reset(scratch); |
199 | 260 | } else { // fallback to re2 |
200 | 260 | cloned.hs_database.reset(); |
201 | 260 | cloned.hs_scratch.reset(); |
202 | | |
203 | 260 | RE2::Options opts; |
204 | 260 | opts.set_never_nl(false); |
205 | 260 | opts.set_dot_nl(true); |
206 | 260 | cloned.regex = std::make_unique<RE2>(re_pattern, opts); |
207 | 260 | if (!cloned.regex->ok()) { |
208 | 0 | return Status::InternalError("Invalid regex expression: {}", re_pattern); |
209 | 0 | } |
210 | 260 | } |
211 | | |
212 | 419 | return Status::OK(); |
213 | 419 | } |
214 | | |
215 | | Status FunctionLikeBase::constant_allpass_fn(const LikeSearchState* state, const ColumnString& vals, |
216 | | const StringRef& pattern, |
217 | 49 | ColumnUInt8::Container& result) { |
218 | 49 | memset(result.data(), 1, vals.size()); |
219 | 49 | return Status::OK(); |
220 | 49 | } |
221 | | |
222 | | Status FunctionLikeBase::constant_allpass_fn_scalar(const LikeSearchState* state, |
223 | | const StringRef& val, const StringRef& pattern, |
224 | 72 | unsigned char* result) { |
225 | 72 | *result = 1; |
226 | 72 | return Status::OK(); |
227 | 72 | } |
228 | | |
229 | | Status FunctionLikeBase::vector_allpass_fn(const ColumnString& vals, |
230 | | const ColumnString& search_strings, |
231 | 21 | ColumnUInt8::Container& result) { |
232 | 21 | DCHECK(vals.size() == search_strings.size()); |
233 | 21 | DCHECK(vals.size() == result.size()); |
234 | 21 | memset(result.data(), 1, vals.size()); |
235 | 21 | return Status::OK(); |
236 | 21 | } |
237 | | |
238 | | Status FunctionLikeBase::constant_starts_with_fn(const LikeSearchState* state, |
239 | | const ColumnString& val, const StringRef& pattern, |
240 | 273 | ColumnUInt8::Container& result) { |
241 | 273 | auto sz = val.size(); |
242 | 68.5k | for (size_t i = 0; i < sz; i++) { |
243 | 68.2k | const auto& str_ref = val.get_data_at(i); |
244 | 68.2k | result[i] = (str_ref.size >= state->search_string_sv.size) && |
245 | 68.3k | str_ref.start_with(state->search_string_sv); |
246 | 68.2k | } |
247 | 273 | return Status::OK(); |
248 | 273 | } |
249 | | |
250 | | Status FunctionLikeBase::constant_starts_with_fn_scalar(const LikeSearchState* state, |
251 | | const StringRef& val, |
252 | | const StringRef& pattern, |
253 | 192k | unsigned char* result) { |
254 | 192k | *result = (val.size >= state->search_string_sv.size) && |
255 | 193k | (state->search_string_sv == val.substring(0, state->search_string_sv.size)); |
256 | 192k | return Status::OK(); |
257 | 192k | } |
258 | | |
259 | | Status FunctionLikeBase::vector_starts_with_fn(const ColumnString& vals, |
260 | | const ColumnString& search_strings, |
261 | 76 | ColumnUInt8::Container& result) { |
262 | 76 | DCHECK(vals.size() == search_strings.size()); |
263 | 76 | DCHECK(vals.size() == result.size()); |
264 | 76 | auto sz = vals.size(); |
265 | 174 | for (size_t i = 0; i < sz; ++i) { |
266 | 98 | const auto& str_sv = vals.get_data_at(i); |
267 | 98 | const auto& search_string_sv = search_strings.get_data_at(i); |
268 | 98 | result[i] = (str_sv.size >= search_string_sv.size) && str_sv.start_with(search_string_sv); |
269 | 98 | } |
270 | 76 | return Status::OK(); |
271 | 76 | } |
272 | | |
273 | | Status FunctionLikeBase::constant_ends_with_fn(const LikeSearchState* state, |
274 | | const ColumnString& val, const StringRef& pattern, |
275 | 129 | ColumnUInt8::Container& result) { |
276 | 129 | auto sz = val.size(); |
277 | 5.21k | for (size_t i = 0; i < sz; i++) { |
278 | 5.09k | const auto& str_ref = val.get_data_at(i); |
279 | 5.09k | result[i] = (str_ref.size >= state->search_string_sv.size) && |
280 | 5.09k | str_ref.end_with(state->search_string_sv); |
281 | 5.09k | } |
282 | 129 | return Status::OK(); |
283 | 129 | } |
284 | | |
285 | | Status FunctionLikeBase::constant_ends_with_fn_scalar(const LikeSearchState* state, |
286 | | const StringRef& val, |
287 | | const StringRef& pattern, |
288 | 4.38k | unsigned char* result) { |
289 | 4.38k | *result = (val.size >= state->search_string_sv.size) && |
290 | 4.38k | (state->search_string_sv == val.substring(val.size - state->search_string_sv.size, |
291 | 4.37k | state->search_string_sv.size)); |
292 | 4.38k | return Status::OK(); |
293 | 4.38k | } |
294 | | |
295 | | Status FunctionLikeBase::vector_ends_with_fn(const ColumnString& vals, |
296 | | const ColumnString& search_strings, |
297 | 34 | ColumnUInt8::Container& result) { |
298 | 34 | DCHECK(vals.size() == search_strings.size()); |
299 | 34 | DCHECK(vals.size() == result.size()); |
300 | 34 | auto sz = vals.size(); |
301 | 78 | for (size_t i = 0; i < sz; ++i) { |
302 | 44 | const auto& str_sv = vals.get_data_at(i); |
303 | 44 | const auto& search_string_sv = search_strings.get_data_at(i); |
304 | 44 | result[i] = (str_sv.size >= search_string_sv.size) && str_sv.end_with(search_string_sv); |
305 | 44 | } |
306 | 34 | return Status::OK(); |
307 | 34 | } |
308 | | |
309 | | Status FunctionLikeBase::constant_equals_fn(const LikeSearchState* state, const ColumnString& val, |
310 | | const StringRef& pattern, |
311 | 50 | ColumnUInt8::Container& result) { |
312 | 50 | auto sz = val.size(); |
313 | 127 | for (size_t i = 0; i < sz; i++) { |
314 | 77 | result[i] = (val.get_data_at(i) == state->search_string_sv); |
315 | 77 | } |
316 | 50 | return Status::OK(); |
317 | 50 | } |
318 | | |
319 | | Status FunctionLikeBase::constant_equals_fn_scalar(const LikeSearchState* state, |
320 | | const StringRef& val, const StringRef& pattern, |
321 | 150 | unsigned char* result) { |
322 | 150 | *result = (val == state->search_string_sv); |
323 | 150 | return Status::OK(); |
324 | 150 | } |
325 | | |
326 | | Status FunctionLikeBase::vector_equals_fn(const ColumnString& vals, |
327 | | const ColumnString& search_strings, |
328 | 88 | ColumnUInt8::Container& result) { |
329 | 88 | DCHECK(vals.size() == search_strings.size()); |
330 | 88 | DCHECK(vals.size() == result.size()); |
331 | 88 | auto sz = vals.size(); |
332 | 197 | for (size_t i = 0; i < sz; ++i) { |
333 | 109 | const auto& str_sv = vals.get_data_at(i); |
334 | 109 | const auto& search_string_sv = search_strings.get_data_at(i); |
335 | 109 | result[i] = str_sv == search_string_sv; |
336 | 109 | } |
337 | 88 | return Status::OK(); |
338 | 88 | } |
339 | | |
340 | | Status FunctionLikeBase::constant_substring_fn(const LikeSearchState* state, |
341 | | const ColumnString& val, const StringRef& pattern, |
342 | 0 | ColumnUInt8::Container& result) { |
343 | 0 | auto sz = val.size(); |
344 | 0 | for (size_t i = 0; i < sz; i++) { |
345 | 0 | if (state->search_string_sv.size == 0) { |
346 | 0 | result[i] = true; |
347 | 0 | continue; |
348 | 0 | } |
349 | 0 | result[i] = state->substring_pattern.search(val.get_data_at(i)) != -1; |
350 | 0 | } |
351 | 0 | return Status::OK(); |
352 | 0 | } |
353 | | |
354 | | Status FunctionLikeBase::constant_substring_fn_scalar(const LikeSearchState* state, |
355 | | const StringRef& val, |
356 | | const StringRef& pattern, |
357 | 401k | unsigned char* result) { |
358 | 401k | if (state->search_string_sv.size == 0) { |
359 | 0 | *result = true; |
360 | 0 | return Status::OK(); |
361 | 0 | } |
362 | 401k | *result = state->substring_pattern.search(val) != -1; |
363 | 401k | return Status::OK(); |
364 | 401k | } |
365 | | |
366 | | Status FunctionLikeBase::vector_substring_fn(const ColumnString& vals, |
367 | | const ColumnString& search_strings, |
368 | 41 | ColumnUInt8::Container& result) { |
369 | 41 | DCHECK(vals.size() == search_strings.size()); |
370 | 41 | DCHECK(vals.size() == result.size()); |
371 | 41 | auto sz = vals.size(); |
372 | 107 | for (size_t i = 0; i < sz; ++i) { |
373 | 66 | const auto& str_sv = vals.get_data_at(i); |
374 | 66 | const auto& search_string_sv = search_strings.get_data_at(i); |
375 | 66 | if (search_string_sv.size == 0) { |
376 | 2 | result[i] = true; |
377 | 2 | continue; |
378 | 2 | } |
379 | 64 | doris::StringSearch substring_search(&search_string_sv); |
380 | 64 | result[i] = substring_search.search(str_sv) != -1; |
381 | 64 | } |
382 | 41 | return Status::OK(); |
383 | 41 | } |
384 | | |
385 | | Status FunctionLikeBase::constant_regex_fn_scalar(const LikeSearchState* state, |
386 | | const StringRef& val, const StringRef& pattern, |
387 | 1.94k | unsigned char* result) { |
388 | 1.94k | if (state->hs_database) { // use hyperscan |
389 | 1.76k | auto ret = hs_scan(state->hs_database.get(), val.data, (int)val.size, 0, |
390 | 1.76k | state->hs_scratch.get(), doris::LikeSearchState::hs_match_handler, |
391 | 1.76k | (void*)result); |
392 | 1.76k | if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { |
393 | 0 | return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); |
394 | 0 | } |
395 | 1.76k | } else if (state->boost_regex) { // use boost::regex for advanced features |
396 | 4 | *result = boost::regex_search(val.data, val.data + val.size, *state->boost_regex); |
397 | 178 | } else { // fallback to re2 |
398 | 178 | *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), *state->regex); |
399 | 178 | } |
400 | | |
401 | 1.94k | return Status::OK(); |
402 | 1.94k | } |
403 | | |
404 | | Status FunctionLikeBase::regexp_fn_scalar(const LikeSearchState* state, const StringRef& val, |
405 | 104 | const StringRef& pattern, unsigned char* result) { |
406 | 104 | RE2::Options opts; |
407 | 104 | opts.set_never_nl(false); |
408 | 104 | opts.set_dot_nl(true); |
409 | 104 | re2::RE2 re(re2::StringPiece(pattern.data, pattern.size), opts); |
410 | 104 | if (re.ok()) { |
411 | 104 | *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), re); |
412 | 104 | } else { |
413 | 0 | return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string()); |
414 | 0 | } |
415 | | |
416 | 104 | return Status::OK(); |
417 | 104 | } |
418 | | |
419 | | Status FunctionLikeBase::constant_regex_fn(const LikeSearchState* state, const ColumnString& val, |
420 | | const StringRef& pattern, |
421 | 555 | ColumnUInt8::Container& result) { |
422 | 555 | auto sz = val.size(); |
423 | 555 | if (state->hs_database) { // use hyperscan |
424 | 892k | for (size_t i = 0; i < sz; i++) { |
425 | 891k | const auto& str_ref = val.get_data_at(i); |
426 | 891k | auto ret = hs_scan(state->hs_database.get(), str_ref.data, (int)str_ref.size, 0, |
427 | 891k | state->hs_scratch.get(), doris::LikeSearchState::hs_match_handler, |
428 | 891k | (void*)(result.data() + i)); |
429 | 891k | if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { |
430 | 0 | return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); |
431 | 0 | } |
432 | 891k | } |
433 | 552 | } else if (state->boost_regex) { // use boost::regex for advanced features |
434 | 0 | for (size_t i = 0; i < sz; i++) { |
435 | 0 | const auto& str_ref = val.get_data_at(i); |
436 | 0 | *(result.data() + i) = boost::regex_search(str_ref.data, str_ref.data + str_ref.size, |
437 | 0 | *state->boost_regex); |
438 | 0 | } |
439 | 3 | } else { // fallback to re2 |
440 | 7 | for (size_t i = 0; i < sz; i++) { |
441 | 4 | const auto& str_ref = val.get_data_at(i); |
442 | 4 | *(result.data() + i) = |
443 | 4 | RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), *state->regex); |
444 | 4 | } |
445 | 3 | } |
446 | | |
447 | 555 | return Status::OK(); |
448 | 555 | } |
449 | | |
450 | | Status FunctionLikeBase::regexp_fn(const LikeSearchState* state, const ColumnString& val, |
451 | 0 | const StringRef& pattern, ColumnUInt8::Container& result) { |
452 | 0 | std::string re_pattern(pattern.data, pattern.size); |
453 | |
|
454 | 0 | hs_database_t* database = nullptr; |
455 | 0 | hs_scratch_t* scratch = nullptr; |
456 | 0 | if (hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch).ok()) { // use hyperscan |
457 | 0 | auto sz = val.size(); |
458 | 0 | for (size_t i = 0; i < sz; i++) { |
459 | 0 | const auto& str_ref = val.get_data_at(i); |
460 | 0 | auto ret = |
461 | 0 | hs_scan(database, str_ref.data, (int)str_ref.size, 0, scratch, |
462 | 0 | doris::LikeSearchState::hs_match_handler, (void*)(result.data() + i)); |
463 | 0 | if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { |
464 | 0 | return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); |
465 | 0 | } |
466 | 0 | } |
467 | | |
468 | 0 | hs_free_scratch(scratch); |
469 | 0 | hs_free_database(database); |
470 | 0 | } else { // fallback to re2 |
471 | 0 | RE2::Options opts; |
472 | 0 | opts.set_never_nl(false); |
473 | 0 | opts.set_dot_nl(true); |
474 | 0 | re2::RE2 re(re_pattern, opts); |
475 | 0 | if (re.ok()) { |
476 | 0 | auto sz = val.size(); |
477 | 0 | for (size_t i = 0; i < sz; i++) { |
478 | 0 | const auto& str_ref = val.get_data_at(i); |
479 | 0 | *(result.data() + i) = |
480 | 0 | RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), re); |
481 | 0 | } |
482 | 0 | } else { |
483 | 0 | return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string()); |
484 | 0 | } |
485 | 0 | } |
486 | | |
487 | 0 | return Status::OK(); |
488 | 0 | } |
489 | | |
490 | | // hyperscan compile expression to database and allocate scratch space |
491 | | Status FunctionLikeBase::hs_prepare(FunctionContext* context, const char* expression, |
492 | 1.47k | hs_database_t** database, hs_scratch_t** scratch) { |
493 | 1.47k | hs_compile_error_t* compile_err; |
494 | 1.47k | auto res = hs_compile(expression, HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8, |
495 | 1.47k | HS_MODE_BLOCK, nullptr, database, &compile_err); |
496 | | |
497 | 1.47k | if (res != HS_SUCCESS) { |
498 | 19 | *database = nullptr; |
499 | 19 | std::string error_message = compile_err->message; |
500 | 19 | hs_free_compile_error(compile_err); |
501 | | // Do not call FunctionContext::set_error here, since we do not want to cancel the query here. |
502 | 19 | return Status::RuntimeError<false>("hs_compile regex pattern error:" + error_message); |
503 | 19 | } |
504 | 1.45k | hs_free_compile_error(compile_err); |
505 | | |
506 | 1.45k | if (hs_alloc_scratch(*database, scratch) != HS_SUCCESS) { |
507 | 0 | hs_free_database(*database); |
508 | 0 | *database = nullptr; |
509 | 0 | *scratch = nullptr; |
510 | | // Do not call FunctionContext::set_error here, since we do not want to cancel the query here. |
511 | 0 | return Status::RuntimeError<false>("hs_alloc_scratch allocate scratch space error"); |
512 | 0 | } |
513 | | |
514 | 1.45k | return Status::OK(); |
515 | 1.45k | } |
516 | | |
517 | | Status FunctionLikeBase::execute_impl(FunctionContext* context, Block& block, |
518 | | const ColumnNumbers& arguments, uint32_t result, |
519 | 1.73k | size_t input_rows_count) const { |
520 | 1.73k | const auto values_col = |
521 | 1.73k | block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); |
522 | 1.73k | const auto* values = check_and_get_column<ColumnString>(values_col.get()); |
523 | | |
524 | 1.73k | if (!values) { |
525 | 0 | return Status::InternalError("Not supported input arguments types"); |
526 | 0 | } |
527 | | // result column |
528 | 1.73k | auto res = ColumnUInt8::create(); |
529 | 1.73k | ColumnUInt8::Container& vec_res = res->get_data(); |
530 | | // set default value to 0, and match functions only need to set 1/true |
531 | 1.73k | vec_res.resize_fill(input_rows_count); |
532 | 1.73k | auto* state = reinterpret_cast<LikeState*>( |
533 | 1.73k | context->get_function_state(FunctionContext::THREAD_LOCAL)); |
534 | | // for constant_substring_fn, use long run length search for performance |
535 | 1.73k | if (constant_substring_fn == |
536 | 1.73k | *(state->function |
537 | 1.73k | .target<doris::Status (*)(const LikeSearchState* state, const ColumnString&, |
538 | 1.73k | const StringRef&, ColumnUInt8::Container&)>())) { |
539 | 231 | RETURN_IF_ERROR(execute_substring(values->get_chars(), values->get_offsets(), vec_res, |
540 | 231 | &state->search_state)); |
541 | 1.50k | } else { |
542 | 1.50k | const auto pattern_col = block.get_by_position(arguments[1]).column; |
543 | 1.50k | if (const auto* str_patterns = check_and_get_column<ColumnString>(pattern_col.get())) { |
544 | 451 | RETURN_IF_ERROR( |
545 | 451 | vector_non_const(*values, *str_patterns, vec_res, state, input_rows_count)); |
546 | 1.05k | } else if (const auto* const_patterns = |
547 | 1.05k | check_and_get_column<ColumnConst>(pattern_col.get())) { |
548 | 1.05k | const auto& pattern_val = const_patterns->get_data_at(0); |
549 | 1.05k | RETURN_IF_ERROR(vector_const(*values, &pattern_val, vec_res, state->function, |
550 | 1.05k | &state->search_state)); |
551 | 18.4E | } else { |
552 | 18.4E | return Status::InternalError("Not supported input arguments types"); |
553 | 18.4E | } |
554 | 1.50k | } |
555 | 1.73k | block.replace_by_position(result, std::move(res)); |
556 | 1.73k | return Status::OK(); |
557 | 1.73k | } |
558 | | |
559 | | Status FunctionLikeBase::execute_substring(const ColumnString::Chars& values, |
560 | | const ColumnString::Offsets& value_offsets, |
561 | | ColumnUInt8::Container& result, |
562 | 231 | LikeSearchState* search_state) const { |
563 | | // treat continuous multi string data as a long string data |
564 | 231 | const UInt8* begin = values.data(); |
565 | 231 | const UInt8* end = begin + values.size(); |
566 | 231 | const UInt8* pos = begin; |
567 | | |
568 | | /// Current index in the array of strings. |
569 | 231 | size_t i = 0; |
570 | 231 | size_t needle_size = search_state->substring_pattern.get_pattern_length(); |
571 | | |
572 | | /// We will search for the next occurrence in all strings at once. |
573 | 1.49k | while (pos < end) { |
574 | | // search return matched substring start offset |
575 | 1.35k | pos = (UInt8*)search_state->substring_pattern.search((char*)pos, end - pos); |
576 | 1.35k | if (pos >= end) { |
577 | 95 | break; |
578 | 95 | } |
579 | | |
580 | | /// Determine which index it refers to. |
581 | | /// begin + value_offsets[i] is the start offset of string at i+1 |
582 | 1.29k | while (i < value_offsets.size() && begin + value_offsets[i] < pos) { |
583 | 29 | ++i; |
584 | 29 | } |
585 | | |
586 | | /// We check that the entry does not pass through the boundaries of strings. |
587 | 1.26k | if (pos + needle_size <= begin + value_offsets[i]) { |
588 | 1.24k | result[i] = 1; |
589 | 1.24k | } |
590 | | |
591 | | // move to next string offset |
592 | 1.26k | pos = begin + value_offsets[i]; |
593 | 1.26k | ++i; |
594 | 1.26k | } |
595 | | |
596 | 231 | return Status::OK(); |
597 | 231 | } |
598 | | |
599 | | Status FunctionLikeBase::vector_const(const ColumnString& values, const StringRef* pattern_val, |
600 | | ColumnUInt8::Container& result, const LikeFn& function, |
601 | 1.05k | LikeSearchState* search_state) const { |
602 | 1.05k | RETURN_IF_ERROR((function)(search_state, values, |
603 | 1.05k | *reinterpret_cast<const StringRef*>(pattern_val), result)); |
604 | 1.05k | return Status::OK(); |
605 | 1.05k | } |
606 | | |
607 | | template <bool LIKE_PATTERN> |
608 | 451 | VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) { |
609 | 451 | VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>(); |
610 | 451 | VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>(); |
611 | 451 | VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>(); |
612 | 451 | VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>(); |
613 | 451 | VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>(); |
614 | 451 | size_t size = patterns.size(); |
615 | | |
616 | 1.03k | for (size_t i = 0; i < size; ++i) { |
617 | 592 | if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched && |
618 | 592 | !substring_state->_pattern_matched && !starts_with_state->_pattern_matched && |
619 | 592 | !ends_with_state->_pattern_matched) { |
620 | 12 | return nullptr; |
621 | 12 | } |
622 | 580 | std::string pattern_str = patterns.get_data_at(i).to_string(); |
623 | 580 | if (allpass_state->_pattern_matched) { |
624 | 456 | if constexpr (LIKE_PATTERN) { |
625 | 424 | allpass_state->like_pattern_match(pattern_str); |
626 | 424 | } else { |
627 | 32 | allpass_state->regexp_pattern_match(pattern_str); |
628 | 32 | } |
629 | 456 | } |
630 | 580 | if (equal_state->_pattern_matched) { |
631 | 482 | if constexpr (LIKE_PATTERN) { |
632 | 450 | equal_state->like_pattern_match(pattern_str); |
633 | 450 | } else { |
634 | 32 | equal_state->regexp_pattern_match(pattern_str); |
635 | 32 | } |
636 | 482 | } |
637 | 580 | if (substring_state->_pattern_matched) { |
638 | 497 | if constexpr (LIKE_PATTERN) { |
639 | 442 | substring_state->like_pattern_match(pattern_str); |
640 | 442 | } else { |
641 | 55 | substring_state->regexp_pattern_match(pattern_str); |
642 | 55 | } |
643 | 497 | } |
644 | 580 | if (starts_with_state->_pattern_matched) { |
645 | 481 | if constexpr (LIKE_PATTERN) { |
646 | 431 | starts_with_state->like_pattern_match(pattern_str); |
647 | 431 | } else { |
648 | 50 | starts_with_state->regexp_pattern_match(pattern_str); |
649 | 50 | } |
650 | 481 | } |
651 | 580 | if (ends_with_state->_pattern_matched) { |
652 | 468 | if constexpr (LIKE_PATTERN) { |
653 | 430 | ends_with_state->like_pattern_match(pattern_str); |
654 | 430 | } else { |
655 | 38 | ends_with_state->regexp_pattern_match(pattern_str); |
656 | 38 | } |
657 | 468 | } |
658 | 580 | } |
659 | | |
660 | 439 | if (allpass_state->_pattern_matched) { |
661 | 21 | return allpass_state; |
662 | 418 | } else if (equal_state->_pattern_matched) { |
663 | 88 | return equal_state; |
664 | 330 | } else if (substring_state->_pattern_matched) { |
665 | 41 | return substring_state; |
666 | 289 | } else if (starts_with_state->_pattern_matched) { |
667 | 76 | return starts_with_state; |
668 | 213 | } else if (ends_with_state->_pattern_matched) { |
669 | 34 | return ends_with_state; |
670 | 179 | } else { |
671 | 179 | return nullptr; |
672 | 179 | } |
673 | 439 | } _ZN5doris16FunctionLikeBase24pattern_type_recognitionILb1EEESt10shared_ptrINS_24VectorPatternSearchStateEERKNS_9ColumnStrIjEE Line | Count | Source | 608 | 419 | VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) { | 609 | 419 | VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>(); | 610 | 419 | VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>(); | 611 | 419 | VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>(); | 612 | 419 | VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>(); | 613 | 419 | VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>(); | 614 | 419 | size_t size = patterns.size(); | 615 | | | 616 | 920 | for (size_t i = 0; i < size; ++i) { | 617 | 513 | if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched && | 618 | 513 | !substring_state->_pattern_matched && !starts_with_state->_pattern_matched && | 619 | 513 | !ends_with_state->_pattern_matched) { | 620 | 12 | return nullptr; | 621 | 12 | } | 622 | 501 | std::string pattern_str = patterns.get_data_at(i).to_string(); | 623 | 501 | if (allpass_state->_pattern_matched) { | 624 | 424 | if constexpr (LIKE_PATTERN) { | 625 | 424 | allpass_state->like_pattern_match(pattern_str); | 626 | | } else { | 627 | | allpass_state->regexp_pattern_match(pattern_str); | 628 | | } | 629 | 424 | } | 630 | 501 | if (equal_state->_pattern_matched) { | 631 | 450 | if constexpr (LIKE_PATTERN) { | 632 | 450 | equal_state->like_pattern_match(pattern_str); | 633 | | } else { | 634 | | equal_state->regexp_pattern_match(pattern_str); | 635 | | } | 636 | 450 | } | 637 | 501 | if (substring_state->_pattern_matched) { | 638 | 442 | if constexpr (LIKE_PATTERN) { | 639 | 442 | substring_state->like_pattern_match(pattern_str); | 640 | | } else { | 641 | | substring_state->regexp_pattern_match(pattern_str); | 642 | | } | 643 | 442 | } | 644 | 501 | if (starts_with_state->_pattern_matched) { | 645 | 431 | if constexpr (LIKE_PATTERN) { | 646 | 431 | starts_with_state->like_pattern_match(pattern_str); | 647 | | } else { | 648 | | starts_with_state->regexp_pattern_match(pattern_str); | 649 | | } | 650 | 431 | } | 651 | 501 | if (ends_with_state->_pattern_matched) { | 652 | 430 | if constexpr (LIKE_PATTERN) { | 653 | 430 | ends_with_state->like_pattern_match(pattern_str); | 654 | | } else { | 655 | | ends_with_state->regexp_pattern_match(pattern_str); | 656 | | } | 657 | 430 | } | 658 | 501 | } | 659 | | | 660 | 407 | if (allpass_state->_pattern_matched) { | 661 | 21 | return allpass_state; | 662 | 386 | } else if (equal_state->_pattern_matched) { | 663 | 88 | return equal_state; | 664 | 298 | } else if (substring_state->_pattern_matched) { | 665 | 30 | return substring_state; | 666 | 268 | } else if (starts_with_state->_pattern_matched) { | 667 | 71 | return starts_with_state; | 668 | 197 | } else if (ends_with_state->_pattern_matched) { | 669 | 31 | return ends_with_state; | 670 | 166 | } else { | 671 | 166 | return nullptr; | 672 | 166 | } | 673 | 407 | } |
_ZN5doris16FunctionLikeBase24pattern_type_recognitionILb0EEESt10shared_ptrINS_24VectorPatternSearchStateEERKNS_9ColumnStrIjEE Line | Count | Source | 608 | 32 | VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) { | 609 | 32 | VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>(); | 610 | 32 | VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>(); | 611 | 32 | VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>(); | 612 | 32 | VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>(); | 613 | 32 | VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>(); | 614 | 32 | size_t size = patterns.size(); | 615 | | | 616 | 111 | for (size_t i = 0; i < size; ++i) { | 617 | 79 | if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched && | 618 | 79 | !substring_state->_pattern_matched && !starts_with_state->_pattern_matched && | 619 | 79 | !ends_with_state->_pattern_matched) { | 620 | 0 | return nullptr; | 621 | 0 | } | 622 | 79 | std::string pattern_str = patterns.get_data_at(i).to_string(); | 623 | 79 | if (allpass_state->_pattern_matched) { | 624 | | if constexpr (LIKE_PATTERN) { | 625 | | allpass_state->like_pattern_match(pattern_str); | 626 | 32 | } else { | 627 | 32 | allpass_state->regexp_pattern_match(pattern_str); | 628 | 32 | } | 629 | 32 | } | 630 | 79 | if (equal_state->_pattern_matched) { | 631 | | if constexpr (LIKE_PATTERN) { | 632 | | equal_state->like_pattern_match(pattern_str); | 633 | 32 | } else { | 634 | 32 | equal_state->regexp_pattern_match(pattern_str); | 635 | 32 | } | 636 | 32 | } | 637 | 79 | if (substring_state->_pattern_matched) { | 638 | | if constexpr (LIKE_PATTERN) { | 639 | | substring_state->like_pattern_match(pattern_str); | 640 | 55 | } else { | 641 | 55 | substring_state->regexp_pattern_match(pattern_str); | 642 | 55 | } | 643 | 55 | } | 644 | 79 | if (starts_with_state->_pattern_matched) { | 645 | | if constexpr (LIKE_PATTERN) { | 646 | | starts_with_state->like_pattern_match(pattern_str); | 647 | 50 | } else { | 648 | 50 | starts_with_state->regexp_pattern_match(pattern_str); | 649 | 50 | } | 650 | 50 | } | 651 | 79 | if (ends_with_state->_pattern_matched) { | 652 | | if constexpr (LIKE_PATTERN) { | 653 | | ends_with_state->like_pattern_match(pattern_str); | 654 | 38 | } else { | 655 | 38 | ends_with_state->regexp_pattern_match(pattern_str); | 656 | 38 | } | 657 | 38 | } | 658 | 79 | } | 659 | | | 660 | 32 | if (allpass_state->_pattern_matched) { | 661 | 0 | return allpass_state; | 662 | 32 | } else if (equal_state->_pattern_matched) { | 663 | 0 | return equal_state; | 664 | 32 | } else if (substring_state->_pattern_matched) { | 665 | 11 | return substring_state; | 666 | 21 | } else if (starts_with_state->_pattern_matched) { | 667 | 5 | return starts_with_state; | 668 | 16 | } else if (ends_with_state->_pattern_matched) { | 669 | 3 | return ends_with_state; | 670 | 13 | } else { | 671 | 13 | return nullptr; | 672 | 13 | } | 673 | 32 | } |
|
674 | | |
675 | | Status FunctionLikeBase::vector_non_const(const ColumnString& values, const ColumnString& patterns, |
676 | | ColumnUInt8::Container& result, LikeState* state, |
677 | 451 | size_t input_rows_count) const { |
678 | 451 | ColumnString::MutablePtr replaced_patterns; |
679 | 451 | VPatternSearchStateSPtr vector_search_state; |
680 | 451 | if (state->is_like_pattern) { |
681 | 419 | if (state->has_custom_escape) { |
682 | 5 | replaced_patterns = ColumnString::create(); |
683 | 10 | for (int i = 0; i < input_rows_count; ++i) { |
684 | 5 | std::string val = |
685 | 5 | replace_pattern_by_escape(patterns.get_data_at(i), state->escape_char); |
686 | 5 | replaced_patterns->insert_data(val.c_str(), val.size()); |
687 | 5 | } |
688 | 5 | vector_search_state = pattern_type_recognition<true>(*replaced_patterns); |
689 | 414 | } else { |
690 | 414 | vector_search_state = pattern_type_recognition<true>(patterns); |
691 | 414 | } |
692 | 419 | } else { |
693 | 32 | vector_search_state = pattern_type_recognition<false>(patterns); |
694 | 32 | } |
695 | | |
696 | 451 | const ColumnString& real_pattern = state->has_custom_escape ? *replaced_patterns : patterns; |
697 | | |
698 | 451 | if (vector_search_state == nullptr) { |
699 | | // pattern type recognition failed, use default case |
700 | 548 | for (int i = 0; i < input_rows_count; ++i) { |
701 | 357 | const auto pattern_val = real_pattern.get_data_at(i); |
702 | 357 | const auto value_val = values.get_data_at(i); |
703 | 357 | RETURN_IF_ERROR((state->scalar_function)(&state->search_state, value_val, pattern_val, |
704 | 357 | &result[i])); |
705 | 357 | } |
706 | 191 | return Status::OK(); |
707 | 191 | } |
708 | 260 | const auto* search_strings = |
709 | 260 | static_cast<const ColumnString*>(vector_search_state->_search_strings.get()); |
710 | 260 | return (vector_search_state->_vector_function)(values, *search_strings, result); |
711 | 451 | } |
712 | | |
713 | | Status FunctionLike::like_fn(const LikeSearchState* state, const ColumnString& val, |
714 | 0 | const StringRef& pattern, ColumnUInt8::Container& result) { |
715 | 0 | std::string re_pattern; |
716 | 0 | convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern); |
717 | 0 | return regexp_fn(state, val, {re_pattern.c_str(), re_pattern.size()}, result); |
718 | 0 | } |
719 | | |
720 | | Status FunctionLike::like_fn_scalar(const LikeSearchState* state, const StringRef& val, |
721 | 243 | const StringRef& pattern, unsigned char* result) { |
722 | | // Try to use fast path to avoid regex compilation |
723 | 243 | std::string search_string; |
724 | 243 | LikeFastPath fast_path = extract_like_fast_path(pattern.data, pattern.size, search_string); |
725 | | |
726 | 243 | switch (fast_path) { |
727 | 12 | case LikeFastPath::ALLPASS: |
728 | 12 | *result = 1; |
729 | 12 | return Status::OK(); |
730 | 61 | case LikeFastPath::EQUALS: |
731 | 61 | *result = (val.size == search_string.size() && |
732 | 61 | (search_string.empty() || |
733 | 33 | memcmp(val.data, search_string.data(), search_string.size()) == 0)); |
734 | 61 | return Status::OK(); |
735 | 22 | case LikeFastPath::STARTS_WITH: |
736 | 22 | *result = (val.size >= search_string.size() && |
737 | 22 | memcmp(val.data, search_string.data(), search_string.size()) == 0); |
738 | 22 | return Status::OK(); |
739 | 16 | case LikeFastPath::ENDS_WITH: |
740 | 16 | *result = (val.size >= search_string.size() && |
741 | 16 | memcmp(val.data + val.size - search_string.size(), search_string.data(), |
742 | 15 | search_string.size()) == 0); |
743 | 16 | return Status::OK(); |
744 | 34 | case LikeFastPath::SUBSTRING: |
745 | 34 | if (search_string.empty()) { |
746 | 0 | *result = 1; |
747 | 34 | } else { |
748 | | // Use memmem for substring search |
749 | 34 | *result = (memmem(val.data, val.size, search_string.data(), search_string.size()) != |
750 | 34 | nullptr); |
751 | 34 | } |
752 | 34 | return Status::OK(); |
753 | 98 | case LikeFastPath::REGEX: |
754 | 98 | default: |
755 | | // Fall back to regex matching |
756 | 98 | std::string re_pattern; |
757 | 98 | convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern); |
758 | 98 | return regexp_fn_scalar(state, StringRef(val.data, val.size), |
759 | 98 | {re_pattern.c_str(), re_pattern.size()}, result); |
760 | 243 | } |
761 | 243 | } |
762 | | |
763 | | void FunctionLike::convert_like_pattern(const LikeSearchState* state, const std::string& pattern, |
764 | 1.71k | std::string* re_pattern) { |
765 | 1.71k | re_pattern->clear(); |
766 | | |
767 | 1.71k | if (pattern.empty()) { |
768 | 0 | re_pattern->append("^$"); |
769 | 0 | return; |
770 | 0 | } |
771 | | |
772 | | // add ^ to pattern head to match line head |
773 | 1.71k | if (!pattern.empty() && pattern[0] != '%') { |
774 | 1.07k | re_pattern->append("^"); |
775 | 1.07k | } |
776 | | |
777 | | // expect % and _, all chars should keep it literal mean. |
778 | 13.7k | for (size_t i = 0; i < pattern.size(); i++) { |
779 | 12.0k | char c = pattern[i]; |
780 | 12.0k | if (c == '\\' && i + 1 < pattern.size()) { |
781 | 549 | char next_c = pattern[i + 1]; |
782 | 549 | if (next_c == '%' || next_c == '_') { |
783 | | // convert "\%" and "\_" to literal "%" and "_" |
784 | 222 | re_pattern->append(1, next_c); |
785 | 222 | i++; |
786 | 222 | continue; |
787 | 327 | } else if (next_c == '\\') { |
788 | | // keep valid escape "\\" |
789 | 308 | re_pattern->append("\\\\"); |
790 | 308 | i++; |
791 | 308 | continue; |
792 | 308 | } |
793 | 549 | } |
794 | | |
795 | 11.5k | if (c == '%') { |
796 | 1.94k | re_pattern->append(".*"); |
797 | 9.59k | } else if (c == '_') { |
798 | 1.49k | re_pattern->append("."); |
799 | 8.10k | } else { |
800 | | // special for hyperscan: [, ], (, ), {, }, -, *, +, \, |, /, :, ^, ., $, ? |
801 | 8.11k | if (c == '[' || c == ']' || c == '(' || c == ')' || c == '{' || c == '}' || c == '-' || |
802 | 8.10k | c == '*' || c == '+' || c == '\\' || c == '|' || c == '/' || c == ':' || c == '^' || |
803 | 8.10k | c == '.' || c == '$' || c == '?') { |
804 | 119 | re_pattern->append(1, '\\'); |
805 | 119 | } |
806 | 8.10k | re_pattern->append(1, c); |
807 | 8.10k | } |
808 | 11.5k | } |
809 | | |
810 | | // add $ to pattern tail to match line tail |
811 | 1.71k | if (!pattern.empty() && re_pattern->back() != '*') { |
812 | 856 | re_pattern->append("$"); |
813 | 856 | } |
814 | 1.71k | } |
815 | | |
816 | 2.31k | void FunctionLike::remove_escape_character(std::string* search_string) { |
817 | 2.31k | std::string tmp_search_string; |
818 | 2.31k | tmp_search_string.swap(*search_string); |
819 | 2.31k | int64_t len = tmp_search_string.length(); |
820 | | // sometime 'like' may allowed converted to 'equals/start_with/end_with/sub_with' |
821 | | // so we need to remove escape from pattern to construct search string and use to do 'equals/start_with/end_with/sub_with' |
822 | 11.6k | for (int i = 0; i < len;) { |
823 | 9.37k | if (tmp_search_string[i] == '\\' && i + 1 < len && |
824 | 9.37k | (tmp_search_string[i + 1] == '%' || tmp_search_string[i + 1] == '_' || |
825 | 156 | tmp_search_string[i + 1] == '\\')) { |
826 | 147 | search_string->append(1, tmp_search_string[i + 1]); |
827 | 147 | i += 2; |
828 | 9.23k | } else { |
829 | 9.23k | search_string->append(1, tmp_search_string[i]); |
830 | 9.23k | i++; |
831 | 9.23k | } |
832 | 9.37k | } |
833 | 2.31k | } |
834 | | |
835 | 0 | bool re2_full_match(const std::string& str, const RE2& re, std::vector<std::string>& results) { |
836 | 0 | if (!re.ok()) { |
837 | 0 | return false; |
838 | 0 | } |
839 | | |
840 | 0 | std::vector<RE2::Arg> arguments; |
841 | 0 | std::vector<RE2::Arg*> arguments_ptrs; |
842 | 0 | std::size_t args_count = re.NumberOfCapturingGroups(); |
843 | 0 | arguments.resize(args_count); |
844 | 0 | arguments_ptrs.resize(args_count); |
845 | 0 | results.resize(args_count); |
846 | 0 | for (std::size_t i = 0; i < args_count; ++i) { |
847 | 0 | arguments[i] = &results[i]; |
848 | 0 | arguments_ptrs[i] = &arguments[i]; |
849 | 0 | } |
850 | |
|
851 | 0 | return RE2::FullMatchN(str, re, arguments_ptrs.data(), (int)args_count); |
852 | 0 | } |
853 | | |
854 | 0 | void verbose_log_match(const std::string& str, const std::string& pattern_name, const RE2& re) { |
855 | 0 | std::vector<std::string> results; |
856 | 0 | VLOG_DEBUG << "arg str: " << str << ", size: " << str.size() << ", pattern " << pattern_name |
857 | 0 | << ": " << re.pattern() << ", size: " << re.pattern().size(); |
858 | 0 | if (re2_full_match(str, re, results)) { |
859 | 0 | for (int i = 0; i < results.size(); ++i) { |
860 | 0 | VLOG_DEBUG << "match " << i << ": " << results[i] << ", size: " << results[i].size(); |
861 | 0 | } |
862 | 0 | } else { |
863 | 0 | VLOG_DEBUG << "no match"; |
864 | 0 | } |
865 | 0 | } |
866 | | |
867 | | Status FunctionLike::construct_like_const_state(FunctionContext* context, const StringRef& pattern, |
868 | | std::shared_ptr<LikeState>& state, |
869 | 3.38k | bool try_hyperscan) { |
870 | 3.38k | std::string pattern_str; |
871 | 3.38k | if (state->has_custom_escape) { |
872 | 11 | pattern_str = replace_pattern_by_escape(pattern, state->escape_char); |
873 | 3.37k | } else { |
874 | 3.37k | pattern_str = pattern.to_string(); |
875 | 3.37k | } |
876 | 3.38k | state->search_state.pattern_str = pattern_str; |
877 | 3.38k | std::string search_string; |
878 | | |
879 | 3.38k | if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) { |
880 | 157 | state->search_state.set_search_string(""); |
881 | 157 | state->function = constant_allpass_fn; |
882 | 157 | state->scalar_function = constant_allpass_fn_scalar; |
883 | 3.22k | } else if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &search_string)) { |
884 | 195 | if (VLOG_DEBUG_IS_ON) { |
885 | 0 | verbose_log_match(pattern_str, "LIKE_EQUALS_RE", LIKE_EQUALS_RE); |
886 | 0 | VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size(); |
887 | 0 | } |
888 | 195 | remove_escape_character(&search_string); |
889 | 195 | if (VLOG_DEBUG_IS_ON) { |
890 | 0 | VLOG_DEBUG << "search_string escape removed: " << search_string |
891 | 0 | << ", size: " << search_string.size(); |
892 | 0 | } |
893 | 195 | state->search_state.set_search_string(search_string); |
894 | 195 | state->function = constant_equals_fn; |
895 | 195 | state->scalar_function = constant_equals_fn_scalar; |
896 | 3.03k | } else if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &search_string)) { |
897 | 729 | if (VLOG_DEBUG_IS_ON) { |
898 | 0 | verbose_log_match(pattern_str, "LIKE_STARTS_WITH_RE", LIKE_STARTS_WITH_RE); |
899 | 0 | VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size(); |
900 | 0 | } |
901 | 729 | remove_escape_character(&search_string); |
902 | 729 | if (VLOG_DEBUG_IS_ON) { |
903 | 0 | VLOG_DEBUG << "search_string escape removed: " << search_string |
904 | 0 | << ", size: " << search_string.size(); |
905 | 0 | } |
906 | 729 | state->search_state.set_search_string(search_string); |
907 | 729 | state->function = constant_starts_with_fn; |
908 | 729 | state->scalar_function = constant_starts_with_fn_scalar; |
909 | 2.30k | } else if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &search_string)) { |
910 | 233 | if (VLOG_DEBUG_IS_ON) { |
911 | 0 | verbose_log_match(pattern_str, "LIKE_ENDS_WITH_RE", LIKE_ENDS_WITH_RE); |
912 | 0 | VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size(); |
913 | 0 | } |
914 | 233 | remove_escape_character(&search_string); |
915 | 233 | if (VLOG_DEBUG_IS_ON) { |
916 | 0 | VLOG_DEBUG << "search_string escape removed: " << search_string |
917 | 0 | << ", size: " << search_string.size(); |
918 | 0 | } |
919 | 233 | state->search_state.set_search_string(search_string); |
920 | 233 | state->function = constant_ends_with_fn; |
921 | 233 | state->scalar_function = constant_ends_with_fn_scalar; |
922 | 2.07k | } else if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &search_string)) { |
923 | 872 | if (VLOG_DEBUG_IS_ON) { |
924 | 0 | verbose_log_match(pattern_str, "LIKE_SUBSTRING_RE", LIKE_SUBSTRING_RE); |
925 | 0 | VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size(); |
926 | 0 | } |
927 | 872 | remove_escape_character(&search_string); |
928 | 872 | if (VLOG_DEBUG_IS_ON) { |
929 | 0 | VLOG_DEBUG << "search_string escape removed: " << search_string |
930 | 0 | << ", size: " << search_string.size(); |
931 | 0 | } |
932 | 872 | state->search_state.set_search_string(search_string); |
933 | 872 | state->function = constant_substring_fn; |
934 | 872 | state->scalar_function = constant_substring_fn_scalar; |
935 | 1.19k | } else { |
936 | 1.19k | std::string re_pattern; |
937 | 1.19k | convert_like_pattern(&state->search_state, pattern_str, &re_pattern); |
938 | 1.19k | if (VLOG_DEBUG_IS_ON) { |
939 | 0 | VLOG_DEBUG << "hyperscan, pattern str: " << pattern_str |
940 | 0 | << ", size: " << pattern_str.size() << ", re pattern: " << re_pattern |
941 | 0 | << ", size: " << re_pattern.size(); |
942 | 0 | } |
943 | | |
944 | 1.19k | hs_database_t* database = nullptr; |
945 | 1.19k | hs_scratch_t* scratch = nullptr; |
946 | 1.19k | if (try_hyperscan && hs_prepare(context, re_pattern.c_str(), &database, &scratch).ok()) { |
947 | | // use hyperscan |
948 | 1.10k | state->search_state.hs_database.reset(database); |
949 | 1.10k | state->search_state.hs_scratch.reset(scratch); |
950 | 1.10k | } else { |
951 | | // fallback to re2 |
952 | | // reset hs_database to nullptr to indicate not use hyperscan |
953 | 97 | state->search_state.hs_database.reset(); |
954 | 97 | state->search_state.hs_scratch.reset(); |
955 | | |
956 | 97 | RE2::Options opts; |
957 | 97 | opts.set_never_nl(false); |
958 | 97 | opts.set_dot_nl(true); |
959 | 97 | state->search_state.regex = std::make_unique<RE2>(re_pattern, opts); |
960 | 97 | if (!state->search_state.regex->ok()) { |
961 | 0 | return Status::InternalError("Invalid regex expression: {}(origin: {})", re_pattern, |
962 | 0 | pattern_str); |
963 | 0 | } |
964 | 97 | } |
965 | | |
966 | 1.19k | state->function = constant_regex_fn; |
967 | 1.19k | state->scalar_function = constant_regex_fn_scalar; |
968 | 1.19k | } |
969 | 3.38k | return Status::OK(); |
970 | 3.38k | } |
971 | | |
972 | 4.46k | Status FunctionLike::open(FunctionContext* context, FunctionContext::FunctionStateScope scope) { |
973 | 4.46k | if (scope != FunctionContext::THREAD_LOCAL) { |
974 | 1.00k | return Status::OK(); |
975 | 1.00k | } |
976 | 3.46k | std::shared_ptr<LikeState> state = std::make_shared<LikeState>(); |
977 | 3.46k | state->is_like_pattern = true; |
978 | 3.46k | state->function = like_fn; |
979 | 3.46k | state->scalar_function = like_fn_scalar; |
980 | 3.46k | if (context->is_col_constant(2)) { |
981 | 11 | state->has_custom_escape = true; |
982 | 11 | const auto escape_col = context->get_constant_col(2)->column_ptr; |
983 | 11 | const auto& escape = escape_col->get_data_at(0); |
984 | 11 | if (escape.size != 1) { |
985 | 0 | return Status::InternalError("Escape character must be a single character, got: {}", |
986 | 0 | escape.to_string()); |
987 | 0 | } |
988 | 11 | state->escape_char = escape.data[0]; |
989 | 11 | } |
990 | 3.46k | if (context->is_col_constant(1)) { |
991 | 3.21k | const auto pattern_col = context->get_constant_col(1)->column_ptr; |
992 | 3.21k | const auto& pattern = pattern_col->get_data_at(0); |
993 | 3.21k | RETURN_IF_ERROR(construct_like_const_state(context, pattern, state)); |
994 | 3.21k | } |
995 | 3.46k | context->set_function_state(scope, state); |
996 | | |
997 | 3.46k | return Status::OK(); |
998 | 3.46k | } |
999 | | |
1000 | | Status FunctionRegexpLike::open(FunctionContext* context, |
1001 | 652 | FunctionContext::FunctionStateScope scope) { |
1002 | 652 | if (scope != FunctionContext::THREAD_LOCAL) { |
1003 | 138 | return Status::OK(); |
1004 | 138 | } |
1005 | 514 | std::shared_ptr<LikeState> state = std::make_shared<LikeState>(); |
1006 | 514 | context->set_function_state(scope, state); |
1007 | 514 | state->is_like_pattern = false; |
1008 | 514 | state->function = regexp_fn; |
1009 | 514 | state->scalar_function = regexp_fn_scalar; |
1010 | 514 | if (context->is_col_constant(1)) { |
1011 | 464 | const auto pattern_col = context->get_constant_col(1)->column_ptr; |
1012 | 464 | const auto& pattern = pattern_col->get_data_at(0); |
1013 | | |
1014 | 464 | std::string pattern_str = pattern.to_string(); |
1015 | 464 | std::string search_string; |
1016 | 464 | if (RE2::FullMatch(pattern_str, ALLPASS_RE)) { |
1017 | 16 | state->search_state.set_search_string(""); |
1018 | 16 | state->function = constant_allpass_fn; |
1019 | 16 | state->scalar_function = constant_allpass_fn_scalar; |
1020 | 448 | } else if (RE2::FullMatch(pattern_str, EQUALS_RE, &search_string)) { |
1021 | 7 | state->search_state.set_search_string(search_string); |
1022 | 7 | state->function = constant_equals_fn; |
1023 | 7 | state->scalar_function = constant_equals_fn_scalar; |
1024 | 441 | } else if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &search_string)) { |
1025 | 90 | state->search_state.set_search_string(search_string); |
1026 | 90 | state->function = constant_starts_with_fn; |
1027 | 90 | state->scalar_function = constant_starts_with_fn_scalar; |
1028 | 351 | } else if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &search_string)) { |
1029 | 75 | state->search_state.set_search_string(search_string); |
1030 | 75 | state->function = constant_ends_with_fn; |
1031 | 75 | state->scalar_function = constant_ends_with_fn_scalar; |
1032 | 276 | } else if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &search_string)) { |
1033 | 62 | state->search_state.set_search_string(search_string); |
1034 | 62 | state->function = constant_substring_fn; |
1035 | 62 | state->scalar_function = constant_substring_fn_scalar; |
1036 | 214 | } else { |
1037 | 214 | hs_database_t* database = nullptr; |
1038 | 214 | hs_scratch_t* scratch = nullptr; |
1039 | 214 | if (hs_prepare(context, pattern_str.c_str(), &database, &scratch).ok()) { |
1040 | | // use hyperscan |
1041 | 196 | state->search_state.hs_database.reset(database); |
1042 | 196 | state->search_state.hs_scratch.reset(scratch); |
1043 | 196 | } else { |
1044 | | // fallback to re2 |
1045 | | // reset hs_database to nullptr to indicate not use hyperscan |
1046 | 18 | state->search_state.hs_database.reset(); |
1047 | 18 | state->search_state.hs_scratch.reset(); |
1048 | 18 | RE2::Options opts; |
1049 | 18 | opts.set_never_nl(false); |
1050 | 18 | opts.set_dot_nl(true); |
1051 | 18 | state->search_state.regex = std::make_unique<RE2>(pattern_str, opts); |
1052 | 18 | if (!state->search_state.regex->ok()) { |
1053 | 9 | if (!context->state()->enable_extended_regex()) { |
1054 | 1 | return Status::InternalError( |
1055 | 1 | "Invalid regex expression: {}. Error: {}. If you need advanced " |
1056 | 1 | "regex features, try setting enable_extended_regex=true", |
1057 | 1 | pattern_str, state->search_state.regex->error()); |
1058 | 1 | } |
1059 | | |
1060 | | // RE2 failed, fallback to Boost.Regex |
1061 | | // This handles advanced regex features like zero-width assertions |
1062 | 8 | state->search_state.regex.reset(); |
1063 | 8 | try { |
1064 | 8 | state->search_state.boost_regex = |
1065 | 8 | std::make_unique<boost::regex>(pattern_str); |
1066 | 8 | } catch (const boost::regex_error& e) { |
1067 | 0 | return Status::InternalError("Invalid regex expression: {}. Error: {}", |
1068 | 0 | pattern_str, e.what()); |
1069 | 0 | } |
1070 | 8 | } |
1071 | 18 | } |
1072 | 213 | state->function = constant_regex_fn; |
1073 | 213 | state->scalar_function = constant_regex_fn_scalar; |
1074 | 213 | } |
1075 | 464 | } |
1076 | 513 | return Status::OK(); |
1077 | 514 | } |
1078 | | |
1079 | 8 | void register_function_like(SimpleFunctionFactory& factory) { |
1080 | 8 | factory.register_function<FunctionLike>(); |
1081 | 8 | } |
1082 | | |
1083 | 8 | void register_function_regexp(SimpleFunctionFactory& factory) { |
1084 | 8 | factory.register_function<FunctionRegexpLike>(); |
1085 | 8 | factory.register_alias(FunctionRegexpLike::name, FunctionRegexpLike::alias); |
1086 | 8 | } |
1087 | | #include "common/compile_check_end.h" |
1088 | | } // namespace doris |