/root/doris/be/src/vec/functions/like.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <hs/hs_common.h> |
21 | | #include <hs/hs_runtime.h> |
22 | | #include <re2/re2.h> |
23 | | |
24 | | #include <algorithm> |
25 | | #include <boost/iterator/iterator_facade.hpp> |
26 | | #include <boost/regex.hpp> |
27 | | #include <cstddef> |
28 | | #include <cstdint> |
29 | | #include <functional> |
30 | | #include <memory> |
31 | | #include <string> |
32 | | |
33 | | #include "common/status.h" |
34 | | #include "runtime/define_primitive_type.h" |
35 | | #include "runtime/string_search.hpp" |
36 | | #include "udf/udf.h" |
37 | | #include "vec/aggregate_functions/aggregate_function.h" |
38 | | #include "vec/columns/column_string.h" |
39 | | #include "vec/columns/predicate_column.h" |
40 | | #include "vec/common/string_ref.h" |
41 | | #include "vec/core/column_numbers.h" |
42 | | #include "vec/core/types.h" |
43 | | #include "vec/data_types/data_type_number.h" |
44 | | #include "vec/functions/function.h" |
45 | | |
46 | | namespace doris::vectorized { |
47 | | class Block; |
48 | | |
49 | | // FastPath types for LIKE pattern matching optimization |
50 | | // This allows per-row pattern analysis to avoid regex when possible |
51 | | enum class LikeFastPath { |
52 | | ALLPASS, // Pattern is just '%' or '%%...' - matches everything |
53 | | EQUALS, // No wildcards - exact string match |
54 | | STARTS_WITH, // Pattern ends with '%' only - prefix match |
55 | | ENDS_WITH, // Pattern starts with '%' only - suffix match |
56 | | SUBSTRING, // Pattern is '%xxx%' - substring search |
57 | | REGEX // Contains '_' or multiple '%' - needs regex |
58 | | }; |
59 | | |
60 | | // Lightweight pattern analysis without RE2 |
61 | | // Returns the fast path type and extracts the search string (without wildcards) |
62 | | // Correctly handles escape sequences: backslash-% -> literal %, backslash-_ -> literal _ |
63 | | inline LikeFastPath extract_like_fast_path(const char* pattern, size_t len, |
64 | 209 | std::string& search_string) { |
65 | 209 | search_string.clear(); |
66 | 209 | if (len == 0) { |
67 | 14 | return LikeFastPath::EQUALS; |
68 | 14 | } |
69 | | |
70 | | // Returns true if the character is NOT escaped (even number of preceding backslashes) |
71 | 195 | auto is_unescaped = [&pattern](size_t pos) -> bool { |
72 | 115 | size_t backslash_count = 0; |
73 | 132 | while (pos > 0 && pattern[pos - 1] == '\\') { |
74 | 17 | backslash_count++; |
75 | 17 | pos--; |
76 | 17 | } |
77 | 115 | return (backslash_count % 2 == 0); |
78 | 115 | }; |
79 | | |
80 | 195 | bool starts_with_percent = (pattern[0] == '%'); |
81 | 195 | bool ends_with_percent = (pattern[len - 1] == '%' && is_unescaped(len - 1)); |
82 | | |
83 | | // Quick check: if starts or ends with unescaped '_', need regex |
84 | 195 | if (pattern[0] == '_') { |
85 | 19 | return LikeFastPath::REGEX; |
86 | 19 | } |
87 | 176 | if (pattern[len - 1] == '_' && is_unescaped(len - 1)) { |
88 | 26 | return LikeFastPath::REGEX; |
89 | 26 | } |
90 | | |
91 | | // Helper lambda: check if character is a wildcard that needs escaping |
92 | 150 | auto is_wildcard = [](char c) { return c == '%' || c == '_' || c == '\\'; }; |
93 | | |
94 | 150 | size_t i = 0; |
95 | | // Skip leading '%' characters (unescaped) |
96 | 209 | while (i < len && pattern[i] == '%') { |
97 | 59 | i++; |
98 | 59 | } |
99 | | // If pattern is all '%', it's ALLPASS |
100 | 150 | if (i >= len) { |
101 | 6 | return LikeFastPath::ALLPASS; |
102 | 6 | } |
103 | | |
104 | 144 | search_string.reserve(len); |
105 | 574 | while (i < len) { |
106 | 519 | char c = pattern[i]; |
107 | | // Escaped character - add the literal |
108 | 519 | if (c == '\\' && i + 1 < len && is_wildcard(pattern[i + 1])) { |
109 | 31 | search_string.push_back(pattern[i + 1]); |
110 | 31 | i += 2; |
111 | 31 | continue; |
112 | 31 | } |
113 | | |
114 | | // Unescaped '_' requires regex |
115 | 488 | if (c == '_') { |
116 | 33 | return LikeFastPath::REGEX; |
117 | 33 | } |
118 | | |
119 | | // Check for trailing '%' or middle '%' (which needs regex) |
120 | 455 | if (c == '%') { |
121 | | // Check if this is a trailing '%' sequence |
122 | 56 | size_t j = i; |
123 | 112 | while (j < len && pattern[j] == '%') { |
124 | 56 | j++; |
125 | 56 | } |
126 | 56 | if (j >= len) { |
127 | | // All remaining chars are '%', we're done parsing |
128 | 44 | break; |
129 | 44 | } |
130 | | // '%' in the middle with more content after - need regex |
131 | 12 | return LikeFastPath::REGEX; |
132 | 56 | } |
133 | | |
134 | 399 | search_string.push_back(c); |
135 | 399 | i++; |
136 | 399 | } |
137 | | |
138 | | // Determine the pattern type based on '%' positions |
139 | 99 | if (starts_with_percent && ends_with_percent) { |
140 | 28 | return LikeFastPath::SUBSTRING; |
141 | 71 | } else if (starts_with_percent) { |
142 | 10 | return LikeFastPath::ENDS_WITH; |
143 | 61 | } else if (ends_with_percent) { |
144 | 16 | return LikeFastPath::STARTS_WITH; |
145 | 45 | } else { |
146 | 45 | return LikeFastPath::EQUALS; |
147 | 45 | } |
148 | 99 | } |
149 | | |
150 | 8 | inline std::string replace_pattern_by_escape(const StringRef& pattern, char escape_char) { |
151 | 8 | std::string result; |
152 | 8 | result.reserve(pattern.size); |
153 | 59 | for (size_t i = 0; i < pattern.size; ++i) { |
154 | 51 | if (i + 1 < pattern.size && pattern.data[i] == escape_char && |
155 | 51 | (pattern.data[i + 1] == escape_char || pattern.data[i + 1] == '%' || |
156 | 13 | pattern.data[i + 1] == '_')) { |
157 | | // "^^" -> "^" |
158 | | // "^%" -> "\%" |
159 | | // "^_" -> "\_" |
160 | 10 | if ((pattern.data[i + 1] == '%' || pattern.data[i + 1] == '_')) { |
161 | 4 | result.push_back('\\'); |
162 | 4 | } |
163 | 10 | result.push_back(pattern.data[i + 1]); |
164 | 10 | ++i; // skip next char |
165 | 41 | } else if (pattern.data[i] == '\\') { |
166 | | // "\" -> "\\" |
167 | 1 | result.append("\\\\"); |
168 | 40 | } else { |
169 | 40 | result.push_back(pattern.data[i]); |
170 | 40 | } |
171 | 51 | } |
172 | 8 | return result; |
173 | 8 | } |
174 | | |
175 | | // TODO: replace with std::string_view when `LikeSearchState.substring_pattern` can |
176 | | // construct from std::string_view. |
177 | | struct LikeSearchState { |
178 | | static constexpr char escape_char = '\\'; |
179 | | |
180 | | /// Holds the string the StringRef points to and is set any time StringRef is |
181 | | /// used. |
182 | | std::string search_string; |
183 | | |
184 | | std::string pattern_str; |
185 | | |
186 | | /// Used for LIKE predicates if the pattern is a constant argument, and is either a |
187 | | /// constant string or has a constant string at the beginning or end of the pattern. |
188 | | /// This will be set in order to check for that pattern in the corresponding part of |
189 | | /// the string. |
190 | | StringRef search_string_sv; |
191 | | |
192 | | /// Used for LIKE predicates if the pattern is a constant argument and has a constant |
193 | | /// string in the middle of it. This will be use in order to check for the substring |
194 | | /// in the value. |
195 | | doris::StringSearch substring_pattern; |
196 | | |
197 | | /// Used for RLIKE and REGEXP predicates if the pattern is a constant argument. |
198 | | std::unique_ptr<re2::RE2> regex; |
199 | | |
200 | | /// Used for REGEXP predicates when RE2 doesn't support the pattern (e.g., zero-width assertions like `?=`, `?!`, `?<=`, `?<!`) |
201 | | std::unique_ptr<boost::regex> boost_regex; |
202 | | |
203 | | template <typename Deleter, Deleter deleter> |
204 | | struct HyperscanDeleter { |
205 | | template <typename T> |
206 | 264 | void operator()(T* ptr) const { |
207 | 264 | deleter(ptr); |
208 | 264 | } _ZNK5doris10vectorized15LikeSearchState16HyperscanDeleterIPFiP11hs_databaseEXadL_Z16hs_free_databaseEEEclIS3_EEvPT_ Line | Count | Source | 206 | 132 | void operator()(T* ptr) const { | 207 | 132 | deleter(ptr); | 208 | 132 | } |
_ZNK5doris10vectorized15LikeSearchState16HyperscanDeleterIPFiP10hs_scratchEXadL_Z15hs_free_scratchEEEclIS3_EEvPT_ Line | Count | Source | 206 | 132 | void operator()(T* ptr) const { | 207 | 132 | deleter(ptr); | 208 | 132 | } |
|
209 | | }; |
210 | | |
211 | | // hyperscan compiled pattern database and scratch space, reused for performance |
212 | | std::unique_ptr<hs_database_t, HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>> |
213 | | hs_database; |
214 | | std::unique_ptr<hs_scratch_t, HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>> |
215 | | hs_scratch; |
216 | | |
217 | | // hyperscan match callback |
218 | | static int hs_match_handler(unsigned int /* from */, // NOLINT |
219 | | unsigned long long /* from */, // NOLINT |
220 | | unsigned long long /* to */, // NOLINT |
221 | 71 | unsigned int /* flags */, void* ctx) { |
222 | | // set result to 1 for matched row |
223 | 71 | *((unsigned char*)ctx) = 1; |
224 | | /// return non-zero to indicate hyperscan stop after first matched |
225 | 71 | return 1; |
226 | 71 | } |
227 | | |
228 | 597 | LikeSearchState() = default; |
229 | | |
230 | | Status clone(LikeSearchState& cloned); |
231 | | |
232 | 276 | void set_search_string(const std::string& search_string_arg) { |
233 | 276 | search_string = search_string_arg; |
234 | 276 | search_string_sv = StringRef(search_string); |
235 | 276 | substring_pattern.set_pattern(&search_string_sv); |
236 | 276 | } |
237 | | }; |
238 | | |
239 | | using LikeFn = std::function<doris::Status(const LikeSearchState*, const ColumnString&, |
240 | | const StringRef&, ColumnUInt8::Container&)>; |
241 | | |
242 | | using ScalarLikeFn = std::function<doris::Status(const LikeSearchState*, const StringRef&, |
243 | | const StringRef&, unsigned char*)>; |
244 | | |
245 | | using VectorLikeFn = std::function<doris::Status(const ColumnString&, const ColumnString&, |
246 | | ColumnUInt8::Container&)>; |
247 | | |
248 | | struct LikeState { |
249 | | bool is_like_pattern; |
250 | | bool has_custom_escape = false; |
251 | | char escape_char = {}; |
252 | | LikeSearchState search_state; |
253 | | LikeFn function; |
254 | | ScalarLikeFn scalar_function; |
255 | | }; |
256 | | |
257 | | struct VectorPatternSearchState { |
258 | | MutableColumnPtr _search_strings; |
259 | | std::string _search_string; |
260 | | VectorLikeFn _vector_function; |
261 | | bool _pattern_matched; |
262 | | |
263 | | VectorPatternSearchState(VectorLikeFn vector_function) |
264 | 1.53k | : _search_strings(ColumnString::create()), |
265 | 1.53k | _vector_function(vector_function), |
266 | 1.53k | _pattern_matched(true) {} |
267 | | |
268 | 1.53k | virtual ~VectorPatternSearchState() = default; |
269 | | |
270 | | virtual void like_pattern_match(const std::string& pattern_str) = 0; |
271 | | |
272 | | virtual void regexp_pattern_match(const std::string& pattern_str) = 0; |
273 | | }; |
274 | | |
275 | | using VPatternSearchStateSPtr = std::shared_ptr<VectorPatternSearchState>; |
276 | | |
277 | | class FunctionLikeBase : public IFunction { |
278 | | public: |
279 | 0 | size_t get_number_of_arguments() const override { return 0; } |
280 | 601 | bool is_variadic() const override { return true; } |
281 | | |
282 | 599 | DataTypePtr get_return_type_impl(const DataTypes& /*arguments*/) const override { |
283 | 599 | return std::make_shared<DataTypeUInt8>(); |
284 | 599 | } |
285 | | |
286 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
287 | | uint32_t result, size_t /*input_rows_count*/) const override; |
288 | | |
289 | | friend struct VectorAllpassSearchState; |
290 | | friend struct VectorEqualSearchState; |
291 | | friend struct VectorSubStringSearchState; |
292 | | friend struct VectorStartsWithSearchState; |
293 | | friend struct VectorEndsWithSearchState; |
294 | | |
295 | | protected: |
296 | | Status vector_const(const ColumnString& values, const StringRef* pattern_val, |
297 | | ColumnUInt8::Container& result, const LikeFn& function, |
298 | | LikeSearchState* search_state) const; |
299 | | |
300 | | Status vector_non_const(const ColumnString& values, const ColumnString& patterns, |
301 | | ColumnUInt8::Container& result, LikeState* state, |
302 | | size_t input_rows_count) const; |
303 | | |
304 | | Status execute_substring(const ColumnString::Chars& values, |
305 | | const ColumnString::Offsets& value_offsets, |
306 | | ColumnUInt8::Container& result, LikeSearchState* search_state) const; |
307 | | |
308 | | template <bool LIKE_PATTERN> |
309 | | static VPatternSearchStateSPtr pattern_type_recognition(const ColumnString& patterns); |
310 | | |
311 | | static Status constant_allpass_fn(const LikeSearchState* state, const ColumnString& val, |
312 | | const StringRef& pattern, ColumnUInt8::Container& result); |
313 | | |
314 | | static Status constant_allpass_fn_scalar(const LikeSearchState* state, const StringRef& val, |
315 | | const StringRef& pattern, unsigned char* result); |
316 | | |
317 | | static Status vector_allpass_fn(const ColumnString& vals, const ColumnString& search_strings, |
318 | | ColumnUInt8::Container& result); |
319 | | |
320 | | static Status constant_starts_with_fn(const LikeSearchState* state, const ColumnString& val, |
321 | | const StringRef& pattern, ColumnUInt8::Container& result); |
322 | | |
323 | | static Status constant_starts_with_fn_scalar(const LikeSearchState* state, const StringRef& val, |
324 | | const StringRef& pattern, unsigned char* result); |
325 | | |
326 | | static Status vector_starts_with_fn(const ColumnString& vals, |
327 | | const ColumnString& search_strings, |
328 | | ColumnUInt8::Container& result); |
329 | | |
330 | | static Status constant_ends_with_fn(const LikeSearchState* state, const ColumnString& val, |
331 | | const StringRef& pattern, ColumnUInt8::Container& result); |
332 | | |
333 | | static Status constant_ends_with_fn_scalar(const LikeSearchState* state, const StringRef& val, |
334 | | const StringRef& pattern, unsigned char* result); |
335 | | |
336 | | static Status vector_ends_with_fn(const ColumnString& vals, const ColumnString& search_strings, |
337 | | ColumnUInt8::Container& result); |
338 | | |
339 | | static Status constant_equals_fn(const LikeSearchState* state, const ColumnString& val, |
340 | | const StringRef& pattern, ColumnUInt8::Container& result); |
341 | | |
342 | | static Status constant_equals_fn_scalar(const LikeSearchState* state, const StringRef& val, |
343 | | const StringRef& pattern, unsigned char* result); |
344 | | |
345 | | static Status vector_equals_fn(const ColumnString& vals, const ColumnString& search_strings, |
346 | | ColumnUInt8::Container& result); |
347 | | |
348 | | static Status constant_substring_fn(const LikeSearchState* state, const ColumnString& val, |
349 | | const StringRef& pattern, ColumnUInt8::Container& result); |
350 | | |
351 | | static Status constant_substring_fn_scalar(const LikeSearchState* state, const StringRef& val, |
352 | | const StringRef& pattern, unsigned char* result); |
353 | | |
354 | | static Status vector_substring_fn(const ColumnString& vals, const ColumnString& search_strings, |
355 | | ColumnUInt8::Container& result); |
356 | | |
357 | | static Status constant_regex_fn(const LikeSearchState* state, const ColumnString& val, |
358 | | const StringRef& pattern, ColumnUInt8::Container& result); |
359 | | |
360 | | static Status constant_regex_fn_scalar(const LikeSearchState* state, const StringRef& val, |
361 | | const StringRef& pattern, unsigned char* result); |
362 | | |
363 | | static Status regexp_fn(const LikeSearchState* state, const ColumnString& val, |
364 | | const StringRef& pattern, ColumnUInt8::Container& result); |
365 | | |
366 | | static Status regexp_fn_scalar(const LikeSearchState* state, const StringRef& val, |
367 | | const StringRef& pattern, unsigned char* result); |
368 | | |
369 | | // hyperscan compile expression to database and allocate scratch space |
370 | | static Status hs_prepare(FunctionContext* context, const char* expression, |
371 | | hs_database_t** database, hs_scratch_t** scratch); |
372 | | }; |
373 | | |
374 | | class FunctionLike : public FunctionLikeBase { |
375 | | public: |
376 | | static constexpr auto name = "like"; |
377 | | |
378 | 540 | static FunctionPtr create() { return std::make_shared<FunctionLike>(); } |
379 | | |
380 | 0 | String get_name() const override { return name; } |
381 | | |
382 | | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override; |
383 | | |
384 | | static Status construct_like_const_state(FunctionContext* ctx, const StringRef& pattern, |
385 | | std::shared_ptr<LikeState>& state, |
386 | | bool try_hyperscan = true); |
387 | | |
388 | | friend struct LikeSearchState; |
389 | | friend struct VectorAllpassSearchState; |
390 | | friend struct VectorEqualSearchState; |
391 | | friend struct VectorSubStringSearchState; |
392 | | friend struct VectorStartsWithSearchState; |
393 | | friend struct VectorEndsWithSearchState; |
394 | | |
395 | | private: |
396 | | static Status like_fn(const LikeSearchState* state, const ColumnString& val, |
397 | | const StringRef& pattern, ColumnUInt8::Container& result); |
398 | | |
399 | | static Status like_fn_scalar(const LikeSearchState* state, const StringRef& val, |
400 | | const StringRef& pattern, unsigned char* result); |
401 | | |
402 | | static void convert_like_pattern(const LikeSearchState* state, const std::string& pattern, |
403 | | std::string* re_pattern); |
404 | | |
405 | | static void remove_escape_character(std::string* search_string); |
406 | | }; |
407 | | |
408 | | class FunctionRegexpLike : public FunctionLikeBase { |
409 | | public: |
410 | | static constexpr auto name = "regexp"; |
411 | | static constexpr auto alias = "rlike"; |
412 | | |
413 | 63 | static FunctionPtr create() { return std::make_shared<FunctionRegexpLike>(); } |
414 | | |
415 | 0 | String get_name() const override { return name; } |
416 | | |
417 | | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override; |
418 | | }; |
419 | | |
420 | | } // namespace doris::vectorized |