be/src/exprs/function/like.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <hs/hs_common.h> |
21 | | #include <hs/hs_runtime.h> |
22 | | #include <re2/re2.h> |
23 | | |
24 | | #include <algorithm> |
25 | | #include <boost/iterator/iterator_facade.hpp> |
26 | | #include <boost/regex.hpp> |
27 | | #include <cstddef> |
28 | | #include <cstdint> |
29 | | #include <functional> |
30 | | #include <memory> |
31 | | #include <string> |
32 | | |
33 | | #include "common/status.h" |
34 | | #include "core/block/column_numbers.h" |
35 | | #include "core/column/column_string.h" |
36 | | #include "core/data_type/data_type_number.h" |
37 | | #include "core/data_type/define_primitive_type.h" |
38 | | #include "core/string_ref.h" |
39 | | #include "core/types.h" |
40 | | #include "exprs/aggregate/aggregate_function.h" |
41 | | #include "exprs/function/function.h" |
42 | | #include "exprs/function_context.h" |
43 | | #include "util/string_search.hpp" |
44 | | |
45 | | namespace doris { |
46 | | class Block; |
47 | | |
48 | | // FastPath types for LIKE pattern matching optimization |
49 | | // This allows per-row pattern analysis to avoid regex when possible |
50 | | enum class LikeFastPath { |
51 | | ALLPASS, // Pattern is just '%' or '%%...' - matches everything |
52 | | EQUALS, // No wildcards - exact string match |
53 | | STARTS_WITH, // Pattern ends with '%' only - prefix match |
54 | | ENDS_WITH, // Pattern starts with '%' only - suffix match |
55 | | SUBSTRING, // Pattern is '%xxx%' - substring search |
56 | | REGEX // Contains '_' or multiple '%' - needs regex |
57 | | }; |
58 | | |
59 | | // Lightweight pattern analysis without RE2 |
60 | | // Returns the fast path type and extracts the search string (without wildcards) |
61 | | // Correctly handles escape sequences: backslash-% -> literal %, backslash-_ -> literal _ |
62 | | inline LikeFastPath extract_like_fast_path(const char* pattern, size_t len, |
63 | 209 | std::string& search_string) { |
64 | 209 | search_string.clear(); |
65 | 209 | if (len == 0) { |
66 | 14 | return LikeFastPath::EQUALS; |
67 | 14 | } |
68 | | |
69 | | // Returns true if the character is NOT escaped (even number of preceding backslashes) |
70 | 195 | auto is_unescaped = [&pattern](size_t pos) -> bool { |
71 | 115 | size_t backslash_count = 0; |
72 | 132 | while (pos > 0 && pattern[pos - 1] == '\\') { |
73 | 17 | backslash_count++; |
74 | 17 | pos--; |
75 | 17 | } |
76 | 115 | return (backslash_count % 2 == 0); |
77 | 115 | }; |
78 | | |
79 | 195 | bool starts_with_percent = (pattern[0] == '%'); |
80 | 195 | bool ends_with_percent = (pattern[len - 1] == '%' && is_unescaped(len - 1)); |
81 | | |
82 | | // Quick check: if starts or ends with unescaped '_', need regex |
83 | 195 | if (pattern[0] == '_') { |
84 | 19 | return LikeFastPath::REGEX; |
85 | 19 | } |
86 | 176 | if (pattern[len - 1] == '_' && is_unescaped(len - 1)) { |
87 | 26 | return LikeFastPath::REGEX; |
88 | 26 | } |
89 | | |
90 | | // Helper lambda: check if character is a wildcard that needs escaping |
91 | 150 | auto is_wildcard = [](char c) { return c == '%' || c == '_' || c == '\\'; }; |
92 | | |
93 | 150 | size_t i = 0; |
94 | | // Skip leading '%' characters (unescaped) |
95 | 209 | while (i < len && pattern[i] == '%') { |
96 | 59 | i++; |
97 | 59 | } |
98 | | // If pattern is all '%', it's ALLPASS |
99 | 150 | if (i >= len) { |
100 | 6 | return LikeFastPath::ALLPASS; |
101 | 6 | } |
102 | | |
103 | 144 | search_string.reserve(len); |
104 | 574 | while (i < len) { |
105 | 519 | char c = pattern[i]; |
106 | | // Escaped character - add the literal |
107 | 519 | if (c == '\\' && i + 1 < len && is_wildcard(pattern[i + 1])) { |
108 | 31 | search_string.push_back(pattern[i + 1]); |
109 | 31 | i += 2; |
110 | 31 | continue; |
111 | 31 | } |
112 | | |
113 | | // Unescaped '_' requires regex |
114 | 488 | if (c == '_') { |
115 | 33 | return LikeFastPath::REGEX; |
116 | 33 | } |
117 | | |
118 | | // Check for trailing '%' or middle '%' (which needs regex) |
119 | 455 | if (c == '%') { |
120 | | // Check if this is a trailing '%' sequence |
121 | 56 | size_t j = i; |
122 | 112 | while (j < len && pattern[j] == '%') { |
123 | 56 | j++; |
124 | 56 | } |
125 | 56 | if (j >= len) { |
126 | | // All remaining chars are '%', we're done parsing |
127 | 44 | break; |
128 | 44 | } |
129 | | // '%' in the middle with more content after - need regex |
130 | 12 | return LikeFastPath::REGEX; |
131 | 56 | } |
132 | | |
133 | 399 | search_string.push_back(c); |
134 | 399 | i++; |
135 | 399 | } |
136 | | |
137 | | // Determine the pattern type based on '%' positions |
138 | 99 | if (starts_with_percent && ends_with_percent) { |
139 | 28 | return LikeFastPath::SUBSTRING; |
140 | 71 | } else if (starts_with_percent) { |
141 | 10 | return LikeFastPath::ENDS_WITH; |
142 | 61 | } else if (ends_with_percent) { |
143 | 16 | return LikeFastPath::STARTS_WITH; |
144 | 45 | } else { |
145 | 45 | return LikeFastPath::EQUALS; |
146 | 45 | } |
147 | 99 | } |
148 | | |
149 | 8 | inline std::string replace_pattern_by_escape(const StringRef& pattern, char escape_char) { |
150 | 8 | std::string result; |
151 | 8 | result.reserve(pattern.size); |
152 | 59 | for (size_t i = 0; i < pattern.size; ++i) { |
153 | 51 | if (i + 1 < pattern.size && pattern.data[i] == escape_char && |
154 | 51 | (pattern.data[i + 1] == escape_char || pattern.data[i + 1] == '%' || |
155 | 13 | pattern.data[i + 1] == '_')) { |
156 | | // "^^" -> "^" |
157 | | // "^%" -> "\%" |
158 | | // "^_" -> "\_" |
159 | 10 | if ((pattern.data[i + 1] == '%' || pattern.data[i + 1] == '_')) { |
160 | 4 | result.push_back('\\'); |
161 | 4 | } |
162 | 10 | result.push_back(pattern.data[i + 1]); |
163 | 10 | ++i; // skip next char |
164 | 41 | } else if (pattern.data[i] == '\\') { |
165 | | // "\" -> "\\" |
166 | 1 | result.append("\\\\"); |
167 | 40 | } else { |
168 | 40 | result.push_back(pattern.data[i]); |
169 | 40 | } |
170 | 51 | } |
171 | 8 | return result; |
172 | 8 | } |
173 | | |
174 | | // TODO: replace with std::string_view when `LikeSearchState.substring_pattern` can |
175 | | // construct from std::string_view. |
176 | | struct LikeSearchState { |
177 | | static constexpr char escape_char = '\\'; |
178 | | |
179 | | /// Holds the string the StringRef points to and is set any time StringRef is |
180 | | /// used. |
181 | | std::string search_string; |
182 | | |
183 | | std::string pattern_str; |
184 | | |
185 | | /// Used for LIKE predicates if the pattern is a constant argument, and is either a |
186 | | /// constant string or has a constant string at the beginning or end of the pattern. |
187 | | /// This will be set in order to check for that pattern in the corresponding part of |
188 | | /// the string. |
189 | | StringRef search_string_sv; |
190 | | |
191 | | /// Used for LIKE predicates if the pattern is a constant argument and has a constant |
192 | | /// string in the middle of it. This will be use in order to check for the substring |
193 | | /// in the value. |
194 | | doris::StringSearch substring_pattern; |
195 | | |
196 | | /// Used for RLIKE and REGEXP predicates if the pattern is a constant argument. |
197 | | std::unique_ptr<re2::RE2> regex; |
198 | | |
199 | | /// Used for REGEXP predicates when RE2 doesn't support the pattern (e.g., zero-width assertions like `?=`, `?!`, `?<=`, `?<!`) |
200 | | std::unique_ptr<boost::regex> boost_regex; |
201 | | |
202 | | template <typename Deleter, Deleter deleter> |
203 | | struct HyperscanDeleter { |
204 | | template <typename T> |
205 | 264 | void operator()(T* ptr) const { |
206 | 264 | deleter(ptr); |
207 | 264 | } _ZNK5doris15LikeSearchState16HyperscanDeleterIPFiP10hs_scratchEXadL_Z15hs_free_scratchEEEclIS2_EEvPT_ Line | Count | Source | 205 | 132 | void operator()(T* ptr) const { | 206 | 132 | deleter(ptr); | 207 | 132 | } |
_ZNK5doris15LikeSearchState16HyperscanDeleterIPFiP11hs_databaseEXadL_Z16hs_free_databaseEEEclIS2_EEvPT_ Line | Count | Source | 205 | 132 | void operator()(T* ptr) const { | 206 | 132 | deleter(ptr); | 207 | 132 | } |
|
208 | | }; |
209 | | |
210 | | // hyperscan compiled pattern database and scratch space, reused for performance |
211 | | std::unique_ptr<hs_database_t, HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>> |
212 | | hs_database; |
213 | | std::unique_ptr<hs_scratch_t, HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>> |
214 | | hs_scratch; |
215 | | |
216 | | // hyperscan match callback |
217 | | static int hs_match_handler(unsigned int /* from */, // NOLINT |
218 | | unsigned long long /* from */, // NOLINT |
219 | | unsigned long long /* to */, // NOLINT |
220 | 71 | unsigned int /* flags */, void* ctx) { |
221 | | // set result to 1 for matched row |
222 | 71 | *((unsigned char*)ctx) = 1; |
223 | | /// return non-zero to indicate hyperscan stop after first matched |
224 | 71 | return 1; |
225 | 71 | } |
226 | | |
227 | 597 | LikeSearchState() = default; |
228 | | |
229 | | Status clone(LikeSearchState& cloned); |
230 | | |
231 | 276 | void set_search_string(const std::string& search_string_arg) { |
232 | 276 | search_string = search_string_arg; |
233 | 276 | search_string_sv = StringRef(search_string); |
234 | 276 | substring_pattern.set_pattern(&search_string_sv); |
235 | 276 | } |
236 | | }; |
237 | | |
238 | | using LikeFn = std::function<doris::Status(const LikeSearchState*, const ColumnString&, |
239 | | const StringRef&, ColumnUInt8::Container&)>; |
240 | | |
241 | | using ScalarLikeFn = std::function<doris::Status(const LikeSearchState*, const StringRef&, |
242 | | const StringRef&, unsigned char*)>; |
243 | | |
244 | | using VectorLikeFn = std::function<doris::Status(const ColumnString&, const ColumnString&, |
245 | | ColumnUInt8::Container&)>; |
246 | | |
247 | | struct LikeState { |
248 | | bool is_like_pattern; |
249 | | bool has_custom_escape = false; |
250 | | char escape_char = {}; |
251 | | LikeSearchState search_state; |
252 | | LikeFn function; |
253 | | ScalarLikeFn scalar_function; |
254 | | }; |
255 | | |
256 | | struct VectorPatternSearchState { |
257 | | MutableColumnPtr _search_strings; |
258 | | std::string _search_string; |
259 | | VectorLikeFn _vector_function; |
260 | | bool _pattern_matched; |
261 | | |
262 | | VectorPatternSearchState(VectorLikeFn vector_function) |
263 | 1.53k | : _search_strings(ColumnString::create()), |
264 | 1.53k | _vector_function(vector_function), |
265 | 1.53k | _pattern_matched(true) {} |
266 | | |
267 | 1.53k | virtual ~VectorPatternSearchState() = default; |
268 | | |
269 | | virtual void like_pattern_match(const std::string& pattern_str) = 0; |
270 | | |
271 | | virtual void regexp_pattern_match(const std::string& pattern_str) = 0; |
272 | | }; |
273 | | |
274 | | using VPatternSearchStateSPtr = std::shared_ptr<VectorPatternSearchState>; |
275 | | |
276 | | class FunctionLikeBase : public IFunction { |
277 | | public: |
278 | 0 | size_t get_number_of_arguments() const override { return 0; } |
279 | 601 | bool is_variadic() const override { return true; } |
280 | | |
281 | 599 | DataTypePtr get_return_type_impl(const DataTypes& /*arguments*/) const override { |
282 | 599 | return std::make_shared<DataTypeUInt8>(); |
283 | 599 | } |
284 | | |
285 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
286 | | uint32_t result, size_t /*input_rows_count*/) const override; |
287 | | |
288 | | friend struct VectorAllpassSearchState; |
289 | | friend struct VectorEqualSearchState; |
290 | | friend struct VectorSubStringSearchState; |
291 | | friend struct VectorStartsWithSearchState; |
292 | | friend struct VectorEndsWithSearchState; |
293 | | |
294 | | protected: |
295 | | Status vector_const(const ColumnString& values, const StringRef* pattern_val, |
296 | | ColumnUInt8::Container& result, const LikeFn& function, |
297 | | LikeSearchState* search_state) const; |
298 | | |
299 | | Status vector_non_const(const ColumnString& values, const ColumnString& patterns, |
300 | | ColumnUInt8::Container& result, LikeState* state, |
301 | | size_t input_rows_count) const; |
302 | | |
303 | | Status execute_substring(const ColumnString::Chars& values, |
304 | | const ColumnString::Offsets& value_offsets, |
305 | | ColumnUInt8::Container& result, LikeSearchState* search_state) const; |
306 | | |
307 | | template <bool LIKE_PATTERN> |
308 | | static VPatternSearchStateSPtr pattern_type_recognition(const ColumnString& patterns); |
309 | | |
310 | | static Status constant_allpass_fn(const LikeSearchState* state, const ColumnString& val, |
311 | | const StringRef& pattern, ColumnUInt8::Container& result); |
312 | | |
313 | | static Status constant_allpass_fn_scalar(const LikeSearchState* state, const StringRef& val, |
314 | | const StringRef& pattern, unsigned char* result); |
315 | | |
316 | | static Status vector_allpass_fn(const ColumnString& vals, const ColumnString& search_strings, |
317 | | ColumnUInt8::Container& result); |
318 | | |
319 | | static Status constant_starts_with_fn(const LikeSearchState* state, const ColumnString& val, |
320 | | const StringRef& pattern, ColumnUInt8::Container& result); |
321 | | |
322 | | static Status constant_starts_with_fn_scalar(const LikeSearchState* state, const StringRef& val, |
323 | | const StringRef& pattern, unsigned char* result); |
324 | | |
325 | | static Status vector_starts_with_fn(const ColumnString& vals, |
326 | | const ColumnString& search_strings, |
327 | | ColumnUInt8::Container& result); |
328 | | |
329 | | static Status constant_ends_with_fn(const LikeSearchState* state, const ColumnString& val, |
330 | | const StringRef& pattern, ColumnUInt8::Container& result); |
331 | | |
332 | | static Status constant_ends_with_fn_scalar(const LikeSearchState* state, const StringRef& val, |
333 | | const StringRef& pattern, unsigned char* result); |
334 | | |
335 | | static Status vector_ends_with_fn(const ColumnString& vals, const ColumnString& search_strings, |
336 | | ColumnUInt8::Container& result); |
337 | | |
338 | | static Status constant_equals_fn(const LikeSearchState* state, const ColumnString& val, |
339 | | const StringRef& pattern, ColumnUInt8::Container& result); |
340 | | |
341 | | static Status constant_equals_fn_scalar(const LikeSearchState* state, const StringRef& val, |
342 | | const StringRef& pattern, unsigned char* result); |
343 | | |
344 | | static Status vector_equals_fn(const ColumnString& vals, const ColumnString& search_strings, |
345 | | ColumnUInt8::Container& result); |
346 | | |
347 | | static Status constant_substring_fn(const LikeSearchState* state, const ColumnString& val, |
348 | | const StringRef& pattern, ColumnUInt8::Container& result); |
349 | | |
350 | | static Status constant_substring_fn_scalar(const LikeSearchState* state, const StringRef& val, |
351 | | const StringRef& pattern, unsigned char* result); |
352 | | |
353 | | static Status vector_substring_fn(const ColumnString& vals, const ColumnString& search_strings, |
354 | | ColumnUInt8::Container& result); |
355 | | |
356 | | static Status constant_regex_fn(const LikeSearchState* state, const ColumnString& val, |
357 | | const StringRef& pattern, ColumnUInt8::Container& result); |
358 | | |
359 | | static Status constant_regex_fn_scalar(const LikeSearchState* state, const StringRef& val, |
360 | | const StringRef& pattern, unsigned char* result); |
361 | | |
362 | | static Status regexp_fn(const LikeSearchState* state, const ColumnString& val, |
363 | | const StringRef& pattern, ColumnUInt8::Container& result); |
364 | | |
365 | | static Status regexp_fn_scalar(const LikeSearchState* state, const StringRef& val, |
366 | | const StringRef& pattern, unsigned char* result); |
367 | | |
368 | | // hyperscan compile expression to database and allocate scratch space |
369 | | static Status hs_prepare(FunctionContext* context, const char* expression, |
370 | | hs_database_t** database, hs_scratch_t** scratch); |
371 | | }; |
372 | | |
373 | | class FunctionLike : public FunctionLikeBase { |
374 | | public: |
375 | | static constexpr auto name = "like"; |
376 | | |
377 | 540 | static FunctionPtr create() { return std::make_shared<FunctionLike>(); } |
378 | | |
379 | 0 | String get_name() const override { return name; } |
380 | | |
381 | | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override; |
382 | | |
383 | | static Status construct_like_const_state(FunctionContext* ctx, const StringRef& pattern, |
384 | | std::shared_ptr<LikeState>& state, |
385 | | bool try_hyperscan = true); |
386 | | |
387 | | friend struct LikeSearchState; |
388 | | friend struct VectorAllpassSearchState; |
389 | | friend struct VectorEqualSearchState; |
390 | | friend struct VectorSubStringSearchState; |
391 | | friend struct VectorStartsWithSearchState; |
392 | | friend struct VectorEndsWithSearchState; |
393 | | |
394 | | private: |
395 | | static Status like_fn(const LikeSearchState* state, const ColumnString& val, |
396 | | const StringRef& pattern, ColumnUInt8::Container& result); |
397 | | |
398 | | static Status like_fn_scalar(const LikeSearchState* state, const StringRef& val, |
399 | | const StringRef& pattern, unsigned char* result); |
400 | | |
401 | | static void convert_like_pattern(const LikeSearchState* state, const std::string& pattern, |
402 | | std::string* re_pattern); |
403 | | |
404 | | static void remove_escape_character(std::string* search_string); |
405 | | }; |
406 | | |
407 | | class FunctionRegexpLike : public FunctionLikeBase { |
408 | | public: |
409 | | static constexpr auto name = "regexp"; |
410 | | static constexpr auto alias = "rlike"; |
411 | | |
412 | 63 | static FunctionPtr create() { return std::make_shared<FunctionRegexpLike>(); } |
413 | | |
414 | 0 | String get_name() const override { return name; } |
415 | | |
416 | | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override; |
417 | | }; |
418 | | |
419 | | } // namespace doris |