/root/doris/be/src/vec/functions/like.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <hs/hs_common.h> |
21 | | #include <hs/hs_runtime.h> |
22 | | #include <re2/re2.h> |
23 | | #include <stddef.h> |
24 | | #include <stdint.h> |
25 | | |
26 | | #include <algorithm> |
27 | | #include <boost/iterator/iterator_facade.hpp> |
28 | | #include <boost/regex.hpp> |
29 | | #include <functional> |
30 | | #include <memory> |
31 | | #include <string> |
32 | | |
33 | | #include "common/status.h" |
34 | | #include "runtime/define_primitive_type.h" |
35 | | #include "runtime/string_search.hpp" |
36 | | #include "udf/udf.h" |
37 | | #include "vec/aggregate_functions/aggregate_function.h" |
38 | | #include "vec/columns/column_string.h" |
39 | | #include "vec/columns/predicate_column.h" |
40 | | #include "vec/common/string_ref.h" |
41 | | #include "vec/core/column_numbers.h" |
42 | | #include "vec/core/types.h" |
43 | | #include "vec/data_types/data_type_number.h" |
44 | | #include "vec/functions/function.h" |
45 | | |
46 | | namespace doris { |
47 | | namespace vectorized { |
48 | | class Block; |
49 | | } // namespace vectorized |
50 | | } // namespace doris |
51 | | |
52 | | namespace doris::vectorized { |
53 | | |
54 | 8 | inline std::string replace_pattern_by_escape(const StringRef& pattern, char escape_char) { |
55 | 8 | std::string result; |
56 | 8 | result.reserve(pattern.size); |
57 | 59 | for (size_t i = 0; i < pattern.size; ++i) { |
58 | 51 | if (i + 1 < pattern.size && pattern.data[i] == escape_char && |
59 | 51 | (pattern.data[i + 1] == escape_char || pattern.data[i + 1] == '%' || |
60 | 13 | pattern.data[i + 1] == '_')) { |
61 | | // "^^" -> "^" |
62 | | // "^%" -> "\%" |
63 | | // "^_" -> "\_" |
64 | 10 | if ((pattern.data[i + 1] == '%' || pattern.data[i + 1] == '_')) { |
65 | 4 | result.push_back('\\'); |
66 | 4 | } |
67 | 10 | result.push_back(pattern.data[i + 1]); |
68 | 10 | ++i; // skip next char |
69 | 41 | } else if (pattern.data[i] == '\\') { |
70 | | // "\" -> "\\" |
71 | 1 | result.append("\\\\"); |
72 | 40 | } else { |
73 | 40 | result.push_back(pattern.data[i]); |
74 | 40 | } |
75 | 51 | } |
76 | 8 | return result; |
77 | 8 | } |
78 | | |
79 | | // TODO: replace with std::string_view when `LikeSearchState.substring_pattern` can |
80 | | // construct from std::string_view. |
81 | | struct LikeSearchState { |
82 | | static constexpr char escape_char = '\\'; |
83 | | |
84 | | /// Holds the string the StringRef points to and is set any time StringRef is |
85 | | /// used. |
86 | | std::string search_string; |
87 | | |
88 | | std::string pattern_str; |
89 | | |
90 | | /// Used for LIKE predicates if the pattern is a constant argument, and is either a |
91 | | /// constant string or has a constant string at the beginning or end of the pattern. |
92 | | /// This will be set in order to check for that pattern in the corresponding part of |
93 | | /// the string. |
94 | | StringRef search_string_sv; |
95 | | |
96 | | /// Used for LIKE predicates if the pattern is a constant argument and has a constant |
97 | | /// string in the middle of it. This will be use in order to check for the substring |
98 | | /// in the value. |
99 | | doris::StringSearch substring_pattern; |
100 | | |
101 | | /// Used for RLIKE and REGEXP predicates if the pattern is a constant argument. |
102 | | std::unique_ptr<re2::RE2> regex; |
103 | | |
104 | | /// Used for REGEXP predicates when RE2 doesn't support the pattern (e.g., zero-width assertions like `?=`, `?!`, `?<=`, `?<!`) |
105 | | std::unique_ptr<boost::regex> boost_regex; |
106 | | |
107 | | template <typename Deleter, Deleter deleter> |
108 | | struct HyperscanDeleter { |
109 | | template <typename T> |
110 | 264 | void operator()(T* ptr) const { |
111 | 264 | deleter(ptr); |
112 | 264 | } _ZNK5doris10vectorized15LikeSearchState16HyperscanDeleterIPFiP10hs_scratchEXadL_Z15hs_free_scratchEEEclIS3_EEvPT_ Line | Count | Source | 110 | 132 | void operator()(T* ptr) const { | 111 | 132 | deleter(ptr); | 112 | 132 | } |
_ZNK5doris10vectorized15LikeSearchState16HyperscanDeleterIPFiP11hs_databaseEXadL_Z16hs_free_databaseEEEclIS3_EEvPT_ Line | Count | Source | 110 | 132 | void operator()(T* ptr) const { | 111 | 132 | deleter(ptr); | 112 | 132 | } |
|
113 | | }; |
114 | | |
115 | | // hyperscan compiled pattern database and scratch space, reused for performance |
116 | | std::unique_ptr<hs_database_t, HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>> |
117 | | hs_database; |
118 | | std::unique_ptr<hs_scratch_t, HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>> |
119 | | hs_scratch; |
120 | | |
121 | | // hyperscan match callback |
122 | | static int hs_match_handler(unsigned int /* from */, // NOLINT |
123 | | unsigned long long /* from */, // NOLINT |
124 | | unsigned long long /* to */, // NOLINT |
125 | 71 | unsigned int /* flags */, void* ctx) { |
126 | | // set result to 1 for matched row |
127 | 71 | *((unsigned char*)ctx) = 1; |
128 | | /// return non-zero to indicate hyperscan stop after first matched |
129 | 71 | return 1; |
130 | 71 | } |
131 | | |
132 | 597 | LikeSearchState() = default; |
133 | | |
134 | | Status clone(LikeSearchState& cloned); |
135 | | |
136 | 276 | void set_search_string(const std::string& search_string_arg) { |
137 | 276 | search_string = search_string_arg; |
138 | 276 | search_string_sv = StringRef(search_string); |
139 | 276 | substring_pattern.set_pattern(&search_string_sv); |
140 | 276 | } |
141 | | }; |
142 | | |
143 | | using LikeFn = std::function<doris::Status(const LikeSearchState*, const ColumnString&, |
144 | | const StringRef&, ColumnUInt8::Container&)>; |
145 | | |
146 | | using ScalarLikeFn = std::function<doris::Status(const LikeSearchState*, const StringRef&, |
147 | | const StringRef&, unsigned char*)>; |
148 | | |
149 | | using VectorLikeFn = std::function<doris::Status(const ColumnString&, const ColumnString&, |
150 | | ColumnUInt8::Container&)>; |
151 | | |
152 | | struct LikeState { |
153 | | bool is_like_pattern; |
154 | | bool has_custom_escape = false; |
155 | | char escape_char = {}; |
156 | | LikeSearchState search_state; |
157 | | LikeFn function; |
158 | | ScalarLikeFn scalar_function; |
159 | | }; |
160 | | |
161 | | struct VectorPatternSearchState { |
162 | | MutableColumnPtr _search_strings; |
163 | | std::string _search_string; |
164 | | VectorLikeFn _vector_function; |
165 | | bool _pattern_matched; |
166 | | |
167 | | VectorPatternSearchState(VectorLikeFn vector_function) |
168 | 1.53k | : _search_strings(ColumnString::create()), |
169 | 1.53k | _vector_function(vector_function), |
170 | 1.53k | _pattern_matched(true) {} |
171 | | |
172 | 1.53k | virtual ~VectorPatternSearchState() = default; |
173 | | |
174 | | virtual void like_pattern_match(const std::string& pattern_str) = 0; |
175 | | |
176 | | virtual void regexp_pattern_match(const std::string& pattern_str) = 0; |
177 | | }; |
178 | | |
179 | | using VPatternSearchStateSPtr = std::shared_ptr<VectorPatternSearchState>; |
180 | | |
181 | | class FunctionLikeBase : public IFunction { |
182 | | public: |
183 | 0 | size_t get_number_of_arguments() const override { return 0; } |
184 | 601 | bool is_variadic() const override { return true; } |
185 | | |
186 | 599 | DataTypePtr get_return_type_impl(const DataTypes& /*arguments*/) const override { |
187 | 599 | return std::make_shared<DataTypeUInt8>(); |
188 | 599 | } |
189 | | |
190 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
191 | | uint32_t result, size_t /*input_rows_count*/) const override; |
192 | | |
193 | | friend struct VectorAllpassSearchState; |
194 | | friend struct VectorEqualSearchState; |
195 | | friend struct VectorSubStringSearchState; |
196 | | friend struct VectorStartsWithSearchState; |
197 | | friend struct VectorEndsWithSearchState; |
198 | | |
199 | | protected: |
200 | | Status vector_const(const ColumnString& values, const StringRef* pattern_val, |
201 | | ColumnUInt8::Container& result, const LikeFn& function, |
202 | | LikeSearchState* search_state) const; |
203 | | |
204 | | Status vector_non_const(const ColumnString& values, const ColumnString& patterns, |
205 | | ColumnUInt8::Container& result, LikeState* state, |
206 | | size_t input_rows_count) const; |
207 | | |
208 | | Status execute_substring(const ColumnString::Chars& values, |
209 | | const ColumnString::Offsets& value_offsets, |
210 | | ColumnUInt8::Container& result, LikeSearchState* search_state) const; |
211 | | |
212 | | template <bool LIKE_PATTERN> |
213 | | static VPatternSearchStateSPtr pattern_type_recognition(const ColumnString& patterns); |
214 | | |
215 | | static Status constant_allpass_fn(const LikeSearchState* state, const ColumnString& val, |
216 | | const StringRef& pattern, ColumnUInt8::Container& result); |
217 | | |
218 | | static Status constant_allpass_fn_scalar(const LikeSearchState* state, const StringRef& val, |
219 | | const StringRef& pattern, unsigned char* result); |
220 | | |
221 | | static Status vector_allpass_fn(const ColumnString& vals, const ColumnString& search_strings, |
222 | | ColumnUInt8::Container& result); |
223 | | |
224 | | static Status constant_starts_with_fn(const LikeSearchState* state, const ColumnString& val, |
225 | | const StringRef& pattern, ColumnUInt8::Container& result); |
226 | | |
227 | | static Status constant_starts_with_fn_scalar(const LikeSearchState* state, const StringRef& val, |
228 | | const StringRef& pattern, unsigned char* result); |
229 | | |
230 | | static Status vector_starts_with_fn(const ColumnString& vals, |
231 | | const ColumnString& search_strings, |
232 | | ColumnUInt8::Container& result); |
233 | | |
234 | | static Status constant_ends_with_fn(const LikeSearchState* state, const ColumnString& val, |
235 | | const StringRef& pattern, ColumnUInt8::Container& result); |
236 | | |
237 | | static Status constant_ends_with_fn_scalar(const LikeSearchState* state, const StringRef& val, |
238 | | const StringRef& pattern, unsigned char* result); |
239 | | |
240 | | static Status vector_ends_with_fn(const ColumnString& vals, const ColumnString& search_strings, |
241 | | ColumnUInt8::Container& result); |
242 | | |
243 | | static Status constant_equals_fn(const LikeSearchState* state, const ColumnString& val, |
244 | | const StringRef& pattern, ColumnUInt8::Container& result); |
245 | | |
246 | | static Status constant_equals_fn_scalar(const LikeSearchState* state, const StringRef& val, |
247 | | const StringRef& pattern, unsigned char* result); |
248 | | |
249 | | static Status vector_equals_fn(const ColumnString& vals, const ColumnString& search_strings, |
250 | | ColumnUInt8::Container& result); |
251 | | |
252 | | static Status constant_substring_fn(const LikeSearchState* state, const ColumnString& val, |
253 | | const StringRef& pattern, ColumnUInt8::Container& result); |
254 | | |
255 | | static Status constant_substring_fn_scalar(const LikeSearchState* state, const StringRef& val, |
256 | | const StringRef& pattern, unsigned char* result); |
257 | | |
258 | | static Status vector_substring_fn(const ColumnString& vals, const ColumnString& search_strings, |
259 | | ColumnUInt8::Container& result); |
260 | | |
261 | | static Status constant_regex_fn(const LikeSearchState* state, const ColumnString& val, |
262 | | const StringRef& pattern, ColumnUInt8::Container& result); |
263 | | |
264 | | static Status constant_regex_fn_scalar(const LikeSearchState* state, const StringRef& val, |
265 | | const StringRef& pattern, unsigned char* result); |
266 | | |
267 | | static Status regexp_fn(const LikeSearchState* state, const ColumnString& val, |
268 | | const StringRef& pattern, ColumnUInt8::Container& result); |
269 | | |
270 | | static Status regexp_fn_scalar(const LikeSearchState* state, const StringRef& val, |
271 | | const StringRef& pattern, unsigned char* result); |
272 | | |
273 | | // hyperscan compile expression to database and allocate scratch space |
274 | | static Status hs_prepare(FunctionContext* context, const char* expression, |
275 | | hs_database_t** database, hs_scratch_t** scratch); |
276 | | }; |
277 | | |
278 | | class FunctionLike : public FunctionLikeBase { |
279 | | public: |
280 | | static constexpr auto name = "like"; |
281 | | |
282 | 540 | static FunctionPtr create() { return std::make_shared<FunctionLike>(); } |
283 | | |
284 | 0 | String get_name() const override { return name; } |
285 | | |
286 | | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override; |
287 | | |
288 | | static Status construct_like_const_state(FunctionContext* ctx, const StringRef& pattern, |
289 | | std::shared_ptr<LikeState>& state, |
290 | | bool try_hyperscan = true); |
291 | | |
292 | | friend struct LikeSearchState; |
293 | | friend struct VectorAllpassSearchState; |
294 | | friend struct VectorEqualSearchState; |
295 | | friend struct VectorSubStringSearchState; |
296 | | friend struct VectorStartsWithSearchState; |
297 | | friend struct VectorEndsWithSearchState; |
298 | | |
299 | | private: |
300 | | static Status like_fn(const LikeSearchState* state, const ColumnString& val, |
301 | | const StringRef& pattern, ColumnUInt8::Container& result); |
302 | | |
303 | | static Status like_fn_scalar(const LikeSearchState* state, const StringRef& val, |
304 | | const StringRef& pattern, unsigned char* result); |
305 | | |
306 | | static void convert_like_pattern(const LikeSearchState* state, const std::string& pattern, |
307 | | std::string* re_pattern); |
308 | | |
309 | | static void remove_escape_character(std::string* search_string); |
310 | | }; |
311 | | |
312 | | class FunctionRegexpLike : public FunctionLikeBase { |
313 | | public: |
314 | | static constexpr auto name = "regexp"; |
315 | | static constexpr auto alias = "rlike"; |
316 | | |
317 | 63 | static FunctionPtr create() { return std::make_shared<FunctionRegexpLike>(); } |
318 | | |
319 | 0 | String get_name() const override { return name; } |
320 | | |
321 | | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override; |
322 | | }; |
323 | | |
324 | | } // namespace doris::vectorized |