/root/doris/be/src/vec/functions/like.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <hs/hs_common.h> |
21 | | #include <hs/hs_runtime.h> |
22 | | #include <re2/re2.h> |
23 | | #include <stddef.h> |
24 | | #include <stdint.h> |
25 | | |
26 | | #include <algorithm> |
27 | | #include <boost/iterator/iterator_facade.hpp> |
28 | | #include <functional> |
29 | | #include <memory> |
30 | | #include <string> |
31 | | |
32 | | #include "common/status.h" |
33 | | #include "runtime/define_primitive_type.h" |
34 | | #include "runtime/string_search.hpp" |
35 | | #include "udf/udf.h" |
36 | | #include "vec/aggregate_functions/aggregate_function.h" |
37 | | #include "vec/columns/column_string.h" |
38 | | #include "vec/columns/predicate_column.h" |
39 | | #include "vec/common/string_ref.h" |
40 | | #include "vec/core/column_numbers.h" |
41 | | #include "vec/core/types.h" |
42 | | #include "vec/data_types/data_type_number.h" |
43 | | #include "vec/functions/function.h" |
44 | | |
45 | | namespace doris { |
46 | | namespace vectorized { |
47 | | class Block; |
48 | | } // namespace vectorized |
49 | | } // namespace doris |
50 | | |
51 | | namespace doris::vectorized { |
52 | | |
53 | 7 | inline std::string replace_pattern_by_escape(const StringRef& pattern, char escape_char) { |
54 | 7 | std::string result; |
55 | 7 | result.reserve(pattern.size); |
56 | 57 | for (size_t i = 0; i < pattern.size; ++i) { |
57 | 50 | if (i + 1 < pattern.size && pattern.data[i] == escape_char && |
58 | 50 | (pattern.data[i + 1] == escape_char || pattern.data[i + 1] == '%' || |
59 | 12 | pattern.data[i + 1] == '_')) { |
60 | | // "^^" -> "^" |
61 | | // "^%" -> "\%" |
62 | | // "^_" -> "\_" |
63 | 9 | if ((pattern.data[i + 1] == '%' || pattern.data[i + 1] == '_')) { |
64 | 4 | result.push_back('\\'); |
65 | 4 | } |
66 | 9 | result.push_back(pattern.data[i + 1]); |
67 | 9 | ++i; // skip next char |
68 | 41 | } else if (pattern.data[i] == '\\') { |
69 | | // "\" -> "\\" |
70 | 1 | result.append("\\\\"); |
71 | 40 | } else { |
72 | 40 | result.push_back(pattern.data[i]); |
73 | 40 | } |
74 | 50 | } |
75 | 7 | return result; |
76 | 7 | } |
77 | | |
78 | | // TODO: replace with std::string_view when `LikeSearchState.substring_pattern` can |
79 | | // construct from std::string_view. |
80 | | struct LikeSearchState { |
81 | | static constexpr char escape_char = '\\'; |
82 | | |
83 | | /// Holds the string the StringRef points to and is set any time StringRef is |
84 | | /// used. |
85 | | std::string search_string; |
86 | | |
87 | | std::string pattern_str; |
88 | | |
89 | | /// Used for LIKE predicates if the pattern is a constant argument, and is either a |
90 | | /// constant string or has a constant string at the beginning or end of the pattern. |
91 | | /// This will be set in order to check for that pattern in the corresponding part of |
92 | | /// the string. |
93 | | StringRef search_string_sv; |
94 | | |
95 | | /// Used for LIKE predicates if the pattern is a constant argument and has a constant |
96 | | /// string in the middle of it. This will be use in order to check for the substring |
97 | | /// in the value. |
98 | | doris::StringSearch substring_pattern; |
99 | | |
100 | | /// Used for RLIKE and REGEXP predicates if the pattern is a constant argument. |
101 | | std::unique_ptr<re2::RE2> regex; |
102 | | |
103 | | template <typename Deleter, Deleter deleter> |
104 | | struct HyperscanDeleter { |
105 | | template <typename T> |
106 | 82 | void operator()(T* ptr) const { |
107 | 82 | deleter(ptr); |
108 | 82 | } _ZNK5doris10vectorized15LikeSearchState16HyperscanDeleterIPFiP10hs_scratchEXadL_Z15hs_free_scratchEEEclIS3_EEvPT_ Line | Count | Source | 106 | 41 | void operator()(T* ptr) const { | 107 | 41 | deleter(ptr); | 108 | 41 | } |
_ZNK5doris10vectorized15LikeSearchState16HyperscanDeleterIPFiP11hs_databaseEXadL_Z16hs_free_databaseEEEclIS3_EEvPT_ Line | Count | Source | 106 | 41 | void operator()(T* ptr) const { | 107 | 41 | deleter(ptr); | 108 | 41 | } |
|
109 | | }; |
110 | | |
111 | | // hyperscan compiled pattern database and scratch space, reused for performance |
112 | | std::unique_ptr<hs_database_t, HyperscanDeleter<decltype(&hs_free_database), &hs_free_database>> |
113 | | hs_database; |
114 | | std::unique_ptr<hs_scratch_t, HyperscanDeleter<decltype(&hs_free_scratch), &hs_free_scratch>> |
115 | | hs_scratch; |
116 | | |
117 | | // hyperscan match callback |
118 | | static int hs_match_handler(unsigned int /* from */, // NOLINT |
119 | | unsigned long long /* from */, // NOLINT |
120 | | unsigned long long /* to */, // NOLINT |
121 | 23 | unsigned int /* flags */, void* ctx) { |
122 | | // set result to 1 for matched row |
123 | 23 | *((unsigned char*)ctx) = 1; |
124 | | /// return non-zero to indicate hyperscan stop after first matched |
125 | 23 | return 1; |
126 | 23 | } |
127 | | |
128 | 153 | LikeSearchState() = default; |
129 | | |
130 | | Status clone(LikeSearchState& cloned); |
131 | | |
132 | 66 | void set_search_string(const std::string& search_string_arg) { |
133 | 66 | search_string = search_string_arg; |
134 | 66 | search_string_sv = StringRef(search_string); |
135 | 66 | substring_pattern.set_pattern(&search_string_sv); |
136 | 66 | } |
137 | | }; |
138 | | |
139 | | using LikeFn = std::function<doris::Status(LikeSearchState*, const ColumnString&, const StringRef&, |
140 | | ColumnUInt8::Container&)>; |
141 | | |
142 | | using ScalarLikeFn = std::function<doris::Status(LikeSearchState*, const StringRef&, |
143 | | const StringRef&, unsigned char*)>; |
144 | | |
145 | | using VectorLikeFn = std::function<doris::Status(const ColumnString&, const ColumnString&, |
146 | | ColumnUInt8::Container&)>; |
147 | | |
148 | | struct LikeState { |
149 | | bool is_like_pattern; |
150 | | bool has_custom_escape = false; |
151 | | char escape_char = {}; |
152 | | LikeSearchState search_state; |
153 | | LikeFn function; |
154 | | ScalarLikeFn scalar_function; |
155 | | }; |
156 | | |
157 | | struct VectorPatternSearchState { |
158 | | MutableColumnPtr _search_strings; |
159 | | std::string _search_string; |
160 | | VectorLikeFn _vector_function; |
161 | | bool _pattern_matched; |
162 | | |
163 | | VectorPatternSearchState(VectorLikeFn vector_function) |
164 | 675 | : _search_strings(ColumnString::create()), |
165 | 675 | _vector_function(vector_function), |
166 | 675 | _pattern_matched(true) {} |
167 | | |
168 | 675 | virtual ~VectorPatternSearchState() = default; |
169 | | |
170 | | virtual void like_pattern_match(const std::string& pattern_str) = 0; |
171 | | |
172 | | virtual void regexp_pattern_match(const std::string& pattern_str) = 0; |
173 | | }; |
174 | | |
175 | | using VPatternSearchStateSPtr = std::shared_ptr<VectorPatternSearchState>; |
176 | | |
177 | | class FunctionLikeBase : public IFunction { |
178 | | public: |
179 | 0 | size_t get_number_of_arguments() const override { return 0; } |
180 | 157 | bool is_variadic() const override { return true; } |
181 | | |
182 | 155 | DataTypePtr get_return_type_impl(const DataTypes& /*arguments*/) const override { |
183 | 155 | return std::make_shared<DataTypeUInt8>(); |
184 | 155 | } |
185 | | |
186 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
187 | | uint32_t result, size_t /*input_rows_count*/) const override; |
188 | | |
189 | | friend struct VectorAllpassSearchState; |
190 | | friend struct VectorEqualSearchState; |
191 | | friend struct VectorSubStringSearchState; |
192 | | friend struct VectorStartsWithSearchState; |
193 | | friend struct VectorEndsWithSearchState; |
194 | | |
195 | | protected: |
196 | | Status vector_const(const ColumnString& values, const StringRef* pattern_val, |
197 | | ColumnUInt8::Container& result, const LikeFn& function, |
198 | | LikeSearchState* search_state) const; |
199 | | |
200 | | Status vector_non_const(const ColumnString& values, const ColumnString& patterns, |
201 | | ColumnUInt8::Container& result, LikeState* state, |
202 | | size_t input_rows_count) const; |
203 | | |
204 | | Status execute_substring(const ColumnString::Chars& values, |
205 | | const ColumnString::Offsets& value_offsets, |
206 | | ColumnUInt8::Container& result, LikeSearchState* search_state) const; |
207 | | |
208 | | template <bool LIKE_PATTERN> |
209 | | static VPatternSearchStateSPtr pattern_type_recognition(const ColumnString& patterns); |
210 | | |
211 | | static Status constant_allpass_fn(LikeSearchState* state, const ColumnString& val, |
212 | | const StringRef& pattern, ColumnUInt8::Container& result); |
213 | | |
214 | | static Status constant_allpass_fn_scalar(LikeSearchState* state, const StringRef& val, |
215 | | const StringRef& pattern, unsigned char* result); |
216 | | |
217 | | static Status vector_allpass_fn(const ColumnString& vals, const ColumnString& search_strings, |
218 | | ColumnUInt8::Container& result); |
219 | | |
220 | | static Status constant_starts_with_fn(LikeSearchState* state, const ColumnString& val, |
221 | | const StringRef& pattern, ColumnUInt8::Container& result); |
222 | | |
223 | | static Status constant_starts_with_fn_scalar(LikeSearchState* state, const StringRef& val, |
224 | | const StringRef& pattern, unsigned char* result); |
225 | | |
226 | | static Status vector_starts_with_fn(const ColumnString& vals, |
227 | | const ColumnString& search_strings, |
228 | | ColumnUInt8::Container& result); |
229 | | |
230 | | static Status constant_ends_with_fn(LikeSearchState* state, const ColumnString& val, |
231 | | const StringRef& pattern, ColumnUInt8::Container& result); |
232 | | |
233 | | static Status constant_ends_with_fn_scalar(LikeSearchState* state, const StringRef& val, |
234 | | const StringRef& pattern, unsigned char* result); |
235 | | |
236 | | static Status vector_ends_with_fn(const ColumnString& vals, const ColumnString& search_strings, |
237 | | ColumnUInt8::Container& result); |
238 | | |
239 | | static Status constant_equals_fn(LikeSearchState* state, const ColumnString& val, |
240 | | const StringRef& pattern, ColumnUInt8::Container& result); |
241 | | |
242 | | static Status constant_equals_fn_scalar(LikeSearchState* state, const StringRef& val, |
243 | | const StringRef& pattern, unsigned char* result); |
244 | | |
245 | | static Status vector_equals_fn(const ColumnString& vals, const ColumnString& search_strings, |
246 | | ColumnUInt8::Container& result); |
247 | | |
248 | | static Status constant_substring_fn(LikeSearchState* state, const ColumnString& val, |
249 | | const StringRef& pattern, ColumnUInt8::Container& result); |
250 | | |
251 | | static Status constant_substring_fn_scalar(LikeSearchState* state, const StringRef& val, |
252 | | const StringRef& pattern, unsigned char* result); |
253 | | |
254 | | static Status vector_substring_fn(const ColumnString& vals, const ColumnString& search_strings, |
255 | | ColumnUInt8::Container& result); |
256 | | |
257 | | static Status constant_regex_fn(LikeSearchState* state, const ColumnString& val, |
258 | | const StringRef& pattern, ColumnUInt8::Container& result); |
259 | | |
260 | | static Status constant_regex_fn_scalar(LikeSearchState* state, const StringRef& val, |
261 | | const StringRef& pattern, unsigned char* result); |
262 | | |
263 | | static Status regexp_fn(LikeSearchState* state, const ColumnString& val, |
264 | | const StringRef& pattern, ColumnUInt8::Container& result); |
265 | | |
266 | | static Status regexp_fn_scalar(LikeSearchState* state, const StringRef& val, |
267 | | const StringRef& pattern, unsigned char* result); |
268 | | |
269 | | // hyperscan compile expression to database and allocate scratch space |
270 | | static Status hs_prepare(FunctionContext* context, const char* expression, |
271 | | hs_database_t** database, hs_scratch_t** scratch); |
272 | | }; |
273 | | |
274 | | class FunctionLike : public FunctionLikeBase { |
275 | | public: |
276 | | static constexpr auto name = "like"; |
277 | | |
278 | 140 | static FunctionPtr create() { return std::make_shared<FunctionLike>(); } |
279 | | |
280 | 0 | String get_name() const override { return name; } |
281 | | |
282 | | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override; |
283 | | |
284 | | static Status construct_like_const_state(FunctionContext* ctx, const StringRef& pattern, |
285 | | std::shared_ptr<LikeState>& state, |
286 | | bool try_hyperscan = true); |
287 | | |
288 | | friend struct LikeSearchState; |
289 | | friend struct VectorAllpassSearchState; |
290 | | friend struct VectorEqualSearchState; |
291 | | friend struct VectorSubStringSearchState; |
292 | | friend struct VectorStartsWithSearchState; |
293 | | friend struct VectorEndsWithSearchState; |
294 | | |
295 | | private: |
296 | | static Status like_fn(LikeSearchState* state, const ColumnString& val, const StringRef& pattern, |
297 | | ColumnUInt8::Container& result); |
298 | | |
299 | | static Status like_fn_scalar(LikeSearchState* state, const StringRef& val, |
300 | | const StringRef& pattern, unsigned char* result); |
301 | | |
302 | | static void convert_like_pattern(LikeSearchState* state, const std::string& pattern, |
303 | | std::string* re_pattern); |
304 | | |
305 | | static void remove_escape_character(std::string* search_string); |
306 | | }; |
307 | | |
308 | | class FunctionRegexpLike : public FunctionLikeBase { |
309 | | public: |
310 | | static constexpr auto name = "regexp"; |
311 | | static constexpr auto alias = "rlike"; |
312 | | |
313 | 19 | static FunctionPtr create() { return std::make_shared<FunctionRegexpLike>(); } |
314 | | |
315 | 0 | String get_name() const override { return name; } |
316 | | |
317 | | Status open(FunctionContext* context, FunctionContext::FunctionStateScope scope) override; |
318 | | }; |
319 | | |
320 | | } // namespace doris::vectorized |