be/src/exprs/function/functions_multi_string_search.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // This file is copied from |
18 | | // https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/FunctionsMultiStringSearch.h |
19 | | // and modified by Doris |
20 | | |
21 | | #include <hs/hs_common.h> |
22 | | #include <hs/hs_runtime.h> |
23 | | |
24 | | #include <algorithm> |
25 | | #include <boost/iterator/iterator_facade.hpp> |
26 | | #include <cstddef> |
27 | | #include <limits> |
28 | | #include <memory> |
29 | | #include <optional> |
30 | | #include <utility> |
31 | | #include <vector> |
32 | | |
33 | | #include "common/status.h" |
34 | | #include "core/block/block.h" |
35 | | #include "core/block/column_numbers.h" |
36 | | #include "core/block/column_with_type_and_name.h" |
37 | | #include "core/column/column.h" |
38 | | #include "core/column/column_array.h" |
39 | | #include "core/column/column_const.h" |
40 | | #include "core/column/column_nullable.h" |
41 | | #include "core/column/column_string.h" |
42 | | #include "core/column/column_vector.h" |
43 | | #include "core/data_type/data_type.h" |
44 | | #include "core/data_type/data_type_number.h" // IWYU pragma: keep |
45 | | #include "core/field.h" |
46 | | #include "core/pod_array_fwd.h" |
47 | | #include "core/string_ref.h" |
48 | | #include "core/types.h" |
49 | | #include "exprs/aggregate/aggregate_function.h" |
50 | | #include "exprs/function/function.h" |
51 | | #include "exprs/function/function_helpers.h" |
52 | | #include "exprs/function/regexps.h" |
53 | | #include "exprs/function/simple_function_factory.h" |
54 | | |
55 | | namespace doris { |
56 | | class FunctionContext; |
57 | | } // namespace doris |
58 | | |
59 | | namespace doris { |
60 | | |
61 | | template <typename Impl> |
62 | | class FunctionsMultiStringSearch : public IFunction { |
63 | | public: |
64 | | static constexpr auto name = Impl::name; |
65 | | |
66 | 2 | static FunctionPtr create() { return std::make_shared<FunctionsMultiStringSearch>(); } |
67 | | |
68 | 1 | String get_name() const override { return name; } |
69 | | |
70 | 0 | size_t get_number_of_arguments() const override { return 2; } |
71 | | |
72 | 0 | bool use_default_implementation_for_nulls() const override { return false; } |
73 | | |
74 | 0 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
75 | 0 | return Impl::get_return_type(); |
76 | 0 | } |
77 | | |
78 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
79 | 0 | uint32_t result, size_t input_rows_count) const override { |
80 | 0 | auto haystack_column = block.get_by_position(arguments[0]).column; |
81 | 0 | auto needles_column = block.get_by_position(arguments[1]).column; |
82 | |
|
83 | 0 | auto haystack_ptr = remove_nullable(haystack_column); |
84 | 0 | auto needles_ptr = remove_nullable(needles_column); |
85 | |
|
86 | 0 | const auto* col_haystack_vector = check_and_get_column<ColumnString>(&*haystack_ptr); |
87 | 0 | const ColumnConst* col_haystack_const = |
88 | 0 | check_and_get_column_const<ColumnString>(&*haystack_ptr); |
89 | |
|
90 | 0 | const auto* col_needles_vector = check_and_get_column<ColumnArray>(needles_ptr.get()); |
91 | 0 | const ColumnConst* col_needles_const = |
92 | 0 | check_and_get_column_const<ColumnArray>(needles_ptr.get()); |
93 | |
|
94 | 0 | if (!col_needles_const && !col_needles_vector) { |
95 | 0 | return Status::InvalidArgument( |
96 | 0 | "function '{}' encountered unsupported needles column, found {}", name, |
97 | 0 | needles_column->get_name()); |
98 | 0 | } |
99 | | |
100 | 0 | if (col_haystack_const && col_needles_vector) { |
101 | 0 | return Status::InvalidArgument( |
102 | 0 | "function '{}' doesn't support search with non-constant needles " |
103 | 0 | "in constant haystack", |
104 | 0 | name); |
105 | 0 | } |
106 | | |
107 | 0 | auto col_res = ColumnVector<Impl::ResultPType>::create(); |
108 | 0 | auto col_offsets = ColumnArray::ColumnOffsets::create(); |
109 | |
|
110 | 0 | auto& vec_res = col_res->get_data(); |
111 | 0 | auto& offsets_res = col_offsets->get_data(); |
112 | |
|
113 | 0 | Status status; |
114 | 0 | if (col_needles_const) { |
115 | 0 | status = Impl::vector_constant( |
116 | 0 | col_haystack_vector->get_chars(), col_haystack_vector->get_offsets(), |
117 | 0 | col_needles_const->get_value<TYPE_ARRAY>(), vec_res, offsets_res, |
118 | 0 | allow_hyperscan_, max_hyperscan_regexp_length_, |
119 | 0 | max_hyperscan_regexp_total_length_); |
120 | 0 | } else { |
121 | 0 | status = Impl::vector_vector( |
122 | 0 | col_haystack_vector->get_chars(), col_haystack_vector->get_offsets(), |
123 | 0 | col_needles_vector->get_data(), col_needles_vector->get_offsets(), vec_res, |
124 | 0 | offsets_res, allow_hyperscan_, max_hyperscan_regexp_length_, |
125 | 0 | max_hyperscan_regexp_total_length_); |
126 | 0 | } |
127 | |
|
128 | 0 | if (!status.ok()) { |
129 | 0 | return status; |
130 | 0 | } |
131 | | |
132 | 0 | handle_nullable_column(haystack_column, vec_res, input_rows_count); |
133 | 0 | handle_nullable_column(needles_column, vec_res, input_rows_count); |
134 | |
|
135 | 0 | block.replace_by_position(result, std::move(col_res)); |
136 | |
|
137 | 0 | return status; |
138 | 0 | } |
139 | | |
140 | | private: |
141 | | using ResultType = typename Impl::ResultType; |
142 | | |
143 | | constexpr static bool allow_hyperscan_ = true; |
144 | | constexpr static size_t max_hyperscan_regexp_length_ = 0; // not limited |
145 | | constexpr static size_t max_hyperscan_regexp_total_length_ = 0; // not limited |
146 | | |
147 | | /// Handles nullable column by setting result to 0 if the input is null |
148 | | void handle_nullable_column(const ColumnPtr& column, PaddedPODArray<ResultType>& vec_res, |
149 | 0 | size_t input_rows_count) const { |
150 | 0 | if (const auto* nullable = check_and_get_column<ColumnNullable>(column.get())) { |
151 | 0 | const auto& null_map = nullable->get_null_map_data(); |
152 | 0 | for (size_t i = 0; i != input_rows_count; ++i) { |
153 | 0 | if (null_map[i]) { |
154 | 0 | vec_res[i] = 0; |
155 | 0 | } |
156 | 0 | } |
157 | 0 | } else if (const auto* const_column = check_and_get_column<ColumnConst>(column.get()); |
158 | 0 | const_column && is_column_nullable(const_column->get_data_column())) { |
159 | 0 | const auto& const_nullable = |
160 | 0 | assert_cast<const ColumnNullable&>(const_column->get_data_column()); |
161 | 0 | if (const_nullable.get_null_map_data()[0]) { |
162 | 0 | std::fill(vec_res.begin(), vec_res.begin() + input_rows_count, 0); |
163 | 0 | } |
164 | 0 | } |
165 | 0 | } |
166 | | }; |
167 | | |
168 | | /// For more readable instantiations of MultiMatchAnyImpl<> |
169 | | struct MultiMatchTraits { |
170 | | enum class Find { Any, AnyIndex }; |
171 | | }; |
172 | | |
173 | | template <PrimitiveType PType, MultiMatchTraits::Find Find, bool WithEditDistance> |
174 | | struct FunctionMultiMatchAnyImpl { |
175 | | using ResultType = typename PrimitiveTypeTraits<PType>::CppType; |
176 | | static constexpr PrimitiveType ResultPType = PType; |
177 | | |
178 | | static constexpr bool FindAny = (Find == MultiMatchTraits::Find::Any); |
179 | | static constexpr bool FindAnyIndex = (Find == MultiMatchTraits::Find::AnyIndex); |
180 | | |
181 | | static constexpr auto name = "multi_match_any"; |
182 | | |
183 | 0 | static auto get_return_type() { |
184 | 0 | return std::make_shared<typename PrimitiveTypeTraits<PType>::DataType>(); |
185 | 0 | } |
186 | | |
187 | | /** |
188 | | * Prepares the regular expressions and scratch space for Hyperscan. |
189 | | * |
190 | | * This function takes a vector of needles (substrings to search for) and initializes |
191 | | * the regular expressions and scratch space required for Hyperscan, a high-performance |
192 | | * regular expression matching library. |
193 | | * |
194 | | */ |
195 | | static Status prepare_regexps_and_scratch(const std::vector<StringRef>& needles, |
196 | | multiregexps::Regexps*& regexps, |
197 | 0 | multiregexps::ScratchPtr& smart_scratch) { |
198 | 0 | multiregexps::DeferredConstructedRegexpsPtr deferred_constructed_regexps = |
199 | 0 | multiregexps::getOrSet</*SaveIndices*/ |
200 | 0 | FindAnyIndex, WithEditDistance>(needles, std::nullopt); |
201 | 0 | regexps = deferred_constructed_regexps->get(); |
202 | |
|
203 | 0 | hs_scratch_t* scratch = nullptr; |
204 | 0 | hs_error_t err = hs_clone_scratch(regexps->getScratch(), &scratch); |
205 | |
|
206 | 0 | if (err != HS_SUCCESS) { |
207 | 0 | return Status::InternalError("could not clone scratch space for vectorscan"); |
208 | 0 | } |
209 | | |
210 | 0 | smart_scratch.reset(scratch); |
211 | 0 | return Status::OK(); |
212 | 0 | } |
213 | | |
214 | | /** |
215 | | * Static callback function to handle the match results of the hs_scan function. |
216 | | * |
217 | | * This function is called when a matching substring is found while scanning with |
218 | | * Hyperscan. It updates the result based on the match information. |
219 | | * |
220 | | */ |
221 | | static int on_match([[maybe_unused]] unsigned int id, unsigned long long /* from */, // NOLINT |
222 | | unsigned long long /* to */, // NOLINT |
223 | 0 | unsigned int /* flags */, void* context) { |
224 | | if constexpr (FindAnyIndex) { |
225 | | *reinterpret_cast<ResultType*>(context) = id; |
226 | 0 | } else if constexpr (FindAny) { |
227 | 0 | *reinterpret_cast<ResultType*>(context) = 1; |
228 | 0 | } |
229 | | /// Once we hit the callback, there is no need to search for others. |
230 | 0 | return 1; |
231 | 0 | } |
232 | | |
233 | | static Status vector_constant(const ColumnString::Chars& haystack_data, |
234 | | const ColumnString::Offsets& haystack_offsets, |
235 | | const Array& needles_arr, PaddedPODArray<ResultType>& res, |
236 | | PaddedPODArray<UInt64>& offsets, bool allow_hyperscan, |
237 | | size_t max_hyperscan_regexp_length, |
238 | 0 | size_t max_hyperscan_regexp_total_length) { |
239 | 0 | if (!allow_hyperscan) { |
240 | 0 | return Status::InvalidArgument("Hyperscan functions are disabled"); |
241 | 0 | } |
242 | | |
243 | 0 | std::vector<StringRef> needles; |
244 | 0 | needles.reserve(needles_arr.size()); |
245 | 0 | for (const auto& needle : needles_arr) { |
246 | 0 | const auto& tmp = needle.get<TYPE_STRING>(); |
247 | 0 | needles.emplace_back(StringRef {tmp.data(), tmp.size()}); |
248 | 0 | } |
249 | |
|
250 | 0 | res.resize(haystack_offsets.size()); |
251 | |
|
252 | 0 | if (needles_arr.empty()) { |
253 | 0 | std::fill(res.begin(), res.end(), 0); |
254 | 0 | return Status::OK(); |
255 | 0 | } |
256 | | |
257 | 0 | multiregexps::Regexps* regexps = nullptr; |
258 | 0 | multiregexps::ScratchPtr smart_scratch; |
259 | 0 | RETURN_IF_ERROR(prepare_regexps_and_scratch(needles, regexps, smart_scratch)); |
260 | | |
261 | 0 | const size_t haystack_offsets_size = haystack_offsets.size(); |
262 | 0 | UInt64 offset = 0; |
263 | 0 | for (size_t i = 0; i < haystack_offsets_size; ++i) { |
264 | 0 | UInt64 length = haystack_offsets[i] - offset; |
265 | | /// vectorscan restriction. |
266 | 0 | if (length > std::numeric_limits<UInt32>::max()) { |
267 | 0 | return Status::InternalError("too long string to search"); |
268 | 0 | } |
269 | | /// zero the result, scan, check, update the offset. |
270 | 0 | res[i] = 0; |
271 | 0 | hs_error_t err = hs_scan( |
272 | 0 | regexps->getDB(), reinterpret_cast<const char*>(haystack_data.data()) + offset, |
273 | 0 | static_cast<unsigned>(length), 0, smart_scratch.get(), on_match, &res[i]); |
274 | 0 | if (err != HS_SUCCESS && err != HS_SCAN_TERMINATED) { |
275 | 0 | return Status::InternalError("failed to scan with vectorscan"); |
276 | 0 | } |
277 | 0 | offset = haystack_offsets[i]; |
278 | 0 | } |
279 | | |
280 | 0 | return Status::OK(); |
281 | 0 | } |
282 | | |
283 | | static Status vector_vector(const ColumnString::Chars& haystack_data, |
284 | | const ColumnString::Offsets& haystack_offsets, |
285 | | const IColumn& needles_data, |
286 | | const ColumnArray::Offsets64& needles_offsets, |
287 | | PaddedPODArray<ResultType>& res, PaddedPODArray<UInt64>& offsets, |
288 | | bool allow_hyperscan, size_t max_hyperscan_regexp_length, |
289 | 0 | size_t max_hyperscan_regexp_total_length) { |
290 | 0 | if (!allow_hyperscan) { |
291 | 0 | return Status::InvalidArgument("Hyperscan functions are disabled"); |
292 | 0 | } |
293 | | |
294 | 0 | res.resize(haystack_offsets.size()); |
295 | |
|
296 | 0 | size_t prev_haystack_offset = 0; |
297 | 0 | size_t prev_needles_offset = 0; |
298 | |
|
299 | 0 | const auto& nested_column = |
300 | 0 | assert_cast<const ColumnNullable&>(needles_data).get_nested_column(); |
301 | 0 | const auto* needles_data_string = check_and_get_column<ColumnString>(nested_column); |
302 | |
|
303 | 0 | if (!needles_data_string) { |
304 | 0 | return Status::InvalidArgument("needles should be string column"); |
305 | 0 | } |
306 | | |
307 | 0 | std::vector<StringRef> needles; |
308 | 0 | for (size_t i = 0; i < haystack_offsets.size(); ++i) { |
309 | 0 | needles.reserve(needles_offsets[i] - prev_needles_offset); |
310 | |
|
311 | 0 | for (size_t j = prev_needles_offset; j < needles_offsets[i]; ++j) { |
312 | 0 | needles.emplace_back(needles_data_string->get_data_at(j)); |
313 | 0 | } |
314 | 0 | if (needles.empty()) { |
315 | 0 | res[i] = 0; |
316 | 0 | prev_haystack_offset = haystack_offsets[i]; |
317 | 0 | prev_needles_offset = needles_offsets[i]; |
318 | 0 | continue; |
319 | 0 | } |
320 | | |
321 | 0 | multiregexps::Regexps* regexps = nullptr; |
322 | 0 | multiregexps::ScratchPtr smart_scratch; |
323 | 0 | RETURN_IF_ERROR(prepare_regexps_and_scratch(needles, regexps, smart_scratch)); |
324 | | |
325 | 0 | const size_t cur_haystack_length = haystack_offsets[i] - prev_haystack_offset; |
326 | | |
327 | | /// vectorscan restriction. |
328 | 0 | if (cur_haystack_length > std::numeric_limits<UInt32>::max()) { |
329 | 0 | return Status::InternalError("too long string to search"); |
330 | 0 | } |
331 | | |
332 | | /// zero the result, scan, check, update the offset. |
333 | 0 | res[i] = 0; |
334 | 0 | hs_error_t err = hs_scan( |
335 | 0 | regexps->getDB(), |
336 | 0 | reinterpret_cast<const char*>(haystack_data.data()) + prev_haystack_offset, |
337 | 0 | static_cast<unsigned>(cur_haystack_length), 0, smart_scratch.get(), on_match, |
338 | 0 | &res[i]); |
339 | 0 | if (err != HS_SUCCESS && err != HS_SCAN_TERMINATED) { |
340 | 0 | return Status::InternalError("failed to scan with vectorscan"); |
341 | 0 | } |
342 | | |
343 | 0 | prev_haystack_offset = haystack_offsets[i]; |
344 | 0 | prev_needles_offset = needles_offsets[i]; |
345 | 0 | needles.clear(); |
346 | 0 | } |
347 | | |
348 | 0 | return Status::OK(); |
349 | 0 | } |
350 | | }; |
351 | | |
352 | | using FunctionMultiMatchAny = FunctionsMultiStringSearch<FunctionMultiMatchAnyImpl< |
353 | | TYPE_TINYINT, MultiMatchTraits::Find::Any, /*WithEditDistance*/ false>>; |
354 | | |
355 | 1 | void register_function_multi_string_search(SimpleFunctionFactory& factory) { |
356 | 1 | factory.register_function<FunctionMultiMatchAny>(); |
357 | 1 | } |
358 | | |
359 | | } // namespace doris |