be/src/exprs/function/functions_multi_string_position.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // This file is copied from |
18 | | // https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/FunctionsMultiStringPosition.h |
19 | | // and modified by Doris |
20 | | |
21 | | #include <stddef.h> |
22 | | |
23 | | #include <algorithm> |
24 | | #include <boost/iterator/iterator_facade.hpp> |
25 | | #include <cstdint> |
26 | | #include <iterator> |
27 | | #include <limits> |
28 | | #include <memory> |
29 | | #include <utility> |
30 | | #include <vector> |
31 | | |
32 | | #include "common/status.h" |
33 | | #include "core/block/block.h" |
34 | | #include "core/block/column_numbers.h" |
35 | | #include "core/block/column_with_type_and_name.h" |
36 | | #include "core/column/column.h" |
37 | | #include "core/column/column_array.h" |
38 | | #include "core/column/column_const.h" |
39 | | #include "core/column/column_nullable.h" |
40 | | #include "core/column/column_string.h" |
41 | | #include "core/column/column_vector.h" |
42 | | #include "core/data_type/data_type.h" |
43 | | #include "core/data_type/data_type_array.h" |
44 | | #include "core/data_type/data_type_nullable.h" |
45 | | #include "core/data_type/data_type_number.h" |
46 | | #include "core/field.h" |
47 | | #include "core/pod_array_fwd.h" |
48 | | #include "core/string_ref.h" |
49 | | #include "core/types.h" |
50 | | #include "exec/common/string_searcher.h" |
51 | | #include "exprs/aggregate/aggregate_function.h" |
52 | | #include "exprs/function/function.h" |
53 | | #include "exprs/function/function_helpers.h" |
54 | | #include "exprs/function/simple_function_factory.h" |
55 | | |
56 | | namespace doris { |
57 | | class FunctionContext; |
58 | | } // namespace doris |
59 | | |
60 | | namespace doris { |
61 | | |
62 | | template <typename Impl> |
63 | | class FunctionMultiStringPosition : public IFunction { |
64 | | public: |
65 | | static constexpr auto name = Impl::name; |
66 | | |
67 | 2 | static FunctionPtr create() { return std::make_shared<FunctionMultiStringPosition>(); } |
68 | | |
69 | 1 | String get_name() const override { return name; } |
70 | | |
71 | 0 | size_t get_number_of_arguments() const override { return 2; } |
72 | | |
73 | 0 | bool use_default_implementation_for_nulls() const override { return false; } |
74 | | |
75 | 0 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
76 | 0 | return std::make_shared<DataTypeArray>(make_nullable(std::make_shared<DataTypeInt32>())); |
77 | 0 | } |
78 | | |
79 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
80 | 0 | uint32_t result, size_t input_rows_count) const override { |
81 | 0 | auto haystack_column = block.get_by_position(arguments[0]).column; |
82 | 0 | auto needles_column = block.get_by_position(arguments[1]).column; |
83 | |
|
84 | 0 | auto haystack_ptr = remove_nullable(haystack_column); |
85 | 0 | auto needles_ptr = remove_nullable(needles_column); |
86 | |
|
87 | 0 | const auto* col_haystack_vector = check_and_get_column<ColumnString>(&*haystack_ptr); |
88 | 0 | const ColumnConst* col_haystack_const = |
89 | 0 | check_and_get_column_const<ColumnString>(&*haystack_ptr); |
90 | |
|
91 | 0 | const auto* col_needles_vector = check_and_get_column<ColumnArray>(needles_ptr.get()); |
92 | 0 | const ColumnConst* col_needles_const = |
93 | 0 | check_and_get_column_const<ColumnArray>(needles_ptr.get()); |
94 | |
|
95 | 0 | if (!col_needles_const && !col_needles_vector) { |
96 | 0 | return Status::InvalidArgument( |
97 | 0 | "function '{}' encountered unsupported needles column, found {}", name, |
98 | 0 | needles_column->get_name()); |
99 | 0 | } |
100 | | |
101 | 0 | if (col_haystack_const && col_needles_vector) { |
102 | 0 | return Status::InvalidArgument( |
103 | 0 | "function '{}' doesn't support search with non-constant needles " |
104 | 0 | "in constant haystack", |
105 | 0 | name); |
106 | 0 | } |
107 | | |
108 | 0 | auto col_res = ColumnVector<Impl::ResultType>::create(); |
109 | 0 | auto col_offsets = ColumnArray::ColumnOffsets::create(); |
110 | |
|
111 | 0 | auto& vec_res = col_res->get_data(); |
112 | 0 | auto& offsets_res = col_offsets->get_data(); |
113 | |
|
114 | 0 | Status status; |
115 | 0 | if (col_needles_const) { |
116 | 0 | status = Impl::vector_constant( |
117 | 0 | col_haystack_vector->get_chars(), col_haystack_vector->get_offsets(), |
118 | 0 | col_needles_const->get_value<TYPE_ARRAY>(), vec_res, offsets_res); |
119 | 0 | } else { |
120 | 0 | status = Impl::vector_vector(col_haystack_vector->get_chars(), |
121 | 0 | col_haystack_vector->get_offsets(), |
122 | 0 | col_needles_vector->get_data(), |
123 | 0 | col_needles_vector->get_offsets(), vec_res, offsets_res); |
124 | 0 | } |
125 | |
|
126 | 0 | if (!status.ok()) { |
127 | 0 | return status; |
128 | 0 | } |
129 | | |
130 | 0 | handle_nullable_column(haystack_column, vec_res, offsets_res, input_rows_count); |
131 | 0 | handle_nullable_column(needles_column, vec_res, offsets_res, input_rows_count); |
132 | |
|
133 | 0 | auto nullable_col = |
134 | 0 | ColumnNullable::create(std::move(col_res), ColumnUInt8::create(col_res->size(), 0)); |
135 | 0 | block.get_by_position(result).column = |
136 | 0 | ColumnArray::create(std::move(nullable_col), std::move(col_offsets)); |
137 | 0 | return status; |
138 | 0 | } |
139 | | |
140 | | private: |
141 | | using ResultContainer = typename ColumnVector<Impl::ResultType>::Container; |
142 | | |
143 | | void fill_result_row_with_zero(ResultContainer& vec_res, |
144 | 0 | const PaddedPODArray<UInt64>& offsets_res, size_t row) const { |
145 | 0 | for (size_t offset = offsets_res[row - 1]; offset != offsets_res[row]; ++offset) { |
146 | 0 | vec_res[offset] = 0; |
147 | 0 | } |
148 | 0 | } |
149 | | |
150 | | void handle_nullable_column(const ColumnPtr& column, ResultContainer& vec_res, |
151 | | const PaddedPODArray<UInt64>& offsets_res, |
152 | 0 | size_t input_rows_count) const { |
153 | 0 | if (const auto* nullable = check_and_get_column<ColumnNullable>(column.get())) { |
154 | 0 | const auto& null_map = nullable->get_null_map_data(); |
155 | 0 | for (size_t i = 0; i != input_rows_count; ++i) { |
156 | 0 | if (null_map[i]) { |
157 | 0 | fill_result_row_with_zero(vec_res, offsets_res, i); |
158 | 0 | } |
159 | 0 | } |
160 | 0 | } else if (const auto* const_column = check_and_get_column<ColumnConst>(column.get()); |
161 | 0 | const_column && is_column_nullable(const_column->get_data_column())) { |
162 | 0 | const auto& const_nullable = |
163 | 0 | assert_cast<const ColumnNullable&>(const_column->get_data_column()); |
164 | 0 | if (const_nullable.get_null_map_data()[0]) { |
165 | 0 | std::fill(vec_res.begin(), vec_res.end(), 0); |
166 | 0 | } |
167 | 0 | } |
168 | 0 | } |
169 | | }; |
170 | | |
171 | | struct FunctionMultiSearchAllPositionsImpl { |
172 | | public: |
173 | | static constexpr PrimitiveType ResultType = TYPE_INT; |
174 | | using SingleSearcher = ASCIICaseSensitiveStringSearcher; |
175 | | static constexpr auto name = "multi_search_all_positions"; |
176 | | |
177 | | static Status vector_constant(const ColumnString::Chars& haystack_data, |
178 | | const ColumnString::Offsets& haystack_offsets, |
179 | | const Array& needles_arr, PaddedPODArray<Int32>& vec_res, |
180 | 0 | PaddedPODArray<UInt64>& offsets_res) { |
181 | 0 | if (needles_arr.size() > std::numeric_limits<UInt8>::max()) { |
182 | 0 | return Status::InvalidArgument( |
183 | 0 | "number of arguments for function {} doesn't match: " |
184 | 0 | "passed {}, should be at most 255", |
185 | 0 | name, needles_arr.size()); |
186 | 0 | } |
187 | | |
188 | 0 | const size_t needles_size = needles_arr.size(); |
189 | 0 | std::vector<SingleSearcher> searchers; |
190 | 0 | searchers.reserve(needles_size); |
191 | 0 | for (const auto& needle : needles_arr) { |
192 | 0 | if (!is_string_type(needle.get_type())) { |
193 | 0 | return Status::InvalidArgument("invalid type of needle {}", needle.get_type_name()); |
194 | 0 | } |
195 | 0 | searchers.emplace_back(needle.get<TYPE_STRING>().data(), |
196 | 0 | needle.get<TYPE_STRING>().size()); |
197 | 0 | } |
198 | | |
199 | 0 | const size_t haystack_size = haystack_offsets.size(); |
200 | 0 | vec_res.resize(haystack_size * needles_size); |
201 | 0 | offsets_res.resize(haystack_size); |
202 | |
|
203 | 0 | std::fill(vec_res.begin(), vec_res.end(), 0); |
204 | | |
205 | | // we traverse to generator answer by Vector's slot of ColumnVector, not by Vector. |
206 | | // TODO: check if the order of loop is best. The large data may make us writing across the line which size out of L2 cache. |
207 | 0 | for (size_t ans_slot_in_row = 0; ans_slot_in_row < searchers.size(); ans_slot_in_row++) { |
208 | | // is i.e. answer slot index in one Vector(row) of answer |
209 | 0 | auto& searcher = searchers[ans_slot_in_row]; |
210 | 0 | size_t prev_haystack_offset = 0; |
211 | |
|
212 | 0 | for (size_t haystack_index = 0, res_index = ans_slot_in_row; |
213 | 0 | haystack_index < haystack_size; ++haystack_index, res_index += needles_size) { |
214 | 0 | const auto* haystack = &haystack_data[prev_haystack_offset]; |
215 | 0 | const auto* haystack_end = |
216 | 0 | haystack - prev_haystack_offset + haystack_offsets[haystack_index]; |
217 | |
|
218 | 0 | const auto* ans_now = searcher.search(haystack, haystack_end); |
219 | 0 | vec_res[res_index] = |
220 | 0 | ans_now >= haystack_end ? 0 : (Int32)std::distance(haystack, ans_now) + 1; |
221 | 0 | prev_haystack_offset = haystack_offsets[haystack_index]; |
222 | 0 | } |
223 | 0 | } |
224 | |
|
225 | 0 | size_t accum = needles_size; |
226 | 0 | for (size_t i = 0; i < haystack_size; ++i) { |
227 | 0 | offsets_res[i] = accum; |
228 | 0 | accum += needles_size; |
229 | 0 | } |
230 | |
|
231 | 0 | return Status::OK(); |
232 | 0 | } |
233 | | |
234 | | static Status vector_vector(const ColumnString::Chars& haystack_data, |
235 | | const ColumnString::Offsets& haystack_offsets, |
236 | | const IColumn& needles_data, |
237 | | const ColumnArray::Offsets64& needles_offsets, |
238 | | PaddedPODArray<Int32>& vec_res, |
239 | 0 | PaddedPODArray<UInt64>& offsets_res) { |
240 | 0 | size_t prev_haystack_offset = 0; |
241 | 0 | size_t prev_needles_offset = 0; |
242 | |
|
243 | 0 | offsets_res.reserve(haystack_data.size()); |
244 | 0 | uint64_t offset_now = 0; |
245 | |
|
246 | 0 | auto& nested_column = assert_cast<const ColumnNullable&>(needles_data).get_nested_column(); |
247 | 0 | const ColumnString* needles_data_string = assert_cast<const ColumnString*>(&nested_column); |
248 | |
|
249 | 0 | std::vector<StringRef> needles_for_row; |
250 | | // haystack first, row by row. |
251 | 0 | for (size_t haystack_index = 0; haystack_index < haystack_offsets.size(); |
252 | 0 | ++haystack_index) { |
253 | | // get haystack for this row. |
254 | 0 | const auto* haystack = &haystack_data[prev_haystack_offset]; |
255 | 0 | const auto* haystack_end = |
256 | 0 | haystack - prev_haystack_offset + haystack_offsets[haystack_index]; |
257 | | |
258 | | // build needles for this row. |
259 | 0 | needles_for_row.reserve(needles_offsets[haystack_index] - prev_needles_offset); |
260 | 0 | for (size_t j = prev_needles_offset; j < needles_offsets[haystack_index]; ++j) { |
261 | 0 | needles_for_row.emplace_back(needles_data_string->get_data_at(j)); |
262 | 0 | } |
263 | 0 | const size_t needles_row_size = needles_for_row.size(); |
264 | 0 | if (needles_row_size > std::numeric_limits<UInt8>::max()) { |
265 | 0 | return Status::InvalidArgument( |
266 | 0 | "number of arguments for function {} doesn't match: " |
267 | 0 | "passed {}, should be at most 255", |
268 | 0 | name, needles_row_size); |
269 | 0 | } |
270 | | |
271 | | // each searcher search for one needle. |
272 | 0 | std::vector<SingleSearcher> searchers; |
273 | 0 | searchers.clear(); |
274 | 0 | searchers.reserve(needles_row_size); |
275 | 0 | for (auto needle : needles_for_row) { |
276 | 0 | searchers.emplace_back(needle.data, needle.size); |
277 | 0 | } |
278 | | |
279 | | // search for first so that the ans's size is constant for each row. |
280 | 0 | auto ans_row_begin = vec_res.size(); |
281 | 0 | vec_res.resize(vec_res.size() + needles_row_size); |
282 | 0 | offset_now += searchers.size(); |
283 | 0 | offsets_res.emplace_back(offset_now); |
284 | | |
285 | | //for now haystack, apply needle to search, generator answer by order. |
286 | 0 | for (size_t ans_slot_in_row = 0; ans_slot_in_row < searchers.size(); |
287 | 0 | ans_slot_in_row++) { |
288 | | // is i.e. answer slot index in one Vector(row) of answer |
289 | 0 | auto& searcher = searchers[ans_slot_in_row]; |
290 | |
|
291 | 0 | auto ans_now = searcher.search(haystack, haystack_end); |
292 | 0 | vec_res[ans_row_begin + ans_slot_in_row] = |
293 | 0 | ans_now >= haystack_end ? 0 : (Int32)std::distance(haystack, ans_now) + 1; |
294 | 0 | } |
295 | |
|
296 | 0 | prev_haystack_offset = haystack_offsets[haystack_index]; |
297 | 0 | prev_needles_offset = needles_offsets[haystack_index]; |
298 | 0 | needles_for_row.clear(); |
299 | 0 | } |
300 | | |
301 | 0 | return Status::OK(); |
302 | 0 | } |
303 | | }; |
304 | | |
305 | | using FunctionMultiSearchAllPositions = |
306 | | FunctionMultiStringPosition<FunctionMultiSearchAllPositionsImpl>; |
307 | | |
308 | 1 | void register_function_multi_string_position(SimpleFunctionFactory& factory) { |
309 | 1 | factory.register_function<FunctionMultiSearchAllPositions>(); |
310 | 1 | } |
311 | | |
312 | | } // namespace doris |