Coverage Report

Created: 2026-06-12 17:35

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/functions_multi_string_position.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
// This file is copied from
18
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/FunctionsMultiStringPosition.h
19
// and modified by Doris
20
21
#include <stddef.h>
22
23
#include <algorithm>
24
#include <boost/iterator/iterator_facade.hpp>
25
#include <cstdint>
26
#include <iterator>
27
#include <limits>
28
#include <memory>
29
#include <utility>
30
#include <vector>
31
32
#include "common/status.h"
33
#include "core/block/block.h"
34
#include "core/block/column_numbers.h"
35
#include "core/block/column_with_type_and_name.h"
36
#include "core/column/column.h"
37
#include "core/column/column_array.h"
38
#include "core/column/column_const.h"
39
#include "core/column/column_nullable.h"
40
#include "core/column/column_string.h"
41
#include "core/column/column_vector.h"
42
#include "core/data_type/data_type.h"
43
#include "core/data_type/data_type_array.h"
44
#include "core/data_type/data_type_nullable.h"
45
#include "core/data_type/data_type_number.h"
46
#include "core/field.h"
47
#include "core/pod_array_fwd.h"
48
#include "core/string_ref.h"
49
#include "core/types.h"
50
#include "exec/common/string_searcher.h"
51
#include "exprs/aggregate/aggregate_function.h"
52
#include "exprs/function/function.h"
53
#include "exprs/function/function_helpers.h"
54
#include "exprs/function/simple_function_factory.h"
55
56
namespace doris {
57
class FunctionContext;
58
} // namespace doris
59
60
namespace doris {
61
62
template <typename Impl>
63
class FunctionMultiStringPosition : public IFunction {
64
public:
65
    static constexpr auto name = Impl::name;
66
67
2
    static FunctionPtr create() { return std::make_shared<FunctionMultiStringPosition>(); }
68
69
1
    String get_name() const override { return name; }
70
71
0
    size_t get_number_of_arguments() const override { return 2; }
72
73
0
    bool use_default_implementation_for_nulls() const override { return false; }
74
75
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
76
0
        return std::make_shared<DataTypeArray>(make_nullable(std::make_shared<DataTypeInt32>()));
77
0
    }
78
79
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
80
0
                        uint32_t result, size_t input_rows_count) const override {
81
0
        auto haystack_column = block.get_by_position(arguments[0]).column;
82
0
        auto needles_column = block.get_by_position(arguments[1]).column;
83
84
0
        auto haystack_ptr = remove_nullable(haystack_column);
85
0
        auto needles_ptr = remove_nullable(needles_column);
86
87
0
        const auto* col_haystack_vector = check_and_get_column<ColumnString>(&*haystack_ptr);
88
0
        const ColumnConst* col_haystack_const =
89
0
                check_and_get_column_const<ColumnString>(&*haystack_ptr);
90
91
0
        const auto* col_needles_vector = check_and_get_column<ColumnArray>(needles_ptr.get());
92
0
        const ColumnConst* col_needles_const =
93
0
                check_and_get_column_const<ColumnArray>(needles_ptr.get());
94
95
0
        if (!col_needles_const && !col_needles_vector) {
96
0
            return Status::InvalidArgument(
97
0
                    "function '{}' encountered unsupported needles column, found {}", name,
98
0
                    needles_column->get_name());
99
0
        }
100
101
0
        if (col_haystack_const && col_needles_vector) {
102
0
            return Status::InvalidArgument(
103
0
                    "function '{}' doesn't support search with non-constant needles "
104
0
                    "in constant haystack",
105
0
                    name);
106
0
        }
107
108
0
        auto col_res = ColumnVector<Impl::ResultType>::create();
109
0
        auto col_offsets = ColumnArray::ColumnOffsets::create();
110
111
0
        auto& vec_res = col_res->get_data();
112
0
        auto& offsets_res = col_offsets->get_data();
113
114
0
        Status status;
115
0
        if (col_needles_const) {
116
0
            status = Impl::vector_constant(
117
0
                    col_haystack_vector->get_chars(), col_haystack_vector->get_offsets(),
118
0
                    col_needles_const->get_value<TYPE_ARRAY>(), vec_res, offsets_res);
119
0
        } else {
120
0
            status = Impl::vector_vector(col_haystack_vector->get_chars(),
121
0
                                         col_haystack_vector->get_offsets(),
122
0
                                         col_needles_vector->get_data(),
123
0
                                         col_needles_vector->get_offsets(), vec_res, offsets_res);
124
0
        }
125
126
0
        if (!status.ok()) {
127
0
            return status;
128
0
        }
129
130
0
        handle_nullable_column(haystack_column, vec_res, offsets_res, input_rows_count);
131
0
        handle_nullable_column(needles_column, vec_res, offsets_res, input_rows_count);
132
133
0
        auto nullable_col =
134
0
                ColumnNullable::create(std::move(col_res), ColumnUInt8::create(col_res->size(), 0));
135
0
        block.get_by_position(result).column =
136
0
                ColumnArray::create(std::move(nullable_col), std::move(col_offsets));
137
0
        return status;
138
0
    }
139
140
private:
141
    using ResultContainer = typename ColumnVector<Impl::ResultType>::Container;
142
143
    void fill_result_row_with_zero(ResultContainer& vec_res,
144
0
                                   const PaddedPODArray<UInt64>& offsets_res, size_t row) const {
145
0
        for (size_t offset = offsets_res[row - 1]; offset != offsets_res[row]; ++offset) {
146
0
            vec_res[offset] = 0;
147
0
        }
148
0
    }
149
150
    void handle_nullable_column(const ColumnPtr& column, ResultContainer& vec_res,
151
                                const PaddedPODArray<UInt64>& offsets_res,
152
0
                                size_t input_rows_count) const {
153
0
        if (const auto* nullable = check_and_get_column<ColumnNullable>(column.get())) {
154
0
            const auto& null_map = nullable->get_null_map_data();
155
0
            for (size_t i = 0; i != input_rows_count; ++i) {
156
0
                if (null_map[i]) {
157
0
                    fill_result_row_with_zero(vec_res, offsets_res, i);
158
0
                }
159
0
            }
160
0
        } else if (const auto* const_column = check_and_get_column<ColumnConst>(column.get());
161
0
                   const_column && is_column_nullable(const_column->get_data_column())) {
162
0
            const auto& const_nullable =
163
0
                    assert_cast<const ColumnNullable&>(const_column->get_data_column());
164
0
            if (const_nullable.get_null_map_data()[0]) {
165
0
                std::fill(vec_res.begin(), vec_res.end(), 0);
166
0
            }
167
0
        }
168
0
    }
169
};
170
171
struct FunctionMultiSearchAllPositionsImpl {
172
public:
173
    static constexpr PrimitiveType ResultType = TYPE_INT;
174
    using SingleSearcher = ASCIICaseSensitiveStringSearcher;
175
    static constexpr auto name = "multi_search_all_positions";
176
177
    static Status vector_constant(const ColumnString::Chars& haystack_data,
178
                                  const ColumnString::Offsets& haystack_offsets,
179
                                  const Array& needles_arr, PaddedPODArray<Int32>& vec_res,
180
0
                                  PaddedPODArray<UInt64>& offsets_res) {
181
0
        if (needles_arr.size() > std::numeric_limits<UInt8>::max()) {
182
0
            return Status::InvalidArgument(
183
0
                    "number of arguments for function {} doesn't match: "
184
0
                    "passed {}, should be at most 255",
185
0
                    name, needles_arr.size());
186
0
        }
187
188
0
        const size_t needles_size = needles_arr.size();
189
0
        std::vector<SingleSearcher> searchers;
190
0
        searchers.reserve(needles_size);
191
0
        for (const auto& needle : needles_arr) {
192
0
            if (!is_string_type(needle.get_type())) {
193
0
                return Status::InvalidArgument("invalid type of needle {}", needle.get_type_name());
194
0
            }
195
0
            searchers.emplace_back(needle.get<TYPE_STRING>().data(),
196
0
                                   needle.get<TYPE_STRING>().size());
197
0
        }
198
199
0
        const size_t haystack_size = haystack_offsets.size();
200
0
        vec_res.resize(haystack_size * needles_size);
201
0
        offsets_res.resize(haystack_size);
202
203
0
        std::fill(vec_res.begin(), vec_res.end(), 0);
204
205
        // we traverse to generator answer by Vector's slot of ColumnVector, not by Vector.
206
        // TODO: check if the order of loop is best. The large data may make us writing across the line which size out of L2 cache.
207
0
        for (size_t ans_slot_in_row = 0; ans_slot_in_row < searchers.size(); ans_slot_in_row++) {
208
            //  is i.e. answer slot index in one Vector(row) of answer
209
0
            auto& searcher = searchers[ans_slot_in_row];
210
0
            size_t prev_haystack_offset = 0;
211
212
0
            for (size_t haystack_index = 0, res_index = ans_slot_in_row;
213
0
                 haystack_index < haystack_size; ++haystack_index, res_index += needles_size) {
214
0
                const auto* haystack = &haystack_data[prev_haystack_offset];
215
0
                const auto* haystack_end =
216
0
                        haystack - prev_haystack_offset + haystack_offsets[haystack_index];
217
218
0
                const auto* ans_now = searcher.search(haystack, haystack_end);
219
0
                vec_res[res_index] =
220
0
                        ans_now >= haystack_end ? 0 : (Int32)std::distance(haystack, ans_now) + 1;
221
0
                prev_haystack_offset = haystack_offsets[haystack_index];
222
0
            }
223
0
        }
224
225
0
        size_t accum = needles_size;
226
0
        for (size_t i = 0; i < haystack_size; ++i) {
227
0
            offsets_res[i] = accum;
228
0
            accum += needles_size;
229
0
        }
230
231
0
        return Status::OK();
232
0
    }
233
234
    static Status vector_vector(const ColumnString::Chars& haystack_data,
235
                                const ColumnString::Offsets& haystack_offsets,
236
                                const IColumn& needles_data,
237
                                const ColumnArray::Offsets64& needles_offsets,
238
                                PaddedPODArray<Int32>& vec_res,
239
0
                                PaddedPODArray<UInt64>& offsets_res) {
240
0
        size_t prev_haystack_offset = 0;
241
0
        size_t prev_needles_offset = 0;
242
243
0
        offsets_res.reserve(haystack_data.size());
244
0
        uint64_t offset_now = 0;
245
246
0
        auto& nested_column = assert_cast<const ColumnNullable&>(needles_data).get_nested_column();
247
0
        const ColumnString* needles_data_string = assert_cast<const ColumnString*>(&nested_column);
248
249
0
        std::vector<StringRef> needles_for_row;
250
        // haystack first, row by row.
251
0
        for (size_t haystack_index = 0; haystack_index < haystack_offsets.size();
252
0
             ++haystack_index) {
253
            // get haystack for this row.
254
0
            const auto* haystack = &haystack_data[prev_haystack_offset];
255
0
            const auto* haystack_end =
256
0
                    haystack - prev_haystack_offset + haystack_offsets[haystack_index];
257
258
            // build needles for this row.
259
0
            needles_for_row.reserve(needles_offsets[haystack_index] - prev_needles_offset);
260
0
            for (size_t j = prev_needles_offset; j < needles_offsets[haystack_index]; ++j) {
261
0
                needles_for_row.emplace_back(needles_data_string->get_data_at(j));
262
0
            }
263
0
            const size_t needles_row_size = needles_for_row.size();
264
0
            if (needles_row_size > std::numeric_limits<UInt8>::max()) {
265
0
                return Status::InvalidArgument(
266
0
                        "number of arguments for function {} doesn't match: "
267
0
                        "passed {}, should be at most 255",
268
0
                        name, needles_row_size);
269
0
            }
270
271
            // each searcher search for one needle.
272
0
            std::vector<SingleSearcher> searchers;
273
0
            searchers.clear();
274
0
            searchers.reserve(needles_row_size);
275
0
            for (auto needle : needles_for_row) {
276
0
                searchers.emplace_back(needle.data, needle.size);
277
0
            }
278
279
            // search for first so that the ans's size is constant for each row.
280
0
            auto ans_row_begin = vec_res.size();
281
0
            vec_res.resize(vec_res.size() + needles_row_size);
282
0
            offset_now += searchers.size();
283
0
            offsets_res.emplace_back(offset_now);
284
285
            //for now haystack, apply needle to search, generator answer by order.
286
0
            for (size_t ans_slot_in_row = 0; ans_slot_in_row < searchers.size();
287
0
                 ans_slot_in_row++) {
288
                //  is i.e. answer slot index in one Vector(row) of answer
289
0
                auto& searcher = searchers[ans_slot_in_row];
290
291
0
                auto ans_now = searcher.search(haystack, haystack_end);
292
0
                vec_res[ans_row_begin + ans_slot_in_row] =
293
0
                        ans_now >= haystack_end ? 0 : (Int32)std::distance(haystack, ans_now) + 1;
294
0
            }
295
296
0
            prev_haystack_offset = haystack_offsets[haystack_index];
297
0
            prev_needles_offset = needles_offsets[haystack_index];
298
0
            needles_for_row.clear();
299
0
        }
300
301
0
        return Status::OK();
302
0
    }
303
};
304
305
using FunctionMultiSearchAllPositions =
306
        FunctionMultiStringPosition<FunctionMultiSearchAllPositionsImpl>;
307
308
1
void register_function_multi_string_position(SimpleFunctionFactory& factory) {
309
1
    factory.register_function<FunctionMultiSearchAllPositions>();
310
1
}
311
312
} // namespace doris