Coverage Report

Created: 2026-06-12 20:27

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/functions_multi_string_search.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
// This file is copied from
18
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/FunctionsMultiStringSearch.h
19
// and modified by Doris
20
21
#include <hs/hs_common.h>
22
#include <hs/hs_runtime.h>
23
24
#include <algorithm>
25
#include <boost/iterator/iterator_facade.hpp>
26
#include <cstddef>
27
#include <limits>
28
#include <memory>
29
#include <optional>
30
#include <utility>
31
#include <vector>
32
33
#include "common/status.h"
34
#include "core/block/block.h"
35
#include "core/block/column_numbers.h"
36
#include "core/block/column_with_type_and_name.h"
37
#include "core/column/column.h"
38
#include "core/column/column_array.h"
39
#include "core/column/column_const.h"
40
#include "core/column/column_nullable.h"
41
#include "core/column/column_string.h"
42
#include "core/column/column_vector.h"
43
#include "core/data_type/data_type.h"
44
#include "core/data_type/data_type_number.h" // IWYU pragma: keep
45
#include "core/field.h"
46
#include "core/pod_array_fwd.h"
47
#include "core/string_ref.h"
48
#include "core/types.h"
49
#include "exprs/aggregate/aggregate_function.h"
50
#include "exprs/function/function.h"
51
#include "exprs/function/function_helpers.h"
52
#include "exprs/function/regexps.h"
53
#include "exprs/function/simple_function_factory.h"
54
55
namespace doris {
56
class FunctionContext;
57
} // namespace doris
58
59
namespace doris {
60
61
template <typename Impl>
62
class FunctionsMultiStringSearch : public IFunction {
63
public:
64
    static constexpr auto name = Impl::name;
65
66
2
    static FunctionPtr create() { return std::make_shared<FunctionsMultiStringSearch>(); }
67
68
1
    String get_name() const override { return name; }
69
70
0
    size_t get_number_of_arguments() const override { return 2; }
71
72
0
    bool use_default_implementation_for_nulls() const override { return false; }
73
74
0
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
75
0
        return Impl::get_return_type();
76
0
    }
77
78
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
79
0
                        uint32_t result, size_t input_rows_count) const override {
80
0
        auto haystack_column = block.get_by_position(arguments[0]).column;
81
0
        auto needles_column = block.get_by_position(arguments[1]).column;
82
83
0
        auto haystack_ptr = remove_nullable(haystack_column);
84
0
        auto needles_ptr = remove_nullable(needles_column);
85
86
0
        const auto* col_haystack_vector = check_and_get_column<ColumnString>(&*haystack_ptr);
87
0
        const ColumnConst* col_haystack_const =
88
0
                check_and_get_column_const<ColumnString>(&*haystack_ptr);
89
90
0
        const auto* col_needles_vector = check_and_get_column<ColumnArray>(needles_ptr.get());
91
0
        const ColumnConst* col_needles_const =
92
0
                check_and_get_column_const<ColumnArray>(needles_ptr.get());
93
94
0
        if (!col_needles_const && !col_needles_vector) {
95
0
            return Status::InvalidArgument(
96
0
                    "function '{}' encountered unsupported needles column, found {}", name,
97
0
                    needles_column->get_name());
98
0
        }
99
100
0
        if (col_haystack_const && col_needles_vector) {
101
0
            return Status::InvalidArgument(
102
0
                    "function '{}' doesn't support search with non-constant needles "
103
0
                    "in constant haystack",
104
0
                    name);
105
0
        }
106
107
0
        auto col_res = ColumnVector<Impl::ResultPType>::create();
108
0
        auto col_offsets = ColumnArray::ColumnOffsets::create();
109
110
0
        auto& vec_res = col_res->get_data();
111
0
        auto& offsets_res = col_offsets->get_data();
112
113
0
        Status status;
114
0
        if (col_needles_const) {
115
0
            status = Impl::vector_constant(
116
0
                    col_haystack_vector->get_chars(), col_haystack_vector->get_offsets(),
117
0
                    col_needles_const->get_value<TYPE_ARRAY>(), vec_res, offsets_res,
118
0
                    allow_hyperscan_, max_hyperscan_regexp_length_,
119
0
                    max_hyperscan_regexp_total_length_);
120
0
        } else {
121
0
            status = Impl::vector_vector(
122
0
                    col_haystack_vector->get_chars(), col_haystack_vector->get_offsets(),
123
0
                    col_needles_vector->get_data(), col_needles_vector->get_offsets(), vec_res,
124
0
                    offsets_res, allow_hyperscan_, max_hyperscan_regexp_length_,
125
0
                    max_hyperscan_regexp_total_length_);
126
0
        }
127
128
0
        if (!status.ok()) {
129
0
            return status;
130
0
        }
131
132
0
        handle_nullable_column(haystack_column, vec_res, input_rows_count);
133
0
        handle_nullable_column(needles_column, vec_res, input_rows_count);
134
135
0
        block.replace_by_position(result, std::move(col_res));
136
137
0
        return status;
138
0
    }
139
140
private:
141
    using ResultType = typename Impl::ResultType;
142
143
    constexpr static bool allow_hyperscan_ = true;
144
    constexpr static size_t max_hyperscan_regexp_length_ = 0;       // not limited
145
    constexpr static size_t max_hyperscan_regexp_total_length_ = 0; // not limited
146
147
    /// Handles nullable column by setting result to 0 if the input is null
148
    void handle_nullable_column(const ColumnPtr& column, PaddedPODArray<ResultType>& vec_res,
149
0
                                size_t input_rows_count) const {
150
0
        if (const auto* nullable = check_and_get_column<ColumnNullable>(column.get())) {
151
0
            const auto& null_map = nullable->get_null_map_data();
152
0
            for (size_t i = 0; i != input_rows_count; ++i) {
153
0
                if (null_map[i]) {
154
0
                    vec_res[i] = 0;
155
0
                }
156
0
            }
157
0
        } else if (const auto* const_column = check_and_get_column<ColumnConst>(column.get());
158
0
                   const_column && is_column_nullable(const_column->get_data_column())) {
159
0
            const auto& const_nullable =
160
0
                    assert_cast<const ColumnNullable&>(const_column->get_data_column());
161
0
            if (const_nullable.get_null_map_data()[0]) {
162
0
                std::fill(vec_res.begin(), vec_res.begin() + input_rows_count, 0);
163
0
            }
164
0
        }
165
0
    }
166
};
167
168
/// For more readable instantiations of MultiMatchAnyImpl<>
169
struct MultiMatchTraits {
170
    enum class Find { Any, AnyIndex };
171
};
172
173
template <PrimitiveType PType, MultiMatchTraits::Find Find, bool WithEditDistance>
174
struct FunctionMultiMatchAnyImpl {
175
    using ResultType = typename PrimitiveTypeTraits<PType>::CppType;
176
    static constexpr PrimitiveType ResultPType = PType;
177
178
    static constexpr bool FindAny = (Find == MultiMatchTraits::Find::Any);
179
    static constexpr bool FindAnyIndex = (Find == MultiMatchTraits::Find::AnyIndex);
180
181
    static constexpr auto name = "multi_match_any";
182
183
0
    static auto get_return_type() {
184
0
        return std::make_shared<typename PrimitiveTypeTraits<PType>::DataType>();
185
0
    }
186
187
    /**
188
     * Prepares the regular expressions and scratch space for Hyperscan.
189
     *
190
     * This function takes a vector of needles (substrings to search for) and initializes
191
     * the regular expressions and scratch space required for Hyperscan, a high-performance
192
     * regular expression matching library.
193
     *
194
     */
195
    static Status prepare_regexps_and_scratch(const std::vector<StringRef>& needles,
196
                                              multiregexps::Regexps*& regexps,
197
0
                                              multiregexps::ScratchPtr& smart_scratch) {
198
0
        multiregexps::DeferredConstructedRegexpsPtr deferred_constructed_regexps =
199
0
                multiregexps::getOrSet</*SaveIndices*/
200
0
                                       FindAnyIndex, WithEditDistance>(needles, std::nullopt);
201
0
        regexps = deferred_constructed_regexps->get();
202
203
0
        hs_scratch_t* scratch = nullptr;
204
0
        hs_error_t err = hs_clone_scratch(regexps->getScratch(), &scratch);
205
206
0
        if (err != HS_SUCCESS) {
207
0
            return Status::InternalError("could not clone scratch space for vectorscan");
208
0
        }
209
210
0
        smart_scratch.reset(scratch);
211
0
        return Status::OK();
212
0
    }
213
214
    /**
215
     * Static callback function to handle the match results of the hs_scan function.
216
     *
217
     * This function is called when a matching substring is found while scanning with
218
     * Hyperscan. It updates the result based on the match information.
219
     *
220
     */
221
    static int on_match([[maybe_unused]] unsigned int id, unsigned long long /* from */, // NOLINT
222
                        unsigned long long /* to */,                                     // NOLINT
223
0
                        unsigned int /* flags */, void* context) {
224
        if constexpr (FindAnyIndex) {
225
            *reinterpret_cast<ResultType*>(context) = id;
226
0
        } else if constexpr (FindAny) {
227
0
            *reinterpret_cast<ResultType*>(context) = 1;
228
0
        }
229
        /// Once we hit the callback, there is no need to search for others.
230
0
        return 1;
231
0
    }
232
233
    static Status vector_constant(const ColumnString::Chars& haystack_data,
234
                                  const ColumnString::Offsets& haystack_offsets,
235
                                  const Array& needles_arr, PaddedPODArray<ResultType>& res,
236
                                  PaddedPODArray<UInt64>& offsets, bool allow_hyperscan,
237
                                  size_t max_hyperscan_regexp_length,
238
0
                                  size_t max_hyperscan_regexp_total_length) {
239
0
        if (!allow_hyperscan) {
240
0
            return Status::InvalidArgument("Hyperscan functions are disabled");
241
0
        }
242
243
0
        std::vector<StringRef> needles;
244
0
        needles.reserve(needles_arr.size());
245
0
        for (const auto& needle : needles_arr) {
246
0
            const auto& tmp = needle.get<TYPE_STRING>();
247
0
            needles.emplace_back(StringRef {tmp.data(), tmp.size()});
248
0
        }
249
250
0
        res.resize(haystack_offsets.size());
251
252
0
        if (needles_arr.empty()) {
253
0
            std::fill(res.begin(), res.end(), 0);
254
0
            return Status::OK();
255
0
        }
256
257
0
        multiregexps::Regexps* regexps = nullptr;
258
0
        multiregexps::ScratchPtr smart_scratch;
259
0
        RETURN_IF_ERROR(prepare_regexps_and_scratch(needles, regexps, smart_scratch));
260
261
0
        const size_t haystack_offsets_size = haystack_offsets.size();
262
0
        UInt64 offset = 0;
263
0
        for (size_t i = 0; i < haystack_offsets_size; ++i) {
264
0
            UInt64 length = haystack_offsets[i] - offset;
265
            /// vectorscan restriction.
266
0
            if (length > std::numeric_limits<UInt32>::max()) {
267
0
                return Status::InternalError("too long string to search");
268
0
            }
269
            /// zero the result, scan, check, update the offset.
270
0
            res[i] = 0;
271
0
            hs_error_t err = hs_scan(
272
0
                    regexps->getDB(), reinterpret_cast<const char*>(haystack_data.data()) + offset,
273
0
                    static_cast<unsigned>(length), 0, smart_scratch.get(), on_match, &res[i]);
274
0
            if (err != HS_SUCCESS && err != HS_SCAN_TERMINATED) {
275
0
                return Status::InternalError("failed to scan with vectorscan");
276
0
            }
277
0
            offset = haystack_offsets[i];
278
0
        }
279
280
0
        return Status::OK();
281
0
    }
282
283
    static Status vector_vector(const ColumnString::Chars& haystack_data,
284
                                const ColumnString::Offsets& haystack_offsets,
285
                                const IColumn& needles_data,
286
                                const ColumnArray::Offsets64& needles_offsets,
287
                                PaddedPODArray<ResultType>& res, PaddedPODArray<UInt64>& offsets,
288
                                bool allow_hyperscan, size_t max_hyperscan_regexp_length,
289
0
                                size_t max_hyperscan_regexp_total_length) {
290
0
        if (!allow_hyperscan) {
291
0
            return Status::InvalidArgument("Hyperscan functions are disabled");
292
0
        }
293
294
0
        res.resize(haystack_offsets.size());
295
296
0
        size_t prev_haystack_offset = 0;
297
0
        size_t prev_needles_offset = 0;
298
299
0
        const auto& nested_column =
300
0
                assert_cast<const ColumnNullable&>(needles_data).get_nested_column();
301
0
        const auto* needles_data_string = check_and_get_column<ColumnString>(nested_column);
302
303
0
        if (!needles_data_string) {
304
0
            return Status::InvalidArgument("needles should be string column");
305
0
        }
306
307
0
        std::vector<StringRef> needles;
308
0
        for (size_t i = 0; i < haystack_offsets.size(); ++i) {
309
0
            needles.reserve(needles_offsets[i] - prev_needles_offset);
310
311
0
            for (size_t j = prev_needles_offset; j < needles_offsets[i]; ++j) {
312
0
                needles.emplace_back(needles_data_string->get_data_at(j));
313
0
            }
314
0
            if (needles.empty()) {
315
0
                res[i] = 0;
316
0
                prev_haystack_offset = haystack_offsets[i];
317
0
                prev_needles_offset = needles_offsets[i];
318
0
                continue;
319
0
            }
320
321
0
            multiregexps::Regexps* regexps = nullptr;
322
0
            multiregexps::ScratchPtr smart_scratch;
323
0
            RETURN_IF_ERROR(prepare_regexps_and_scratch(needles, regexps, smart_scratch));
324
325
0
            const size_t cur_haystack_length = haystack_offsets[i] - prev_haystack_offset;
326
327
            /// vectorscan restriction.
328
0
            if (cur_haystack_length > std::numeric_limits<UInt32>::max()) {
329
0
                return Status::InternalError("too long string to search");
330
0
            }
331
332
            /// zero the result, scan, check, update the offset.
333
0
            res[i] = 0;
334
0
            hs_error_t err = hs_scan(
335
0
                    regexps->getDB(),
336
0
                    reinterpret_cast<const char*>(haystack_data.data()) + prev_haystack_offset,
337
0
                    static_cast<unsigned>(cur_haystack_length), 0, smart_scratch.get(), on_match,
338
0
                    &res[i]);
339
0
            if (err != HS_SUCCESS && err != HS_SCAN_TERMINATED) {
340
0
                return Status::InternalError("failed to scan with vectorscan");
341
0
            }
342
343
0
            prev_haystack_offset = haystack_offsets[i];
344
0
            prev_needles_offset = needles_offsets[i];
345
0
            needles.clear();
346
0
        }
347
348
0
        return Status::OK();
349
0
    }
350
};
351
352
using FunctionMultiMatchAny = FunctionsMultiStringSearch<FunctionMultiMatchAnyImpl<
353
        TYPE_TINYINT, MultiMatchTraits::Find::Any, /*WithEditDistance*/ false>>;
354
355
1
void register_function_multi_string_search(SimpleFunctionFactory& factory) {
356
1
    factory.register_function<FunctionMultiMatchAny>();
357
1
}
358
359
} // namespace doris