Coverage Report

Created: 2026-04-14 12:18

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/match.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exprs/function/match.h"
19
20
#include <hs/hs.h>
21
22
#include "runtime/query_context.h"
23
#include "runtime/runtime_state.h"
24
#include "storage/index/index_reader_helper.h"
25
#include "storage/index/inverted/analyzer/analyzer.h"
26
#include "util/debug_points.h"
27
28
namespace doris {
29
30
namespace {
31
32
1.37k
const InvertedIndexAnalyzerCtx* get_match_analyzer_ctx(FunctionContext* context) {
33
1.37k
    if (context == nullptr) {
34
0
        return nullptr;
35
0
    }
36
1.37k
    auto* analyzer_ctx = reinterpret_cast<const InvertedIndexAnalyzerCtx*>(
37
1.37k
            context->get_function_state(FunctionContext::THREAD_LOCAL));
38
1.37k
    if (analyzer_ctx == nullptr) {
39
0
        analyzer_ctx = reinterpret_cast<const InvertedIndexAnalyzerCtx*>(
40
0
                context->get_function_state(FunctionContext::FRAGMENT_LOCAL));
41
0
    }
42
1.37k
    return analyzer_ctx;
43
1.37k
}
44
45
} // namespace
46
47
Status FunctionMatchBase::evaluate_inverted_index(
48
        const ColumnsWithTypeAndName& arguments,
49
        const std::vector<IndexFieldNameAndTypePair>& data_type_with_names,
50
        std::vector<segment_v2::IndexIterator*> iterators, uint32_t num_rows,
51
        const InvertedIndexAnalyzerCtx* analyzer_ctx,
52
3.73k
        segment_v2::InvertedIndexResultBitmap& bitmap_result) const {
53
3.73k
    DCHECK(arguments.size() == 1);
54
3.73k
    DCHECK(data_type_with_names.size() == 1);
55
3.73k
    DCHECK(iterators.size() == 1);
56
3.73k
    auto* iter = iterators[0];
57
3.73k
    auto data_type_with_name = data_type_with_names[0];
58
3.73k
    if (iter == nullptr) {
59
0
        return Status::OK();
60
0
    }
61
3.73k
    const std::string& function_name = get_name();
62
63
3.73k
    if (function_name == MATCH_PHRASE_FUNCTION || function_name == MATCH_PHRASE_PREFIX_FUNCTION ||
64
3.73k
        function_name == MATCH_PHRASE_EDGE_FUNCTION) {
65
472
        auto reader = iter->get_reader(InvertedIndexReaderType::FULLTEXT);
66
472
        if (reader && !segment_v2::IndexReaderHelper::is_support_phrase(reader)) {
67
2
            return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
68
2
                    "phrase queries require setting support_phrase = true");
69
2
        }
70
472
    }
71
3.73k
    Field param_value;
72
3.73k
    arguments[0].column->get(0, param_value);
73
3.73k
    if (param_value.is_null()) {
74
        // if query value is null, skip evaluate inverted index
75
0
        return Status::OK();
76
0
    }
77
3.73k
    auto param_type = arguments[0].type->get_primitive_type();
78
3.73k
    if (!is_string_type(param_type)) {
79
0
        return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
80
0
                "arguments for match must be string");
81
0
    }
82
3.73k
    std::unique_ptr<InvertedIndexQueryParamFactory> query_param = nullptr;
83
3.73k
    RETURN_IF_ERROR(InvertedIndexQueryParamFactory::create_query_value(param_type, &param_value,
84
3.73k
                                                                       query_param));
85
86
3.73k
    InvertedIndexParam param;
87
3.73k
    param.column_name = data_type_with_name.first;
88
3.73k
    param.column_type = data_type_with_name.second;
89
3.73k
    param.query_value = query_param->get_value();
90
3.73k
    param.query_type = get_query_type_from_fn_name();
91
3.73k
    param.num_rows = num_rows;
92
3.73k
    param.roaring = std::make_shared<roaring::Roaring>();
93
3.73k
    param.analyzer_ctx = analyzer_ctx;
94
3.74k
    if (is_string_type(param_type)) {
95
3.74k
        RETURN_IF_ERROR(iter->read_from_index(&param));
96
18.4E
    } else {
97
18.4E
        return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
98
18.4E
                "invalid params type for FunctionMatchBase::evaluate_inverted_index {}",
99
18.4E
                param_type);
100
18.4E
    }
101
3.74k
    std::shared_ptr<roaring::Roaring> null_bitmap = std::make_shared<roaring::Roaring>();
102
3.77k
    if (iter->has_null()) {
103
3.77k
        segment_v2::InvertedIndexQueryCacheHandle null_bitmap_cache_handle;
104
3.77k
        RETURN_IF_ERROR(iter->read_null_bitmap(&null_bitmap_cache_handle));
105
3.77k
        null_bitmap = null_bitmap_cache_handle.get_bitmap();
106
3.77k
    }
107
3.74k
    segment_v2::InvertedIndexResultBitmap result(param.roaring, null_bitmap);
108
3.74k
    bitmap_result = result;
109
3.74k
    bitmap_result.mask_out_null();
110
111
3.74k
    return Status::OK();
112
3.74k
}
113
Status FunctionMatchBase::execute_impl(FunctionContext* context, Block& block,
114
                                       const ColumnNumbers& arguments, uint32_t result,
115
1.36k
                                       size_t input_rows_count) const {
116
1.36k
    ColumnPtr& column_ptr = block.get_by_position(arguments[1]).column;
117
1.36k
    DataTypePtr& type_ptr = block.get_by_position(arguments[1]).type;
118
119
1.36k
    auto format_options = DataTypeSerDe::get_default_format_options();
120
1.36k
    auto time_zone = cctz::utc_time_zone();
121
1.36k
    format_options.timezone =
122
18.4E
            (context && context->state()) ? &context->state()->timezone_obj() : &time_zone;
123
124
1.36k
    auto match_query_str = type_ptr->to_string(*column_ptr, 0, format_options);
125
1.36k
    std::string column_name = block.get_by_position(arguments[0]).name;
126
18.4E
    VLOG_DEBUG << "begin to execute match directly, column_name=" << column_name
127
18.4E
               << ", match_query_str=" << match_query_str;
128
1.36k
    auto* analyzer_ctx = get_match_analyzer_ctx(context);
129
1.36k
    const ColumnPtr source_col =
130
1.36k
            block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
131
1.36k
    const auto* values = check_and_get_column<ColumnString>(source_col.get());
132
1.36k
    const ColumnArray* array_col = nullptr;
133
1.36k
    if (is_column<ColumnArray>(source_col.get())) {
134
11
        array_col = check_and_get_column<ColumnArray>(source_col.get());
135
11
        if (array_col && !array_col->get_data().is_column_string()) {
136
0
            return Status::NotSupported(fmt::format(
137
0
                    "unsupported nested array of type {} for function {}",
138
0
                    is_column_nullable(array_col->get_data()) ? array_col->get_data().get_name()
139
0
                                                              : array_col->get_data().get_name(),
140
0
                    get_name()));
141
0
        }
142
143
11
        if (is_column_nullable(array_col->get_data())) {
144
11
            const auto& array_nested_null_column =
145
11
                    reinterpret_cast<const ColumnNullable&>(array_col->get_data());
146
11
            values = check_and_get_column<ColumnString>(
147
11
                    *(array_nested_null_column.get_nested_column_ptr()));
148
11
        } else {
149
            // array column element is always set Nullable for now.
150
0
            values = check_and_get_column<ColumnString>(*(array_col->get_data_ptr()));
151
0
        }
152
1.35k
    } else if (const auto* nullable = check_and_get_column<ColumnNullable>(source_col.get())) {
153
0
        values = check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr());
154
0
    }
155
156
1.36k
    if (!values) {
157
0
        LOG(WARNING) << "Illegal column " << source_col->get_name();
158
0
        return Status::InternalError("Not supported input column types");
159
0
    }
160
    // result column
161
1.36k
    auto res = ColumnUInt8::create();
162
1.36k
    ColumnUInt8::Container& vec_res = res->get_data();
163
    // set default value to 0, and match functions only need to set 1/true
164
1.36k
    vec_res.resize_fill(input_rows_count);
165
1.36k
    RETURN_IF_ERROR(execute_match(context, column_name, match_query_str, input_rows_count, values,
166
1.36k
                                  analyzer_ctx, (array_col ? &(array_col->get_offsets()) : nullptr),
167
1.36k
                                  vec_res));
168
1.36k
    block.replace_by_position(result, std::move(res));
169
170
1.36k
    return Status::OK();
171
1.36k
}
172
173
inline doris::segment_v2::InvertedIndexQueryType FunctionMatchBase::get_query_type_from_fn_name()
174
3.75k
        const {
175
3.75k
    std::string fn_name = get_name();
176
3.75k
    if (fn_name == MATCH_ANY_FUNCTION) {
177
2.66k
        return doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY;
178
2.66k
    } else if (fn_name == MATCH_ALL_FUNCTION) {
179
598
        return doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY;
180
598
    } else if (fn_name == MATCH_PHRASE_FUNCTION) {
181
442
        return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY;
182
442
    } else if (fn_name == MATCH_PHRASE_PREFIX_FUNCTION) {
183
34
        return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY;
184
34
    } else if (fn_name == MATCH_PHRASE_REGEXP_FUNCTION) {
185
21
        return doris::segment_v2::InvertedIndexQueryType::MATCH_REGEXP_QUERY;
186
18.4E
    } else if (fn_name == MATCH_PHRASE_EDGE_FUNCTION) {
187
3
        return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY;
188
3
    }
189
18.4E
    return doris::segment_v2::InvertedIndexQueryType::UNKNOWN_QUERY;
190
3.75k
}
191
192
std::vector<TermInfo> FunctionMatchBase::analyse_query_str_token(
193
        const InvertedIndexAnalyzerCtx* analyzer_ctx, const std::string& match_query_str,
194
1.36k
        const std::string& column_name) const {
195
1.36k
    std::vector<TermInfo> query_tokens;
196
1.36k
    if (analyzer_ctx == nullptr) {
197
3
        return query_tokens;
198
3
    }
199
200
1.36k
    VLOG_DEBUG << "begin to run " << get_name() << ", parser_type: "
201
0
               << inverted_index_parser_type_to_string(analyzer_ctx->parser_type);
202
203
    // Decision is based on parser_type (from index properties):
204
    // - PARSER_NONE: no tokenization (keyword/exact match)
205
    // - Other parsers: tokenize using the analyzer
206
1.36k
    if (!analyzer_ctx->should_tokenize()) {
207
        // Keyword index: all strings (including empty) are valid tokens for exact match.
208
        // Empty string is a valid value in keyword index and should be matchable.
209
22
        query_tokens.emplace_back(match_query_str);
210
22
        return query_tokens;
211
22
    }
212
213
    // Safety check: if analyzer is nullptr but tokenization is expected, fall back to no tokenization
214
1.34k
    if (analyzer_ctx->analyzer == nullptr) {
215
0
        VLOG_DEBUG << "Analyzer is nullptr, falling back to no tokenization";
216
        // For fallback case, also allow empty strings to be matched
217
0
        query_tokens.emplace_back(match_query_str);
218
0
        return query_tokens;
219
0
    }
220
221
    // Tokenize using the analyzer
222
1.34k
    auto reader = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader(
223
1.34k
            analyzer_ctx->char_filter_map);
224
1.34k
    reader->init(match_query_str.data(), (int)match_query_str.size(), true);
225
1.34k
    query_tokens = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
226
1.34k
            reader, analyzer_ctx->analyzer.get());
227
1.34k
    return query_tokens;
228
1.34k
}
229
230
inline std::vector<TermInfo> FunctionMatchBase::analyse_data_token(
231
        const std::string& column_name, const InvertedIndexAnalyzerCtx* analyzer_ctx,
232
        const ColumnString* string_col, int32_t current_block_row_idx,
233
154k
        const ColumnArray::Offsets64* array_offsets, int32_t& current_src_array_offset) const {
234
154k
    std::vector<TermInfo> data_tokens;
235
154k
    if (analyzer_ctx == nullptr) {
236
0
        return data_tokens;
237
0
    }
238
239
    // Determine tokenization strategy based on parser_type
240
154k
    const bool should_tokenize =
241
154k
            analyzer_ctx->should_tokenize() && analyzer_ctx->analyzer != nullptr;
242
243
154k
    if (array_offsets) {
244
13
        for (auto next_src_array_offset = (*array_offsets)[current_block_row_idx];
245
28
             current_src_array_offset < next_src_array_offset; ++current_src_array_offset) {
246
15
            const auto& str_ref = string_col->get_data_at(current_src_array_offset);
247
15
            if (!should_tokenize) {
248
0
                data_tokens.emplace_back(str_ref.to_string());
249
0
                continue;
250
0
            }
251
15
            auto reader = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader(
252
15
                    analyzer_ctx->char_filter_map);
253
15
            reader->init(str_ref.data, (int)str_ref.size, true);
254
15
            data_tokens =
255
15
                    doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
256
15
                            reader, analyzer_ctx->analyzer.get());
257
15
        }
258
154k
    } else {
259
154k
        const auto& str_ref = string_col->get_data_at(current_block_row_idx);
260
154k
        if (!should_tokenize) {
261
35
            data_tokens.emplace_back(str_ref.to_string());
262
154k
        } else {
263
154k
            auto reader = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader(
264
154k
                    analyzer_ctx->char_filter_map);
265
154k
            reader->init(str_ref.data, (int)str_ref.size, true);
266
154k
            data_tokens =
267
154k
                    doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
268
154k
                            reader, analyzer_ctx->analyzer.get());
269
154k
        }
270
154k
    }
271
154k
    return data_tokens;
272
154k
}
273
274
1.37k
Status FunctionMatchBase::check(FunctionContext* context, const std::string& function_name) const {
275
1.37k
    if (!context->state()->query_options().enable_match_without_inverted_index) {
276
2
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
277
2
                "{} not support execute_match", function_name);
278
2
    }
279
280
1.36k
    DBUG_EXECUTE_IF("match.invert_index_not_support_execute_match", {
281
1.36k
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
282
1.36k
                "debug point: {} not support execute_match", function_name);
283
1.36k
    });
284
285
1.36k
    return Status::OK();
286
1.36k
}
287
288
Status FunctionMatchAny::execute_match(FunctionContext* context, const std::string& column_name,
289
                                       const std::string& match_query_str, size_t input_rows_count,
290
                                       const ColumnString* string_col,
291
                                       const InvertedIndexAnalyzerCtx* analyzer_ctx,
292
                                       const ColumnArray::Offsets64* array_offsets,
293
1.03k
                                       ColumnUInt8::Container& result) const {
294
1.03k
    RETURN_IF_ERROR(check(context, name));
295
296
1.03k
    auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str, column_name);
297
1.03k
    if (query_tokens.empty()) {
298
6
        VLOG_DEBUG << fmt::format(
299
0
                "token parser result is empty for query, "
300
0
                "please check your query: '{}' and index parser: '{}'",
301
0
                match_query_str,
302
0
                analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
303
0
                             : "unknown");
304
6
        return Status::OK();
305
6
    }
306
307
1.03k
    auto current_src_array_offset = 0;
308
131k
    for (int i = 0; i < input_rows_count; i++) {
309
130k
        auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
310
130k
                                              array_offsets, current_src_array_offset);
311
312
        // TODO: more efficient impl
313
1.23M
        for (auto& term_info : query_tokens) {
314
1.23M
            auto it =
315
4.14M
                    std::find_if(data_tokens.begin(), data_tokens.end(), [&](const TermInfo& info) {
316
4.14M
                        return info.get_single_term() == term_info.get_single_term();
317
4.14M
                    });
318
1.23M
            if (it != data_tokens.end()) {
319
99.3k
                result[i] = true;
320
99.3k
                break;
321
99.3k
            }
322
1.23M
        }
323
130k
    }
324
325
1.03k
    return Status::OK();
326
1.03k
}
327
328
Status FunctionMatchAll::execute_match(FunctionContext* context, const std::string& column_name,
329
                                       const std::string& match_query_str, size_t input_rows_count,
330
                                       const ColumnString* string_col,
331
                                       const InvertedIndexAnalyzerCtx* analyzer_ctx,
332
                                       const ColumnArray::Offsets64* array_offsets,
333
159
                                       ColumnUInt8::Container& result) const {
334
159
    RETURN_IF_ERROR(check(context, name));
335
336
159
    auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str, column_name);
337
159
    if (query_tokens.empty()) {
338
5
        VLOG_DEBUG << fmt::format(
339
0
                "token parser result is empty for query, "
340
0
                "please check your query: '{}' and index parser: '{}'",
341
0
                match_query_str,
342
0
                analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
343
0
                             : "unknown");
344
5
        return Status::OK();
345
5
    }
346
347
154
    auto current_src_array_offset = 0;
348
4.78k
    for (int i = 0; i < input_rows_count; i++) {
349
4.62k
        auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
350
4.62k
                                              array_offsets, current_src_array_offset);
351
352
        // TODO: more efficient impl
353
4.62k
        auto find_count = 0;
354
5.58k
        for (auto& term_info : query_tokens) {
355
5.58k
            auto it =
356
57.9k
                    std::find_if(data_tokens.begin(), data_tokens.end(), [&](const TermInfo& info) {
357
57.9k
                        return info.get_single_term() == term_info.get_single_term();
358
57.9k
                    });
359
5.58k
            if (it != data_tokens.end()) {
360
1.46k
                ++find_count;
361
4.12k
            } else {
362
4.12k
                break;
363
4.12k
            }
364
5.58k
        }
365
366
4.62k
        if (find_count == query_tokens.size()) {
367
502
            result[i] = true;
368
502
        }
369
4.62k
    }
370
371
154
    return Status::OK();
372
159
}
373
374
Status FunctionMatchPhrase::execute_match(FunctionContext* context, const std::string& column_name,
375
                                          const std::string& match_query_str,
376
                                          size_t input_rows_count, const ColumnString* string_col,
377
                                          const InvertedIndexAnalyzerCtx* analyzer_ctx,
378
                                          const ColumnArray::Offsets64* array_offsets,
379
156
                                          ColumnUInt8::Container& result) const {
380
156
    RETURN_IF_ERROR(check(context, name));
381
382
155
    auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str, column_name);
383
155
    if (query_tokens.empty()) {
384
7
        VLOG_DEBUG << fmt::format(
385
0
                "token parser result is empty for query, "
386
0
                "please check your query: '{}' and index parser: '{}'",
387
0
                match_query_str,
388
0
                analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
389
0
                             : "unknown");
390
7
        return Status::OK();
391
7
    }
392
393
148
    auto current_src_array_offset = 0;
394
13.3k
    for (int i = 0; i < input_rows_count; i++) {
395
13.1k
        auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
396
13.1k
                                              array_offsets, current_src_array_offset);
397
398
        // TODO: more efficient impl
399
13.1k
        bool matched = false;
400
13.1k
        auto data_it = data_tokens.begin();
401
23.7k
        while (data_it != data_tokens.end()) {
402
            // find position of first token
403
86.5k
            data_it = std::find_if(data_it, data_tokens.end(), [&](const TermInfo& info) {
404
86.5k
                return info.get_single_term() == query_tokens[0].get_single_term();
405
86.5k
            });
406
13.5k
            if (data_it != data_tokens.end()) {
407
3.40k
                matched = true;
408
3.40k
                auto data_it_next = ++data_it;
409
3.40k
                auto query_it = query_tokens.begin() + 1;
410
                // compare query_tokens after the first to data_tokens one by one
411
3.85k
                while (query_it != query_tokens.end()) {
412
882
                    if (data_it_next == data_tokens.end() ||
413
882
                        data_it_next->get_single_term() != query_it->get_single_term()) {
414
436
                        matched = false;
415
436
                        break;
416
436
                    }
417
446
                    query_it++;
418
446
                    data_it_next++;
419
446
                }
420
421
3.40k
                if (matched) {
422
2.97k
                    break;
423
2.97k
                }
424
3.40k
            }
425
13.5k
        }
426
427
        // check matched
428
13.1k
        if (matched) {
429
2.97k
            result[i] = true;
430
2.97k
        }
431
13.1k
    }
432
433
148
    return Status::OK();
434
155
}
435
436
Status FunctionMatchPhrasePrefix::execute_match(
437
        FunctionContext* context, const std::string& column_name,
438
        const std::string& match_query_str, size_t input_rows_count, const ColumnString* string_col,
439
        const InvertedIndexAnalyzerCtx* analyzer_ctx, const ColumnArray::Offsets64* array_offsets,
440
9
        ColumnUInt8::Container& result) const {
441
9
    RETURN_IF_ERROR(check(context, name));
442
443
8
    auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str, column_name);
444
8
    if (query_tokens.empty()) {
445
2
        VLOG_DEBUG << fmt::format(
446
0
                "token parser result is empty for query, "
447
0
                "please check your query: '{}' and index parser: '{}'",
448
0
                match_query_str,
449
0
                analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
450
0
                             : "unknown");
451
2
        return Status::OK();
452
2
    }
453
454
6
    int32_t current_src_array_offset = 0;
455
4.01k
    for (int i = 0; i < input_rows_count; i++) {
456
4.00k
        auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
457
4.00k
                                              array_offsets, current_src_array_offset);
458
459
4.00k
        int64_t dis_count = data_tokens.size() - query_tokens.size();
460
4.00k
        if (dis_count < 0) {
461
2
            continue;
462
2
        }
463
464
33.7k
        for (size_t j = 0; j < dis_count + 1; j++) {
465
30.2k
            if (data_tokens[j].get_single_term() == query_tokens[0].get_single_term() ||
466
30.2k
                query_tokens.size() == 1) {
467
844
                bool match = true;
468
2.12k
                for (size_t k = 0; k < query_tokens.size(); k++) {
469
1.68k
                    const std::string& data_token = data_tokens[j + k].get_single_term();
470
1.68k
                    const std::string& query_token = query_tokens[k].get_single_term();
471
1.68k
                    if (k == query_tokens.size() - 1) {
472
844
                        if (data_token.compare(0, query_token.size(), query_token) != 0) {
473
406
                            match = false;
474
406
                            break;
475
406
                        }
476
844
                    } else {
477
840
                        if (data_token != query_token) {
478
0
                            match = false;
479
0
                            break;
480
0
                        }
481
840
                    }
482
1.68k
                }
483
844
                if (match) {
484
438
                    result[i] = true;
485
438
                    break;
486
438
                }
487
844
            }
488
30.2k
        }
489
4.00k
    }
490
491
6
    return Status::OK();
492
8
}
493
494
Status FunctionMatchRegexp::execute_match(FunctionContext* context, const std::string& column_name,
495
                                          const std::string& match_query_str,
496
                                          size_t input_rows_count, const ColumnString* string_col,
497
                                          const InvertedIndexAnalyzerCtx* analyzer_ctx,
498
                                          const ColumnArray::Offsets64* array_offsets,
499
7
                                          ColumnUInt8::Container& result) const {
500
7
    RETURN_IF_ERROR(check(context, name));
501
502
7
    VLOG_DEBUG << "begin to run FunctionMatchRegexp::execute_match, parser_type: "
503
0
               << (analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
504
0
                                : "unknown");
505
506
7
    const std::string& pattern = match_query_str;
507
508
7
    hs_database_t* database = nullptr;
509
7
    hs_compile_error_t* compile_err = nullptr;
510
7
    hs_scratch_t* scratch = nullptr;
511
512
7
    if (hs_compile(pattern.data(), HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8,
513
7
                   HS_MODE_BLOCK, nullptr, &database, &compile_err) != HS_SUCCESS) {
514
1
        std::string err_message = "hyperscan compilation failed: ";
515
1
        err_message.append(compile_err->message);
516
1
        LOG(ERROR) << err_message;
517
1
        hs_free_compile_error(compile_err);
518
1
        return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(err_message);
519
1
    }
520
521
6
    if (hs_alloc_scratch(database, &scratch) != HS_SUCCESS) {
522
0
        LOG(ERROR) << "hyperscan could not allocate scratch space.";
523
0
        hs_free_database(database);
524
0
        return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
525
0
                "hyperscan could not allocate scratch space.");
526
0
    }
527
528
6
    auto on_match = [](unsigned int id, unsigned long long from, unsigned long long to,
529
4.07k
                       unsigned int flags, void* context) -> int {
530
4.07k
        *((bool*)context) = true;
531
4.07k
        return 0;
532
4.07k
    };
533
534
6
    try {
535
6
        auto current_src_array_offset = 0;
536
2.01k
        for (int i = 0; i < input_rows_count; i++) {
537
2.00k
            auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
538
2.00k
                                                  array_offsets, current_src_array_offset);
539
540
9.63k
            for (auto& input : data_tokens) {
541
9.63k
                bool is_match = false;
542
9.63k
                const auto& input_str = input.get_single_term();
543
9.63k
                if (hs_scan(database, input_str.data(), (uint32_t)input_str.size(), 0, scratch,
544
9.63k
                            on_match, (void*)&is_match) != HS_SUCCESS) {
545
0
                    LOG(ERROR) << "hyperscan match failed: " << input_str;
546
0
                    break;
547
0
                }
548
549
9.63k
                if (is_match) {
550
1.06k
                    result[i] = true;
551
1.06k
                    break;
552
1.06k
                }
553
9.63k
            }
554
2.00k
        }
555
6
    }
556
6
    _CLFINALLY({
557
6
        hs_free_scratch(scratch);
558
6
        hs_free_database(database);
559
6
    })
560
561
6
    return Status::OK();
562
6
}
563
564
Status FunctionMatchPhraseEdge::execute_match(
565
        FunctionContext* context, const std::string& column_name,
566
        const std::string& match_query_str, size_t input_rows_count, const ColumnString* string_col,
567
        const InvertedIndexAnalyzerCtx* analyzer_ctx, const ColumnArray::Offsets64* array_offsets,
568
0
        ColumnUInt8::Container& result) const {
569
0
    RETURN_IF_ERROR(check(context, name));
570
571
0
    auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str, column_name);
572
0
    if (query_tokens.empty()) {
573
0
        VLOG_DEBUG << fmt::format(
574
0
                "token parser result is empty for query, "
575
0
                "please check your query: '{}' and index parser: '{}'",
576
0
                match_query_str,
577
0
                analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
578
0
                             : "unknown");
579
0
        return Status::OK();
580
0
    }
581
582
0
    int32_t current_src_array_offset = 0;
583
0
    for (int i = 0; i < input_rows_count; i++) {
584
0
        auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
585
0
                                              array_offsets, current_src_array_offset);
586
587
0
        int64_t dis_count = data_tokens.size() - query_tokens.size();
588
0
        if (dis_count < 0) {
589
0
            continue;
590
0
        }
591
592
0
        for (size_t j = 0; j < dis_count + 1; j++) {
593
0
            bool match = true;
594
0
            if (query_tokens.size() == 1) {
595
0
                if (data_tokens[j].get_single_term().find(query_tokens[0].get_single_term()) ==
596
0
                    std::string::npos) {
597
0
                    match = false;
598
0
                }
599
0
            } else {
600
0
                for (size_t k = 0; k < query_tokens.size(); k++) {
601
0
                    const std::string& data_token = data_tokens[j + k].get_single_term();
602
0
                    const std::string& query_token = query_tokens[k].get_single_term();
603
0
                    if (k == 0) {
604
0
                        if (!data_token.ends_with(query_token)) {
605
0
                            match = false;
606
0
                            break;
607
0
                        }
608
0
                    } else if (k == query_tokens.size() - 1) {
609
0
                        if (!data_token.starts_with(query_token)) {
610
0
                            match = false;
611
0
                            break;
612
0
                        }
613
0
                    } else {
614
0
                        if (data_token != query_token) {
615
0
                            match = false;
616
0
                            break;
617
0
                        }
618
0
                    }
619
0
                }
620
0
            }
621
0
            if (match) {
622
0
                result[i] = true;
623
0
                break;
624
0
            }
625
0
        }
626
0
    }
627
628
0
    return Status::OK();
629
0
}
630
631
8
void register_function_match(SimpleFunctionFactory& factory) {
632
8
    factory.register_function<FunctionMatchAny>();
633
8
    factory.register_function<FunctionMatchAll>();
634
8
    factory.register_function<FunctionMatchPhrase>();
635
8
    factory.register_function<FunctionMatchPhrasePrefix>();
636
8
    factory.register_function<FunctionMatchRegexp>();
637
8
    factory.register_function<FunctionMatchPhraseEdge>();
638
8
}
639
} // namespace doris