Coverage Report

Created: 2026-05-30 07:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/match.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exprs/function/match.h"
19
20
#include <hs/hs.h>
21
22
#include "core/field.h"
23
#include "runtime/query_context.h"
24
#include "runtime/runtime_state.h"
25
#include "storage/index/index_reader_helper.h"
26
#include "storage/index/inverted/analyzer/analyzer.h"
27
#include "util/debug_points.h"
28
29
namespace doris {
30
31
namespace {
32
33
492
const InvertedIndexAnalyzerCtx* get_match_analyzer_ctx(FunctionContext* context) {
34
492
    if (context == nullptr) {
35
0
        return nullptr;
36
0
    }
37
492
    auto* analyzer_ctx = reinterpret_cast<const InvertedIndexAnalyzerCtx*>(
38
492
            context->get_function_state(FunctionContext::THREAD_LOCAL));
39
492
    if (analyzer_ctx == nullptr) {
40
0
        analyzer_ctx = reinterpret_cast<const InvertedIndexAnalyzerCtx*>(
41
0
                context->get_function_state(FunctionContext::FRAGMENT_LOCAL));
42
0
    }
43
492
    return analyzer_ctx;
44
492
}
45
46
} // namespace
47
48
Status FunctionMatchBase::evaluate_inverted_index(
49
        const ColumnsWithTypeAndName& arguments,
50
        const std::vector<IndexFieldNameAndTypePair>& data_type_with_names,
51
        std::vector<segment_v2::IndexIterator*> iterators, uint32_t num_rows,
52
        const InvertedIndexAnalyzerCtx* analyzer_ctx,
53
4.76k
        segment_v2::InvertedIndexResultBitmap& bitmap_result) const {
54
4.76k
    DCHECK(arguments.size() == 1);
55
4.76k
    DCHECK(data_type_with_names.size() == 1);
56
4.76k
    DCHECK(iterators.size() == 1);
57
4.76k
    auto* iter = iterators[0];
58
4.76k
    auto data_type_with_name = data_type_with_names[0];
59
4.76k
    if (iter == nullptr) {
60
0
        return Status::OK();
61
0
    }
62
4.76k
    const std::string& function_name = get_name();
63
64
4.76k
    if (function_name == MATCH_PHRASE_FUNCTION || function_name == MATCH_PHRASE_PREFIX_FUNCTION ||
65
4.76k
        function_name == MATCH_PHRASE_EDGE_FUNCTION) {
66
546
        auto reader = iter->get_reader(InvertedIndexReaderType::FULLTEXT);
67
546
        if (reader && !segment_v2::IndexReaderHelper::is_support_phrase(reader)) {
68
2
            return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
69
2
                    "phrase queries require setting support_phrase = true");
70
2
        }
71
546
    }
72
4.76k
    Field param_value;
73
4.76k
    arguments[0].column->get(0, param_value);
74
4.76k
    if (param_value.is_null()) {
75
        // if query value is null, skip evaluate inverted index
76
0
        return Status::OK();
77
0
    }
78
4.76k
    auto param_type = arguments[0].type->get_primitive_type();
79
4.76k
    if (!is_string_type(param_type)) {
80
0
        return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
81
0
                "arguments for match must be string");
82
0
    }
83
4.76k
    InvertedIndexParam param;
84
4.76k
    param.column_name = data_type_with_name.first;
85
4.76k
    param.column_type = data_type_with_name.second;
86
4.76k
    param.query_value = param_value;
87
4.76k
    param.query_type = get_query_type_from_fn_name();
88
4.76k
    param.num_rows = num_rows;
89
4.76k
    param.roaring = std::make_shared<roaring::Roaring>();
90
4.76k
    param.analyzer_ctx = analyzer_ctx;
91
4.76k
    if (is_string_type(param_type)) {
92
4.76k
        RETURN_IF_ERROR(iter->read_from_index(&param));
93
18.4E
    } else {
94
18.4E
        return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
95
18.4E
                "invalid params type for FunctionMatchBase::evaluate_inverted_index {}",
96
18.4E
                param_type);
97
18.4E
    }
98
4.76k
    std::shared_ptr<roaring::Roaring> null_bitmap = std::make_shared<roaring::Roaring>();
99
4.77k
    if (iter->has_null()) {
100
4.77k
        segment_v2::InvertedIndexQueryCacheHandle null_bitmap_cache_handle;
101
4.77k
        RETURN_IF_ERROR(iter->read_null_bitmap(&null_bitmap_cache_handle));
102
4.77k
        null_bitmap = null_bitmap_cache_handle.get_bitmap();
103
4.77k
    }
104
4.76k
    segment_v2::InvertedIndexResultBitmap result(param.roaring, null_bitmap);
105
4.76k
    bitmap_result = result;
106
4.76k
    bitmap_result.mask_out_null();
107
108
4.76k
    return Status::OK();
109
4.76k
}
110
Status FunctionMatchBase::execute_impl(FunctionContext* context, Block& block,
111
                                       const ColumnNumbers& arguments, uint32_t result,
112
492
                                       size_t input_rows_count) const {
113
492
    ColumnPtr& column_ptr = block.get_by_position(arguments[1]).column;
114
492
    DataTypePtr& type_ptr = block.get_by_position(arguments[1]).type;
115
116
492
    auto format_options = DataTypeSerDe::get_default_format_options();
117
492
    auto time_zone = cctz::utc_time_zone();
118
492
    format_options.timezone =
119
492
            (context && context->state()) ? &context->state()->timezone_obj() : &time_zone;
120
121
492
    auto match_query_str = type_ptr->to_string(*column_ptr, 0, format_options);
122
492
    std::string column_name = block.get_by_position(arguments[0]).name;
123
492
    VLOG_DEBUG << "begin to execute match directly, column_name=" << column_name
124
0
               << ", match_query_str=" << match_query_str;
125
492
    auto* analyzer_ctx = get_match_analyzer_ctx(context);
126
492
    const ColumnPtr source_col =
127
492
            block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
128
492
    const ColumnString* values = nullptr;
129
492
    const ColumnArray* array_col = nullptr;
130
492
    if (const auto array_col_guard = check_and_get_column<ColumnArray>(source_col.get())) {
131
11
        array_col = array_col_guard.get();
132
11
        if (array_col && !array_col->get_data().is_column_string()) {
133
0
            return Status::NotSupported(fmt::format(
134
0
                    "unsupported nested array of type {} for function {}",
135
0
                    is_column_nullable(array_col->get_data()) ? array_col->get_data().get_name()
136
0
                                                              : array_col->get_data().get_name(),
137
0
                    get_name()));
138
0
        }
139
140
11
        if (is_column_nullable(array_col->get_data())) {
141
11
            const auto& array_nested_null_column =
142
11
                    reinterpret_cast<const ColumnNullable&>(array_col->get_data());
143
11
            const auto values_guard = check_and_get_column<ColumnString>(
144
11
                    *(array_nested_null_column.get_nested_column_ptr()));
145
11
            if (values_guard) {
146
11
                values = values_guard.get();
147
11
            }
148
11
        } else {
149
            // array column element is always set Nullable for now.
150
0
            const auto values_guard =
151
0
                    check_and_get_column<ColumnString>(*(array_col->get_data_ptr()));
152
0
            if (values_guard) {
153
0
                values = values_guard.get();
154
0
            }
155
0
        }
156
481
    } else if (const auto nullable = check_and_get_column<ColumnNullable>(source_col.get())) {
157
0
        const auto values_guard =
158
0
                check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr());
159
0
        if (values_guard) {
160
0
            values = values_guard.get();
161
0
        }
162
481
    } else if (const auto values_guard = check_and_get_column<ColumnString>(source_col.get())) {
163
481
        values = values_guard.get();
164
481
    }
165
166
492
    if (!values) {
167
0
        LOG(WARNING) << "Illegal column " << source_col->get_name();
168
0
        return Status::InternalError("Not supported input column types");
169
0
    }
170
    // result column
171
492
    auto res = ColumnUInt8::create();
172
492
    ColumnUInt8::Container& vec_res = res->get_data();
173
    // set default value to 0, and match functions only need to set 1/true
174
492
    vec_res.resize_fill(input_rows_count);
175
492
    RETURN_IF_ERROR(execute_match(context, column_name, match_query_str, input_rows_count, values,
176
492
                                  analyzer_ctx, (array_col ? &(array_col->get_offsets()) : nullptr),
177
492
                                  vec_res));
178
489
    block.replace_by_position(result, std::move(res));
179
180
489
    return Status::OK();
181
492
}
182
183
inline doris::segment_v2::InvertedIndexQueryType FunctionMatchBase::get_query_type_from_fn_name()
184
4.76k
        const {
185
4.76k
    std::string fn_name = get_name();
186
4.76k
    if (fn_name == MATCH_ANY_FUNCTION) {
187
3.48k
        return doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY;
188
3.48k
    } else if (fn_name == MATCH_ALL_FUNCTION) {
189
718
        return doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY;
190
718
    } else if (fn_name == MATCH_PHRASE_FUNCTION) {
191
512
        return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY;
192
512
    } else if (fn_name == MATCH_PHRASE_PREFIX_FUNCTION) {
193
38
        return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY;
194
38
    } else if (fn_name == MATCH_PHRASE_REGEXP_FUNCTION) {
195
22
        return doris::segment_v2::InvertedIndexQueryType::MATCH_REGEXP_QUERY;
196
18.4E
    } else if (fn_name == MATCH_PHRASE_EDGE_FUNCTION) {
197
3
        return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY;
198
3
    }
199
18.4E
    return doris::segment_v2::InvertedIndexQueryType::UNKNOWN_QUERY;
200
4.76k
}
201
202
std::vector<TermInfo> FunctionMatchBase::analyse_query_str_token(
203
        const InvertedIndexAnalyzerCtx* analyzer_ctx, const std::string& match_query_str,
204
492
        const std::string& column_name) const {
205
492
    std::vector<TermInfo> query_tokens;
206
492
    if (analyzer_ctx == nullptr) {
207
3
        return query_tokens;
208
3
    }
209
210
489
    VLOG_DEBUG << "begin to run " << get_name() << ", parser_type: "
211
0
               << inverted_index_parser_type_to_string(analyzer_ctx->parser_type);
212
213
    // Decision is based on parser_type (from index properties):
214
    // - PARSER_NONE: no tokenization (keyword/exact match)
215
    // - Other parsers: tokenize using the analyzer
216
489
    if (!analyzer_ctx->should_tokenize()) {
217
        // Keyword index: all strings (including empty) are valid tokens for exact match.
218
        // Empty string is a valid value in keyword index and should be matchable.
219
4
        query_tokens.emplace_back(match_query_str);
220
4
        return query_tokens;
221
4
    }
222
223
    // Safety check: if analyzer is nullptr but tokenization is expected, fall back to no tokenization
224
485
    if (analyzer_ctx->analyzer == nullptr) {
225
0
        VLOG_DEBUG << "Analyzer is nullptr, falling back to no tokenization";
226
        // For fallback case, also allow empty strings to be matched
227
0
        query_tokens.emplace_back(match_query_str);
228
0
        return query_tokens;
229
0
    }
230
231
    // Tokenize using the analyzer
232
485
    auto reader = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader(
233
485
            analyzer_ctx->char_filter_map);
234
485
    reader->init(match_query_str.data(), (int)match_query_str.size(), true);
235
485
    query_tokens = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
236
485
            reader, analyzer_ctx->analyzer.get());
237
485
    return query_tokens;
238
485
}
239
240
inline std::vector<TermInfo> FunctionMatchBase::analyse_data_token(
241
        const std::string& column_name, const InvertedIndexAnalyzerCtx* analyzer_ctx,
242
        const ColumnString* string_col, int32_t current_block_row_idx,
243
25.5k
        const ColumnArray::Offsets64* array_offsets, int32_t& current_src_array_offset) const {
244
25.5k
    std::vector<TermInfo> data_tokens;
245
25.5k
    if (analyzer_ctx == nullptr) {
246
0
        return data_tokens;
247
0
    }
248
249
    // Determine tokenization strategy based on parser_type
250
25.5k
    const bool should_tokenize =
251
25.5k
            analyzer_ctx->should_tokenize() && analyzer_ctx->analyzer != nullptr;
252
253
25.5k
    if (array_offsets) {
254
13
        for (auto next_src_array_offset = (*array_offsets)[current_block_row_idx];
255
28
             current_src_array_offset < next_src_array_offset; ++current_src_array_offset) {
256
15
            const auto& str_ref = string_col->get_data_at(current_src_array_offset);
257
15
            if (!should_tokenize) {
258
0
                data_tokens.emplace_back(str_ref.to_string());
259
0
                continue;
260
0
            }
261
15
            auto reader = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader(
262
15
                    analyzer_ctx->char_filter_map);
263
15
            reader->init(str_ref.data, (int)str_ref.size, true);
264
15
            data_tokens =
265
15
                    doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
266
15
                            reader, analyzer_ctx->analyzer.get());
267
15
        }
268
25.4k
    } else {
269
25.4k
        const auto& str_ref = string_col->get_data_at(current_block_row_idx);
270
25.4k
        if (!should_tokenize) {
271
15
            data_tokens.emplace_back(str_ref.to_string());
272
25.4k
        } else {
273
25.4k
            auto reader = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader(
274
25.4k
                    analyzer_ctx->char_filter_map);
275
25.4k
            reader->init(str_ref.data, (int)str_ref.size, true);
276
25.4k
            data_tokens =
277
25.4k
                    doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
278
25.4k
                            reader, analyzer_ctx->analyzer.get());
279
25.4k
        }
280
25.4k
    }
281
25.5k
    return data_tokens;
282
25.5k
}
283
284
492
Status FunctionMatchBase::check(FunctionContext* context, const std::string& function_name) const {
285
492
    if (!context->state()->query_options().enable_match_without_inverted_index) {
286
2
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
287
2
                "{} not support execute_match", function_name);
288
2
    }
289
290
490
    DBUG_EXECUTE_IF("match.invert_index_not_support_execute_match", {
291
490
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
292
490
                "debug point: {} not support execute_match", function_name);
293
490
    });
294
295
490
    return Status::OK();
296
490
}
297
298
Status FunctionMatchAny::execute_match(FunctionContext* context, const std::string& column_name,
299
                                       const std::string& match_query_str, size_t input_rows_count,
300
                                       const ColumnString* string_col,
301
                                       const InvertedIndexAnalyzerCtx* analyzer_ctx,
302
                                       const ColumnArray::Offsets64* array_offsets,
303
306
                                       ColumnUInt8::Container& result) const {
304
306
    RETURN_IF_ERROR(check(context, name));
305
306
306
    auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str, column_name);
307
306
    if (query_tokens.empty()) {
308
6
        VLOG_DEBUG << fmt::format(
309
0
                "token parser result is empty for query, "
310
0
                "please check your query: '{}' and index parser: '{}'",
311
0
                match_query_str,
312
0
                analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
313
0
                             : "unknown");
314
6
        return Status::OK();
315
6
    }
316
317
300
    auto current_src_array_offset = 0;
318
20.5k
    for (int i = 0; i < input_rows_count; i++) {
319
20.2k
        auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
320
20.2k
                                              array_offsets, current_src_array_offset);
321
322
        // TODO: more efficient impl
323
21.0k
        for (auto& term_info : query_tokens) {
324
21.0k
            auto it =
325
74.0k
                    std::find_if(data_tokens.begin(), data_tokens.end(), [&](const TermInfo& info) {
326
74.0k
                        return info.get_single_term() == term_info.get_single_term();
327
74.0k
                    });
328
21.0k
            if (it != data_tokens.end()) {
329
11.6k
                result[i] = true;
330
11.6k
                break;
331
11.6k
            }
332
21.0k
        }
333
20.2k
    }
334
335
300
    return Status::OK();
336
306
}
337
338
Status FunctionMatchAll::execute_match(FunctionContext* context, const std::string& column_name,
339
                                       const std::string& match_query_str, size_t input_rows_count,
340
                                       const ColumnString* string_col,
341
                                       const InvertedIndexAnalyzerCtx* analyzer_ctx,
342
                                       const ColumnArray::Offsets64* array_offsets,
343
91
                                       ColumnUInt8::Container& result) const {
344
91
    RETURN_IF_ERROR(check(context, name));
345
346
91
    auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str, column_name);
347
91
    if (query_tokens.empty()) {
348
5
        VLOG_DEBUG << fmt::format(
349
0
                "token parser result is empty for query, "
350
0
                "please check your query: '{}' and index parser: '{}'",
351
0
                match_query_str,
352
0
                analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
353
0
                             : "unknown");
354
5
        return Status::OK();
355
5
    }
356
357
86
    auto current_src_array_offset = 0;
358
1.18k
    for (int i = 0; i < input_rows_count; i++) {
359
1.09k
        auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
360
1.09k
                                              array_offsets, current_src_array_offset);
361
362
        // TODO: more efficient impl
363
1.09k
        auto find_count = 0;
364
1.35k
        for (auto& term_info : query_tokens) {
365
1.35k
            auto it =
366
9.62k
                    std::find_if(data_tokens.begin(), data_tokens.end(), [&](const TermInfo& info) {
367
9.62k
                        return info.get_single_term() == term_info.get_single_term();
368
9.62k
                    });
369
1.35k
            if (it != data_tokens.end()) {
370
394
                ++find_count;
371
958
            } else {
372
958
                break;
373
958
            }
374
1.35k
        }
375
376
1.09k
        if (find_count == query_tokens.size()) {
377
138
            result[i] = true;
378
138
        }
379
1.09k
    }
380
381
86
    return Status::OK();
382
91
}
383
384
Status FunctionMatchPhrase::execute_match(FunctionContext* context, const std::string& column_name,
385
                                          const std::string& match_query_str,
386
                                          size_t input_rows_count, const ColumnString* string_col,
387
                                          const InvertedIndexAnalyzerCtx* analyzer_ctx,
388
                                          const ColumnArray::Offsets64* array_offsets,
389
84
                                          ColumnUInt8::Container& result) const {
390
84
    RETURN_IF_ERROR(check(context, name));
391
392
83
    auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str, column_name);
393
83
    if (query_tokens.empty()) {
394
7
        VLOG_DEBUG << fmt::format(
395
0
                "token parser result is empty for query, "
396
0
                "please check your query: '{}' and index parser: '{}'",
397
0
                match_query_str,
398
0
                analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
399
0
                             : "unknown");
400
7
        return Status::OK();
401
7
    }
402
403
76
    auto current_src_array_offset = 0;
404
1.16k
    for (int i = 0; i < input_rows_count; i++) {
405
1.08k
        auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
406
1.08k
                                              array_offsets, current_src_array_offset);
407
408
        // TODO: more efficient impl
409
1.08k
        bool matched = false;
410
1.08k
        auto data_it = data_tokens.begin();
411
2.16k
        while (data_it != data_tokens.end()) {
412
            // find position of first token
413
8.63k
            data_it = std::find_if(data_it, data_tokens.end(), [&](const TermInfo& info) {
414
8.63k
                return info.get_single_term() == query_tokens[0].get_single_term();
415
8.63k
            });
416
1.19k
            if (data_it != data_tokens.end()) {
417
234
                matched = true;
418
234
                auto data_it_next = ++data_it;
419
234
                auto query_it = query_tokens.begin() + 1;
420
                // compare query_tokens after the first to data_tokens one by one
421
356
                while (query_it != query_tokens.end()) {
422
238
                    if (data_it_next == data_tokens.end() ||
423
238
                        data_it_next->get_single_term() != query_it->get_single_term()) {
424
116
                        matched = false;
425
116
                        break;
426
116
                    }
427
122
                    query_it++;
428
122
                    data_it_next++;
429
122
                }
430
431
234
                if (matched) {
432
118
                    break;
433
118
                }
434
234
            }
435
1.19k
        }
436
437
        // check matched
438
1.08k
        if (matched) {
439
118
            result[i] = true;
440
118
        }
441
1.08k
    }
442
443
76
    return Status::OK();
444
83
}
445
446
Status FunctionMatchPhrasePrefix::execute_match(
447
        FunctionContext* context, const std::string& column_name,
448
        const std::string& match_query_str, size_t input_rows_count, const ColumnString* string_col,
449
        const InvertedIndexAnalyzerCtx* analyzer_ctx, const ColumnArray::Offsets64* array_offsets,
450
5
        ColumnUInt8::Container& result) const {
451
5
    RETURN_IF_ERROR(check(context, name));
452
453
4
    auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str, column_name);
454
4
    if (query_tokens.empty()) {
455
2
        VLOG_DEBUG << fmt::format(
456
0
                "token parser result is empty for query, "
457
0
                "please check your query: '{}' and index parser: '{}'",
458
0
                match_query_str,
459
0
                analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
460
0
                             : "unknown");
461
2
        return Status::OK();
462
2
    }
463
464
2
    int32_t current_src_array_offset = 0;
465
1.00k
    for (int i = 0; i < input_rows_count; i++) {
466
1.00k
        auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
467
1.00k
                                              array_offsets, current_src_array_offset);
468
469
1.00k
        int64_t dis_count = data_tokens.size() - query_tokens.size();
470
1.00k
        if (dis_count < 0) {
471
1
            continue;
472
1
        }
473
474
8.38k
        for (size_t j = 0; j < dis_count + 1; j++) {
475
7.50k
            if (data_tokens[j].get_single_term() == query_tokens[0].get_single_term() ||
476
7.50k
                query_tokens.size() == 1) {
477
210
                bool match = true;
478
540
                for (size_t k = 0; k < query_tokens.size(); k++) {
479
420
                    const std::string& data_token = data_tokens[j + k].get_single_term();
480
420
                    const std::string& query_token = query_tokens[k].get_single_term();
481
420
                    if (k == query_tokens.size() - 1) {
482
210
                        if (data_token.compare(0, query_token.size(), query_token) != 0) {
483
90
                            match = false;
484
90
                            break;
485
90
                        }
486
210
                    } else {
487
210
                        if (data_token != query_token) {
488
0
                            match = false;
489
0
                            break;
490
0
                        }
491
210
                    }
492
420
                }
493
210
                if (match) {
494
120
                    result[i] = true;
495
120
                    break;
496
120
                }
497
210
            }
498
7.50k
        }
499
1.00k
    }
500
501
2
    return Status::OK();
502
4
}
503
504
Status FunctionMatchRegexp::execute_match(FunctionContext* context, const std::string& column_name,
505
                                          const std::string& match_query_str,
506
                                          size_t input_rows_count, const ColumnString* string_col,
507
                                          const InvertedIndexAnalyzerCtx* analyzer_ctx,
508
                                          const ColumnArray::Offsets64* array_offsets,
509
6
                                          ColumnUInt8::Container& result) const {
510
6
    RETURN_IF_ERROR(check(context, name));
511
512
6
    VLOG_DEBUG << "begin to run FunctionMatchRegexp::execute_match, parser_type: "
513
0
               << (analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
514
0
                                : "unknown");
515
516
6
    const std::string& pattern = match_query_str;
517
518
6
    hs_database_t* database = nullptr;
519
6
    hs_compile_error_t* compile_err = nullptr;
520
6
    hs_scratch_t* scratch = nullptr;
521
522
6
    if (hs_compile(pattern.data(), HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8,
523
6
                   HS_MODE_BLOCK, nullptr, &database, &compile_err) != HS_SUCCESS) {
524
1
        std::string err_message = "hyperscan compilation failed: ";
525
1
        err_message.append(compile_err->message);
526
1
        LOG(ERROR) << err_message;
527
1
        hs_free_compile_error(compile_err);
528
1
        return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(err_message);
529
1
    }
530
531
5
    if (hs_alloc_scratch(database, &scratch) != HS_SUCCESS) {
532
0
        LOG(ERROR) << "hyperscan could not allocate scratch space.";
533
0
        hs_free_database(database);
534
0
        return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
535
0
                "hyperscan could not allocate scratch space.");
536
0
    }
537
538
5
    auto on_match = [](unsigned int id, unsigned long long from, unsigned long long to,
539
4.06k
                       unsigned int flags, void* context) -> int {
540
4.06k
        *((bool*)context) = true;
541
4.06k
        return 0;
542
4.06k
    };
543
544
5
    try {
545
5
        auto current_src_array_offset = 0;
546
2.00k
        for (int i = 0; i < input_rows_count; i++) {
547
2.00k
            auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
548
2.00k
                                                  array_offsets, current_src_array_offset);
549
550
9.62k
            for (auto& input : data_tokens) {
551
9.62k
                bool is_match = false;
552
9.62k
                const auto& input_str = input.get_single_term();
553
9.62k
                if (hs_scan(database, input_str.data(), (uint32_t)input_str.size(), 0, scratch,
554
9.62k
                            on_match, (void*)&is_match) != HS_SUCCESS) {
555
0
                    LOG(ERROR) << "hyperscan match failed: " << input_str;
556
0
                    break;
557
0
                }
558
559
9.62k
                if (is_match) {
560
1.06k
                    result[i] = true;
561
1.06k
                    break;
562
1.06k
                }
563
9.62k
            }
564
2.00k
        }
565
5
    }
566
5
    _CLFINALLY({
567
5
        hs_free_scratch(scratch);
568
5
        hs_free_database(database);
569
5
    })
570
571
5
    return Status::OK();
572
5
}
573
574
Status FunctionMatchPhraseEdge::execute_match(
575
        FunctionContext* context, const std::string& column_name,
576
        const std::string& match_query_str, size_t input_rows_count, const ColumnString* string_col,
577
        const InvertedIndexAnalyzerCtx* analyzer_ctx, const ColumnArray::Offsets64* array_offsets,
578
0
        ColumnUInt8::Container& result) const {
579
0
    RETURN_IF_ERROR(check(context, name));
580
581
0
    auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str, column_name);
582
0
    if (query_tokens.empty()) {
583
0
        VLOG_DEBUG << fmt::format(
584
0
                "token parser result is empty for query, "
585
0
                "please check your query: '{}' and index parser: '{}'",
586
0
                match_query_str,
587
0
                analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
588
0
                             : "unknown");
589
0
        return Status::OK();
590
0
    }
591
592
0
    int32_t current_src_array_offset = 0;
593
0
    for (int i = 0; i < input_rows_count; i++) {
594
0
        auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
595
0
                                              array_offsets, current_src_array_offset);
596
597
0
        int64_t dis_count = data_tokens.size() - query_tokens.size();
598
0
        if (dis_count < 0) {
599
0
            continue;
600
0
        }
601
602
0
        for (size_t j = 0; j < dis_count + 1; j++) {
603
0
            bool match = true;
604
0
            if (query_tokens.size() == 1) {
605
0
                if (data_tokens[j].get_single_term().find(query_tokens[0].get_single_term()) ==
606
0
                    std::string::npos) {
607
0
                    match = false;
608
0
                }
609
0
            } else {
610
0
                for (size_t k = 0; k < query_tokens.size(); k++) {
611
0
                    const std::string& data_token = data_tokens[j + k].get_single_term();
612
0
                    const std::string& query_token = query_tokens[k].get_single_term();
613
0
                    if (k == 0) {
614
0
                        if (!data_token.ends_with(query_token)) {
615
0
                            match = false;
616
0
                            break;
617
0
                        }
618
0
                    } else if (k == query_tokens.size() - 1) {
619
0
                        if (!data_token.starts_with(query_token)) {
620
0
                            match = false;
621
0
                            break;
622
0
                        }
623
0
                    } else {
624
0
                        if (data_token != query_token) {
625
0
                            match = false;
626
0
                            break;
627
0
                        }
628
0
                    }
629
0
                }
630
0
            }
631
0
            if (match) {
632
0
                result[i] = true;
633
0
                break;
634
0
            }
635
0
        }
636
0
    }
637
638
0
    return Status::OK();
639
0
}
640
641
8
void register_function_match(SimpleFunctionFactory& factory) {
642
8
    factory.register_function<FunctionMatchAny>();
643
8
    factory.register_function<FunctionMatchAll>();
644
8
    factory.register_function<FunctionMatchPhrase>();
645
8
    factory.register_function<FunctionMatchPhrasePrefix>();
646
8
    factory.register_function<FunctionMatchRegexp>();
647
8
    factory.register_function<FunctionMatchPhraseEdge>();
648
8
}
649
} // namespace doris