Coverage Report

Created: 2026-06-15 14:04

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/root/doris/be/src/exprs/function/match.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exprs/function/match.h"
19
20
#include <hs/hs.h>
21
22
#include "core/field.h"
23
#include "runtime/query_context.h"
24
#include "runtime/runtime_state.h"
25
#include "storage/index/index_reader_helper.h"
26
#include "storage/index/inverted/analyzer/analyzer.h"
27
#include "util/debug_points.h"
28
29
namespace doris {
30
#include "common/compile_check_begin.h"
31
32
namespace {
33
34
0
const InvertedIndexAnalyzerCtx* get_match_analyzer_ctx(FunctionContext* context) {
35
0
    if (context == nullptr) {
36
0
        return nullptr;
37
0
    }
38
0
    auto* analyzer_ctx = reinterpret_cast<const InvertedIndexAnalyzerCtx*>(
39
0
            context->get_function_state(FunctionContext::THREAD_LOCAL));
40
0
    if (analyzer_ctx == nullptr) {
41
0
        analyzer_ctx = reinterpret_cast<const InvertedIndexAnalyzerCtx*>(
42
0
                context->get_function_state(FunctionContext::FRAGMENT_LOCAL));
43
0
    }
44
0
    return analyzer_ctx;
45
0
}
46
47
} // namespace
48
49
Status FunctionMatchBase::evaluate_inverted_index(
50
        const ColumnsWithTypeAndName& arguments,
51
        const std::vector<IndexFieldNameAndTypePair>& data_type_with_names,
52
        std::vector<segment_v2::IndexIterator*> iterators, uint32_t num_rows,
53
        const InvertedIndexAnalyzerCtx* analyzer_ctx,
54
0
        segment_v2::InvertedIndexResultBitmap& bitmap_result) const {
55
0
    DCHECK(arguments.size() == 1);
56
0
    DCHECK(data_type_with_names.size() == 1);
57
0
    DCHECK(iterators.size() == 1);
58
0
    auto* iter = iterators[0];
59
0
    auto data_type_with_name = data_type_with_names[0];
60
0
    if (iter == nullptr) {
61
0
        return Status::OK();
62
0
    }
63
0
    const std::string& function_name = get_name();
64
65
0
    if (function_name == MATCH_PHRASE_FUNCTION || function_name == MATCH_PHRASE_PREFIX_FUNCTION ||
66
0
        function_name == MATCH_PHRASE_EDGE_FUNCTION) {
67
0
        auto reader = iter->get_reader(InvertedIndexReaderType::FULLTEXT);
68
0
        if (reader && !segment_v2::IndexReaderHelper::is_support_phrase(reader)) {
69
0
            return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
70
0
                    "phrase queries require setting support_phrase = true");
71
0
        }
72
0
    }
73
0
    Field param_value;
74
0
    arguments[0].column->get(0, param_value);
75
0
    if (param_value.is_null()) {
76
        // if query value is null, skip evaluate inverted index
77
0
        return Status::OK();
78
0
    }
79
0
    auto param_type = arguments[0].type->get_primitive_type();
80
0
    if (!is_string_type(param_type)) {
81
0
        return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
82
0
                "arguments for match must be string");
83
0
    }
84
0
    InvertedIndexParam param;
85
0
    param.column_name = data_type_with_name.first;
86
0
    param.column_type = data_type_with_name.second;
87
0
    param.query_value = param_value;
88
0
    param.query_type = get_query_type_from_fn_name();
89
0
    param.num_rows = num_rows;
90
0
    param.roaring = std::make_shared<roaring::Roaring>();
91
0
    param.analyzer_ctx = analyzer_ctx;
92
0
    if (is_string_type(param_type)) {
93
0
        RETURN_IF_ERROR(iter->read_from_index(&param));
94
0
    } else {
95
0
        return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
96
0
                "invalid params type for FunctionMatchBase::evaluate_inverted_index {}",
97
0
                param_type);
98
0
    }
99
0
    std::shared_ptr<roaring::Roaring> null_bitmap = std::make_shared<roaring::Roaring>();
100
0
    if (iter->has_null()) {
101
0
        segment_v2::InvertedIndexQueryCacheHandle null_bitmap_cache_handle;
102
0
        RETURN_IF_ERROR(iter->read_null_bitmap(&null_bitmap_cache_handle));
103
0
        null_bitmap = null_bitmap_cache_handle.get_bitmap();
104
0
    }
105
0
    segment_v2::InvertedIndexResultBitmap result(param.roaring, null_bitmap);
106
0
    bitmap_result = result;
107
0
    bitmap_result.mask_out_null();
108
109
0
    return Status::OK();
110
0
}
111
Status FunctionMatchBase::execute_impl(FunctionContext* context, Block& block,
112
                                       const ColumnNumbers& arguments, uint32_t result,
113
0
                                       size_t input_rows_count) const {
114
0
    ColumnPtr& column_ptr = block.get_by_position(arguments[1]).column;
115
0
    DataTypePtr& type_ptr = block.get_by_position(arguments[1]).type;
116
117
0
    auto format_options = DataTypeSerDe::get_default_format_options();
118
0
    auto time_zone = cctz::utc_time_zone();
119
0
    format_options.timezone =
120
0
            (context && context->state()) ? &context->state()->timezone_obj() : &time_zone;
121
122
0
    auto match_query_str = type_ptr->to_string(*column_ptr, 0, format_options);
123
0
    std::string column_name = block.get_by_position(arguments[0]).name;
124
0
    VLOG_DEBUG << "begin to execute match directly, column_name=" << column_name
125
0
               << ", match_query_str=" << match_query_str;
126
0
    auto* analyzer_ctx = get_match_analyzer_ctx(context);
127
0
    const ColumnPtr source_col =
128
0
            block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
129
0
    const auto* values = check_and_get_column<ColumnString>(source_col.get());
130
0
    const ColumnArray* array_col = nullptr;
131
0
    if (is_column<ColumnArray>(source_col.get())) {
132
0
        array_col = check_and_get_column<ColumnArray>(source_col.get());
133
0
        if (array_col && !array_col->get_data().is_column_string()) {
134
0
            return Status::NotSupported(fmt::format(
135
0
                    "unsupported nested array of type {} for function {}",
136
0
                    is_column_nullable(array_col->get_data()) ? array_col->get_data().get_name()
137
0
                                                              : array_col->get_data().get_name(),
138
0
                    get_name()));
139
0
        }
140
141
0
        if (is_column_nullable(array_col->get_data())) {
142
0
            const auto& array_nested_null_column =
143
0
                    reinterpret_cast<const ColumnNullable&>(array_col->get_data());
144
0
            values = check_and_get_column<ColumnString>(
145
0
                    *(array_nested_null_column.get_nested_column_ptr()));
146
0
        } else {
147
            // array column element is always set Nullable for now.
148
0
            values = check_and_get_column<ColumnString>(*(array_col->get_data_ptr()));
149
0
        }
150
0
    } else if (const auto* nullable = check_and_get_column<ColumnNullable>(source_col.get())) {
151
0
        values = check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr());
152
0
    }
153
154
0
    if (!values) {
155
0
        LOG(WARNING) << "Illegal column " << source_col->get_name();
156
0
        return Status::InternalError("Not supported input column types");
157
0
    }
158
    // result column
159
0
    auto res = ColumnUInt8::create();
160
0
    ColumnUInt8::Container& vec_res = res->get_data();
161
    // set default value to 0, and match functions only need to set 1/true
162
0
    vec_res.resize_fill(input_rows_count);
163
0
    RETURN_IF_ERROR(execute_match(context, column_name, match_query_str, input_rows_count, values,
164
0
                                  analyzer_ctx, (array_col ? &(array_col->get_offsets()) : nullptr),
165
0
                                  vec_res));
166
0
    block.replace_by_position(result, std::move(res));
167
168
0
    return Status::OK();
169
0
}
170
171
inline doris::segment_v2::InvertedIndexQueryType FunctionMatchBase::get_query_type_from_fn_name()
172
15
        const {
173
15
    std::string fn_name = get_name();
174
15
    if (fn_name == MATCH_ANY_FUNCTION) {
175
2
        return doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY;
176
13
    } else if (fn_name == MATCH_ALL_FUNCTION) {
177
2
        return doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY;
178
11
    } else if (fn_name == MATCH_PHRASE_FUNCTION) {
179
3
        return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY;
180
8
    } else if (fn_name == MATCH_PHRASE_PREFIX_FUNCTION) {
181
3
        return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY;
182
5
    } else if (fn_name == MATCH_PHRASE_REGEXP_FUNCTION) {
183
2
        return doris::segment_v2::InvertedIndexQueryType::MATCH_REGEXP_QUERY;
184
3
    } else if (fn_name == MATCH_PHRASE_EDGE_FUNCTION) {
185
3
        return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY;
186
3
    }
187
0
    return doris::segment_v2::InvertedIndexQueryType::UNKNOWN_QUERY;
188
15
}
189
190
std::vector<TermInfo> FunctionMatchBase::analyse_query_str_token(
191
        const InvertedIndexAnalyzerCtx* analyzer_ctx, const std::string& match_query_str,
192
8
        const std::string& column_name) const {
193
8
    std::vector<TermInfo> query_tokens;
194
8
    if (analyzer_ctx == nullptr) {
195
3
        return query_tokens;
196
3
    }
197
198
5
    VLOG_DEBUG << "begin to run " << get_name() << ", parser_type: "
199
0
               << inverted_index_parser_type_to_string(analyzer_ctx->parser_type);
200
201
    // Decision is based on parser_type (from index properties):
202
    // - PARSER_NONE: no tokenization (keyword/exact match)
203
    // - Other parsers: tokenize using the analyzer
204
5
    if (!analyzer_ctx->should_tokenize()) {
205
        // Keyword index: all strings (including empty) are valid tokens for exact match.
206
        // Empty string is a valid value in keyword index and should be matchable.
207
1
        query_tokens.emplace_back(match_query_str);
208
1
        return query_tokens;
209
1
    }
210
211
    // Safety check: if analyzer is nullptr but tokenization is expected, fall back to no tokenization
212
4
    if (analyzer_ctx->analyzer == nullptr) {
213
0
        VLOG_DEBUG << "Analyzer is nullptr, falling back to no tokenization";
214
        // For fallback case, also allow empty strings to be matched
215
0
        query_tokens.emplace_back(match_query_str);
216
0
        return query_tokens;
217
0
    }
218
219
    // Tokenize using the analyzer
220
4
    auto reader = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader(
221
4
            analyzer_ctx->char_filter_map);
222
4
    reader->init(match_query_str.data(), (int)match_query_str.size(), true);
223
4
    query_tokens = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
224
4
            reader, analyzer_ctx->analyzer.get());
225
4
    return query_tokens;
226
4
}
227
228
inline std::vector<TermInfo> FunctionMatchBase::analyse_data_token(
229
        const std::string& column_name, const InvertedIndexAnalyzerCtx* analyzer_ctx,
230
        const ColumnString* string_col, int32_t current_block_row_idx,
231
128
        const ColumnArray::Offsets64* array_offsets, int32_t& current_src_array_offset) const {
232
128
    std::vector<TermInfo> data_tokens;
233
128
    if (analyzer_ctx == nullptr) {
234
0
        return data_tokens;
235
0
    }
236
237
    // Determine tokenization strategy based on parser_type
238
128
    const bool should_tokenize =
239
128
            analyzer_ctx->should_tokenize() && analyzer_ctx->analyzer != nullptr;
240
241
128
    if (array_offsets) {
242
2
        for (auto next_src_array_offset = (*array_offsets)[current_block_row_idx];
243
6
             current_src_array_offset < next_src_array_offset; ++current_src_array_offset) {
244
4
            const auto& str_ref = string_col->get_data_at(current_src_array_offset);
245
4
            if (!should_tokenize) {
246
0
                data_tokens.emplace_back(str_ref.to_string());
247
0
                continue;
248
0
            }
249
4
            auto reader = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader(
250
4
                    analyzer_ctx->char_filter_map);
251
4
            reader->init(str_ref.data, (int)str_ref.size, true);
252
4
            data_tokens =
253
4
                    doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
254
4
                            reader, analyzer_ctx->analyzer.get());
255
4
        }
256
126
    } else {
257
126
        const auto& str_ref = string_col->get_data_at(current_block_row_idx);
258
126
        if (!should_tokenize) {
259
3
            data_tokens.emplace_back(str_ref.to_string());
260
123
        } else {
261
123
            auto reader = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader(
262
123
                    analyzer_ctx->char_filter_map);
263
123
            reader->init(str_ref.data, (int)str_ref.size, true);
264
123
            data_tokens =
265
123
                    doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
266
123
                            reader, analyzer_ctx->analyzer.get());
267
123
        }
268
126
    }
269
128
    return data_tokens;
270
128
}
271
272
0
Status FunctionMatchBase::check(FunctionContext* context, const std::string& function_name) const {
273
0
    if (!context->state()->query_options().enable_match_without_inverted_index) {
274
0
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
275
0
                "{} not support execute_match", function_name);
276
0
    }
277
278
0
    DBUG_EXECUTE_IF("match.invert_index_not_support_execute_match", {
279
0
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
280
0
                "debug point: {} not support execute_match", function_name);
281
0
    });
282
283
0
    return Status::OK();
284
0
}
285
286
Status FunctionMatchAny::execute_match(FunctionContext* context, const std::string& column_name,
287
                                       const std::string& match_query_str, size_t input_rows_count,
288
                                       const ColumnString* string_col,
289
                                       const InvertedIndexAnalyzerCtx* analyzer_ctx,
290
                                       const ColumnArray::Offsets64* array_offsets,
291
0
                                       ColumnUInt8::Container& result) const {
292
0
    RETURN_IF_ERROR(check(context, name));
293
294
0
    auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str, column_name);
295
0
    if (query_tokens.empty()) {
296
0
        VLOG_DEBUG << fmt::format(
297
0
                "token parser result is empty for query, "
298
0
                "please check your query: '{}' and index parser: '{}'",
299
0
                match_query_str,
300
0
                analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
301
0
                             : "unknown");
302
0
        return Status::OK();
303
0
    }
304
305
0
    auto current_src_array_offset = 0;
306
0
    for (int i = 0; i < input_rows_count; i++) {
307
0
        auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
308
0
                                              array_offsets, current_src_array_offset);
309
310
        // TODO: more efficient impl
311
0
        for (auto& term_info : query_tokens) {
312
0
            auto it =
313
0
                    std::find_if(data_tokens.begin(), data_tokens.end(), [&](const TermInfo& info) {
314
0
                        return info.get_single_term() == term_info.get_single_term();
315
0
                    });
316
0
            if (it != data_tokens.end()) {
317
0
                result[i] = true;
318
0
                break;
319
0
            }
320
0
        }
321
0
    }
322
323
0
    return Status::OK();
324
0
}
325
326
Status FunctionMatchAll::execute_match(FunctionContext* context, const std::string& column_name,
327
                                       const std::string& match_query_str, size_t input_rows_count,
328
                                       const ColumnString* string_col,
329
                                       const InvertedIndexAnalyzerCtx* analyzer_ctx,
330
                                       const ColumnArray::Offsets64* array_offsets,
331
0
                                       ColumnUInt8::Container& result) const {
332
0
    RETURN_IF_ERROR(check(context, name));
333
334
0
    auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str, column_name);
335
0
    if (query_tokens.empty()) {
336
0
        VLOG_DEBUG << fmt::format(
337
0
                "token parser result is empty for query, "
338
0
                "please check your query: '{}' and index parser: '{}'",
339
0
                match_query_str,
340
0
                analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
341
0
                             : "unknown");
342
0
        return Status::OK();
343
0
    }
344
345
0
    auto current_src_array_offset = 0;
346
0
    for (int i = 0; i < input_rows_count; i++) {
347
0
        auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
348
0
                                              array_offsets, current_src_array_offset);
349
350
        // TODO: more efficient impl
351
0
        auto find_count = 0;
352
0
        for (auto& term_info : query_tokens) {
353
0
            auto it =
354
0
                    std::find_if(data_tokens.begin(), data_tokens.end(), [&](const TermInfo& info) {
355
0
                        return info.get_single_term() == term_info.get_single_term();
356
0
                    });
357
0
            if (it != data_tokens.end()) {
358
0
                ++find_count;
359
0
            } else {
360
0
                break;
361
0
            }
362
0
        }
363
364
0
        if (find_count == query_tokens.size()) {
365
0
            result[i] = true;
366
0
        }
367
0
    }
368
369
0
    return Status::OK();
370
0
}
371
372
Status FunctionMatchPhrase::execute_match(FunctionContext* context, const std::string& column_name,
373
                                          const std::string& match_query_str,
374
                                          size_t input_rows_count, const ColumnString* string_col,
375
                                          const InvertedIndexAnalyzerCtx* analyzer_ctx,
376
                                          const ColumnArray::Offsets64* array_offsets,
377
0
                                          ColumnUInt8::Container& result) const {
378
0
    RETURN_IF_ERROR(check(context, name));
379
380
0
    auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str, column_name);
381
0
    if (query_tokens.empty()) {
382
0
        VLOG_DEBUG << fmt::format(
383
0
                "token parser result is empty for query, "
384
0
                "please check your query: '{}' and index parser: '{}'",
385
0
                match_query_str,
386
0
                analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
387
0
                             : "unknown");
388
0
        return Status::OK();
389
0
    }
390
391
0
    auto current_src_array_offset = 0;
392
0
    for (int i = 0; i < input_rows_count; i++) {
393
0
        auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
394
0
                                              array_offsets, current_src_array_offset);
395
396
        // TODO: more efficient impl
397
0
        bool matched = false;
398
0
        auto data_it = data_tokens.begin();
399
0
        while (data_it != data_tokens.end()) {
400
            // find position of first token
401
0
            data_it = std::find_if(data_it, data_tokens.end(), [&](const TermInfo& info) {
402
0
                return info.get_single_term() == query_tokens[0].get_single_term();
403
0
            });
404
0
            if (data_it != data_tokens.end()) {
405
0
                matched = true;
406
0
                auto data_it_next = ++data_it;
407
0
                auto query_it = query_tokens.begin() + 1;
408
                // compare query_tokens after the first to data_tokens one by one
409
0
                while (query_it != query_tokens.end()) {
410
0
                    if (data_it_next == data_tokens.end() ||
411
0
                        data_it_next->get_single_term() != query_it->get_single_term()) {
412
0
                        matched = false;
413
0
                        break;
414
0
                    }
415
0
                    query_it++;
416
0
                    data_it_next++;
417
0
                }
418
419
0
                if (matched) {
420
0
                    break;
421
0
                }
422
0
            }
423
0
        }
424
425
        // check matched
426
0
        if (matched) {
427
0
            result[i] = true;
428
0
        }
429
0
    }
430
431
0
    return Status::OK();
432
0
}
433
434
Status FunctionMatchPhrasePrefix::execute_match(
435
        FunctionContext* context, const std::string& column_name,
436
        const std::string& match_query_str, size_t input_rows_count, const ColumnString* string_col,
437
        const InvertedIndexAnalyzerCtx* analyzer_ctx, const ColumnArray::Offsets64* array_offsets,
438
0
        ColumnUInt8::Container& result) const {
439
0
    RETURN_IF_ERROR(check(context, name));
440
441
0
    auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str, column_name);
442
0
    if (query_tokens.empty()) {
443
0
        VLOG_DEBUG << fmt::format(
444
0
                "token parser result is empty for query, "
445
0
                "please check your query: '{}' and index parser: '{}'",
446
0
                match_query_str,
447
0
                analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
448
0
                             : "unknown");
449
0
        return Status::OK();
450
0
    }
451
452
0
    int32_t current_src_array_offset = 0;
453
0
    for (int i = 0; i < input_rows_count; i++) {
454
0
        auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
455
0
                                              array_offsets, current_src_array_offset);
456
457
0
        int64_t dis_count = data_tokens.size() - query_tokens.size();
458
0
        if (dis_count < 0) {
459
0
            continue;
460
0
        }
461
462
0
        for (size_t j = 0; j < dis_count + 1; j++) {
463
0
            if (data_tokens[j].get_single_term() == query_tokens[0].get_single_term() ||
464
0
                query_tokens.size() == 1) {
465
0
                bool match = true;
466
0
                for (size_t k = 0; k < query_tokens.size(); k++) {
467
0
                    const std::string& data_token = data_tokens[j + k].get_single_term();
468
0
                    const std::string& query_token = query_tokens[k].get_single_term();
469
0
                    if (k == query_tokens.size() - 1) {
470
0
                        if (data_token.compare(0, query_token.size(), query_token) != 0) {
471
0
                            match = false;
472
0
                            break;
473
0
                        }
474
0
                    } else {
475
0
                        if (data_token != query_token) {
476
0
                            match = false;
477
0
                            break;
478
0
                        }
479
0
                    }
480
0
                }
481
0
                if (match) {
482
0
                    result[i] = true;
483
0
                    break;
484
0
                }
485
0
            }
486
0
        }
487
0
    }
488
489
0
    return Status::OK();
490
0
}
491
492
Status FunctionMatchRegexp::execute_match(FunctionContext* context, const std::string& column_name,
493
                                          const std::string& match_query_str,
494
                                          size_t input_rows_count, const ColumnString* string_col,
495
                                          const InvertedIndexAnalyzerCtx* analyzer_ctx,
496
                                          const ColumnArray::Offsets64* array_offsets,
497
0
                                          ColumnUInt8::Container& result) const {
498
0
    RETURN_IF_ERROR(check(context, name));
499
500
0
    VLOG_DEBUG << "begin to run FunctionMatchRegexp::execute_match, parser_type: "
501
0
               << (analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
502
0
                                : "unknown");
503
504
0
    const std::string& pattern = match_query_str;
505
506
0
    hs_database_t* database = nullptr;
507
0
    hs_compile_error_t* compile_err = nullptr;
508
0
    hs_scratch_t* scratch = nullptr;
509
510
0
    if (hs_compile(pattern.data(), HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8,
511
0
                   HS_MODE_BLOCK, nullptr, &database, &compile_err) != HS_SUCCESS) {
512
0
        std::string err_message = "hyperscan compilation failed: ";
513
0
        err_message.append(compile_err->message);
514
0
        LOG(ERROR) << err_message;
515
0
        hs_free_compile_error(compile_err);
516
0
        return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(err_message);
517
0
    }
518
519
0
    if (hs_alloc_scratch(database, &scratch) != HS_SUCCESS) {
520
0
        LOG(ERROR) << "hyperscan could not allocate scratch space.";
521
0
        hs_free_database(database);
522
0
        return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
523
0
                "hyperscan could not allocate scratch space.");
524
0
    }
525
526
0
    auto on_match = [](unsigned int id, unsigned long long from, unsigned long long to,
527
0
                       unsigned int flags, void* context) -> int {
528
0
        *((bool*)context) = true;
529
0
        return 0;
530
0
    };
531
532
0
    try {
533
0
        auto current_src_array_offset = 0;
534
0
        for (int i = 0; i < input_rows_count; i++) {
535
0
            auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
536
0
                                                  array_offsets, current_src_array_offset);
537
538
0
            for (auto& input : data_tokens) {
539
0
                bool is_match = false;
540
0
                const auto& input_str = input.get_single_term();
541
0
                if (hs_scan(database, input_str.data(), (uint32_t)input_str.size(), 0, scratch,
542
0
                            on_match, (void*)&is_match) != HS_SUCCESS) {
543
0
                    LOG(ERROR) << "hyperscan match failed: " << input_str;
544
0
                    break;
545
0
                }
546
547
0
                if (is_match) {
548
0
                    result[i] = true;
549
0
                    break;
550
0
                }
551
0
            }
552
0
        }
553
0
    }
554
0
    _CLFINALLY({
555
0
        hs_free_scratch(scratch);
556
0
        hs_free_database(database);
557
0
    })
558
559
0
    return Status::OK();
560
0
}
561
562
Status FunctionMatchPhraseEdge::execute_match(
563
        FunctionContext* context, const std::string& column_name,
564
        const std::string& match_query_str, size_t input_rows_count, const ColumnString* string_col,
565
        const InvertedIndexAnalyzerCtx* analyzer_ctx, const ColumnArray::Offsets64* array_offsets,
566
0
        ColumnUInt8::Container& result) const {
567
0
    RETURN_IF_ERROR(check(context, name));
568
569
0
    auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str, column_name);
570
0
    if (query_tokens.empty()) {
571
0
        VLOG_DEBUG << fmt::format(
572
0
                "token parser result is empty for query, "
573
0
                "please check your query: '{}' and index parser: '{}'",
574
0
                match_query_str,
575
0
                analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
576
0
                             : "unknown");
577
0
        return Status::OK();
578
0
    }
579
580
0
    int32_t current_src_array_offset = 0;
581
0
    for (int i = 0; i < input_rows_count; i++) {
582
0
        auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
583
0
                                              array_offsets, current_src_array_offset);
584
585
0
        int64_t dis_count = data_tokens.size() - query_tokens.size();
586
0
        if (dis_count < 0) {
587
0
            continue;
588
0
        }
589
590
0
        for (size_t j = 0; j < dis_count + 1; j++) {
591
0
            bool match = true;
592
0
            if (query_tokens.size() == 1) {
593
0
                if (data_tokens[j].get_single_term().find(query_tokens[0].get_single_term()) ==
594
0
                    std::string::npos) {
595
0
                    match = false;
596
0
                }
597
0
            } else {
598
0
                for (size_t k = 0; k < query_tokens.size(); k++) {
599
0
                    const std::string& data_token = data_tokens[j + k].get_single_term();
600
0
                    const std::string& query_token = query_tokens[k].get_single_term();
601
0
                    if (k == 0) {
602
0
                        if (!data_token.ends_with(query_token)) {
603
0
                            match = false;
604
0
                            break;
605
0
                        }
606
0
                    } else if (k == query_tokens.size() - 1) {
607
0
                        if (!data_token.starts_with(query_token)) {
608
0
                            match = false;
609
0
                            break;
610
0
                        }
611
0
                    } else {
612
0
                        if (data_token != query_token) {
613
0
                            match = false;
614
0
                            break;
615
0
                        }
616
0
                    }
617
0
                }
618
0
            }
619
0
            if (match) {
620
0
                result[i] = true;
621
0
                break;
622
0
            }
623
0
        }
624
0
    }
625
626
0
    return Status::OK();
627
0
}
628
629
1
void register_function_match(SimpleFunctionFactory& factory) {
630
1
    factory.register_function<FunctionMatchAny>();
631
1
    factory.register_function<FunctionMatchAll>();
632
1
    factory.register_function<FunctionMatchPhrase>();
633
1
    factory.register_function<FunctionMatchPhrasePrefix>();
634
1
    factory.register_function<FunctionMatchRegexp>();
635
1
    factory.register_function<FunctionMatchPhraseEdge>();
636
1
}
637
#include "common/compile_check_end.h"
638
} // namespace doris