Coverage Report

Created: 2026-03-15 15:38

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/match.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exprs/function/match.h"
19
20
#include <hs/hs.h>
21
22
#include "runtime/query_context.h"
23
#include "runtime/runtime_state.h"
24
#include "storage/index/index_reader_helper.h"
25
#include "storage/index/inverted/analyzer/analyzer.h"
26
#include "util/debug_points.h"
27
28
namespace doris {
29
#include "common/compile_check_begin.h"
30
31
namespace {
32
33
867
const InvertedIndexAnalyzerCtx* get_match_analyzer_ctx(FunctionContext* context) {
34
867
    if (context == nullptr) {
35
0
        return nullptr;
36
0
    }
37
867
    auto* analyzer_ctx = reinterpret_cast<const InvertedIndexAnalyzerCtx*>(
38
867
            context->get_function_state(FunctionContext::THREAD_LOCAL));
39
867
    if (analyzer_ctx == nullptr) {
40
0
        analyzer_ctx = reinterpret_cast<const InvertedIndexAnalyzerCtx*>(
41
0
                context->get_function_state(FunctionContext::FRAGMENT_LOCAL));
42
0
    }
43
867
    return analyzer_ctx;
44
867
}
45
46
} // namespace
47
48
Status FunctionMatchBase::evaluate_inverted_index(
49
        const ColumnsWithTypeAndName& arguments,
50
        const std::vector<IndexFieldNameAndTypePair>& data_type_with_names,
51
        std::vector<segment_v2::IndexIterator*> iterators, uint32_t num_rows,
52
        const InvertedIndexAnalyzerCtx* analyzer_ctx,
53
4.07k
        segment_v2::InvertedIndexResultBitmap& bitmap_result) const {
54
4.07k
    DCHECK(arguments.size() == 1);
55
4.07k
    DCHECK(data_type_with_names.size() == 1);
56
4.07k
    DCHECK(iterators.size() == 1);
57
4.07k
    auto* iter = iterators[0];
58
4.07k
    auto data_type_with_name = data_type_with_names[0];
59
4.07k
    if (iter == nullptr) {
60
0
        return Status::OK();
61
0
    }
62
4.07k
    const std::string& function_name = get_name();
63
64
4.07k
    if (function_name == MATCH_PHRASE_FUNCTION || function_name == MATCH_PHRASE_PREFIX_FUNCTION ||
65
4.07k
        function_name == MATCH_PHRASE_EDGE_FUNCTION) {
66
524
        auto reader = iter->get_reader(InvertedIndexReaderType::FULLTEXT);
67
524
        if (reader && !segment_v2::IndexReaderHelper::is_support_phrase(reader)) {
68
2
            return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
69
2
                    "phrase queries require setting support_phrase = true");
70
2
        }
71
524
    }
72
4.07k
    Field param_value;
73
4.07k
    arguments[0].column->get(0, param_value);
74
4.07k
    if (param_value.is_null()) {
75
        // if query value is null, skip evaluate inverted index
76
0
        return Status::OK();
77
0
    }
78
4.07k
    auto param_type = arguments[0].type->get_primitive_type();
79
4.07k
    if (!is_string_type(param_type)) {
80
0
        return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
81
0
                "arguments for match must be string");
82
0
    }
83
4.07k
    std::unique_ptr<InvertedIndexQueryParamFactory> query_param = nullptr;
84
4.07k
    RETURN_IF_ERROR(InvertedIndexQueryParamFactory::create_query_value(param_type, &param_value,
85
4.07k
                                                                       query_param));
86
87
4.07k
    InvertedIndexParam param;
88
4.07k
    param.column_name = data_type_with_name.first;
89
4.07k
    param.column_type = data_type_with_name.second;
90
4.07k
    param.query_value = query_param->get_value();
91
4.07k
    param.query_type = get_query_type_from_fn_name();
92
4.07k
    param.num_rows = num_rows;
93
4.07k
    param.roaring = std::make_shared<roaring::Roaring>();
94
4.07k
    param.analyzer_ctx = analyzer_ctx;
95
4.08k
    if (is_string_type(param_type)) {
96
4.08k
        RETURN_IF_ERROR(iter->read_from_index(&param));
97
18.4E
    } else {
98
18.4E
        return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
99
18.4E
                "invalid params type for FunctionMatchBase::evaluate_inverted_index {}",
100
18.4E
                param_type);
101
18.4E
    }
102
4.07k
    std::shared_ptr<roaring::Roaring> null_bitmap = std::make_shared<roaring::Roaring>();
103
4.09k
    if (iter->has_null()) {
104
4.09k
        segment_v2::InvertedIndexQueryCacheHandle null_bitmap_cache_handle;
105
4.09k
        RETURN_IF_ERROR(iter->read_null_bitmap(&null_bitmap_cache_handle));
106
4.09k
        null_bitmap = null_bitmap_cache_handle.get_bitmap();
107
4.09k
    }
108
4.07k
    segment_v2::InvertedIndexResultBitmap result(param.roaring, null_bitmap);
109
4.07k
    bitmap_result = result;
110
4.07k
    bitmap_result.mask_out_null();
111
112
4.07k
    return Status::OK();
113
4.07k
}
114
Status FunctionMatchBase::execute_impl(FunctionContext* context, Block& block,
115
                                       const ColumnNumbers& arguments, uint32_t result,
116
867
                                       size_t input_rows_count) const {
117
867
    ColumnPtr& column_ptr = block.get_by_position(arguments[1]).column;
118
867
    DataTypePtr& type_ptr = block.get_by_position(arguments[1]).type;
119
120
867
    auto format_options = DataTypeSerDe::get_default_format_options();
121
867
    auto time_zone = cctz::utc_time_zone();
122
867
    format_options.timezone =
123
867
            (context && context->state()) ? &context->state()->timezone_obj() : &time_zone;
124
125
867
    auto match_query_str = type_ptr->to_string(*column_ptr, 0, format_options);
126
867
    std::string column_name = block.get_by_position(arguments[0]).name;
127
867
    VLOG_DEBUG << "begin to execute match directly, column_name=" << column_name
128
0
               << ", match_query_str=" << match_query_str;
129
867
    auto* analyzer_ctx = get_match_analyzer_ctx(context);
130
867
    const ColumnPtr source_col =
131
867
            block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
132
867
    const auto* values = check_and_get_column<ColumnString>(source_col.get());
133
867
    const ColumnArray* array_col = nullptr;
134
867
    if (is_column<ColumnArray>(source_col.get())) {
135
9
        array_col = check_and_get_column<ColumnArray>(source_col.get());
136
9
        if (array_col && !array_col->get_data().is_column_string()) {
137
0
            return Status::NotSupported(fmt::format(
138
0
                    "unsupported nested array of type {} for function {}",
139
0
                    is_column_nullable(array_col->get_data()) ? array_col->get_data().get_name()
140
0
                                                              : array_col->get_data().get_name(),
141
0
                    get_name()));
142
0
        }
143
144
9
        if (is_column_nullable(array_col->get_data())) {
145
9
            const auto& array_nested_null_column =
146
9
                    reinterpret_cast<const ColumnNullable&>(array_col->get_data());
147
9
            values = check_and_get_column<ColumnString>(
148
9
                    *(array_nested_null_column.get_nested_column_ptr()));
149
9
        } else {
150
            // array column element is always set Nullable for now.
151
0
            values = check_and_get_column<ColumnString>(*(array_col->get_data_ptr()));
152
0
        }
153
858
    } else if (const auto* nullable = check_and_get_column<ColumnNullable>(source_col.get())) {
154
0
        values = check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr());
155
0
    }
156
157
867
    if (!values) {
158
0
        LOG(WARNING) << "Illegal column " << source_col->get_name();
159
0
        return Status::InternalError("Not supported input column types");
160
0
    }
161
    // result column
162
867
    auto res = ColumnUInt8::create();
163
867
    ColumnUInt8::Container& vec_res = res->get_data();
164
    // set default value to 0, and match functions only need to set 1/true
165
867
    vec_res.resize_fill(input_rows_count);
166
867
    RETURN_IF_ERROR(execute_match(context, column_name, match_query_str, input_rows_count, values,
167
867
                                  analyzer_ctx, (array_col ? &(array_col->get_offsets()) : nullptr),
168
867
                                  vec_res));
169
864
    block.replace_by_position(result, std::move(res));
170
171
864
    return Status::OK();
172
867
}
173
174
inline doris::segment_v2::InvertedIndexQueryType FunctionMatchBase::get_query_type_from_fn_name()
175
4.06k
        const {
176
4.06k
    std::string fn_name = get_name();
177
4.06k
    if (fn_name == MATCH_ANY_FUNCTION) {
178
2.98k
        return doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY;
179
2.98k
    } else if (fn_name == MATCH_ALL_FUNCTION) {
180
544
        return doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY;
181
544
    } else if (fn_name == MATCH_PHRASE_FUNCTION) {
182
492
        return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY;
183
492
    } else if (fn_name == MATCH_PHRASE_PREFIX_FUNCTION) {
184
37
        return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY;
185
37
    } else if (fn_name == MATCH_PHRASE_REGEXP_FUNCTION) {
186
22
        return doris::segment_v2::InvertedIndexQueryType::MATCH_REGEXP_QUERY;
187
18.4E
    } else if (fn_name == MATCH_PHRASE_EDGE_FUNCTION) {
188
3
        return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY;
189
3
    }
190
18.4E
    return doris::segment_v2::InvertedIndexQueryType::UNKNOWN_QUERY;
191
4.06k
}
192
193
std::vector<TermInfo> FunctionMatchBase::analyse_query_str_token(
194
        const InvertedIndexAnalyzerCtx* analyzer_ctx, const std::string& match_query_str,
195
867
        const std::string& column_name) const {
196
867
    std::vector<TermInfo> query_tokens;
197
867
    if (analyzer_ctx == nullptr) {
198
3
        return query_tokens;
199
3
    }
200
201
864
    VLOG_DEBUG << "begin to run " << get_name() << ", parser_type: "
202
0
               << inverted_index_parser_type_to_string(analyzer_ctx->parser_type);
203
204
    // Decision is based on parser_type (from index properties):
205
    // - PARSER_NONE: no tokenization (keyword/exact match)
206
    // - Other parsers: tokenize using the analyzer
207
864
    if (!analyzer_ctx->should_tokenize()) {
208
        // Keyword index: all strings (including empty) are valid tokens for exact match.
209
        // Empty string is a valid value in keyword index and should be matchable.
210
14
        query_tokens.emplace_back(match_query_str);
211
14
        return query_tokens;
212
14
    }
213
214
    // Safety check: if analyzer is nullptr but tokenization is expected, fall back to no tokenization
215
850
    if (analyzer_ctx->analyzer == nullptr) {
216
0
        VLOG_DEBUG << "Analyzer is nullptr, falling back to no tokenization";
217
        // For fallback case, also allow empty strings to be matched
218
0
        query_tokens.emplace_back(match_query_str);
219
0
        return query_tokens;
220
0
    }
221
222
    // Tokenize using the analyzer
223
850
    auto reader = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader(
224
850
            analyzer_ctx->char_filter_map);
225
850
    reader->init(match_query_str.data(), (int)match_query_str.size(), true);
226
850
    query_tokens = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
227
850
            reader, analyzer_ctx->analyzer.get());
228
850
    return query_tokens;
229
850
}
230
231
inline std::vector<TermInfo> FunctionMatchBase::analyse_data_token(
232
        const std::string& column_name, const InvertedIndexAnalyzerCtx* analyzer_ctx,
233
        const ColumnString* string_col, int32_t current_block_row_idx,
234
150k
        const ColumnArray::Offsets64* array_offsets, int32_t& current_src_array_offset) const {
235
150k
    std::vector<TermInfo> data_tokens;
236
150k
    if (analyzer_ctx == nullptr) {
237
0
        return data_tokens;
238
0
    }
239
240
    // Determine tokenization strategy based on parser_type
241
150k
    const bool should_tokenize =
242
150k
            analyzer_ctx->should_tokenize() && analyzer_ctx->analyzer != nullptr;
243
244
150k
    if (array_offsets) {
245
13
        for (auto next_src_array_offset = (*array_offsets)[current_block_row_idx];
246
28
             current_src_array_offset < next_src_array_offset; ++current_src_array_offset) {
247
15
            const auto& str_ref = string_col->get_data_at(current_src_array_offset);
248
15
            if (!should_tokenize) {
249
0
                data_tokens.emplace_back(str_ref.to_string());
250
0
                continue;
251
0
            }
252
15
            auto reader = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader(
253
15
                    analyzer_ctx->char_filter_map);
254
15
            reader->init(str_ref.data, (int)str_ref.size, true);
255
15
            data_tokens =
256
15
                    doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
257
15
                            reader, analyzer_ctx->analyzer.get());
258
15
        }
259
150k
    } else {
260
150k
        const auto& str_ref = string_col->get_data_at(current_block_row_idx);
261
150k
        if (!should_tokenize) {
262
27
            data_tokens.emplace_back(str_ref.to_string());
263
150k
        } else {
264
150k
            auto reader = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader(
265
150k
                    analyzer_ctx->char_filter_map);
266
150k
            reader->init(str_ref.data, (int)str_ref.size, true);
267
150k
            data_tokens =
268
150k
                    doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
269
150k
                            reader, analyzer_ctx->analyzer.get());
270
150k
        }
271
150k
    }
272
150k
    return data_tokens;
273
150k
}
274
275
867
Status FunctionMatchBase::check(FunctionContext* context, const std::string& function_name) const {
276
867
    if (!context->state()->query_options().enable_match_without_inverted_index) {
277
2
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
278
2
                "{} not support execute_match", function_name);
279
2
    }
280
281
865
    DBUG_EXECUTE_IF("match.invert_index_not_support_execute_match", {
282
865
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
283
865
                "debug point: {} not support execute_match", function_name);
284
865
    });
285
286
865
    return Status::OK();
287
865
}
288
289
Status FunctionMatchAny::execute_match(FunctionContext* context, const std::string& column_name,
290
                                       const std::string& match_query_str, size_t input_rows_count,
291
                                       const ColumnString* string_col,
292
                                       const InvertedIndexAnalyzerCtx* analyzer_ctx,
293
                                       const ColumnArray::Offsets64* array_offsets,
294
629
                                       ColumnUInt8::Container& result) const {
295
629
    RETURN_IF_ERROR(check(context, name));
296
297
629
    auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str, column_name);
298
629
    if (query_tokens.empty()) {
299
6
        VLOG_DEBUG << fmt::format(
300
0
                "token parser result is empty for query, "
301
0
                "please check your query: '{}' and index parser: '{}'",
302
0
                match_query_str,
303
0
                analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
304
0
                             : "unknown");
305
6
        return Status::OK();
306
6
    }
307
308
623
    auto current_src_array_offset = 0;
309
136k
    for (int i = 0; i < input_rows_count; i++) {
310
135k
        auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
311
135k
                                              array_offsets, current_src_array_offset);
312
313
        // TODO: more efficient impl
314
1.23M
        for (auto& term_info : query_tokens) {
315
1.23M
            auto it =
316
4.04M
                    std::find_if(data_tokens.begin(), data_tokens.end(), [&](const TermInfo& info) {
317
4.04M
                        return info.get_single_term() == term_info.get_single_term();
318
4.04M
                    });
319
1.23M
            if (it != data_tokens.end()) {
320
98.5k
                result[i] = true;
321
98.5k
                break;
322
98.5k
            }
323
1.23M
        }
324
135k
    }
325
326
623
    return Status::OK();
327
629
}
328
329
Status FunctionMatchAll::execute_match(FunctionContext* context, const std::string& column_name,
330
                                       const std::string& match_query_str, size_t input_rows_count,
331
                                       const ColumnString* string_col,
332
                                       const InvertedIndexAnalyzerCtx* analyzer_ctx,
333
                                       const ColumnArray::Offsets64* array_offsets,
334
123
                                       ColumnUInt8::Container& result) const {
335
123
    RETURN_IF_ERROR(check(context, name));
336
337
123
    auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str, column_name);
338
123
    if (query_tokens.empty()) {
339
5
        VLOG_DEBUG << fmt::format(
340
0
                "token parser result is empty for query, "
341
0
                "please check your query: '{}' and index parser: '{}'",
342
0
                match_query_str,
343
0
                analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
344
0
                             : "unknown");
345
5
        return Status::OK();
346
5
    }
347
348
118
    auto current_src_array_offset = 0;
349
1.69k
    for (int i = 0; i < input_rows_count; i++) {
350
1.57k
        auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
351
1.57k
                                              array_offsets, current_src_array_offset);
352
353
        // TODO: more efficient impl
354
1.57k
        auto find_count = 0;
355
1.83k
        for (auto& term_info : query_tokens) {
356
1.83k
            auto it =
357
29.3k
                    std::find_if(data_tokens.begin(), data_tokens.end(), [&](const TermInfo& info) {
358
29.3k
                        return info.get_single_term() == term_info.get_single_term();
359
29.3k
                    });
360
1.83k
            if (it != data_tokens.end()) {
361
437
                ++find_count;
362
1.40k
            } else {
363
1.40k
                break;
364
1.40k
            }
365
1.83k
        }
366
367
1.57k
        if (find_count == query_tokens.size()) {
368
175
            result[i] = true;
369
175
        }
370
1.57k
    }
371
372
118
    return Status::OK();
373
123
}
374
375
Status FunctionMatchPhrase::execute_match(FunctionContext* context, const std::string& column_name,
376
                                          const std::string& match_query_str,
377
                                          size_t input_rows_count, const ColumnString* string_col,
378
                                          const InvertedIndexAnalyzerCtx* analyzer_ctx,
379
                                          const ColumnArray::Offsets64* array_offsets,
380
103
                                          ColumnUInt8::Container& result) const {
381
103
    RETURN_IF_ERROR(check(context, name));
382
383
102
    auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str, column_name);
384
102
    if (query_tokens.empty()) {
385
7
        VLOG_DEBUG << fmt::format(
386
0
                "token parser result is empty for query, "
387
0
                "please check your query: '{}' and index parser: '{}'",
388
0
                match_query_str,
389
0
                analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
390
0
                             : "unknown");
391
7
        return Status::OK();
392
7
    }
393
394
95
    auto current_src_array_offset = 0;
395
10.1k
    for (int i = 0; i < input_rows_count; i++) {
396
10.0k
        auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
397
10.0k
                                              array_offsets, current_src_array_offset);
398
399
        // TODO: more efficient impl
400
10.0k
        bool matched = false;
401
10.0k
        auto data_it = data_tokens.begin();
402
17.6k
        while (data_it != data_tokens.end()) {
403
            // find position of first token
404
60.9k
            data_it = std::find_if(data_it, data_tokens.end(), [&](const TermInfo& info) {
405
60.9k
                return info.get_single_term() == query_tokens[0].get_single_term();
406
60.9k
            });
407
10.2k
            if (data_it != data_tokens.end()) {
408
2.76k
                matched = true;
409
2.76k
                auto data_it_next = ++data_it;
410
2.76k
                auto query_it = query_tokens.begin() + 1;
411
                // compare query_tokens after the first to data_tokens one by one
412
2.89k
                while (query_it != query_tokens.end()) {
413
240
                    if (data_it_next == data_tokens.end() ||
414
240
                        data_it_next->get_single_term() != query_it->get_single_term()) {
415
116
                        matched = false;
416
116
                        break;
417
116
                    }
418
124
                    query_it++;
419
124
                    data_it_next++;
420
124
                }
421
422
2.76k
                if (matched) {
423
2.65k
                    break;
424
2.65k
                }
425
2.76k
            }
426
10.2k
        }
427
428
        // check matched
429
10.0k
        if (matched) {
430
2.65k
            result[i] = true;
431
2.65k
        }
432
10.0k
    }
433
434
95
    return Status::OK();
435
102
}
436
437
Status FunctionMatchPhrasePrefix::execute_match(
438
        FunctionContext* context, const std::string& column_name,
439
        const std::string& match_query_str, size_t input_rows_count, const ColumnString* string_col,
440
        const InvertedIndexAnalyzerCtx* analyzer_ctx, const ColumnArray::Offsets64* array_offsets,
441
6
        ColumnUInt8::Container& result) const {
442
6
    RETURN_IF_ERROR(check(context, name));
443
444
5
    auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str, column_name);
445
5
    if (query_tokens.empty()) {
446
2
        VLOG_DEBUG << fmt::format(
447
0
                "token parser result is empty for query, "
448
0
                "please check your query: '{}' and index parser: '{}'",
449
0
                match_query_str,
450
0
                analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
451
0
                             : "unknown");
452
2
        return Status::OK();
453
2
    }
454
455
3
    int32_t current_src_array_offset = 0;
456
1.00k
    for (int i = 0; i < input_rows_count; i++) {
457
1.00k
        auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
458
1.00k
                                              array_offsets, current_src_array_offset);
459
460
1.00k
        int64_t dis_count = data_tokens.size() - query_tokens.size();
461
1.00k
        if (dis_count < 0) {
462
1
            continue;
463
1
        }
464
465
8.39k
        for (size_t j = 0; j < dis_count + 1; j++) {
466
7.51k
            if (data_tokens[j].get_single_term() == query_tokens[0].get_single_term() ||
467
7.51k
                query_tokens.size() == 1) {
468
213
                bool match = true;
469
549
                for (size_t k = 0; k < query_tokens.size(); k++) {
470
426
                    const std::string& data_token = data_tokens[j + k].get_single_term();
471
426
                    const std::string& query_token = query_tokens[k].get_single_term();
472
426
                    if (k == query_tokens.size() - 1) {
473
213
                        if (data_token.compare(0, query_token.size(), query_token) != 0) {
474
90
                            match = false;
475
90
                            break;
476
90
                        }
477
213
                    } else {
478
213
                        if (data_token != query_token) {
479
0
                            match = false;
480
0
                            break;
481
0
                        }
482
213
                    }
483
426
                }
484
213
                if (match) {
485
123
                    result[i] = true;
486
123
                    break;
487
123
                }
488
213
            }
489
7.51k
        }
490
1.00k
    }
491
492
3
    return Status::OK();
493
5
}
494
495
Status FunctionMatchRegexp::execute_match(FunctionContext* context, const std::string& column_name,
496
                                          const std::string& match_query_str,
497
                                          size_t input_rows_count, const ColumnString* string_col,
498
                                          const InvertedIndexAnalyzerCtx* analyzer_ctx,
499
                                          const ColumnArray::Offsets64* array_offsets,
500
6
                                          ColumnUInt8::Container& result) const {
501
6
    RETURN_IF_ERROR(check(context, name));
502
503
6
    VLOG_DEBUG << "begin to run FunctionMatchRegexp::execute_match, parser_type: "
504
0
               << (analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
505
0
                                : "unknown");
506
507
6
    const std::string& pattern = match_query_str;
508
509
6
    hs_database_t* database = nullptr;
510
6
    hs_compile_error_t* compile_err = nullptr;
511
6
    hs_scratch_t* scratch = nullptr;
512
513
6
    if (hs_compile(pattern.data(), HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8,
514
6
                   HS_MODE_BLOCK, nullptr, &database, &compile_err) != HS_SUCCESS) {
515
1
        std::string err_message = "hyperscan compilation failed: ";
516
1
        err_message.append(compile_err->message);
517
1
        LOG(ERROR) << err_message;
518
1
        hs_free_compile_error(compile_err);
519
1
        return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(err_message);
520
1
    }
521
522
5
    if (hs_alloc_scratch(database, &scratch) != HS_SUCCESS) {
523
0
        LOG(ERROR) << "hyperscan could not allocate scratch space.";
524
0
        hs_free_database(database);
525
0
        return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
526
0
                "hyperscan could not allocate scratch space.");
527
0
    }
528
529
5
    auto on_match = [](unsigned int id, unsigned long long from, unsigned long long to,
530
4.06k
                       unsigned int flags, void* context) -> int {
531
4.06k
        *((bool*)context) = true;
532
4.06k
        return 0;
533
4.06k
    };
534
535
5
    try {
536
5
        auto current_src_array_offset = 0;
537
2.00k
        for (int i = 0; i < input_rows_count; i++) {
538
2.00k
            auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
539
2.00k
                                                  array_offsets, current_src_array_offset);
540
541
9.62k
            for (auto& input : data_tokens) {
542
9.62k
                bool is_match = false;
543
9.62k
                const auto& input_str = input.get_single_term();
544
9.62k
                if (hs_scan(database, input_str.data(), (uint32_t)input_str.size(), 0, scratch,
545
9.62k
                            on_match, (void*)&is_match) != HS_SUCCESS) {
546
0
                    LOG(ERROR) << "hyperscan match failed: " << input_str;
547
0
                    break;
548
0
                }
549
550
9.62k
                if (is_match) {
551
1.06k
                    result[i] = true;
552
1.06k
                    break;
553
1.06k
                }
554
9.62k
            }
555
2.00k
        }
556
5
    }
557
5
    _CLFINALLY({
558
5
        hs_free_scratch(scratch);
559
5
        hs_free_database(database);
560
5
    })
561
562
5
    return Status::OK();
563
5
}
564
565
Status FunctionMatchPhraseEdge::execute_match(
566
        FunctionContext* context, const std::string& column_name,
567
        const std::string& match_query_str, size_t input_rows_count, const ColumnString* string_col,
568
        const InvertedIndexAnalyzerCtx* analyzer_ctx, const ColumnArray::Offsets64* array_offsets,
569
0
        ColumnUInt8::Container& result) const {
570
0
    RETURN_IF_ERROR(check(context, name));
571
572
0
    auto query_tokens = analyse_query_str_token(analyzer_ctx, match_query_str, column_name);
573
0
    if (query_tokens.empty()) {
574
0
        VLOG_DEBUG << fmt::format(
575
0
                "token parser result is empty for query, "
576
0
                "please check your query: '{}' and index parser: '{}'",
577
0
                match_query_str,
578
0
                analyzer_ctx ? inverted_index_parser_type_to_string(analyzer_ctx->parser_type)
579
0
                             : "unknown");
580
0
        return Status::OK();
581
0
    }
582
583
0
    int32_t current_src_array_offset = 0;
584
0
    for (int i = 0; i < input_rows_count; i++) {
585
0
        auto data_tokens = analyse_data_token(column_name, analyzer_ctx, string_col, i,
586
0
                                              array_offsets, current_src_array_offset);
587
588
0
        int64_t dis_count = data_tokens.size() - query_tokens.size();
589
0
        if (dis_count < 0) {
590
0
            continue;
591
0
        }
592
593
0
        for (size_t j = 0; j < dis_count + 1; j++) {
594
0
            bool match = true;
595
0
            if (query_tokens.size() == 1) {
596
0
                if (data_tokens[j].get_single_term().find(query_tokens[0].get_single_term()) ==
597
0
                    std::string::npos) {
598
0
                    match = false;
599
0
                }
600
0
            } else {
601
0
                for (size_t k = 0; k < query_tokens.size(); k++) {
602
0
                    const std::string& data_token = data_tokens[j + k].get_single_term();
603
0
                    const std::string& query_token = query_tokens[k].get_single_term();
604
0
                    if (k == 0) {
605
0
                        if (!data_token.ends_with(query_token)) {
606
0
                            match = false;
607
0
                            break;
608
0
                        }
609
0
                    } else if (k == query_tokens.size() - 1) {
610
0
                        if (!data_token.starts_with(query_token)) {
611
0
                            match = false;
612
0
                            break;
613
0
                        }
614
0
                    } else {
615
0
                        if (data_token != query_token) {
616
0
                            match = false;
617
0
                            break;
618
0
                        }
619
0
                    }
620
0
                }
621
0
            }
622
0
            if (match) {
623
0
                result[i] = true;
624
0
                break;
625
0
            }
626
0
        }
627
0
    }
628
629
0
    return Status::OK();
630
0
}
631
632
8
void register_function_match(SimpleFunctionFactory& factory) {
633
8
    factory.register_function<FunctionMatchAny>();
634
8
    factory.register_function<FunctionMatchAll>();
635
8
    factory.register_function<FunctionMatchPhrase>();
636
8
    factory.register_function<FunctionMatchPhrasePrefix>();
637
8
    factory.register_function<FunctionMatchRegexp>();
638
8
    factory.register_function<FunctionMatchPhraseEdge>();
639
8
}
640
#include "common/compile_check_end.h"
641
} // namespace doris