Coverage Report

Created: 2025-11-05 22:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/root/doris/be/src/vec/functions/match.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "vec/functions/match.h"
19
20
#include <hs/hs.h>
21
22
#include "olap/rowset/segment_v2/index_reader_helper.h"
23
#include "olap/rowset/segment_v2/inverted_index/analyzer/analyzer.h"
24
#include "runtime/query_context.h"
25
#include "runtime/runtime_state.h"
26
#include "util/debug_points.h"
27
28
namespace doris::vectorized {
29
#include "common/compile_check_begin.h"
30
Status FunctionMatchBase::evaluate_inverted_index(
31
        const ColumnsWithTypeAndName& arguments,
32
        const std::vector<vectorized::IndexFieldNameAndTypePair>& data_type_with_names,
33
        std::vector<segment_v2::IndexIterator*> iterators, uint32_t num_rows,
34
0
        segment_v2::InvertedIndexResultBitmap& bitmap_result) const {
35
0
    DCHECK(arguments.size() == 1);
36
0
    DCHECK(data_type_with_names.size() == 1);
37
0
    DCHECK(iterators.size() == 1);
38
0
    auto* iter = iterators[0];
39
0
    auto data_type_with_name = data_type_with_names[0];
40
0
    if (iter == nullptr) {
41
0
        return Status::OK();
42
0
    }
43
0
    const std::string& function_name = get_name();
44
45
0
    if (function_name == MATCH_PHRASE_FUNCTION || function_name == MATCH_PHRASE_PREFIX_FUNCTION ||
46
0
        function_name == MATCH_PHRASE_EDGE_FUNCTION) {
47
0
        auto reader = iter->get_reader(InvertedIndexReaderType::FULLTEXT);
48
0
        if (reader && !segment_v2::IndexReaderHelper::is_support_phrase(reader)) {
49
0
            return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
50
0
                    "phrase queries require setting support_phrase = true");
51
0
        }
52
0
    }
53
0
    Field param_value;
54
0
    arguments[0].column->get(0, param_value);
55
0
    if (param_value.is_null()) {
56
        // if query value is null, skip evaluate inverted index
57
0
        return Status::OK();
58
0
    }
59
0
    auto param_type = arguments[0].type->get_primitive_type();
60
0
    if (!is_string_type(param_type)) {
61
0
        return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
62
0
                "arguments for match must be string");
63
0
    }
64
0
    std::unique_ptr<InvertedIndexQueryParamFactory> query_param = nullptr;
65
0
    RETURN_IF_ERROR(InvertedIndexQueryParamFactory::create_query_value(param_type, &param_value,
66
0
                                                                       query_param));
67
68
0
    InvertedIndexParam param;
69
0
    param.column_name = data_type_with_name.first;
70
0
    param.column_type = data_type_with_name.second;
71
0
    param.query_value = query_param->get_value();
72
0
    param.query_type = get_query_type_from_fn_name();
73
0
    param.num_rows = num_rows;
74
0
    param.roaring = std::make_shared<roaring::Roaring>();
75
0
    if (is_string_type(param_type)) {
76
0
        RETURN_IF_ERROR(iter->read_from_index(&param));
77
0
    } else {
78
0
        return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
79
0
                "invalid params type for FunctionMatchBase::evaluate_inverted_index {}",
80
0
                param_type);
81
0
    }
82
0
    std::shared_ptr<roaring::Roaring> null_bitmap = std::make_shared<roaring::Roaring>();
83
0
    if (iter->has_null()) {
84
0
        segment_v2::InvertedIndexQueryCacheHandle null_bitmap_cache_handle;
85
0
        RETURN_IF_ERROR(iter->read_null_bitmap(&null_bitmap_cache_handle));
86
0
        null_bitmap = null_bitmap_cache_handle.get_bitmap();
87
0
    }
88
0
    segment_v2::InvertedIndexResultBitmap result(param.roaring, null_bitmap);
89
0
    bitmap_result = result;
90
0
    bitmap_result.mask_out_null();
91
92
0
    return Status::OK();
93
0
}
94
Status FunctionMatchBase::execute_impl(FunctionContext* context, Block& block,
95
                                       const ColumnNumbers& arguments, uint32_t result,
96
0
                                       size_t input_rows_count) const {
97
0
    ColumnPtr& column_ptr = block.get_by_position(arguments[1]).column;
98
0
    DataTypePtr& type_ptr = block.get_by_position(arguments[1]).type;
99
0
    auto match_query_str = type_ptr->to_string(*column_ptr, 0);
100
0
    std::string column_name = block.get_by_position(arguments[0]).name;
101
0
    VLOG_DEBUG << "begin to execute match directly, column_name=" << column_name
102
0
               << ", match_query_str=" << match_query_str;
103
0
    auto* inverted_index_ctx = reinterpret_cast<InvertedIndexCtx*>(
104
0
            context->get_function_state(FunctionContext::THREAD_LOCAL));
105
0
    if (inverted_index_ctx == nullptr) {
106
0
        inverted_index_ctx = reinterpret_cast<InvertedIndexCtx*>(
107
0
                context->get_function_state(FunctionContext::FRAGMENT_LOCAL));
108
0
    }
109
110
0
    const ColumnPtr source_col =
111
0
            block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
112
0
    const auto* values = check_and_get_column<ColumnString>(source_col.get());
113
0
    const ColumnArray* array_col = nullptr;
114
0
    if (is_column<vectorized::ColumnArray>(source_col.get())) {
115
0
        array_col = check_and_get_column<ColumnArray>(source_col.get());
116
0
        if (array_col && !array_col->get_data().is_column_string()) {
117
0
            return Status::NotSupported(fmt::format(
118
0
                    "unsupported nested array of type {} for function {}",
119
0
                    is_column_nullable(array_col->get_data()) ? array_col->get_data().get_name()
120
0
                                                              : array_col->get_data().get_name(),
121
0
                    get_name()));
122
0
        }
123
124
0
        if (is_column_nullable(array_col->get_data())) {
125
0
            const auto& array_nested_null_column =
126
0
                    reinterpret_cast<const ColumnNullable&>(array_col->get_data());
127
0
            values = check_and_get_column<ColumnString>(
128
0
                    *(array_nested_null_column.get_nested_column_ptr()));
129
0
        } else {
130
            // array column element is always set Nullable for now.
131
0
            values = check_and_get_column<ColumnString>(*(array_col->get_data_ptr()));
132
0
        }
133
0
    } else if (const auto* nullable = check_and_get_column<ColumnNullable>(source_col.get())) {
134
0
        values = check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr());
135
0
    }
136
137
0
    if (!values) {
138
0
        LOG(WARNING) << "Illegal column " << source_col->get_name();
139
0
        return Status::InternalError("Not supported input column types");
140
0
    }
141
    // result column
142
0
    auto res = ColumnUInt8::create();
143
0
    ColumnUInt8::Container& vec_res = res->get_data();
144
    // set default value to 0, and match functions only need to set 1/true
145
0
    vec_res.resize_fill(input_rows_count);
146
0
    RETURN_IF_ERROR(execute_match(context, column_name, match_query_str, input_rows_count, values,
147
0
                                  inverted_index_ctx,
148
0
                                  (array_col ? &(array_col->get_offsets()) : nullptr), vec_res));
149
0
    block.replace_by_position(result, std::move(res));
150
151
0
    return Status::OK();
152
0
}
153
154
inline doris::segment_v2::InvertedIndexQueryType FunctionMatchBase::get_query_type_from_fn_name()
155
15
        const {
156
15
    std::string fn_name = get_name();
157
15
    if (fn_name == MATCH_ANY_FUNCTION) {
158
2
        return doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY;
159
13
    } else if (fn_name == MATCH_ALL_FUNCTION) {
160
2
        return doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY;
161
11
    } else if (fn_name == MATCH_PHRASE_FUNCTION) {
162
3
        return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY;
163
8
    } else if (fn_name == MATCH_PHRASE_PREFIX_FUNCTION) {
164
3
        return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY;
165
5
    } else if (fn_name == MATCH_PHRASE_REGEXP_FUNCTION) {
166
2
        return doris::segment_v2::InvertedIndexQueryType::MATCH_REGEXP_QUERY;
167
3
    } else if (fn_name == MATCH_PHRASE_EDGE_FUNCTION) {
168
3
        return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY;
169
3
    }
170
0
    return doris::segment_v2::InvertedIndexQueryType::UNKNOWN_QUERY;
171
15
}
172
173
std::vector<TermInfo> FunctionMatchBase::analyse_query_str_token(
174
        InvertedIndexCtx* inverted_index_ctx, const std::string& match_query_str,
175
8
        const std::string& column_name) const {
176
8
    VLOG_DEBUG << "begin to run " << get_name() << ", parser_type: "
177
0
               << inverted_index_parser_type_to_string(inverted_index_ctx->parser_type);
178
8
    std::vector<TermInfo> query_tokens;
179
8
    if (inverted_index_ctx == nullptr) {
180
3
        return query_tokens;
181
3
    }
182
    // parse is none and custom analyzer is empty mean no analyzer is set
183
5
    if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE &&
184
5
        inverted_index_ctx->custom_analyzer.empty()) {
185
1
        query_tokens.emplace_back(match_query_str);
186
1
        return query_tokens;
187
1
    }
188
4
    auto reader = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader(
189
4
            inverted_index_ctx->char_filter_map);
190
4
    reader->init(match_query_str.data(), (int)match_query_str.size(), true);
191
4
    query_tokens = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
192
4
            reader, inverted_index_ctx->analyzer);
193
4
    return query_tokens;
194
5
}
195
196
inline std::vector<TermInfo> FunctionMatchBase::analyse_data_token(
197
        const std::string& column_name, InvertedIndexCtx* inverted_index_ctx,
198
        const ColumnString* string_col, int32_t current_block_row_idx,
199
128
        const ColumnArray::Offsets64* array_offsets, int32_t& current_src_array_offset) const {
200
128
    std::vector<TermInfo> data_tokens;
201
128
    if (array_offsets) {
202
2
        for (auto next_src_array_offset = (*array_offsets)[current_block_row_idx];
203
6
             current_src_array_offset < next_src_array_offset; ++current_src_array_offset) {
204
4
            const auto& str_ref = string_col->get_data_at(current_src_array_offset);
205
            // parse is none and custom analyzer is empty mean no analyzer is set
206
4
            if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE &&
207
4
                inverted_index_ctx->custom_analyzer.empty()) {
208
0
                data_tokens.emplace_back(str_ref.to_string());
209
0
                continue;
210
0
            }
211
4
            auto reader = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader(
212
4
                    inverted_index_ctx->char_filter_map);
213
4
            reader->init(str_ref.data, (int)str_ref.size, true);
214
215
4
            data_tokens =
216
4
                    doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
217
4
                            reader, inverted_index_ctx->analyzer);
218
4
        }
219
126
    } else {
220
126
        const auto& str_ref = string_col->get_data_at(current_block_row_idx);
221
        // parse is none and custom analyzer is empty mean no analyzer is set
222
126
        if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE &&
223
126
            inverted_index_ctx->custom_analyzer.empty()) {
224
3
            data_tokens.emplace_back(str_ref.to_string());
225
123
        } else {
226
123
            auto reader = doris::segment_v2::inverted_index::InvertedIndexAnalyzer::create_reader(
227
123
                    inverted_index_ctx->char_filter_map);
228
123
            reader->init(str_ref.data, (int)str_ref.size, true);
229
123
            data_tokens =
230
123
                    doris::segment_v2::inverted_index::InvertedIndexAnalyzer::get_analyse_result(
231
123
                            reader, inverted_index_ctx->analyzer);
232
123
        }
233
126
    }
234
128
    return data_tokens;
235
128
}
236
237
0
Status FunctionMatchBase::check(FunctionContext* context, const std::string& function_name) const {
238
0
    if (!context->state()->query_options().enable_match_without_inverted_index) {
239
0
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
240
0
                "{} not support execute_match", function_name);
241
0
    }
242
243
0
    DBUG_EXECUTE_IF("match.invert_index_not_support_execute_match", {
244
0
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
245
0
                "debug point: {} not support execute_match", function_name);
246
0
    });
247
248
0
    return Status::OK();
249
0
}
250
251
Status FunctionMatchAny::execute_match(FunctionContext* context, const std::string& column_name,
252
                                       const std::string& match_query_str, size_t input_rows_count,
253
                                       const ColumnString* string_col,
254
                                       InvertedIndexCtx* inverted_index_ctx,
255
                                       const ColumnArray::Offsets64* array_offsets,
256
0
                                       ColumnUInt8::Container& result) const {
257
0
    RETURN_IF_ERROR(check(context, name));
258
259
0
    auto query_tokens = analyse_query_str_token(inverted_index_ctx, match_query_str, column_name);
260
0
    if (query_tokens.empty()) {
261
0
        VLOG_DEBUG << fmt::format(
262
0
                "token parser result is empty for query, "
263
0
                "please check your query: '{}' and index parser: '{}'",
264
0
                match_query_str,
265
0
                inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
266
0
        return Status::OK();
267
0
    }
268
269
0
    auto current_src_array_offset = 0;
270
0
    for (int i = 0; i < input_rows_count; i++) {
271
0
        auto data_tokens = analyse_data_token(column_name, inverted_index_ctx, string_col, i,
272
0
                                              array_offsets, current_src_array_offset);
273
274
        // TODO: more efficient impl
275
0
        for (auto& term_info : query_tokens) {
276
0
            auto it =
277
0
                    std::find_if(data_tokens.begin(), data_tokens.end(), [&](const TermInfo& info) {
278
0
                        return info.get_single_term() == term_info.get_single_term();
279
0
                    });
280
0
            if (it != data_tokens.end()) {
281
0
                result[i] = true;
282
0
                break;
283
0
            }
284
0
        }
285
0
    }
286
287
0
    return Status::OK();
288
0
}
289
290
Status FunctionMatchAll::execute_match(FunctionContext* context, const std::string& column_name,
291
                                       const std::string& match_query_str, size_t input_rows_count,
292
                                       const ColumnString* string_col,
293
                                       InvertedIndexCtx* inverted_index_ctx,
294
                                       const ColumnArray::Offsets64* array_offsets,
295
0
                                       ColumnUInt8::Container& result) const {
296
0
    RETURN_IF_ERROR(check(context, name));
297
298
0
    auto query_tokens = analyse_query_str_token(inverted_index_ctx, match_query_str, column_name);
299
0
    if (query_tokens.empty()) {
300
0
        VLOG_DEBUG << fmt::format(
301
0
                "token parser result is empty for query, "
302
0
                "please check your query: '{}' and index parser: '{}'",
303
0
                match_query_str,
304
0
                inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
305
0
        return Status::OK();
306
0
    }
307
308
0
    auto current_src_array_offset = 0;
309
0
    for (int i = 0; i < input_rows_count; i++) {
310
0
        auto data_tokens = analyse_data_token(column_name, inverted_index_ctx, string_col, i,
311
0
                                              array_offsets, current_src_array_offset);
312
313
        // TODO: more efficient impl
314
0
        auto find_count = 0;
315
0
        for (auto& term_info : query_tokens) {
316
0
            auto it =
317
0
                    std::find_if(data_tokens.begin(), data_tokens.end(), [&](const TermInfo& info) {
318
0
                        return info.get_single_term() == term_info.get_single_term();
319
0
                    });
320
0
            if (it != data_tokens.end()) {
321
0
                ++find_count;
322
0
            } else {
323
0
                break;
324
0
            }
325
0
        }
326
327
0
        if (find_count == query_tokens.size()) {
328
0
            result[i] = true;
329
0
        }
330
0
    }
331
332
0
    return Status::OK();
333
0
}
334
335
Status FunctionMatchPhrase::execute_match(FunctionContext* context, const std::string& column_name,
336
                                          const std::string& match_query_str,
337
                                          size_t input_rows_count, const ColumnString* string_col,
338
                                          InvertedIndexCtx* inverted_index_ctx,
339
                                          const ColumnArray::Offsets64* array_offsets,
340
0
                                          ColumnUInt8::Container& result) const {
341
0
    RETURN_IF_ERROR(check(context, name));
342
343
0
    auto query_tokens = analyse_query_str_token(inverted_index_ctx, match_query_str, column_name);
344
0
    if (query_tokens.empty()) {
345
0
        VLOG_DEBUG << fmt::format(
346
0
                "token parser result is empty for query, "
347
0
                "please check your query: '{}' and index parser: '{}'",
348
0
                match_query_str,
349
0
                inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
350
0
        return Status::OK();
351
0
    }
352
353
0
    auto current_src_array_offset = 0;
354
0
    for (int i = 0; i < input_rows_count; i++) {
355
0
        auto data_tokens = analyse_data_token(column_name, inverted_index_ctx, string_col, i,
356
0
                                              array_offsets, current_src_array_offset);
357
358
        // TODO: more efficient impl
359
0
        bool matched = false;
360
0
        auto data_it = data_tokens.begin();
361
0
        while (data_it != data_tokens.end()) {
362
            // find position of first token
363
0
            data_it = std::find_if(data_it, data_tokens.end(), [&](const TermInfo& info) {
364
0
                return info.get_single_term() == query_tokens[0].get_single_term();
365
0
            });
366
0
            if (data_it != data_tokens.end()) {
367
0
                matched = true;
368
0
                auto data_it_next = ++data_it;
369
0
                auto query_it = query_tokens.begin() + 1;
370
                // compare query_tokens after the first to data_tokens one by one
371
0
                while (query_it != query_tokens.end()) {
372
0
                    if (data_it_next == data_tokens.end() ||
373
0
                        data_it_next->get_single_term() != query_it->get_single_term()) {
374
0
                        matched = false;
375
0
                        break;
376
0
                    }
377
0
                    query_it++;
378
0
                    data_it_next++;
379
0
                }
380
381
0
                if (matched) {
382
0
                    break;
383
0
                }
384
0
            }
385
0
        }
386
387
        // check matched
388
0
        if (matched) {
389
0
            result[i] = true;
390
0
        }
391
0
    }
392
393
0
    return Status::OK();
394
0
}
395
396
Status FunctionMatchPhrasePrefix::execute_match(
397
        FunctionContext* context, const std::string& column_name,
398
        const std::string& match_query_str, size_t input_rows_count, const ColumnString* string_col,
399
        InvertedIndexCtx* inverted_index_ctx, const ColumnArray::Offsets64* array_offsets,
400
0
        ColumnUInt8::Container& result) const {
401
0
    RETURN_IF_ERROR(check(context, name));
402
403
0
    auto query_tokens = analyse_query_str_token(inverted_index_ctx, match_query_str, column_name);
404
0
    if (query_tokens.empty()) {
405
0
        VLOG_DEBUG << fmt::format(
406
0
                "token parser result is empty for query, "
407
0
                "please check your query: '{}' and index parser: '{}'",
408
0
                match_query_str,
409
0
                inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
410
0
        return Status::OK();
411
0
    }
412
413
0
    int32_t current_src_array_offset = 0;
414
0
    for (int i = 0; i < input_rows_count; i++) {
415
0
        auto data_tokens = analyse_data_token(column_name, inverted_index_ctx, string_col, i,
416
0
                                              array_offsets, current_src_array_offset);
417
418
0
        int64_t dis_count = data_tokens.size() - query_tokens.size();
419
0
        if (dis_count < 0) {
420
0
            continue;
421
0
        }
422
423
0
        for (size_t j = 0; j < dis_count + 1; j++) {
424
0
            if (data_tokens[j].get_single_term() == query_tokens[0].get_single_term() ||
425
0
                query_tokens.size() == 1) {
426
0
                bool match = true;
427
0
                for (size_t k = 0; k < query_tokens.size(); k++) {
428
0
                    const std::string& data_token = data_tokens[j + k].get_single_term();
429
0
                    const std::string& query_token = query_tokens[k].get_single_term();
430
0
                    if (k == query_tokens.size() - 1) {
431
0
                        if (data_token.compare(0, query_token.size(), query_token) != 0) {
432
0
                            match = false;
433
0
                            break;
434
0
                        }
435
0
                    } else {
436
0
                        if (data_token != query_token) {
437
0
                            match = false;
438
0
                            break;
439
0
                        }
440
0
                    }
441
0
                }
442
0
                if (match) {
443
0
                    result[i] = true;
444
0
                    break;
445
0
                }
446
0
            }
447
0
        }
448
0
    }
449
450
0
    return Status::OK();
451
0
}
452
453
Status FunctionMatchRegexp::execute_match(FunctionContext* context, const std::string& column_name,
454
                                          const std::string& match_query_str,
455
                                          size_t input_rows_count, const ColumnString* string_col,
456
                                          InvertedIndexCtx* inverted_index_ctx,
457
                                          const ColumnArray::Offsets64* array_offsets,
458
0
                                          ColumnUInt8::Container& result) const {
459
0
    RETURN_IF_ERROR(check(context, name));
460
461
0
    VLOG_DEBUG << "begin to run FunctionMatchRegexp::execute_match, parser_type: "
462
0
               << inverted_index_parser_type_to_string(inverted_index_ctx->parser_type);
463
464
0
    const std::string& pattern = match_query_str;
465
466
0
    hs_database_t* database = nullptr;
467
0
    hs_compile_error_t* compile_err = nullptr;
468
0
    hs_scratch_t* scratch = nullptr;
469
470
0
    if (hs_compile(pattern.data(), HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8,
471
0
                   HS_MODE_BLOCK, nullptr, &database, &compile_err) != HS_SUCCESS) {
472
0
        std::string err_message = "hyperscan compilation failed: ";
473
0
        err_message.append(compile_err->message);
474
0
        LOG(ERROR) << err_message;
475
0
        hs_free_compile_error(compile_err);
476
0
        return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(err_message);
477
0
    }
478
479
0
    if (hs_alloc_scratch(database, &scratch) != HS_SUCCESS) {
480
0
        LOG(ERROR) << "hyperscan could not allocate scratch space.";
481
0
        hs_free_database(database);
482
0
        return Status::Error<ErrorCode::INDEX_INVALID_PARAMETERS>(
483
0
                "hyperscan could not allocate scratch space.");
484
0
    }
485
486
0
    auto on_match = [](unsigned int id, unsigned long long from, unsigned long long to,
487
0
                       unsigned int flags, void* context) -> int {
488
0
        *((bool*)context) = true;
489
0
        return 0;
490
0
    };
491
492
0
    try {
493
0
        auto current_src_array_offset = 0;
494
0
        for (int i = 0; i < input_rows_count; i++) {
495
0
            auto data_tokens = analyse_data_token(column_name, inverted_index_ctx, string_col, i,
496
0
                                                  array_offsets, current_src_array_offset);
497
498
0
            for (auto& input : data_tokens) {
499
0
                bool is_match = false;
500
0
                const auto& input_str = input.get_single_term();
501
0
                if (hs_scan(database, input_str.data(), (uint32_t)input_str.size(), 0, scratch,
502
0
                            on_match, (void*)&is_match) != HS_SUCCESS) {
503
0
                    LOG(ERROR) << "hyperscan match failed: " << input_str;
504
0
                    break;
505
0
                }
506
507
0
                if (is_match) {
508
0
                    result[i] = true;
509
0
                    break;
510
0
                }
511
0
            }
512
0
        }
513
0
    }
514
0
    _CLFINALLY({
515
0
        hs_free_scratch(scratch);
516
0
        hs_free_database(database);
517
0
    })
518
519
0
    return Status::OK();
520
0
}
521
522
Status FunctionMatchPhraseEdge::execute_match(
523
        FunctionContext* context, const std::string& column_name,
524
        const std::string& match_query_str, size_t input_rows_count, const ColumnString* string_col,
525
        InvertedIndexCtx* inverted_index_ctx, const ColumnArray::Offsets64* array_offsets,
526
0
        ColumnUInt8::Container& result) const {
527
0
    RETURN_IF_ERROR(check(context, name));
528
529
0
    auto query_tokens = analyse_query_str_token(inverted_index_ctx, match_query_str, column_name);
530
0
    if (query_tokens.empty()) {
531
0
        VLOG_DEBUG << fmt::format(
532
0
                "token parser result is empty for query, "
533
0
                "please check your query: '{}' and index parser: '{}'",
534
0
                match_query_str,
535
0
                inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
536
0
        return Status::OK();
537
0
    }
538
539
0
    int32_t current_src_array_offset = 0;
540
0
    for (int i = 0; i < input_rows_count; i++) {
541
0
        auto data_tokens = analyse_data_token(column_name, inverted_index_ctx, string_col, i,
542
0
                                              array_offsets, current_src_array_offset);
543
544
0
        int64_t dis_count = data_tokens.size() - query_tokens.size();
545
0
        if (dis_count < 0) {
546
0
            continue;
547
0
        }
548
549
0
        for (size_t j = 0; j < dis_count + 1; j++) {
550
0
            bool match = true;
551
0
            if (query_tokens.size() == 1) {
552
0
                if (data_tokens[j].get_single_term().find(query_tokens[0].get_single_term()) ==
553
0
                    std::string::npos) {
554
0
                    match = false;
555
0
                }
556
0
            } else {
557
0
                for (size_t k = 0; k < query_tokens.size(); k++) {
558
0
                    const std::string& data_token = data_tokens[j + k].get_single_term();
559
0
                    const std::string& query_token = query_tokens[k].get_single_term();
560
0
                    if (k == 0) {
561
0
                        if (!data_token.ends_with(query_token)) {
562
0
                            match = false;
563
0
                            break;
564
0
                        }
565
0
                    } else if (k == query_tokens.size() - 1) {
566
0
                        if (!data_token.starts_with(query_token)) {
567
0
                            match = false;
568
0
                            break;
569
0
                        }
570
0
                    } else {
571
0
                        if (data_token != query_token) {
572
0
                            match = false;
573
0
                            break;
574
0
                        }
575
0
                    }
576
0
                }
577
0
            }
578
0
            if (match) {
579
0
                result[i] = true;
580
0
                break;
581
0
            }
582
0
        }
583
0
    }
584
585
0
    return Status::OK();
586
0
}
587
588
1
void register_function_match(SimpleFunctionFactory& factory) {
589
1
    factory.register_function<FunctionMatchAny>();
590
1
    factory.register_function<FunctionMatchAll>();
591
1
    factory.register_function<FunctionMatchPhrase>();
592
1
    factory.register_function<FunctionMatchPhrasePrefix>();
593
1
    factory.register_function<FunctionMatchRegexp>();
594
1
    factory.register_function<FunctionMatchPhraseEdge>();
595
1
}
596
#include "common/compile_check_end.h"
597
} // namespace doris::vectorized