Coverage Report

Created: 2025-04-15 11:51

/root/doris/be/src/vec/functions/match.cpp
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "vec/functions/match.h"
19
20
#include <hs/hs.h>
21
22
#include "runtime/query_context.h"
23
#include "runtime/runtime_state.h"
24
#include "util/debug_points.h"
25
26
namespace doris::vectorized {
27
Status FunctionMatchBase::evaluate_inverted_index(
28
        const ColumnsWithTypeAndName& arguments,
29
        const std::vector<vectorized::IndexFieldNameAndTypePair>& data_type_with_names,
30
        std::vector<segment_v2::InvertedIndexIterator*> iterators, uint32_t num_rows,
31
0
        segment_v2::InvertedIndexResultBitmap& bitmap_result) const {
32
0
    DCHECK(arguments.size() == 1);
33
0
    DCHECK(data_type_with_names.size() == 1);
34
0
    DCHECK(iterators.size() == 1);
35
0
    auto* iter = iterators[0];
36
0
    auto data_type_with_name = data_type_with_names[0];
37
0
    if (iter == nullptr) {
38
0
        return Status::OK();
39
0
    }
40
0
    const std::string& function_name = get_name();
41
42
0
    if (function_name == MATCH_PHRASE_FUNCTION || function_name == MATCH_PHRASE_PREFIX_FUNCTION ||
43
0
        function_name == MATCH_PHRASE_EDGE_FUNCTION) {
44
0
        if (iter->get_inverted_index_reader_type() == InvertedIndexReaderType::FULLTEXT &&
45
0
            get_parser_phrase_support_string_from_properties(iter->get_index_properties()) ==
46
0
                    INVERTED_INDEX_PARSER_PHRASE_SUPPORT_NO) {
47
0
            return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>(
48
0
                    "phrase queries require setting support_phrase = true");
49
0
        }
50
0
    }
51
0
    std::shared_ptr<roaring::Roaring> roaring = std::make_shared<roaring::Roaring>();
52
0
    Field param_value;
53
0
    arguments[0].column->get(0, param_value);
54
0
    if (param_value.is_null()) {
55
        // if query value is null, skip evaluate inverted index
56
0
        return Status::OK();
57
0
    }
58
0
    auto param_type = arguments[0].type->get_type_as_type_descriptor().type;
59
0
    if (!is_string_type(param_type)) {
60
0
        return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>(
61
0
                "arguments for match must be string");
62
0
    }
63
0
    std::unique_ptr<segment_v2::InvertedIndexQueryParamFactory> query_param = nullptr;
64
0
    RETURN_IF_ERROR(segment_v2::InvertedIndexQueryParamFactory::create_query_value(
65
0
            param_type, &param_value, query_param));
66
0
    if (is_string_type(param_type)) {
67
0
        auto inverted_index_query_type = get_query_type_from_fn_name();
68
0
        RETURN_IF_ERROR(
69
0
                iter->read_from_inverted_index(data_type_with_name.first, query_param->get_value(),
70
0
                                               inverted_index_query_type, num_rows, roaring));
71
0
    } else {
72
0
        return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>(
73
0
                "invalid params type for FunctionMatchBase::evaluate_inverted_index {}",
74
0
                param_type);
75
0
    }
76
0
    std::shared_ptr<roaring::Roaring> null_bitmap = std::make_shared<roaring::Roaring>();
77
0
    if (iter->has_null()) {
78
0
        segment_v2::InvertedIndexQueryCacheHandle null_bitmap_cache_handle;
79
0
        RETURN_IF_ERROR(iter->read_null_bitmap(&null_bitmap_cache_handle));
80
0
        null_bitmap = null_bitmap_cache_handle.get_bitmap();
81
0
    }
82
0
    segment_v2::InvertedIndexResultBitmap result(roaring, null_bitmap);
83
0
    bitmap_result = result;
84
0
    bitmap_result.mask_out_null();
85
86
0
    return Status::OK();
87
0
}
88
Status FunctionMatchBase::execute_impl(FunctionContext* context, Block& block,
89
                                       const ColumnNumbers& arguments, size_t result,
90
0
                                       size_t input_rows_count) const {
91
0
    ColumnPtr& column_ptr = block.get_by_position(arguments[1]).column;
92
0
    DataTypePtr& type_ptr = block.get_by_position(arguments[1]).type;
93
0
    auto match_query_str = type_ptr->to_string(*column_ptr, 0);
94
0
    std::string column_name = block.get_by_position(arguments[0]).name;
95
0
    VLOG_DEBUG << "begin to execute match directly, column_name=" << column_name
96
0
               << ", match_query_str=" << match_query_str;
97
0
    InvertedIndexCtx* inverted_index_ctx = reinterpret_cast<InvertedIndexCtx*>(
98
0
            context->get_function_state(FunctionContext::THREAD_LOCAL));
99
0
    if (inverted_index_ctx == nullptr) {
100
0
        inverted_index_ctx = reinterpret_cast<InvertedIndexCtx*>(
101
0
                context->get_function_state(FunctionContext::FRAGMENT_LOCAL));
102
0
    }
103
104
0
    const ColumnPtr source_col =
105
0
            block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
106
0
    const auto* values = check_and_get_column<ColumnString>(source_col.get());
107
0
    const ColumnArray* array_col = nullptr;
108
0
    if (source_col->is_column_array()) {
109
0
        array_col = check_and_get_column<ColumnArray>(source_col.get());
110
0
        if (array_col && !array_col->get_data().is_column_string()) {
111
0
            return Status::NotSupported(
112
0
                    fmt::format("unsupported nested array of type {} for function {}",
113
0
                                is_column_nullable(array_col->get_data())
114
0
                                        ? array_col->get_data().get_name()
115
0
                                        : array_col->get_data().get_family_name(),
116
0
                                get_name()));
117
0
        }
118
119
0
        if (is_column_nullable(array_col->get_data())) {
120
0
            const auto& array_nested_null_column =
121
0
                    reinterpret_cast<const ColumnNullable&>(array_col->get_data());
122
0
            values = check_and_get_column<ColumnString>(
123
0
                    *(array_nested_null_column.get_nested_column_ptr()));
124
0
        } else {
125
            // array column element is always set Nullable for now.
126
0
            values = check_and_get_column<ColumnString>(*(array_col->get_data_ptr()));
127
0
        }
128
0
    } else if (auto* nullable = check_and_get_column<ColumnNullable>(source_col.get())) {
129
0
        values = check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr());
130
0
    }
131
132
0
    if (!values) {
133
0
        LOG(WARNING) << "Illegal column " << source_col->get_name();
134
0
        return Status::InternalError("Not supported input column types");
135
0
    }
136
    // result column
137
0
    auto res = ColumnUInt8::create();
138
0
    ColumnUInt8::Container& vec_res = res->get_data();
139
    // set default value to 0, and match functions only need to set 1/true
140
0
    vec_res.resize_fill(input_rows_count);
141
0
    RETURN_IF_ERROR(execute_match(context, column_name, match_query_str, input_rows_count, values,
142
0
                                  inverted_index_ctx,
143
0
                                  (array_col ? &(array_col->get_offsets()) : nullptr), vec_res));
144
0
    block.replace_by_position(result, std::move(res));
145
146
0
    return Status::OK();
147
0
}
148
149
inline doris::segment_v2::InvertedIndexQueryType FunctionMatchBase::get_query_type_from_fn_name()
150
1
        const {
151
1
    std::string fn_name = get_name();
152
1
    if (fn_name == MATCH_ANY_FUNCTION) {
153
0
        return doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY;
154
1
    } else if (fn_name == MATCH_ALL_FUNCTION) {
155
0
        return doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY;
156
1
    } else if (fn_name == MATCH_PHRASE_FUNCTION) {
157
1
        return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY;
158
1
    } else if (fn_name == MATCH_PHRASE_PREFIX_FUNCTION) {
159
0
        return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY;
160
0
    } else if (fn_name == MATCH_PHRASE_REGEXP_FUNCTION) {
161
0
        return doris::segment_v2::InvertedIndexQueryType::MATCH_REGEXP_QUERY;
162
0
    } else if (fn_name == MATCH_PHRASE_EDGE_FUNCTION) {
163
0
        return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_EDGE_QUERY;
164
0
    }
165
0
    return doris::segment_v2::InvertedIndexQueryType::UNKNOWN_QUERY;
166
1
}
167
168
void FunctionMatchBase::analyse_query_str_token(std::vector<std::string>* query_tokens,
169
                                                InvertedIndexCtx* inverted_index_ctx,
170
                                                const std::string& match_query_str,
171
3
                                                const std::string& column_name) const {
172
3
    VLOG_DEBUG << "begin to run " << get_name() << ", parser_type: "
173
0
               << inverted_index_parser_type_to_string(inverted_index_ctx->parser_type);
174
3
    if (inverted_index_ctx == nullptr) {
175
1
        return;
176
1
    }
177
2
    if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE) {
178
1
        query_tokens->emplace_back(match_query_str);
179
1
        return;
180
1
    }
181
1
    auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
182
1
                                                                        match_query_str);
183
1
    doris::segment_v2::InvertedIndexReader::get_analyse_result(
184
1
            *query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
185
1
            get_query_type_from_fn_name());
186
1
}
187
188
inline std::vector<std::string> FunctionMatchBase::analyse_data_token(
189
        const std::string& column_name, InvertedIndexCtx* inverted_index_ctx,
190
        const ColumnString* string_col, int32_t current_block_row_idx,
191
0
        const ColumnArray::Offsets64* array_offsets, int32_t& current_src_array_offset) const {
192
0
    std::vector<std::string> data_tokens;
193
0
    auto query_type = get_query_type_from_fn_name();
194
0
    if (array_offsets) {
195
0
        for (auto next_src_array_offset = (*array_offsets)[current_block_row_idx];
196
0
             current_src_array_offset < next_src_array_offset; ++current_src_array_offset) {
197
0
            const auto& str_ref = string_col->get_data_at(current_src_array_offset);
198
0
            if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE) {
199
0
                data_tokens.emplace_back(str_ref.to_string());
200
0
                continue;
201
0
            }
202
0
            auto reader = doris::segment_v2::InvertedIndexReader::create_reader(
203
0
                    inverted_index_ctx, str_ref.to_string());
204
205
0
            std::vector<std::string> element_tokens;
206
207
0
            doris::segment_v2::InvertedIndexReader::get_analyse_result(
208
0
                    element_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
209
0
                    query_type, false);
210
0
            data_tokens.insert(data_tokens.end(), element_tokens.begin(), element_tokens.end());
211
0
        }
212
0
    } else {
213
0
        const auto& str_ref = string_col->get_data_at(current_block_row_idx);
214
0
        if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE) {
215
0
            data_tokens.emplace_back(str_ref.to_string());
216
0
        } else {
217
0
            auto reader = doris::segment_v2::InvertedIndexReader::create_reader(
218
0
                    inverted_index_ctx, str_ref.to_string());
219
0
            doris::segment_v2::InvertedIndexReader::get_analyse_result(
220
0
                    data_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
221
0
                    query_type, false);
222
0
        }
223
0
    }
224
0
    return data_tokens;
225
0
}
226
227
0
Status FunctionMatchBase::check(FunctionContext* context, const std::string& function_name) const {
228
0
    if (!context->state()->query_options().enable_match_without_inverted_index) {
229
0
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
230
0
                "{} not support execute_match", function_name);
231
0
    }
232
233
0
    DBUG_EXECUTE_IF("match.invert_index_not_support_execute_match", {
234
0
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
235
0
                "{} not support execute_match", function_name);
236
0
    });
237
238
0
    return Status::OK();
239
0
}
240
241
Status FunctionMatchAny::execute_match(FunctionContext* context, const std::string& column_name,
242
                                       const std::string& match_query_str, size_t input_rows_count,
243
                                       const ColumnString* string_col,
244
                                       InvertedIndexCtx* inverted_index_ctx,
245
                                       const ColumnArray::Offsets64* array_offsets,
246
0
                                       ColumnUInt8::Container& result) const {
247
0
    RETURN_IF_ERROR(check(context, name));
248
249
0
    std::vector<std::string> query_tokens;
250
0
    analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
251
0
    if (query_tokens.empty()) {
252
0
        VLOG_DEBUG << fmt::format(
253
0
                "token parser result is empty for query, "
254
0
                "please check your query: '{}' and index parser: '{}'",
255
0
                match_query_str,
256
0
                inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
257
0
        return Status::OK();
258
0
    }
259
260
0
    auto current_src_array_offset = 0;
261
0
    for (int i = 0; i < input_rows_count; i++) {
262
0
        std::vector<std::string> data_tokens =
263
0
                analyse_data_token(column_name, inverted_index_ctx, string_col, i, array_offsets,
264
0
                                   current_src_array_offset);
265
266
        // TODO: more efficient impl
267
0
        for (auto& token : query_tokens) {
268
0
            auto it = std::find(data_tokens.begin(), data_tokens.end(), token);
269
0
            if (it != data_tokens.end()) {
270
0
                result[i] = true;
271
0
                break;
272
0
            }
273
0
        }
274
0
    }
275
276
0
    return Status::OK();
277
0
}
278
279
Status FunctionMatchAll::execute_match(FunctionContext* context, const std::string& column_name,
280
                                       const std::string& match_query_str, size_t input_rows_count,
281
                                       const ColumnString* string_col,
282
                                       InvertedIndexCtx* inverted_index_ctx,
283
                                       const ColumnArray::Offsets64* array_offsets,
284
0
                                       ColumnUInt8::Container& result) const {
285
0
    RETURN_IF_ERROR(check(context, name));
286
287
0
    std::vector<std::string> query_tokens;
288
0
    analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
289
0
    if (query_tokens.empty()) {
290
0
        VLOG_DEBUG << fmt::format(
291
0
                "token parser result is empty for query, "
292
0
                "please check your query: '{}' and index parser: '{}'",
293
0
                match_query_str,
294
0
                inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
295
0
        return Status::OK();
296
0
    }
297
298
0
    auto current_src_array_offset = 0;
299
0
    for (int i = 0; i < input_rows_count; i++) {
300
0
        std::vector<std::string> data_tokens =
301
0
                analyse_data_token(column_name, inverted_index_ctx, string_col, i, array_offsets,
302
0
                                   current_src_array_offset);
303
304
        // TODO: more efficient impl
305
0
        auto find_count = 0;
306
0
        for (auto& token : query_tokens) {
307
0
            auto it = std::find(data_tokens.begin(), data_tokens.end(), token);
308
0
            if (it != data_tokens.end()) {
309
0
                ++find_count;
310
0
            } else {
311
0
                break;
312
0
            }
313
0
        }
314
315
0
        if (find_count == query_tokens.size()) {
316
0
            result[i] = true;
317
0
        }
318
0
    }
319
320
0
    return Status::OK();
321
0
}
322
323
Status FunctionMatchPhrase::execute_match(FunctionContext* context, const std::string& column_name,
324
                                          const std::string& match_query_str,
325
                                          size_t input_rows_count, const ColumnString* string_col,
326
                                          InvertedIndexCtx* inverted_index_ctx,
327
                                          const ColumnArray::Offsets64* array_offsets,
328
0
                                          ColumnUInt8::Container& result) const {
329
0
    RETURN_IF_ERROR(check(context, name));
330
331
0
    std::vector<std::string> query_tokens;
332
0
    analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
333
0
    if (query_tokens.empty()) {
334
0
        VLOG_DEBUG << fmt::format(
335
0
                "token parser result is empty for query, "
336
0
                "please check your query: '{}' and index parser: '{}'",
337
0
                match_query_str,
338
0
                inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
339
0
        return Status::OK();
340
0
    }
341
342
0
    auto current_src_array_offset = 0;
343
0
    for (int i = 0; i < input_rows_count; i++) {
344
0
        std::vector<std::string> data_tokens =
345
0
                analyse_data_token(column_name, inverted_index_ctx, string_col, i, array_offsets,
346
0
                                   current_src_array_offset);
347
348
        // TODO: more efficient impl
349
0
        bool matched = false;
350
0
        auto data_it = data_tokens.begin();
351
0
        while (data_it != data_tokens.end()) {
352
            // find position of first token
353
0
            data_it = std::find(data_it, data_tokens.end(), query_tokens[0]);
354
0
            if (data_it != data_tokens.end()) {
355
0
                matched = true;
356
0
                auto data_it_next = ++data_it;
357
0
                auto query_it = query_tokens.begin() + 1;
358
                // compare query_tokens after the first to data_tokens one by one
359
0
                while (query_it != query_tokens.end()) {
360
0
                    if (data_it_next == data_tokens.end() || *data_it_next != *query_it) {
361
0
                        matched = false;
362
0
                        break;
363
0
                    }
364
0
                    query_it++;
365
0
                    data_it_next++;
366
0
                }
367
368
0
                if (matched) {
369
0
                    break;
370
0
                }
371
0
            }
372
0
        }
373
374
        // check matched
375
0
        if (matched) {
376
0
            result[i] = true;
377
0
        }
378
0
    }
379
380
0
    return Status::OK();
381
0
}
382
383
Status FunctionMatchPhrasePrefix::execute_match(
384
        FunctionContext* context, const std::string& column_name,
385
        const std::string& match_query_str, size_t input_rows_count, const ColumnString* string_col,
386
        InvertedIndexCtx* inverted_index_ctx, const ColumnArray::Offsets64* array_offsets,
387
0
        ColumnUInt8::Container& result) const {
388
0
    RETURN_IF_ERROR(check(context, name));
389
390
0
    std::vector<std::string> query_tokens;
391
0
    analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
392
0
    if (query_tokens.empty()) {
393
0
        VLOG_DEBUG << fmt::format(
394
0
                "token parser result is empty for query, "
395
0
                "please check your query: '{}' and index parser: '{}'",
396
0
                match_query_str,
397
0
                inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
398
0
        return Status::OK();
399
0
    }
400
401
0
    int32_t current_src_array_offset = 0;
402
0
    for (size_t i = 0; i < input_rows_count; i++) {
403
0
        std::vector<std::string> data_tokens =
404
0
                analyse_data_token(column_name, inverted_index_ctx, string_col, i, array_offsets,
405
0
                                   current_src_array_offset);
406
407
0
        int32_t dis_count = data_tokens.size() - query_tokens.size();
408
0
        if (dis_count < 0) {
409
0
            continue;
410
0
        }
411
412
0
        for (size_t j = 0; j < dis_count + 1; j++) {
413
0
            if (data_tokens[j] == query_tokens[0] || query_tokens.size() == 1) {
414
0
                bool match = true;
415
0
                for (size_t k = 0; k < query_tokens.size(); k++) {
416
0
                    const std::string& data_token = data_tokens[j + k];
417
0
                    const std::string& query_token = query_tokens[k];
418
0
                    if (k == query_tokens.size() - 1) {
419
0
                        if (data_token.compare(0, query_token.size(), query_token) != 0) {
420
0
                            match = false;
421
0
                            break;
422
0
                        }
423
0
                    } else {
424
0
                        if (data_token != query_token) {
425
0
                            match = false;
426
0
                            break;
427
0
                        }
428
0
                    }
429
0
                }
430
0
                if (match) {
431
0
                    result[i] = true;
432
0
                    break;
433
0
                }
434
0
            }
435
0
        }
436
0
    }
437
438
0
    return Status::OK();
439
0
}
440
441
Status FunctionMatchRegexp::execute_match(FunctionContext* context, const std::string& column_name,
442
                                          const std::string& match_query_str,
443
                                          size_t input_rows_count, const ColumnString* string_col,
444
                                          InvertedIndexCtx* inverted_index_ctx,
445
                                          const ColumnArray::Offsets64* array_offsets,
446
0
                                          ColumnUInt8::Container& result) const {
447
0
    RETURN_IF_ERROR(check(context, name));
448
449
0
    VLOG_DEBUG << "begin to run FunctionMatchRegexp::execute_match, parser_type: "
450
0
               << inverted_index_parser_type_to_string(inverted_index_ctx->parser_type);
451
452
0
    const std::string& pattern = match_query_str;
453
454
0
    hs_database_t* database = nullptr;
455
0
    hs_compile_error_t* compile_err = nullptr;
456
0
    hs_scratch_t* scratch = nullptr;
457
458
0
    if (hs_compile(pattern.data(), HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8,
459
0
                   HS_MODE_BLOCK, nullptr, &database, &compile_err) != HS_SUCCESS) {
460
0
        std::string err_message = "hyperscan compilation failed: ";
461
0
        err_message.append(compile_err->message);
462
0
        LOG(ERROR) << err_message;
463
0
        hs_free_compile_error(compile_err);
464
0
        return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>(err_message);
465
0
    }
466
467
0
    if (hs_alloc_scratch(database, &scratch) != HS_SUCCESS) {
468
0
        LOG(ERROR) << "hyperscan could not allocate scratch space.";
469
0
        hs_free_database(database);
470
0
        return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>(
471
0
                "hyperscan could not allocate scratch space.");
472
0
    }
473
474
0
    auto on_match = [](unsigned int id, unsigned long long from, unsigned long long to,
475
0
                       unsigned int flags, void* context) -> int {
476
0
        *((bool*)context) = true;
477
0
        return 0;
478
0
    };
479
480
0
    try {
481
0
        auto current_src_array_offset = 0;
482
0
        for (int i = 0; i < input_rows_count; i++) {
483
0
            std::vector<std::string> data_tokens =
484
0
                    analyse_data_token(column_name, inverted_index_ctx, string_col, i,
485
0
                                       array_offsets, current_src_array_offset);
486
487
0
            for (auto& input : data_tokens) {
488
0
                bool is_match = false;
489
0
                if (hs_scan(database, input.data(), input.size(), 0, scratch, on_match,
490
0
                            (void*)&is_match) != HS_SUCCESS) {
491
0
                    LOG(ERROR) << "hyperscan match failed: " << input;
492
0
                    break;
493
0
                }
494
495
0
                if (is_match) {
496
0
                    result[i] = true;
497
0
                    break;
498
0
                }
499
0
            }
500
0
        }
501
0
    }
502
0
    _CLFINALLY({
503
0
        hs_free_scratch(scratch);
504
0
        hs_free_database(database);
505
0
    })
506
507
0
    return Status::OK();
508
0
}
509
510
Status FunctionMatchPhraseEdge::execute_match(
511
        FunctionContext* context, const std::string& column_name,
512
        const std::string& match_query_str, size_t input_rows_count, const ColumnString* string_col,
513
        InvertedIndexCtx* inverted_index_ctx, const ColumnArray::Offsets64* array_offsets,
514
0
        ColumnUInt8::Container& result) const {
515
0
    RETURN_IF_ERROR(check(context, name));
516
517
0
    std::vector<std::string> query_tokens;
518
0
    analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
519
0
    if (query_tokens.empty()) {
520
0
        VLOG_DEBUG << fmt::format(
521
0
                "token parser result is empty for query, "
522
0
                "please check your query: '{}' and index parser: '{}'",
523
0
                match_query_str,
524
0
                inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
525
0
        return Status::OK();
526
0
    }
527
528
0
    int32_t current_src_array_offset = 0;
529
0
    for (size_t i = 0; i < input_rows_count; i++) {
530
0
        auto data_tokens = analyse_data_token(column_name, inverted_index_ctx, string_col, i,
531
0
                                              array_offsets, current_src_array_offset);
532
533
0
        int32_t dis_count = data_tokens.size() - query_tokens.size();
534
0
        if (dis_count < 0) {
535
0
            continue;
536
0
        }
537
538
0
        for (size_t j = 0; j < dis_count + 1; j++) {
539
0
            bool match = true;
540
0
            if (query_tokens.size() == 1) {
541
0
                if (data_tokens[j].find(query_tokens[0]) == std::string::npos) {
542
0
                    match = false;
543
0
                }
544
0
            } else {
545
0
                for (size_t k = 0; k < query_tokens.size(); k++) {
546
0
                    const std::string& data_token = data_tokens[j + k];
547
0
                    const std::string& query_token = query_tokens[k];
548
0
                    if (k == 0) {
549
0
                        if (!data_token.ends_with(query_token)) {
550
0
                            match = false;
551
0
                            break;
552
0
                        }
553
0
                    } else if (k == query_tokens.size() - 1) {
554
0
                        if (!data_token.starts_with(query_token)) {
555
0
                            match = false;
556
0
                            break;
557
0
                        }
558
0
                    } else {
559
0
                        if (data_token != query_token) {
560
0
                            match = false;
561
0
                            break;
562
0
                        }
563
0
                    }
564
0
                }
565
0
            }
566
0
            if (match) {
567
0
                result[i] = true;
568
0
                break;
569
0
            }
570
0
        }
571
0
    }
572
573
0
    return Status::OK();
574
0
}
575
576
1
void register_function_match(SimpleFunctionFactory& factory) {
577
1
    factory.register_function<FunctionMatchAny>();
578
1
    factory.register_function<FunctionMatchAll>();
579
1
    factory.register_function<FunctionMatchPhrase>();
580
1
    factory.register_function<FunctionMatchPhrasePrefix>();
581
1
    factory.register_function<FunctionMatchRegexp>();
582
1
    factory.register_function<FunctionMatchPhraseEdge>();
583
1
}
584
585
} // namespace doris::vectorized