Coverage Report

Created: 2024-11-20 12:56

/root/doris/be/src/vec/functions/match.cpp
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "vec/functions/match.h"
19
20
#include <hs/hs.h>
21
22
#include "runtime/query_context.h"
23
#include "runtime/runtime_state.h"
24
#include "util/debug_points.h"
25
26
namespace doris::vectorized {
27
28
Status FunctionMatchBase::execute_impl(FunctionContext* context, Block& block,
29
                                       const ColumnNumbers& arguments, size_t result,
30
0
                                       size_t input_rows_count) {
31
0
    ColumnPtr& column_ptr = block.get_by_position(arguments[1]).column;
32
0
    DataTypePtr& type_ptr = block.get_by_position(arguments[1]).type;
33
0
    auto match_query_str = type_ptr->to_string(*column_ptr, 0);
34
0
    std::string column_name = block.get_by_position(arguments[0]).name;
35
0
    auto match_pred_column_name =
36
0
            BeConsts::BLOCK_TEMP_COLUMN_PREFIX + column_name + "_match_" + match_query_str;
37
0
    if (!block.has(match_pred_column_name)) {
38
0
        VLOG_DEBUG << "begin to execute match directly, column_name=" << column_name
39
0
                   << ", match_query_str=" << match_query_str;
40
0
        InvertedIndexCtx* inverted_index_ctx = reinterpret_cast<InvertedIndexCtx*>(
41
0
                context->get_function_state(FunctionContext::THREAD_LOCAL));
42
0
        if (inverted_index_ctx == nullptr) {
43
0
            inverted_index_ctx = reinterpret_cast<InvertedIndexCtx*>(
44
0
                    context->get_function_state(FunctionContext::FRAGMENT_LOCAL));
45
0
        }
46
47
0
        const ColumnPtr source_col =
48
0
                block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
49
0
        const auto* values = check_and_get_column<ColumnString>(source_col.get());
50
0
        const ColumnArray* array_col = nullptr;
51
0
        if (source_col->is_column_array()) {
52
0
            if (source_col->is_nullable()) {
53
0
                auto* nullable = check_and_get_column<ColumnNullable>(source_col.get());
54
0
                array_col = check_and_get_column<ColumnArray>(*nullable->get_nested_column_ptr());
55
0
            } else {
56
0
                array_col = check_and_get_column<ColumnArray>(source_col.get());
57
0
            }
58
0
            if (array_col && !array_col->get_data().is_column_string()) {
59
0
                return Status::NotSupported(
60
0
                        fmt::format("unsupported nested array of type {} for function {}",
61
0
                                    is_column_nullable(array_col->get_data())
62
0
                                            ? array_col->get_data().get_name()
63
0
                                            : array_col->get_data().get_family_name(),
64
0
                                    get_name()));
65
0
            }
66
67
0
            if (is_column_nullable(array_col->get_data())) {
68
0
                const auto& array_nested_null_column =
69
0
                        reinterpret_cast<const ColumnNullable&>(array_col->get_data());
70
0
                values = check_and_get_column<ColumnString>(
71
0
                        *(array_nested_null_column.get_nested_column_ptr()));
72
0
            } else {
73
                // array column element is always set Nullable for now.
74
0
                values = check_and_get_column<ColumnString>(*(array_col->get_data_ptr()));
75
0
            }
76
0
        } else if (auto* nullable = check_and_get_column<ColumnNullable>(source_col.get())) {
77
            // match null
78
0
            if (type_ptr->is_nullable()) {
79
0
                if (column_ptr->only_null()) {
80
0
                    block.get_by_position(result).column = nullable->get_null_map_column_ptr();
81
0
                    return Status::OK();
82
0
                }
83
0
            } else {
84
0
                values = check_and_get_column<ColumnString>(*nullable->get_nested_column_ptr());
85
0
            }
86
0
        }
87
88
0
        if (!values) {
89
0
            LOG(WARNING) << "Illegal column " << source_col->get_name();
90
0
            return Status::InternalError("Not supported input column types");
91
0
        }
92
        // result column
93
0
        auto res = ColumnUInt8::create();
94
0
        ColumnUInt8::Container& vec_res = res->get_data();
95
        // set default value to 0, and match functions only need to set 1/true
96
0
        vec_res.resize_fill(input_rows_count);
97
0
        RETURN_IF_ERROR(execute_match(
98
0
                context, column_name, match_query_str, input_rows_count, values, inverted_index_ctx,
99
0
                (array_col ? &(array_col->get_offsets()) : nullptr), vec_res));
100
0
        block.replace_by_position(result, std::move(res));
101
0
    } else {
102
0
        auto match_pred_column =
103
0
                block.get_by_name(match_pred_column_name).column->convert_to_full_column_if_const();
104
0
        block.replace_by_position(result, std::move(match_pred_column));
105
0
    }
106
107
0
    return Status::OK();
108
0
}
109
110
0
inline doris::segment_v2::InvertedIndexQueryType FunctionMatchBase::get_query_type_from_fn_name() {
111
0
    std::string fn_name = get_name();
112
0
    if (fn_name == MATCH_ANY_FUNCTION) {
113
0
        return doris::segment_v2::InvertedIndexQueryType::MATCH_ANY_QUERY;
114
0
    } else if (fn_name == MATCH_ALL_FUNCTION) {
115
0
        return doris::segment_v2::InvertedIndexQueryType::MATCH_ALL_QUERY;
116
0
    } else if (fn_name == MATCH_PHRASE_FUNCTION) {
117
0
        return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_QUERY;
118
0
    } else if (fn_name == MATCH_PHRASE_PREFIX_FUNCTION) {
119
0
        return doris::segment_v2::InvertedIndexQueryType::MATCH_PHRASE_PREFIX_QUERY;
120
0
    } else if (fn_name == MATCH_PHRASE_REGEXP_FUNCTION) {
121
0
        return doris::segment_v2::InvertedIndexQueryType::MATCH_REGEXP_QUERY;
122
0
    }
123
0
    return doris::segment_v2::InvertedIndexQueryType::UNKNOWN_QUERY;
124
0
}
125
126
void FunctionMatchBase::analyse_query_str_token(std::vector<std::string>* query_tokens,
127
                                                InvertedIndexCtx* inverted_index_ctx,
128
                                                const std::string& match_query_str,
129
0
                                                const std::string& column_name) {
130
0
    VLOG_DEBUG << "begin to run " << get_name() << ", parser_type: "
131
0
               << inverted_index_parser_type_to_string(inverted_index_ctx->parser_type);
132
0
    if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE) {
133
0
        query_tokens->emplace_back(match_query_str);
134
0
        return;
135
0
    }
136
0
    auto reader = doris::segment_v2::InvertedIndexReader::create_reader(inverted_index_ctx,
137
0
                                                                        match_query_str);
138
0
    doris::segment_v2::InvertedIndexReader::get_analyse_result(
139
0
            *query_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
140
0
            get_query_type_from_fn_name());
141
0
}
142
143
inline std::vector<std::string> FunctionMatchBase::analyse_data_token(
144
        const std::string& column_name, InvertedIndexCtx* inverted_index_ctx,
145
        const ColumnString* string_col, int32_t current_block_row_idx,
146
0
        const ColumnArray::Offsets64* array_offsets, int32_t& current_src_array_offset) {
147
0
    std::vector<std::string> data_tokens;
148
0
    auto query_type = get_query_type_from_fn_name();
149
0
    if (array_offsets) {
150
0
        for (auto next_src_array_offset = (*array_offsets)[current_block_row_idx];
151
0
             current_src_array_offset < next_src_array_offset; ++current_src_array_offset) {
152
0
            const auto& str_ref = string_col->get_data_at(current_src_array_offset);
153
0
            if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE) {
154
0
                data_tokens.emplace_back(str_ref.to_string());
155
0
                continue;
156
0
            }
157
0
            auto reader = doris::segment_v2::InvertedIndexReader::create_reader(
158
0
                    inverted_index_ctx, str_ref.to_string());
159
160
0
            std::vector<std::string> element_tokens;
161
162
0
            doris::segment_v2::InvertedIndexReader::get_analyse_result(
163
0
                    element_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
164
0
                    query_type, false);
165
0
            data_tokens.insert(data_tokens.end(), element_tokens.begin(), element_tokens.end());
166
0
        }
167
0
    } else {
168
0
        const auto& str_ref = string_col->get_data_at(current_block_row_idx);
169
0
        if (inverted_index_ctx->parser_type == InvertedIndexParserType::PARSER_NONE) {
170
0
            data_tokens.emplace_back(str_ref.to_string());
171
0
        } else {
172
0
            auto reader = doris::segment_v2::InvertedIndexReader::create_reader(
173
0
                    inverted_index_ctx, str_ref.to_string());
174
0
            doris::segment_v2::InvertedIndexReader::get_analyse_result(
175
0
                    data_tokens, reader.get(), inverted_index_ctx->analyzer, column_name,
176
0
                    query_type, false);
177
0
        }
178
0
    }
179
0
    return data_tokens;
180
0
}
181
182
0
Status FunctionMatchBase::check(FunctionContext* context, const std::string& function_name) const {
183
0
    if (!context->state()->query_options().enable_match_without_inverted_index) {
184
0
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
185
0
                "{} not support execute_match", function_name);
186
0
    }
187
188
0
    DBUG_EXECUTE_IF("match.invert_index_not_support_execute_match", {
189
0
        return Status::Error<ErrorCode::INVERTED_INDEX_NOT_SUPPORTED>(
190
0
                "{} not support execute_match", function_name);
191
0
    });
192
193
0
    return Status::OK();
194
0
}
195
196
Status FunctionMatchAny::execute_match(FunctionContext* context, const std::string& column_name,
197
                                       const std::string& match_query_str, size_t input_rows_count,
198
                                       const ColumnString* string_col,
199
                                       InvertedIndexCtx* inverted_index_ctx,
200
                                       const ColumnArray::Offsets64* array_offsets,
201
0
                                       ColumnUInt8::Container& result) {
202
0
    RETURN_IF_ERROR(check(context, name));
203
204
0
    std::vector<std::string> query_tokens;
205
0
    analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
206
0
    if (query_tokens.empty()) {
207
0
        LOG(WARNING) << fmt::format(
208
0
                "token parser result is empty for query, "
209
0
                "please check your query: '{}' and index parser: '{}'",
210
0
                match_query_str,
211
0
                inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
212
0
        return Status::OK();
213
0
    }
214
215
0
    auto current_src_array_offset = 0;
216
0
    for (int i = 0; i < input_rows_count; i++) {
217
0
        std::vector<std::string> data_tokens =
218
0
                analyse_data_token(column_name, inverted_index_ctx, string_col, i, array_offsets,
219
0
                                   current_src_array_offset);
220
221
        // TODO: more efficient impl
222
0
        for (auto& token : query_tokens) {
223
0
            auto it = std::find(data_tokens.begin(), data_tokens.end(), token);
224
0
            if (it != data_tokens.end()) {
225
0
                result[i] = true;
226
0
                break;
227
0
            }
228
0
        }
229
0
    }
230
231
0
    return Status::OK();
232
0
}
233
234
Status FunctionMatchAll::execute_match(FunctionContext* context, const std::string& column_name,
235
                                       const std::string& match_query_str, size_t input_rows_count,
236
                                       const ColumnString* string_col,
237
                                       InvertedIndexCtx* inverted_index_ctx,
238
                                       const ColumnArray::Offsets64* array_offsets,
239
0
                                       ColumnUInt8::Container& result) {
240
0
    RETURN_IF_ERROR(check(context, name));
241
242
0
    std::vector<std::string> query_tokens;
243
0
    analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
244
0
    if (query_tokens.empty()) {
245
0
        LOG(WARNING) << fmt::format(
246
0
                "token parser result is empty for query, "
247
0
                "please check your query: '{}' and index parser: '{}'",
248
0
                match_query_str,
249
0
                inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
250
0
        return Status::OK();
251
0
    }
252
253
0
    auto current_src_array_offset = 0;
254
0
    for (int i = 0; i < input_rows_count; i++) {
255
0
        std::vector<std::string> data_tokens =
256
0
                analyse_data_token(column_name, inverted_index_ctx, string_col, i, array_offsets,
257
0
                                   current_src_array_offset);
258
259
        // TODO: more efficient impl
260
0
        auto find_count = 0;
261
0
        for (auto& token : query_tokens) {
262
0
            auto it = std::find(data_tokens.begin(), data_tokens.end(), token);
263
0
            if (it != data_tokens.end()) {
264
0
                ++find_count;
265
0
            } else {
266
0
                break;
267
0
            }
268
0
        }
269
270
0
        if (find_count == query_tokens.size()) {
271
0
            result[i] = true;
272
0
        }
273
0
    }
274
275
0
    return Status::OK();
276
0
}
277
278
Status FunctionMatchPhrase::execute_match(FunctionContext* context, const std::string& column_name,
279
                                          const std::string& match_query_str,
280
                                          size_t input_rows_count, const ColumnString* string_col,
281
                                          InvertedIndexCtx* inverted_index_ctx,
282
                                          const ColumnArray::Offsets64* array_offsets,
283
0
                                          ColumnUInt8::Container& result) {
284
0
    RETURN_IF_ERROR(check(context, name));
285
286
0
    std::vector<std::string> query_tokens;
287
0
    analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
288
0
    if (query_tokens.empty()) {
289
0
        VLOG_DEBUG << fmt::format(
290
0
                "token parser result is empty for query, "
291
0
                "please check your query: '{}' and index parser: '{}'",
292
0
                match_query_str,
293
0
                inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
294
0
        return Status::OK();
295
0
    }
296
297
0
    auto current_src_array_offset = 0;
298
0
    for (int i = 0; i < input_rows_count; i++) {
299
0
        std::vector<std::string> data_tokens =
300
0
                analyse_data_token(column_name, inverted_index_ctx, string_col, i, array_offsets,
301
0
                                   current_src_array_offset);
302
303
        // TODO: more efficient impl
304
0
        bool matched = false;
305
0
        auto data_it = data_tokens.begin();
306
0
        while (data_it != data_tokens.end()) {
307
            // find position of first token
308
0
            data_it = std::find(data_it, data_tokens.end(), query_tokens[0]);
309
0
            if (data_it != data_tokens.end()) {
310
0
                matched = true;
311
0
                auto data_it_next = ++data_it;
312
0
                auto query_it = query_tokens.begin() + 1;
313
                // compare query_tokens after the first to data_tokens one by one
314
0
                while (query_it != query_tokens.end()) {
315
0
                    if (data_it_next == data_tokens.end() || *data_it_next != *query_it) {
316
0
                        matched = false;
317
0
                        break;
318
0
                    }
319
0
                    query_it++;
320
0
                    data_it_next++;
321
0
                }
322
323
0
                if (matched) {
324
0
                    break;
325
0
                }
326
0
            }
327
0
        }
328
329
        // check matched
330
0
        if (matched) {
331
0
            result[i] = true;
332
0
        }
333
0
    }
334
335
0
    return Status::OK();
336
0
}
337
338
Status FunctionMatchPhrasePrefix::execute_match(
339
        FunctionContext* context, const std::string& column_name,
340
        const std::string& match_query_str, size_t input_rows_count, const ColumnString* string_col,
341
        InvertedIndexCtx* inverted_index_ctx, const ColumnArray::Offsets64* array_offsets,
342
0
        ColumnUInt8::Container& result) {
343
0
    RETURN_IF_ERROR(check(context, name));
344
345
0
    std::vector<std::string> query_tokens;
346
0
    analyse_query_str_token(&query_tokens, inverted_index_ctx, match_query_str, column_name);
347
0
    if (query_tokens.empty()) {
348
0
        VLOG_DEBUG << fmt::format(
349
0
                "token parser result is empty for query, "
350
0
                "please check your query: '{}' and index parser: '{}'",
351
0
                match_query_str,
352
0
                inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
353
0
        return Status::OK();
354
0
    }
355
356
0
    int32_t current_src_array_offset = 0;
357
0
    for (size_t i = 0; i < input_rows_count; i++) {
358
0
        auto data_tokens = analyse_data_token(column_name, inverted_index_ctx, string_col, i,
359
0
                                              array_offsets, current_src_array_offset);
360
361
0
        int32_t dis_count = data_tokens.size() - query_tokens.size();
362
0
        if (dis_count < 0) {
363
0
            continue;
364
0
        }
365
366
0
        for (size_t j = 0; j < dis_count + 1; j++) {
367
0
            if (data_tokens[j] == query_tokens[0] || query_tokens.size() == 1) {
368
0
                bool match = true;
369
0
                for (size_t k = 0; k < query_tokens.size(); k++) {
370
0
                    const std::string& data_token = data_tokens[j + k];
371
0
                    const std::string& query_token = query_tokens[k];
372
0
                    if (k == query_tokens.size() - 1) {
373
0
                        if (data_token.compare(0, query_token.size(), query_token) != 0) {
374
0
                            match = false;
375
0
                            break;
376
0
                        }
377
0
                    } else {
378
0
                        if (data_token != query_token) {
379
0
                            match = false;
380
0
                            break;
381
0
                        }
382
0
                    }
383
0
                }
384
0
                if (match) {
385
0
                    result[i] = true;
386
0
                    break;
387
0
                }
388
0
            }
389
0
        }
390
0
    }
391
392
0
    return Status::OK();
393
0
}
394
395
Status FunctionMatchRegexp::execute_match(FunctionContext* context, const std::string& column_name,
396
                                          const std::string& match_query_str,
397
                                          size_t input_rows_count, const ColumnString* string_col,
398
                                          InvertedIndexCtx* inverted_index_ctx,
399
                                          const ColumnArray::Offsets64* array_offsets,
400
0
                                          ColumnUInt8::Container& result) {
401
0
    RETURN_IF_ERROR(check(context, name));
402
403
0
    VLOG_DEBUG << "begin to run FunctionMatchRegexp::execute_match, parser_type: "
404
0
               << inverted_index_parser_type_to_string(inverted_index_ctx->parser_type);
405
406
0
    if (match_query_str.empty()) {
407
0
        VLOG_DEBUG << fmt::format(
408
0
                "token parser result is empty for query, "
409
0
                "please check your query: '{}' and index parser: '{}'",
410
0
                match_query_str,
411
0
                inverted_index_parser_type_to_string(inverted_index_ctx->parser_type));
412
0
        return Status::OK();
413
0
    }
414
415
0
    const std::string& pattern = match_query_str;
416
417
0
    hs_database_t* database = nullptr;
418
0
    hs_compile_error_t* compile_err = nullptr;
419
0
    hs_scratch_t* scratch = nullptr;
420
421
0
    if (hs_compile(pattern.data(), HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8,
422
0
                   HS_MODE_BLOCK, nullptr, &database, &compile_err) != HS_SUCCESS) {
423
0
        LOG(ERROR) << "hyperscan compilation failed: " << compile_err->message;
424
0
        hs_free_compile_error(compile_err);
425
0
        return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>(
426
0
                std::string("hyperscan compilation failed:") + compile_err->message);
427
0
    }
428
429
0
    if (hs_alloc_scratch(database, &scratch) != HS_SUCCESS) {
430
0
        LOG(ERROR) << "hyperscan could not allocate scratch space.";
431
0
        hs_free_database(database);
432
0
        return Status::Error<ErrorCode::INVERTED_INDEX_INVALID_PARAMETERS>(
433
0
                "hyperscan could not allocate scratch space.");
434
0
    }
435
436
0
    auto on_match = [](unsigned int id, unsigned long long from, unsigned long long to,
437
0
                       unsigned int flags, void* context) -> int {
438
0
        *((bool*)context) = true;
439
0
        return 0;
440
0
    };
441
442
0
    try {
443
0
        auto current_src_array_offset = 0;
444
0
        for (int i = 0; i < input_rows_count; i++) {
445
0
            std::vector<std::string> data_tokens =
446
0
                    analyse_data_token(column_name, inverted_index_ctx, string_col, i,
447
0
                                       array_offsets, current_src_array_offset);
448
449
0
            for (auto& input : data_tokens) {
450
0
                bool is_match = false;
451
0
                if (hs_scan(database, input.data(), input.size(), 0, scratch, on_match,
452
0
                            (void*)&is_match) != HS_SUCCESS) {
453
0
                    LOG(ERROR) << "hyperscan match failed: " << input;
454
0
                    break;
455
0
                }
456
457
0
                if (is_match) {
458
0
                    result[i] = true;
459
0
                    break;
460
0
                }
461
0
            }
462
0
        }
463
0
    }
464
0
    _CLFINALLY({
465
0
        hs_free_scratch(scratch);
466
0
        hs_free_database(database);
467
0
    })
468
469
0
    return Status::OK();
470
0
}
471
472
1
void register_function_match(SimpleFunctionFactory& factory) {
473
1
    factory.register_function<FunctionMatchAny>();
474
1
    factory.register_function<FunctionMatchAll>();
475
1
    factory.register_function<FunctionMatchPhrase>();
476
1
    factory.register_function<FunctionMatchPhrasePrefix>();
477
1
    factory.register_function<FunctionMatchRegexp>();
478
1
}
479
480
} // namespace doris::vectorized