Coverage Report

Created: 2024-11-21 11:46

/root/doris/be/src/vec/functions/like.cpp
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "like.h"
19
20
#include <fmt/format.h>
21
#include <hs/hs_compile.h>
22
#include <re2/stringpiece.h>
23
24
#include <cstddef>
25
#include <ostream>
26
#include <utility>
27
#include <vector>
28
29
#include "common/logging.h"
30
#include "vec/columns/column.h"
31
#include "vec/columns/column_const.h"
32
#include "vec/columns/column_vector.h"
33
#include "vec/columns/columns_number.h"
34
#include "vec/common/string_ref.h"
35
#include "vec/core/block.h"
36
#include "vec/core/column_with_type_and_name.h"
37
#include "vec/functions/simple_function_factory.h"
38
39
namespace doris::vectorized {
40
// A regex to match any regex pattern is equivalent to a substring search.
41
static const RE2 SUBSTRING_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
42
43
// A regex to match any regex pattern which is equivalent to matching a constant string
44
// at the end of the string values.
45
static const RE2 ENDS_WITH_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
46
47
// A regex to match any regex pattern which is equivalent to matching a constant string
48
// at the end of the string values.
49
static const RE2 STARTS_WITH_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
50
51
// A regex to match any regex pattern which is equivalent to a constant string match.
52
static const RE2 EQUALS_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
53
// A regex to match .*
54
static const RE2 ALLPASS_RE(R"((\.\*)+)");
55
56
// Like patterns
57
static const re2::RE2 LIKE_SUBSTRING_RE(R"((?:%+)(((\\_)|([^%_\\]))+)(?:%+))");
58
static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\_)|([^%_]))+)");
59
static const re2::RE2 LIKE_STARTS_WITH_RE(R"((((\\%)|(\\_)|([^%_\\]))+)(?:%+))");
60
static const re2::RE2 LIKE_EQUALS_RE("(((\\\\_)|([^%_]))+)");
61
static const re2::RE2 LIKE_ALLPASS_RE("%+");
62
63
struct VectorAllpassSearchState : public VectorPatternSearchState {
64
12
    VectorAllpassSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_allpass_fn) {}
65
66
12
    ~VectorAllpassSearchState() override = default;
67
68
1
    void like_pattern_match(const std::string& pattern_str) override {
69
1
        if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) {
70
0
            _search_strings->insert_default();
71
1
        } else {
72
1
            _pattern_matched = false;
73
1
        }
74
1
    }
75
76
11
    void regexp_pattern_match(const std::string& pattern_str) override {
77
11
        if (RE2::FullMatch(pattern_str, ALLPASS_RE)) {
78
0
            _search_strings->insert_default();
79
11
        } else {
80
11
            _pattern_matched = false;
81
11
        }
82
11
    }
83
};
84
85
struct VectorEqualSearchState : public VectorPatternSearchState {
86
12
    VectorEqualSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_equals_fn) {}
87
88
12
    ~VectorEqualSearchState() override = default;
89
90
1
    void like_pattern_match(const std::string& pattern_str) override {
91
1
        _search_string.clear();
92
1
        if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &_search_string)) {
93
0
            FunctionLike::remove_escape_character(&_search_string);
94
0
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
95
1
        } else {
96
1
            _pattern_matched = false;
97
1
        }
98
1
    }
99
100
11
    void regexp_pattern_match(const std::string& pattern_str) override {
101
11
        _search_string.clear();
102
11
        if (RE2::FullMatch(pattern_str, EQUALS_RE, &_search_string)) {
103
2
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
104
9
        } else {
105
9
            _pattern_matched = false;
106
9
        }
107
11
    }
108
};
109
110
struct VectorSubStringSearchState : public VectorPatternSearchState {
111
    VectorSubStringSearchState()
112
12
            : VectorPatternSearchState(FunctionLikeBase::vector_substring_fn) {}
113
114
12
    ~VectorSubStringSearchState() override = default;
115
116
3
    void like_pattern_match(const std::string& pattern_str) override {
117
3
        _search_string.clear();
118
3
        if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &_search_string)) {
119
2
            FunctionLike::remove_escape_character(&_search_string);
120
2
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
121
2
        } else {
122
1
            _pattern_matched = false;
123
1
        }
124
3
    }
125
126
11
    void regexp_pattern_match(const std::string& pattern_str) override {
127
11
        _search_string.clear();
128
11
        if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &_search_string)) {
129
0
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
130
11
        } else {
131
11
            _pattern_matched = false;
132
11
        }
133
11
    }
134
};
135
136
struct VectorStartsWithSearchState : public VectorPatternSearchState {
137
    VectorStartsWithSearchState()
138
12
            : VectorPatternSearchState(FunctionLikeBase::vector_starts_with_fn) {}
139
140
12
    ~VectorStartsWithSearchState() override = default;
141
142
1
    void like_pattern_match(const std::string& pattern_str) override {
143
1
        _search_string.clear();
144
1
        if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &_search_string)) {
145
0
            FunctionLike::remove_escape_character(&_search_string);
146
0
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
147
1
        } else {
148
1
            _pattern_matched = false;
149
1
        }
150
1
    }
151
152
11
    void regexp_pattern_match(const std::string& pattern_str) override {
153
11
        _search_string.clear();
154
11
        if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &_search_string)) {
155
2
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
156
9
        } else {
157
9
            _pattern_matched = false;
158
9
        }
159
11
    }
160
};
161
162
struct VectorEndsWithSearchState : public VectorPatternSearchState {
163
12
    VectorEndsWithSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_ends_with_fn) {}
164
165
12
    ~VectorEndsWithSearchState() override = default;
166
167
1
    void like_pattern_match(const std::string& pattern_str) override {
168
1
        _search_string.clear();
169
1
        if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &_search_string)) {
170
0
            FunctionLike::remove_escape_character(&_search_string);
171
0
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
172
1
        } else {
173
1
            _pattern_matched = false;
174
1
        }
175
1
    }
176
177
11
    void regexp_pattern_match(const std::string& pattern_str) override {
178
11
        _search_string.clear();
179
11
        if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &_search_string)) {
180
2
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
181
9
        } else {
182
9
            _pattern_matched = false;
183
9
        }
184
11
    }
185
};
186
187
0
Status LikeSearchState::clone(LikeSearchState& cloned) {
188
0
    cloned.escape_char = escape_char;
189
0
    cloned.set_search_string(search_string);
190
191
0
    std::string re_pattern;
192
0
    FunctionLike::convert_like_pattern(this, pattern_str, &re_pattern);
193
0
    if (hs_database) { // use hyperscan
194
0
        hs_database_t* database = nullptr;
195
0
        hs_scratch_t* scratch = nullptr;
196
0
        RETURN_IF_ERROR(FunctionLike::hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch));
197
198
0
        cloned.hs_database.reset(database);
199
0
        cloned.hs_scratch.reset(scratch);
200
0
    } else { // fallback to re2
201
0
        cloned.hs_database.reset();
202
0
        cloned.hs_scratch.reset();
203
204
0
        RE2::Options opts;
205
0
        opts.set_never_nl(false);
206
0
        opts.set_dot_nl(true);
207
0
        cloned.regex = std::make_unique<RE2>(re_pattern, opts);
208
0
        if (!cloned.regex->ok()) {
209
0
            return Status::InternalError("Invalid regex expression: {}", re_pattern);
210
0
        }
211
0
    }
212
213
0
    return Status::OK();
214
0
}
215
216
Status FunctionLikeBase::constant_allpass_fn(LikeSearchState* state, const ColumnString& vals,
217
                                             const StringRef& pattern,
218
0
                                             ColumnUInt8::Container& result) {
219
0
    memset(result.data(), 1, vals.size());
220
0
    return Status::OK();
221
0
}
222
223
Status FunctionLikeBase::constant_allpass_fn_scalar(LikeSearchState* state, const StringRef& val,
224
                                                    const StringRef& pattern,
225
0
                                                    unsigned char* result) {
226
0
    *result = 1;
227
0
    return Status::OK();
228
0
}
229
230
Status FunctionLikeBase::vector_allpass_fn(const ColumnString& vals,
231
                                           const ColumnString& search_strings,
232
0
                                           ColumnUInt8::Container& result) {
233
0
    DCHECK(vals.size() == search_strings.size());
234
0
    DCHECK(vals.size() == result.size());
235
0
    memset(result.data(), 1, vals.size());
236
0
    return Status::OK();
237
0
}
238
239
Status FunctionLikeBase::constant_starts_with_fn(LikeSearchState* state, const ColumnString& val,
240
                                                 const StringRef& pattern,
241
0
                                                 ColumnUInt8::Container& result) {
242
0
    auto sz = val.size();
243
0
    for (size_t i = 0; i < sz; i++) {
244
0
        const auto& str_ref = val.get_data_at(i);
245
0
        result[i] = (str_ref.size >= state->search_string_sv.size) &&
246
0
                    str_ref.start_with(state->search_string_sv);
247
0
    }
248
0
    return Status::OK();
249
0
}
250
251
Status FunctionLikeBase::constant_starts_with_fn_scalar(LikeSearchState* state,
252
                                                        const StringRef& val,
253
                                                        const StringRef& pattern,
254
0
                                                        unsigned char* result) {
255
0
    *result = (val.size >= state->search_string_sv.size) &&
256
0
              (state->search_string_sv == val.substring(0, state->search_string_sv.size));
257
0
    return Status::OK();
258
0
}
259
260
Status FunctionLikeBase::vector_starts_with_fn(const ColumnString& vals,
261
                                               const ColumnString& search_strings,
262
2
                                               ColumnUInt8::Container& result) {
263
2
    DCHECK(vals.size() == search_strings.size());
264
2
    DCHECK(vals.size() == result.size());
265
2
    auto sz = vals.size();
266
4
    for (size_t i = 0; i < sz; ++i) {
267
2
        const auto& str_sv = vals.get_data_at(i);
268
2
        const auto& search_string_sv = search_strings.get_data_at(i);
269
2
        result[i] = (str_sv.size >= search_string_sv.size) && str_sv.start_with(search_string_sv);
270
2
    }
271
2
    return Status::OK();
272
2
}
273
274
Status FunctionLikeBase::constant_ends_with_fn(LikeSearchState* state, const ColumnString& val,
275
                                               const StringRef& pattern,
276
0
                                               ColumnUInt8::Container& result) {
277
0
    auto sz = val.size();
278
0
    for (size_t i = 0; i < sz; i++) {
279
0
        const auto& str_ref = val.get_data_at(i);
280
0
        result[i] = (str_ref.size >= state->search_string_sv.size) &&
281
0
                    str_ref.end_with(state->search_string_sv);
282
0
    }
283
0
    return Status::OK();
284
0
}
285
286
Status FunctionLikeBase::constant_ends_with_fn_scalar(LikeSearchState* state, const StringRef& val,
287
                                                      const StringRef& pattern,
288
0
                                                      unsigned char* result) {
289
0
    *result = (val.size >= state->search_string_sv.size) &&
290
0
              (state->search_string_sv == val.substring(val.size - state->search_string_sv.size,
291
0
                                                        state->search_string_sv.size));
292
0
    return Status::OK();
293
0
}
294
295
Status FunctionLikeBase::vector_ends_with_fn(const ColumnString& vals,
296
                                             const ColumnString& search_strings,
297
2
                                             ColumnUInt8::Container& result) {
298
2
    DCHECK(vals.size() == search_strings.size());
299
2
    DCHECK(vals.size() == result.size());
300
2
    auto sz = vals.size();
301
4
    for (size_t i = 0; i < sz; ++i) {
302
2
        const auto& str_sv = vals.get_data_at(i);
303
2
        const auto& search_string_sv = search_strings.get_data_at(i);
304
2
        result[i] = (str_sv.size >= search_string_sv.size) && str_sv.end_with(search_string_sv);
305
2
    }
306
2
    return Status::OK();
307
2
}
308
309
Status FunctionLikeBase::constant_equals_fn(LikeSearchState* state, const ColumnString& val,
310
                                            const StringRef& pattern,
311
0
                                            ColumnUInt8::Container& result) {
312
0
    auto sz = val.size();
313
0
    for (size_t i = 0; i < sz; i++) {
314
0
        result[i] = (val.get_data_at(i) == state->search_string_sv);
315
0
    }
316
0
    return Status::OK();
317
0
}
318
319
Status FunctionLikeBase::constant_equals_fn_scalar(LikeSearchState* state, const StringRef& val,
320
                                                   const StringRef& pattern,
321
0
                                                   unsigned char* result) {
322
0
    *result = (val == state->search_string_sv);
323
0
    return Status::OK();
324
0
}
325
326
Status FunctionLikeBase::vector_equals_fn(const ColumnString& vals,
327
                                          const ColumnString& search_strings,
328
2
                                          ColumnUInt8::Container& result) {
329
2
    DCHECK(vals.size() == search_strings.size());
330
2
    DCHECK(vals.size() == result.size());
331
2
    auto sz = vals.size();
332
4
    for (size_t i = 0; i < sz; ++i) {
333
2
        const auto& str_sv = vals.get_data_at(i);
334
2
        const auto& search_string_sv = search_strings.get_data_at(i);
335
2
        result[i] = str_sv == search_string_sv;
336
2
    }
337
2
    return Status::OK();
338
2
}
339
340
Status FunctionLikeBase::constant_substring_fn(LikeSearchState* state, const ColumnString& val,
341
                                               const StringRef& pattern,
342
0
                                               ColumnUInt8::Container& result) {
343
0
    auto sz = val.size();
344
0
    for (size_t i = 0; i < sz; i++) {
345
0
        if (state->search_string_sv.size == 0) {
346
0
            result[i] = true;
347
0
            continue;
348
0
        }
349
0
        result[i] = state->substring_pattern.search(val.get_data_at(i)) != -1;
350
0
    }
351
0
    return Status::OK();
352
0
}
353
354
Status FunctionLikeBase::constant_substring_fn_scalar(LikeSearchState* state, const StringRef& val,
355
                                                      const StringRef& pattern,
356
0
                                                      unsigned char* result) {
357
0
    if (state->search_string_sv.size == 0) {
358
0
        *result = true;
359
0
        return Status::OK();
360
0
    }
361
0
    *result = state->substring_pattern.search(val) != -1;
362
0
    return Status::OK();
363
0
}
364
365
Status FunctionLikeBase::vector_substring_fn(const ColumnString& vals,
366
                                             const ColumnString& search_strings,
367
0
                                             ColumnUInt8::Container& result) {
368
0
    DCHECK(vals.size() == search_strings.size());
369
0
    DCHECK(vals.size() == result.size());
370
0
    auto sz = vals.size();
371
0
    for (size_t i = 0; i < sz; ++i) {
372
0
        const auto& str_sv = vals.get_data_at(i);
373
0
        const auto& search_string_sv = search_strings.get_data_at(i);
374
0
        if (search_string_sv.size == 0) {
375
0
            result[i] = true;
376
0
            continue;
377
0
        }
378
0
        doris::StringSearch substring_search(&search_string_sv);
379
0
        result[i] = substring_search.search(str_sv) != -1;
380
0
    }
381
0
    return Status::OK();
382
0
}
383
384
Status FunctionLikeBase::constant_regex_fn_scalar(LikeSearchState* state, const StringRef& val,
385
5
                                                  const StringRef& pattern, unsigned char* result) {
386
5
    if (state->hs_database) { // use hyperscan
387
5
        auto ret = hs_scan(state->hs_database.get(), val.data, val.size, 0, state->hs_scratch.get(),
388
5
                           doris::vectorized::LikeSearchState::hs_match_handler, (void*)result);
389
5
        if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
390
0
            return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
391
0
        }
392
5
    } else { // fallback to re2
393
0
        *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), *state->regex);
394
0
    }
395
396
5
    return Status::OK();
397
5
}
398
399
Status FunctionLikeBase::regexp_fn_scalar(LikeSearchState* state, const StringRef& val,
400
30
                                          const StringRef& pattern, unsigned char* result) {
401
30
    RE2::Options opts;
402
30
    opts.set_never_nl(false);
403
30
    opts.set_dot_nl(true);
404
30
    re2::RE2 re(re2::StringPiece(pattern.data, pattern.size), opts);
405
30
    if (re.ok()) {
406
30
        *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), re);
407
30
    } else {
408
0
        return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
409
0
    }
410
411
30
    return Status::OK();
412
30
}
413
414
Status FunctionLikeBase::constant_regex_fn(LikeSearchState* state, const ColumnString& val,
415
                                           const StringRef& pattern,
416
0
                                           ColumnUInt8::Container& result) {
417
0
    auto sz = val.size();
418
0
    if (state->hs_database) { // use hyperscan
419
0
        for (size_t i = 0; i < sz; i++) {
420
0
            const auto& str_ref = val.get_data_at(i);
421
0
            auto ret = hs_scan(state->hs_database.get(), str_ref.data, str_ref.size, 0,
422
0
                               state->hs_scratch.get(),
423
0
                               doris::vectorized::LikeSearchState::hs_match_handler,
424
0
                               (void*)(result.data() + i));
425
0
            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
426
0
                return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
427
0
            }
428
0
        }
429
0
    } else { // fallback to re2
430
0
        for (size_t i = 0; i < sz; i++) {
431
0
            const auto& str_ref = val.get_data_at(i);
432
0
            *(result.data() + i) =
433
0
                    RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), *state->regex);
434
0
        }
435
0
    }
436
437
0
    return Status::OK();
438
0
}
439
440
Status FunctionLikeBase::regexp_fn(LikeSearchState* state, const ColumnString& val,
441
0
                                   const StringRef& pattern, ColumnUInt8::Container& result) {
442
0
    std::string re_pattern(pattern.data, pattern.size);
443
444
0
    hs_database_t* database = nullptr;
445
0
    hs_scratch_t* scratch = nullptr;
446
0
    if (hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch).ok()) { // use hyperscan
447
0
        auto sz = val.size();
448
0
        for (size_t i = 0; i < sz; i++) {
449
0
            const auto& str_ref = val.get_data_at(i);
450
0
            auto ret = hs_scan(database, str_ref.data, str_ref.size, 0, scratch,
451
0
                               doris::vectorized::LikeSearchState::hs_match_handler,
452
0
                               (void*)(result.data() + i));
453
0
            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
454
0
                return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
455
0
            }
456
0
        }
457
458
0
        hs_free_scratch(scratch);
459
0
        hs_free_database(database);
460
0
    } else { // fallback to re2
461
0
        RE2::Options opts;
462
0
        opts.set_never_nl(false);
463
0
        opts.set_dot_nl(true);
464
0
        re2::RE2 re(re_pattern, opts);
465
0
        if (re.ok()) {
466
0
            auto sz = val.size();
467
0
            for (size_t i = 0; i < sz; i++) {
468
0
                const auto& str_ref = val.get_data_at(i);
469
0
                *(result.data() + i) =
470
0
                        RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), re);
471
0
            }
472
0
        } else {
473
0
            return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
474
0
        }
475
0
    }
476
477
0
    return Status::OK();
478
0
}
479
480
// hyperscan compile expression to database and allocate scratch space
481
Status FunctionLikeBase::hs_prepare(FunctionContext* context, const char* expression,
482
5
                                    hs_database_t** database, hs_scratch_t** scratch) {
483
5
    hs_compile_error_t* compile_err;
484
5
    auto res = hs_compile(expression, HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8,
485
5
                          HS_MODE_BLOCK, nullptr, database, &compile_err);
486
487
5
    if (res != HS_SUCCESS) {
488
0
        *database = nullptr;
489
0
        if (context) {
490
0
            context->set_error("hs_compile regex pattern error");
491
0
        }
492
0
        return Status::RuntimeError("hs_compile regex pattern error:" +
493
0
                                    std::string(compile_err->message));
494
0
        hs_free_compile_error(compile_err);
495
0
    }
496
5
    hs_free_compile_error(compile_err);
497
498
5
    if (hs_alloc_scratch(*database, scratch) != HS_SUCCESS) {
499
0
        hs_free_database(*database);
500
0
        *database = nullptr;
501
0
        *scratch = nullptr;
502
0
        if (context) {
503
0
            context->set_error("hs_alloc_scratch allocate scratch space error");
504
0
        }
505
0
        return Status::RuntimeError("hs_alloc_scratch allocate scratch space error");
506
0
    }
507
508
5
    return Status::OK();
509
5
}
510
511
Status FunctionLikeBase::execute_impl(FunctionContext* context, Block& block,
512
                                      const ColumnNumbers& arguments, uint32_t result,
513
16
                                      size_t input_rows_count) const {
514
16
    const auto values_col =
515
16
            block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
516
16
    const auto* values = check_and_get_column<ColumnString>(values_col.get());
517
518
16
    if (!values) {
519
0
        return Status::InternalError("Not supported input arguments types");
520
0
    }
521
    // result column
522
16
    auto res = ColumnUInt8::create();
523
16
    ColumnUInt8::Container& vec_res = res->get_data();
524
    // set default value to 0, and match functions only need to set 1/true
525
16
    vec_res.resize_fill(input_rows_count);
526
16
    auto* state = reinterpret_cast<LikeState*>(
527
16
            context->get_function_state(FunctionContext::THREAD_LOCAL));
528
    // for constant_substring_fn, use long run length search for performance
529
16
    if (constant_substring_fn ==
530
16
        *(state->function.target<doris::Status (*)(LikeSearchState* state, const ColumnString&,
531
16
                                                   const StringRef&, ColumnUInt8::Container&)>())) {
532
4
        RETURN_IF_ERROR(execute_substring(values->get_chars(), values->get_offsets(), vec_res,
533
4
                                          &state->search_state));
534
12
    } else {
535
12
        const auto pattern_col = block.get_by_position(arguments[1]).column;
536
12
        if (const auto* str_patterns = check_and_get_column<ColumnString>(pattern_col.get())) {
537
12
            RETURN_IF_ERROR(
538
12
                    vector_non_const(*values, *str_patterns, vec_res, state, input_rows_count));
539
12
        } else if (const auto* const_patterns =
540
0
                           check_and_get_column<ColumnConst>(pattern_col.get())) {
541
0
            const auto& pattern_val = const_patterns->get_data_at(0);
542
0
            RETURN_IF_ERROR(vector_const(*values, &pattern_val, vec_res, state->function,
543
0
                                         &state->search_state));
544
0
        } else {
545
0
            return Status::InternalError("Not supported input arguments types");
546
0
        }
547
12
    }
548
16
    block.replace_by_position(result, std::move(res));
549
16
    return Status::OK();
550
16
}
551
552
Status FunctionLikeBase::close(FunctionContext* context,
553
36
                               FunctionContext::FunctionStateScope scope) {
554
36
    return Status::OK();
555
36
}
556
557
Status FunctionLikeBase::execute_substring(const ColumnString::Chars& values,
558
                                           const ColumnString::Offsets& value_offsets,
559
                                           ColumnUInt8::Container& result,
560
4
                                           LikeSearchState* search_state) const {
561
    // treat continuous multi string data as a long string data
562
4
    const UInt8* begin = values.data();
563
4
    const UInt8* end = begin + values.size();
564
4
    const UInt8* pos = begin;
565
566
    /// Current index in the array of strings.
567
4
    size_t i = 0;
568
4
    size_t needle_size = search_state->substring_pattern.get_pattern_length();
569
570
    /// We will search for the next occurrence in all strings at once.
571
7
    while (pos < end) {
572
        // search return matched substring start offset
573
4
        pos = (UInt8*)search_state->substring_pattern.search((char*)pos, end - pos);
574
4
        if (pos >= end) {
575
1
            break;
576
1
        }
577
578
        /// Determine which index it refers to.
579
        /// begin + value_offsets[i] is the start offset of string at i+1
580
3
        while (begin + value_offsets[i] < pos) {
581
0
            ++i;
582
0
        }
583
584
        /// We check that the entry does not pass through the boundaries of strings.
585
3
        if (pos + needle_size <= begin + value_offsets[i]) {
586
3
            result[i] = 1;
587
3
        }
588
589
        // move to next string offset
590
3
        pos = begin + value_offsets[i];
591
3
        ++i;
592
3
    }
593
594
4
    return Status::OK();
595
4
}
596
597
Status FunctionLikeBase::vector_const(const ColumnString& values, const StringRef* pattern_val,
598
                                      ColumnUInt8::Container& result, const LikeFn& function,
599
0
                                      LikeSearchState* search_state) const {
600
0
    RETURN_IF_ERROR((function)(search_state, values,
601
0
                               *reinterpret_cast<const StringRef*>(pattern_val), result));
602
0
    return Status::OK();
603
0
}
604
605
template <bool LIKE_PATTERN>
606
12
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
607
12
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
608
12
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
609
12
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
610
12
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
611
12
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
612
12
    size_t size = patterns.size();
613
614
26
    for (size_t i = 0; i < size; ++i) {
615
15
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
616
15
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
617
15
            !ends_with_state->_pattern_matched) {
618
1
            return nullptr;
619
1
        }
620
14
        std::string pattern_str = patterns.get_data_at(i).to_string();
621
14
        if (allpass_state->_pattern_matched) {
622
12
            if constexpr (LIKE_PATTERN) {
623
11
                allpass_state->like_pattern_match(pattern_str);
624
11
            } else {
625
11
                allpass_state->regexp_pattern_match(pattern_str);
626
11
            }
627
12
        }
628
14
        if (equal_state->_pattern_matched) {
629
12
            if constexpr (LIKE_PATTERN) {
630
11
                equal_state->like_pattern_match(pattern_str);
631
11
            } else {
632
11
                equal_state->regexp_pattern_match(pattern_str);
633
11
            }
634
12
        }
635
14
        if (substring_state->_pattern_matched) {
636
14
            if constexpr (LIKE_PATTERN) {
637
11
                substring_state->like_pattern_match(pattern_str);
638
11
            } else {
639
11
                substring_state->regexp_pattern_match(pattern_str);
640
11
            }
641
14
        }
642
14
        if (starts_with_state->_pattern_matched) {
643
12
            if constexpr (LIKE_PATTERN) {
644
11
                starts_with_state->like_pattern_match(pattern_str);
645
11
            } else {
646
11
                starts_with_state->regexp_pattern_match(pattern_str);
647
11
            }
648
12
        }
649
14
        if (ends_with_state->_pattern_matched) {
650
12
            if constexpr (LIKE_PATTERN) {
651
11
                ends_with_state->like_pattern_match(pattern_str);
652
11
            } else {
653
11
                ends_with_state->regexp_pattern_match(pattern_str);
654
11
            }
655
12
        }
656
14
    }
657
658
11
    if (allpass_state->_pattern_matched) {
659
0
        return allpass_state;
660
11
    } else if (equal_state->_pattern_matched) {
661
2
        return equal_state;
662
9
    } else if (substring_state->_pattern_matched) {
663
0
        return substring_state;
664
9
    } else if (starts_with_state->_pattern_matched) {
665
2
        return starts_with_state;
666
7
    } else if (ends_with_state->_pattern_matched) {
667
2
        return ends_with_state;
668
5
    } else {
669
5
        return nullptr;
670
5
    }
671
11
}
_ZN5doris10vectorized16FunctionLikeBase24pattern_type_recognitionILb1EEESt10shared_ptrINS0_24VectorPatternSearchStateEERKNS0_9ColumnStrIjEE
Line
Count
Source
606
1
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
607
1
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
608
1
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
609
1
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
610
1
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
611
1
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
612
1
    size_t size = patterns.size();
613
614
4
    for (size_t i = 0; i < size; ++i) {
615
4
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
616
4
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
617
4
            !ends_with_state->_pattern_matched) {
618
1
            return nullptr;
619
1
        }
620
3
        std::string pattern_str = patterns.get_data_at(i).to_string();
621
3
        if (allpass_state->_pattern_matched) {
622
1
            if constexpr (LIKE_PATTERN) {
623
1
                allpass_state->like_pattern_match(pattern_str);
624
1
            } else {
625
1
                allpass_state->regexp_pattern_match(pattern_str);
626
1
            }
627
1
        }
628
3
        if (equal_state->_pattern_matched) {
629
1
            if constexpr (LIKE_PATTERN) {
630
1
                equal_state->like_pattern_match(pattern_str);
631
1
            } else {
632
1
                equal_state->regexp_pattern_match(pattern_str);
633
1
            }
634
1
        }
635
3
        if (substring_state->_pattern_matched) {
636
3
            if constexpr (LIKE_PATTERN) {
637
3
                substring_state->like_pattern_match(pattern_str);
638
3
            } else {
639
3
                substring_state->regexp_pattern_match(pattern_str);
640
3
            }
641
3
        }
642
3
        if (starts_with_state->_pattern_matched) {
643
1
            if constexpr (LIKE_PATTERN) {
644
1
                starts_with_state->like_pattern_match(pattern_str);
645
1
            } else {
646
1
                starts_with_state->regexp_pattern_match(pattern_str);
647
1
            }
648
1
        }
649
3
        if (ends_with_state->_pattern_matched) {
650
1
            if constexpr (LIKE_PATTERN) {
651
1
                ends_with_state->like_pattern_match(pattern_str);
652
1
            } else {
653
1
                ends_with_state->regexp_pattern_match(pattern_str);
654
1
            }
655
1
        }
656
3
    }
657
658
0
    if (allpass_state->_pattern_matched) {
659
0
        return allpass_state;
660
0
    } else if (equal_state->_pattern_matched) {
661
0
        return equal_state;
662
0
    } else if (substring_state->_pattern_matched) {
663
0
        return substring_state;
664
0
    } else if (starts_with_state->_pattern_matched) {
665
0
        return starts_with_state;
666
0
    } else if (ends_with_state->_pattern_matched) {
667
0
        return ends_with_state;
668
0
    } else {
669
0
        return nullptr;
670
0
    }
671
0
}
_ZN5doris10vectorized16FunctionLikeBase24pattern_type_recognitionILb0EEESt10shared_ptrINS0_24VectorPatternSearchStateEERKNS0_9ColumnStrIjEE
Line
Count
Source
606
11
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
607
11
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
608
11
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
609
11
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
610
11
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
611
11
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
612
11
    size_t size = patterns.size();
613
614
22
    for (size_t i = 0; i < size; ++i) {
615
11
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
616
11
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
617
11
            !ends_with_state->_pattern_matched) {
618
0
            return nullptr;
619
0
        }
620
11
        std::string pattern_str = patterns.get_data_at(i).to_string();
621
11
        if (allpass_state->_pattern_matched) {
622
11
            if constexpr (LIKE_PATTERN) {
623
11
                allpass_state->like_pattern_match(pattern_str);
624
11
            } else {
625
11
                allpass_state->regexp_pattern_match(pattern_str);
626
11
            }
627
11
        }
628
11
        if (equal_state->_pattern_matched) {
629
11
            if constexpr (LIKE_PATTERN) {
630
11
                equal_state->like_pattern_match(pattern_str);
631
11
            } else {
632
11
                equal_state->regexp_pattern_match(pattern_str);
633
11
            }
634
11
        }
635
11
        if (substring_state->_pattern_matched) {
636
11
            if constexpr (LIKE_PATTERN) {
637
11
                substring_state->like_pattern_match(pattern_str);
638
11
            } else {
639
11
                substring_state->regexp_pattern_match(pattern_str);
640
11
            }
641
11
        }
642
11
        if (starts_with_state->_pattern_matched) {
643
11
            if constexpr (LIKE_PATTERN) {
644
11
                starts_with_state->like_pattern_match(pattern_str);
645
11
            } else {
646
11
                starts_with_state->regexp_pattern_match(pattern_str);
647
11
            }
648
11
        }
649
11
        if (ends_with_state->_pattern_matched) {
650
11
            if constexpr (LIKE_PATTERN) {
651
11
                ends_with_state->like_pattern_match(pattern_str);
652
11
            } else {
653
11
                ends_with_state->regexp_pattern_match(pattern_str);
654
11
            }
655
11
        }
656
11
    }
657
658
11
    if (allpass_state->_pattern_matched) {
659
0
        return allpass_state;
660
11
    } else if (equal_state->_pattern_matched) {
661
2
        return equal_state;
662
9
    } else if (substring_state->_pattern_matched) {
663
0
        return substring_state;
664
9
    } else if (starts_with_state->_pattern_matched) {
665
2
        return starts_with_state;
666
7
    } else if (ends_with_state->_pattern_matched) {
667
2
        return ends_with_state;
668
5
    } else {
669
5
        return nullptr;
670
5
    }
671
11
}
672
673
Status FunctionLikeBase::vector_non_const(const ColumnString& values, const ColumnString& patterns,
674
                                          ColumnUInt8::Container& result, LikeState* state,
675
12
                                          size_t input_rows_count) const {
676
12
    VPatternSearchStateSPtr vector_search_state;
677
12
    if (state->is_like_pattern) {
678
1
        vector_search_state = pattern_type_recognition<true>(patterns);
679
11
    } else {
680
11
        vector_search_state = pattern_type_recognition<false>(patterns);
681
11
    }
682
12
    if (vector_search_state == nullptr) {
683
        // pattern type recognition failed, use default case
684
41
        for (int i = 0; i < input_rows_count; ++i) {
685
35
            const auto pattern_val = patterns.get_data_at(i);
686
35
            const auto value_val = values.get_data_at(i);
687
35
            RETURN_IF_ERROR((state->scalar_function)(&state->search_state, value_val, pattern_val,
688
35
                                                     &result[i]));
689
35
        }
690
6
        return Status::OK();
691
6
    }
692
6
    const auto* search_strings =
693
6
            static_cast<const ColumnString*>(vector_search_state->_search_strings.get());
694
6
    return (vector_search_state->_vector_function)(values, *search_strings, result);
695
12
}
696
697
Status FunctionLike::like_fn(LikeSearchState* state, const ColumnString& val,
698
0
                             const StringRef& pattern, ColumnUInt8::Container& result) {
699
0
    std::string re_pattern;
700
0
    convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern);
701
0
    return regexp_fn(state, val, {re_pattern.c_str(), re_pattern.size()}, result);
702
0
}
703
704
Status FunctionLike::like_fn_scalar(LikeSearchState* state, const StringRef& val,
705
30
                                    const StringRef& pattern, unsigned char* result) {
706
30
    std::string re_pattern;
707
30
    convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern);
708
709
30
    return regexp_fn_scalar(state, StringRef(val.data, val.size),
710
30
                            {re_pattern.c_str(), re_pattern.size()}, result);
711
30
}
712
713
void FunctionLike::convert_like_pattern(LikeSearchState* state, const std::string& pattern,
714
30
                                        std::string* re_pattern) {
715
30
    re_pattern->clear();
716
717
30
    if (pattern.empty()) {
718
4
        re_pattern->append("^$");
719
4
        return;
720
4
    }
721
722
    // add ^ to pattern head to match line head
723
26
    if (!pattern.empty() && pattern[0] != '%') {
724
17
        re_pattern->append("^");
725
17
    }
726
727
26
    bool is_escaped = false;
728
    // expect % and _, all chars should keep it literal means.
729
107
    for (char i : pattern) {
730
107
        if (is_escaped) { // last is \, this should be escape
731
4
            if (i == '[' || i == ']' || i == '(' || i == ')' || i == '{' || i == '}' || i == '-' ||
732
4
                i == '*' || i == '+' || i == '\\' || i == '|' || i == '/' || i == ':' || i == '^' ||
733
4
                i == '.' || i == '$' || i == '?') {
734
0
                re_pattern->append(1, '\\');
735
4
            } else if (i != '%' && i != '_') {
736
4
                re_pattern->append(2, '\\');
737
4
            }
738
4
            re_pattern->append(1, i);
739
4
            is_escaped = false;
740
103
        } else {
741
103
            switch (i) {
742
22
            case '%':
743
22
                re_pattern->append(".*");
744
22
                break;
745
21
            case '_':
746
21
                re_pattern->append(".");
747
21
                break;
748
60
            default:
749
60
                is_escaped = i == state->escape_char;
750
60
                if (!is_escaped) {
751
                    // special for hyperscan: [, ], (, ), {, }, -, *, +, \, |, /, :, ^, ., $, ?
752
56
                    if (i == '[' || i == ']' || i == '(' || i == ')' || i == '{' || i == '}' ||
753
56
                        i == '-' || i == '*' || i == '+' || i == '\\' || i == '|' || i == '/' ||
754
56
                        i == ':' || i == '^' || i == '.' || i == '$' || i == '?') {
755
7
                        re_pattern->append(1, '\\');
756
7
                    }
757
56
                    re_pattern->append(1, i);
758
56
                }
759
60
                break;
760
103
            }
761
103
        }
762
107
    }
763
764
    // add $ to pattern tail to match line tail
765
26
    if (!pattern.empty() && re_pattern->back() != '*') {
766
13
        re_pattern->append("$");
767
13
    }
768
26
}
769
770
2
void FunctionLike::remove_escape_character(std::string* search_string) {
771
2
    std::string tmp_search_string;
772
2
    tmp_search_string.swap(*search_string);
773
2
    int len = tmp_search_string.length();
774
5
    for (int i = 0; i < len;) {
775
3
        if (tmp_search_string[i] == '\\' && i + 1 < len &&
776
3
            (tmp_search_string[i + 1] == '%' || tmp_search_string[i + 1] == '_' ||
777
0
             tmp_search_string[i + 1] == '\\')) {
778
0
            search_string->append(1, tmp_search_string[i + 1]);
779
0
            i += 2;
780
3
        } else {
781
3
            search_string->append(1, tmp_search_string[i]);
782
3
            i++;
783
3
        }
784
3
    }
785
2
}
786
787
0
bool re2_full_match(const std::string& str, const RE2& re, std::vector<std::string>& results) {
788
0
    if (!re.ok()) {
789
0
        return false;
790
0
    }
791
792
0
    std::vector<RE2::Arg> arguments;
793
0
    std::vector<RE2::Arg*> arguments_ptrs;
794
0
    std::size_t args_count = re.NumberOfCapturingGroups();
795
0
    arguments.resize(args_count);
796
0
    arguments_ptrs.resize(args_count);
797
0
    results.resize(args_count);
798
0
    for (std::size_t i = 0; i < args_count; ++i) {
799
0
        arguments[i] = &results[i];
800
0
        arguments_ptrs[i] = &arguments[i];
801
0
    }
802
803
0
    return RE2::FullMatchN(str, re, arguments_ptrs.data(), args_count);
804
0
}
805
806
0
void verbose_log_match(const std::string& str, const std::string& pattern_name, const RE2& re) {
807
0
    std::vector<std::string> results;
808
0
    VLOG_DEBUG << "arg str: " << str << ", size: " << str.size() << ", pattern " << pattern_name
809
0
               << ": " << re.pattern() << ", size: " << re.pattern().size();
810
0
    if (re2_full_match(str, re, results)) {
811
0
        for (int i = 0; i < results.size(); ++i) {
812
0
            VLOG_DEBUG << "match " << i << ": " << results[i] << ", size: " << results[i].size();
813
0
        }
814
0
    } else {
815
0
        VLOG_DEBUG << "no match";
816
0
    }
817
0
}
818
819
Status FunctionLike::construct_like_const_state(FunctionContext* context, const StringRef& pattern,
820
                                                std::shared_ptr<LikeState>& state,
821
0
                                                bool try_hyperscan) {
822
0
    std::string pattern_str = pattern.to_string();
823
0
    state->search_state.pattern_str = pattern_str;
824
0
    std::string search_string;
825
826
0
    if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) {
827
0
        state->search_state.set_search_string("");
828
0
        state->function = constant_allpass_fn;
829
0
        state->scalar_function = constant_allpass_fn_scalar;
830
0
    } else if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &search_string)) {
831
0
        if (VLOG_DEBUG_IS_ON) {
832
0
            verbose_log_match(pattern_str, "LIKE_EQUALS_RE", LIKE_EQUALS_RE);
833
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
834
0
        }
835
0
        remove_escape_character(&search_string);
836
0
        if (VLOG_DEBUG_IS_ON) {
837
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
838
0
                       << ", size: " << search_string.size();
839
0
        }
840
0
        state->search_state.set_search_string(search_string);
841
0
        state->function = constant_equals_fn;
842
0
        state->scalar_function = constant_equals_fn_scalar;
843
0
    } else if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &search_string)) {
844
0
        if (VLOG_DEBUG_IS_ON) {
845
0
            verbose_log_match(pattern_str, "LIKE_STARTS_WITH_RE", LIKE_STARTS_WITH_RE);
846
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
847
0
        }
848
0
        remove_escape_character(&search_string);
849
0
        if (VLOG_DEBUG_IS_ON) {
850
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
851
0
                       << ", size: " << search_string.size();
852
0
        }
853
0
        state->search_state.set_search_string(search_string);
854
0
        state->function = constant_starts_with_fn;
855
0
        state->scalar_function = constant_starts_with_fn_scalar;
856
0
    } else if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &search_string)) {
857
0
        if (VLOG_DEBUG_IS_ON) {
858
0
            verbose_log_match(pattern_str, "LIKE_ENDS_WITH_RE", LIKE_ENDS_WITH_RE);
859
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
860
0
        }
861
0
        remove_escape_character(&search_string);
862
0
        if (VLOG_DEBUG_IS_ON) {
863
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
864
0
                       << ", size: " << search_string.size();
865
0
        }
866
0
        state->search_state.set_search_string(search_string);
867
0
        state->function = constant_ends_with_fn;
868
0
        state->scalar_function = constant_ends_with_fn_scalar;
869
0
    } else if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &search_string)) {
870
0
        if (VLOG_DEBUG_IS_ON) {
871
0
            verbose_log_match(pattern_str, "LIKE_SUBSTRING_RE", LIKE_SUBSTRING_RE);
872
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
873
0
        }
874
0
        remove_escape_character(&search_string);
875
0
        if (VLOG_DEBUG_IS_ON) {
876
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
877
0
                       << ", size: " << search_string.size();
878
0
        }
879
0
        state->search_state.set_search_string(search_string);
880
0
        state->function = constant_substring_fn;
881
0
        state->scalar_function = constant_substring_fn_scalar;
882
0
    } else {
883
0
        std::string re_pattern;
884
0
        convert_like_pattern(&state->search_state, pattern_str, &re_pattern);
885
0
        if (VLOG_DEBUG_IS_ON) {
886
0
            VLOG_DEBUG << "hyperscan, pattern str: " << pattern_str
887
0
                       << ", size: " << pattern_str.size() << ", re pattern: " << re_pattern
888
0
                       << ", size: " << re_pattern.size();
889
0
        }
890
891
0
        hs_database_t* database = nullptr;
892
0
        hs_scratch_t* scratch = nullptr;
893
0
        if (try_hyperscan && hs_prepare(context, re_pattern.c_str(), &database, &scratch).ok()) {
894
            // use hyperscan
895
0
            state->search_state.hs_database.reset(database);
896
0
            state->search_state.hs_scratch.reset(scratch);
897
0
        } else {
898
            // fallback to re2
899
            // reset hs_database to nullptr to indicate not use hyperscan
900
0
            state->search_state.hs_database.reset();
901
0
            state->search_state.hs_scratch.reset();
902
903
0
            RE2::Options opts;
904
0
            opts.set_never_nl(false);
905
0
            opts.set_dot_nl(true);
906
0
            state->search_state.regex = std::make_unique<RE2>(re_pattern, opts);
907
0
            if (!state->search_state.regex->ok()) {
908
0
                return Status::InternalError("Invalid regex expression: {}(origin: {})", re_pattern,
909
0
                                             pattern_str);
910
0
            }
911
0
        }
912
913
0
        state->function = constant_regex_fn;
914
0
        state->scalar_function = constant_regex_fn_scalar;
915
0
    }
916
0
    return Status::OK();
917
0
}
918
919
2
Status FunctionLike::open(FunctionContext* context, FunctionContext::FunctionStateScope scope) {
920
2
    if (scope != FunctionContext::THREAD_LOCAL) {
921
1
        return Status::OK();
922
1
    }
923
1
    std::shared_ptr<LikeState> state = std::make_shared<LikeState>();
924
1
    state->is_like_pattern = true;
925
1
    state->function = like_fn;
926
1
    state->scalar_function = like_fn_scalar;
927
1
    if (context->is_col_constant(1)) {
928
0
        const auto pattern_col = context->get_constant_col(1)->column_ptr;
929
0
        const auto& pattern = pattern_col->get_data_at(0);
930
0
        RETURN_IF_ERROR(construct_like_const_state(context, pattern, state));
931
0
    }
932
1
    context->set_function_state(scope, state);
933
934
1
    return Status::OK();
935
1
}
936
937
34
Status FunctionRegexp::open(FunctionContext* context, FunctionContext::FunctionStateScope scope) {
938
34
    if (scope != FunctionContext::THREAD_LOCAL) {
939
17
        return Status::OK();
940
17
    }
941
17
    std::shared_ptr<LikeState> state = std::make_shared<LikeState>();
942
17
    context->set_function_state(scope, state);
943
17
    state->is_like_pattern = false;
944
17
    state->function = regexp_fn;
945
17
    state->scalar_function = regexp_fn_scalar;
946
17
    if (context->is_col_constant(1)) {
947
17
        const auto pattern_col = context->get_constant_col(1)->column_ptr;
948
17
        const auto& pattern = pattern_col->get_data_at(0);
949
950
17
        std::string pattern_str = pattern.to_string();
951
17
        std::string search_string;
952
17
        if (RE2::FullMatch(pattern_str, ALLPASS_RE)) {
953
0
            state->search_state.set_search_string("");
954
0
            state->function = constant_allpass_fn;
955
0
            state->scalar_function = constant_allpass_fn_scalar;
956
17
        } else if (RE2::FullMatch(pattern_str, EQUALS_RE, &search_string)) {
957
2
            state->search_state.set_search_string(search_string);
958
2
            state->function = constant_equals_fn;
959
2
            state->scalar_function = constant_equals_fn_scalar;
960
15
        } else if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &search_string)) {
961
2
            state->search_state.set_search_string(search_string);
962
2
            state->function = constant_starts_with_fn;
963
2
            state->scalar_function = constant_starts_with_fn_scalar;
964
13
        } else if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &search_string)) {
965
2
            state->search_state.set_search_string(search_string);
966
2
            state->function = constant_ends_with_fn;
967
2
            state->scalar_function = constant_ends_with_fn_scalar;
968
11
        } else if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &search_string)) {
969
6
            state->search_state.set_search_string(search_string);
970
6
            state->function = constant_substring_fn;
971
6
            state->scalar_function = constant_substring_fn_scalar;
972
6
        } else {
973
5
            hs_database_t* database = nullptr;
974
5
            hs_scratch_t* scratch = nullptr;
975
5
            if (hs_prepare(context, pattern_str.c_str(), &database, &scratch).ok()) {
976
                // use hyperscan
977
5
                state->search_state.hs_database.reset(database);
978
5
                state->search_state.hs_scratch.reset(scratch);
979
5
            } else {
980
                // fallback to re2
981
                // reset hs_database to nullptr to indicate not use hyperscan
982
0
                state->search_state.hs_database.reset();
983
0
                state->search_state.hs_scratch.reset();
984
0
                RE2::Options opts;
985
0
                opts.set_never_nl(false);
986
0
                opts.set_dot_nl(true);
987
0
                state->search_state.regex = std::make_unique<RE2>(pattern_str, opts);
988
0
                if (!state->search_state.regex->ok()) {
989
0
                    return Status::InternalError("Invalid regex expression: {}", pattern_str);
990
0
                }
991
0
            }
992
5
            state->function = constant_regex_fn;
993
5
            state->scalar_function = constant_regex_fn_scalar;
994
5
        }
995
17
    }
996
17
    return Status::OK();
997
17
}
998
999
1
void register_function_like(SimpleFunctionFactory& factory) {
1000
1
    factory.register_function<FunctionLike>();
1001
1
}
1002
1003
1
void register_function_regexp(SimpleFunctionFactory& factory) {
1004
1
    factory.register_function<FunctionRegexp>();
1005
1
    factory.register_alias(FunctionRegexp::name, FunctionRegexp::alias);
1006
1
}
1007
1008
} // namespace doris::vectorized