Coverage Report

Created: 2025-07-25 20:58

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/root/doris/be/src/vec/functions/like.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "like.h"
19
20
#include <fmt/format.h>
21
#include <hs/hs_compile.h>
22
#include <re2/stringpiece.h>
23
24
#include <cstddef>
25
#include <ostream>
26
#include <utility>
27
#include <vector>
28
29
#include "common/logging.h"
30
#include "vec/columns/column.h"
31
#include "vec/columns/column_const.h"
32
#include "vec/columns/column_vector.h"
33
#include "vec/common/string_ref.h"
34
#include "vec/core/block.h"
35
#include "vec/core/column_with_type_and_name.h"
36
#include "vec/functions/simple_function_factory.h"
37
38
namespace doris::vectorized {
39
#include "common/compile_check_begin.h"
40
// A regex to match any regex pattern is equivalent to a substring search.
41
static const RE2 SUBSTRING_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
42
43
// A regex to match any regex pattern which is equivalent to matching a constant string
44
// at the end of the string values.
45
static const RE2 ENDS_WITH_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
46
47
// A regex to match any regex pattern which is equivalent to matching a constant string
48
// at the end of the string values.
49
static const RE2 STARTS_WITH_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)");
50
51
// A regex to match any regex pattern which is equivalent to a constant string match.
52
static const RE2 EQUALS_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)");
53
// A regex to match .*
54
static const RE2 ALLPASS_RE(R"((\.\*)+)");
55
56
// Like patterns
57
static const re2::RE2 LIKE_SUBSTRING_RE(R"((?:%+)(((\\_)|([^%_\\]))+)(?:%+))");
58
static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\_)|([^%_]))+)");
59
static const re2::RE2 LIKE_STARTS_WITH_RE(R"((((\\%)|(\\_)|([^%_\\]))+)(?:%+))");
60
static const re2::RE2 LIKE_EQUALS_RE("(((\\\\_)|([^%_]))+)");
61
static const re2::RE2 LIKE_ALLPASS_RE("%+");
62
63
struct VectorAllpassSearchState : public VectorPatternSearchState {
64
135
    VectorAllpassSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_allpass_fn) {}
65
66
135
    ~VectorAllpassSearchState() override = default;
67
68
124
    void like_pattern_match(const std::string& pattern_str) override {
69
124
        if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) {
70
0
            _search_strings->insert_default();
71
124
        } else {
72
124
            _pattern_matched = false;
73
124
        }
74
124
    }
75
76
11
    void regexp_pattern_match(const std::string& pattern_str) override {
77
11
        if (RE2::FullMatch(pattern_str, ALLPASS_RE)) {
78
0
            _search_strings->insert_default();
79
11
        } else {
80
11
            _pattern_matched = false;
81
11
        }
82
11
    }
83
};
84
85
struct VectorEqualSearchState : public VectorPatternSearchState {
86
135
    VectorEqualSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_equals_fn) {}
87
88
135
    ~VectorEqualSearchState() override = default;
89
90
124
    void like_pattern_match(const std::string& pattern_str) override {
91
124
        _search_string.clear();
92
124
        if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &_search_string)) {
93
57
            FunctionLike::remove_escape_character(&_search_string);
94
57
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
95
67
        } else {
96
67
            _pattern_matched = false;
97
67
        }
98
124
    }
99
100
11
    void regexp_pattern_match(const std::string& pattern_str) override {
101
11
        _search_string.clear();
102
11
        if (RE2::FullMatch(pattern_str, EQUALS_RE, &_search_string)) {
103
2
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
104
9
        } else {
105
9
            _pattern_matched = false;
106
9
        }
107
11
    }
108
};
109
110
struct VectorSubStringSearchState : public VectorPatternSearchState {
111
    VectorSubStringSearchState()
112
135
            : VectorPatternSearchState(FunctionLikeBase::vector_substring_fn) {}
113
114
135
    ~VectorSubStringSearchState() override = default;
115
116
126
    void like_pattern_match(const std::string& pattern_str) override {
117
126
        _search_string.clear();
118
126
        if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &_search_string)) {
119
5
            FunctionLike::remove_escape_character(&_search_string);
120
5
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
121
121
        } else {
122
121
            _pattern_matched = false;
123
121
        }
124
126
    }
125
126
11
    void regexp_pattern_match(const std::string& pattern_str) override {
127
11
        _search_string.clear();
128
11
        if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &_search_string)) {
129
0
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
130
11
        } else {
131
11
            _pattern_matched = false;
132
11
        }
133
11
    }
134
};
135
136
struct VectorStartsWithSearchState : public VectorPatternSearchState {
137
    VectorStartsWithSearchState()
138
135
            : VectorPatternSearchState(FunctionLikeBase::vector_starts_with_fn) {}
139
140
135
    ~VectorStartsWithSearchState() override = default;
141
142
124
    void like_pattern_match(const std::string& pattern_str) override {
143
124
        _search_string.clear();
144
124
        if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &_search_string)) {
145
6
            FunctionLike::remove_escape_character(&_search_string);
146
6
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
147
118
        } else {
148
118
            _pattern_matched = false;
149
118
        }
150
124
    }
151
152
11
    void regexp_pattern_match(const std::string& pattern_str) override {
153
11
        _search_string.clear();
154
11
        if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &_search_string)) {
155
2
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
156
9
        } else {
157
9
            _pattern_matched = false;
158
9
        }
159
11
    }
160
};
161
162
struct VectorEndsWithSearchState : public VectorPatternSearchState {
163
135
    VectorEndsWithSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_ends_with_fn) {}
164
165
135
    ~VectorEndsWithSearchState() override = default;
166
167
124
    void like_pattern_match(const std::string& pattern_str) override {
168
124
        _search_string.clear();
169
124
        if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &_search_string)) {
170
6
            FunctionLike::remove_escape_character(&_search_string);
171
6
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
172
118
        } else {
173
118
            _pattern_matched = false;
174
118
        }
175
124
    }
176
177
11
    void regexp_pattern_match(const std::string& pattern_str) override {
178
11
        _search_string.clear();
179
11
        if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &_search_string)) {
180
2
            _search_strings->insert_data(_search_string.c_str(), _search_string.size());
181
9
        } else {
182
9
            _pattern_matched = false;
183
9
        }
184
11
    }
185
};
186
187
0
Status LikeSearchState::clone(LikeSearchState& cloned) {
188
0
    cloned.set_search_string(search_string);
189
190
0
    std::string re_pattern;
191
0
    FunctionLike::convert_like_pattern(this, pattern_str, &re_pattern);
192
0
    if (hs_database) { // use hyperscan
193
0
        hs_database_t* database = nullptr;
194
0
        hs_scratch_t* scratch = nullptr;
195
0
        RETURN_IF_ERROR(FunctionLike::hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch));
196
197
0
        cloned.hs_database.reset(database);
198
0
        cloned.hs_scratch.reset(scratch);
199
0
    } else { // fallback to re2
200
0
        cloned.hs_database.reset();
201
0
        cloned.hs_scratch.reset();
202
203
0
        RE2::Options opts;
204
0
        opts.set_never_nl(false);
205
0
        opts.set_dot_nl(true);
206
0
        cloned.regex = std::make_unique<RE2>(re_pattern, opts);
207
0
        if (!cloned.regex->ok()) {
208
0
            return Status::InternalError("Invalid regex expression: {}", re_pattern);
209
0
        }
210
0
    }
211
212
0
    return Status::OK();
213
0
}
214
215
Status FunctionLikeBase::constant_allpass_fn(LikeSearchState* state, const ColumnString& vals,
216
                                             const StringRef& pattern,
217
0
                                             ColumnUInt8::Container& result) {
218
0
    memset(result.data(), 1, vals.size());
219
0
    return Status::OK();
220
0
}
221
222
Status FunctionLikeBase::constant_allpass_fn_scalar(LikeSearchState* state, const StringRef& val,
223
                                                    const StringRef& pattern,
224
0
                                                    unsigned char* result) {
225
0
    *result = 1;
226
0
    return Status::OK();
227
0
}
228
229
Status FunctionLikeBase::vector_allpass_fn(const ColumnString& vals,
230
                                           const ColumnString& search_strings,
231
0
                                           ColumnUInt8::Container& result) {
232
0
    DCHECK(vals.size() == search_strings.size());
233
0
    DCHECK(vals.size() == result.size());
234
0
    memset(result.data(), 1, vals.size());
235
0
    return Status::OK();
236
0
}
237
238
Status FunctionLikeBase::constant_starts_with_fn(LikeSearchState* state, const ColumnString& val,
239
                                                 const StringRef& pattern,
240
0
                                                 ColumnUInt8::Container& result) {
241
0
    auto sz = val.size();
242
0
    for (size_t i = 0; i < sz; i++) {
243
0
        const auto& str_ref = val.get_data_at(i);
244
0
        result[i] = (str_ref.size >= state->search_string_sv.size) &&
245
0
                    str_ref.start_with(state->search_string_sv);
246
0
    }
247
0
    return Status::OK();
248
0
}
249
250
Status FunctionLikeBase::constant_starts_with_fn_scalar(LikeSearchState* state,
251
                                                        const StringRef& val,
252
                                                        const StringRef& pattern,
253
0
                                                        unsigned char* result) {
254
0
    *result = (val.size >= state->search_string_sv.size) &&
255
0
              (state->search_string_sv == val.substring(0, state->search_string_sv.size));
256
0
    return Status::OK();
257
0
}
258
259
Status FunctionLikeBase::vector_starts_with_fn(const ColumnString& vals,
260
                                               const ColumnString& search_strings,
261
8
                                               ColumnUInt8::Container& result) {
262
8
    DCHECK(vals.size() == search_strings.size());
263
8
    DCHECK(vals.size() == result.size());
264
8
    auto sz = vals.size();
265
16
    for (size_t i = 0; i < sz; ++i) {
266
8
        const auto& str_sv = vals.get_data_at(i);
267
8
        const auto& search_string_sv = search_strings.get_data_at(i);
268
8
        result[i] = (str_sv.size >= search_string_sv.size) && str_sv.start_with(search_string_sv);
269
8
    }
270
8
    return Status::OK();
271
8
}
272
273
Status FunctionLikeBase::constant_ends_with_fn(LikeSearchState* state, const ColumnString& val,
274
                                               const StringRef& pattern,
275
0
                                               ColumnUInt8::Container& result) {
276
0
    auto sz = val.size();
277
0
    for (size_t i = 0; i < sz; i++) {
278
0
        const auto& str_ref = val.get_data_at(i);
279
0
        result[i] = (str_ref.size >= state->search_string_sv.size) &&
280
0
                    str_ref.end_with(state->search_string_sv);
281
0
    }
282
0
    return Status::OK();
283
0
}
284
285
Status FunctionLikeBase::constant_ends_with_fn_scalar(LikeSearchState* state, const StringRef& val,
286
                                                      const StringRef& pattern,
287
0
                                                      unsigned char* result) {
288
0
    *result = (val.size >= state->search_string_sv.size) &&
289
0
              (state->search_string_sv == val.substring(val.size - state->search_string_sv.size,
290
0
                                                        state->search_string_sv.size));
291
0
    return Status::OK();
292
0
}
293
294
Status FunctionLikeBase::vector_ends_with_fn(const ColumnString& vals,
295
                                             const ColumnString& search_strings,
296
8
                                             ColumnUInt8::Container& result) {
297
8
    DCHECK(vals.size() == search_strings.size());
298
8
    DCHECK(vals.size() == result.size());
299
8
    auto sz = vals.size();
300
16
    for (size_t i = 0; i < sz; ++i) {
301
8
        const auto& str_sv = vals.get_data_at(i);
302
8
        const auto& search_string_sv = search_strings.get_data_at(i);
303
8
        result[i] = (str_sv.size >= search_string_sv.size) && str_sv.end_with(search_string_sv);
304
8
    }
305
8
    return Status::OK();
306
8
}
307
308
Status FunctionLikeBase::constant_equals_fn(LikeSearchState* state, const ColumnString& val,
309
                                            const StringRef& pattern,
310
0
                                            ColumnUInt8::Container& result) {
311
0
    auto sz = val.size();
312
0
    for (size_t i = 0; i < sz; i++) {
313
0
        result[i] = (val.get_data_at(i) == state->search_string_sv);
314
0
    }
315
0
    return Status::OK();
316
0
}
317
318
Status FunctionLikeBase::constant_equals_fn_scalar(LikeSearchState* state, const StringRef& val,
319
                                                   const StringRef& pattern,
320
0
                                                   unsigned char* result) {
321
0
    *result = (val == state->search_string_sv);
322
0
    return Status::OK();
323
0
}
324
325
Status FunctionLikeBase::vector_equals_fn(const ColumnString& vals,
326
                                          const ColumnString& search_strings,
327
59
                                          ColumnUInt8::Container& result) {
328
59
    DCHECK(vals.size() == search_strings.size());
329
59
    DCHECK(vals.size() == result.size());
330
59
    auto sz = vals.size();
331
118
    for (size_t i = 0; i < sz; ++i) {
332
59
        const auto& str_sv = vals.get_data_at(i);
333
59
        const auto& search_string_sv = search_strings.get_data_at(i);
334
59
        result[i] = str_sv == search_string_sv;
335
59
    }
336
59
    return Status::OK();
337
59
}
338
339
Status FunctionLikeBase::constant_substring_fn(LikeSearchState* state, const ColumnString& val,
340
                                               const StringRef& pattern,
341
0
                                               ColumnUInt8::Container& result) {
342
0
    auto sz = val.size();
343
0
    for (size_t i = 0; i < sz; i++) {
344
0
        if (state->search_string_sv.size == 0) {
345
0
            result[i] = true;
346
0
            continue;
347
0
        }
348
0
        result[i] = state->substring_pattern.search(val.get_data_at(i)) != -1;
349
0
    }
350
0
    return Status::OK();
351
0
}
352
353
Status FunctionLikeBase::constant_substring_fn_scalar(LikeSearchState* state, const StringRef& val,
354
                                                      const StringRef& pattern,
355
0
                                                      unsigned char* result) {
356
0
    if (state->search_string_sv.size == 0) {
357
0
        *result = true;
358
0
        return Status::OK();
359
0
    }
360
0
    *result = state->substring_pattern.search(val) != -1;
361
0
    return Status::OK();
362
0
}
363
364
Status FunctionLikeBase::vector_substring_fn(const ColumnString& vals,
365
                                             const ColumnString& search_strings,
366
3
                                             ColumnUInt8::Container& result) {
367
3
    DCHECK(vals.size() == search_strings.size());
368
3
    DCHECK(vals.size() == result.size());
369
3
    auto sz = vals.size();
370
6
    for (size_t i = 0; i < sz; ++i) {
371
3
        const auto& str_sv = vals.get_data_at(i);
372
3
        const auto& search_string_sv = search_strings.get_data_at(i);
373
3
        if (search_string_sv.size == 0) {
374
0
            result[i] = true;
375
0
            continue;
376
0
        }
377
3
        doris::StringSearch substring_search(&search_string_sv);
378
3
        result[i] = substring_search.search(str_sv) != -1;
379
3
    }
380
3
    return Status::OK();
381
3
}
382
383
Status FunctionLikeBase::constant_regex_fn_scalar(LikeSearchState* state, const StringRef& val,
384
39
                                                  const StringRef& pattern, unsigned char* result) {
385
39
    if (state->hs_database) { // use hyperscan
386
39
        auto ret = hs_scan(state->hs_database.get(), val.data, (int)val.size, 0,
387
39
                           state->hs_scratch.get(),
388
39
                           doris::vectorized::LikeSearchState::hs_match_handler, (void*)result);
389
39
        if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
390
0
            return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
391
0
        }
392
39
    } else { // fallback to re2
393
0
        *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), *state->regex);
394
0
    }
395
396
39
    return Status::OK();
397
39
}
398
399
Status FunctionLikeBase::regexp_fn_scalar(LikeSearchState* state, const StringRef& val,
400
62
                                          const StringRef& pattern, unsigned char* result) {
401
62
    RE2::Options opts;
402
62
    opts.set_never_nl(false);
403
62
    opts.set_dot_nl(true);
404
62
    re2::RE2 re(re2::StringPiece(pattern.data, pattern.size), opts);
405
62
    if (re.ok()) {
406
62
        *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), re);
407
62
    } else {
408
0
        return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
409
0
    }
410
411
62
    return Status::OK();
412
62
}
413
414
Status FunctionLikeBase::constant_regex_fn(LikeSearchState* state, const ColumnString& val,
415
                                           const StringRef& pattern,
416
0
                                           ColumnUInt8::Container& result) {
417
0
    auto sz = val.size();
418
0
    if (state->hs_database) { // use hyperscan
419
0
        for (size_t i = 0; i < sz; i++) {
420
0
            const auto& str_ref = val.get_data_at(i);
421
0
            auto ret = hs_scan(state->hs_database.get(), str_ref.data, (int)str_ref.size, 0,
422
0
                               state->hs_scratch.get(),
423
0
                               doris::vectorized::LikeSearchState::hs_match_handler,
424
0
                               (void*)(result.data() + i));
425
0
            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
426
0
                return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
427
0
            }
428
0
        }
429
0
    } else { // fallback to re2
430
0
        for (size_t i = 0; i < sz; i++) {
431
0
            const auto& str_ref = val.get_data_at(i);
432
0
            *(result.data() + i) =
433
0
                    RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), *state->regex);
434
0
        }
435
0
    }
436
437
0
    return Status::OK();
438
0
}
439
440
Status FunctionLikeBase::regexp_fn(LikeSearchState* state, const ColumnString& val,
441
0
                                   const StringRef& pattern, ColumnUInt8::Container& result) {
442
0
    std::string re_pattern(pattern.data, pattern.size);
443
444
0
    hs_database_t* database = nullptr;
445
0
    hs_scratch_t* scratch = nullptr;
446
0
    if (hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch).ok()) { // use hyperscan
447
0
        auto sz = val.size();
448
0
        for (size_t i = 0; i < sz; i++) {
449
0
            const auto& str_ref = val.get_data_at(i);
450
0
            auto ret = hs_scan(database, str_ref.data, (int)str_ref.size, 0, scratch,
451
0
                               doris::vectorized::LikeSearchState::hs_match_handler,
452
0
                               (void*)(result.data() + i));
453
0
            if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) {
454
0
                return Status::RuntimeError(fmt::format("hyperscan error: {}", ret));
455
0
            }
456
0
        }
457
458
0
        hs_free_scratch(scratch);
459
0
        hs_free_database(database);
460
0
    } else { // fallback to re2
461
0
        RE2::Options opts;
462
0
        opts.set_never_nl(false);
463
0
        opts.set_dot_nl(true);
464
0
        re2::RE2 re(re_pattern, opts);
465
0
        if (re.ok()) {
466
0
            auto sz = val.size();
467
0
            for (size_t i = 0; i < sz; i++) {
468
0
                const auto& str_ref = val.get_data_at(i);
469
0
                *(result.data() + i) =
470
0
                        RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), re);
471
0
            }
472
0
        } else {
473
0
            return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string());
474
0
        }
475
0
    }
476
477
0
    return Status::OK();
478
0
}
479
480
// hyperscan compile expression to database and allocate scratch space
481
Status FunctionLikeBase::hs_prepare(FunctionContext* context, const char* expression,
482
41
                                    hs_database_t** database, hs_scratch_t** scratch) {
483
41
    hs_compile_error_t* compile_err;
484
41
    auto res = hs_compile(expression, HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8,
485
41
                          HS_MODE_BLOCK, nullptr, database, &compile_err);
486
487
41
    if (res != HS_SUCCESS) {
488
0
        *database = nullptr;
489
0
        std::string error_message = compile_err->message;
490
0
        hs_free_compile_error(compile_err);
491
        // Do not call FunctionContext::set_error here, since we do not want to cancel the query here.
492
0
        return Status::RuntimeError<false>("hs_compile regex pattern error:" + error_message);
493
0
    }
494
41
    hs_free_compile_error(compile_err);
495
496
41
    if (hs_alloc_scratch(*database, scratch) != HS_SUCCESS) {
497
0
        hs_free_database(*database);
498
0
        *database = nullptr;
499
0
        *scratch = nullptr;
500
        // Do not call FunctionContext::set_error here, since we do not want to cancel the query here.
501
0
        return Status::RuntimeError<false>("hs_alloc_scratch allocate scratch space error");
502
0
    }
503
504
41
    return Status::OK();
505
41
}
506
507
Status FunctionLikeBase::execute_impl(FunctionContext* context, Block& block,
508
                                      const ColumnNumbers& arguments, uint32_t result,
509
145
                                      size_t input_rows_count) const {
510
145
    const auto values_col =
511
145
            block.get_by_position(arguments[0]).column->convert_to_full_column_if_const();
512
145
    const auto* values = check_and_get_column<ColumnString>(values_col.get());
513
514
145
    if (!values) {
515
0
        return Status::InternalError("Not supported input arguments types");
516
0
    }
517
    // result column
518
145
    auto res = ColumnUInt8::create();
519
145
    ColumnUInt8::Container& vec_res = res->get_data();
520
    // set default value to 0, and match functions only need to set 1/true
521
145
    vec_res.resize_fill(input_rows_count);
522
145
    auto* state = reinterpret_cast<LikeState*>(
523
145
            context->get_function_state(FunctionContext::THREAD_LOCAL));
524
    // for constant_substring_fn, use long run length search for performance
525
145
    if (constant_substring_fn ==
526
145
        *(state->function.target<doris::Status (*)(LikeSearchState* state, const ColumnString&,
527
145
                                                   const StringRef&, ColumnUInt8::Container&)>())) {
528
10
        RETURN_IF_ERROR(execute_substring(values->get_chars(), values->get_offsets(), vec_res,
529
10
                                          &state->search_state));
530
135
    } else {
531
135
        const auto pattern_col = block.get_by_position(arguments[1]).column;
532
135
        if (const auto* str_patterns = check_and_get_column<ColumnString>(pattern_col.get())) {
533
135
            RETURN_IF_ERROR(
534
135
                    vector_non_const(*values, *str_patterns, vec_res, state, input_rows_count));
535
135
        } else if (const auto* const_patterns =
536
0
                           check_and_get_column<ColumnConst>(pattern_col.get())) {
537
0
            const auto& pattern_val = const_patterns->get_data_at(0);
538
0
            RETURN_IF_ERROR(vector_const(*values, &pattern_val, vec_res, state->function,
539
0
                                         &state->search_state));
540
0
        } else {
541
0
            return Status::InternalError("Not supported input arguments types");
542
0
        }
543
135
    }
544
145
    block.replace_by_position(result, std::move(res));
545
145
    return Status::OK();
546
145
}
547
548
Status FunctionLikeBase::execute_substring(const ColumnString::Chars& values,
549
                                           const ColumnString::Offsets& value_offsets,
550
                                           ColumnUInt8::Container& result,
551
10
                                           LikeSearchState* search_state) const {
552
    // treat continuous multi string data as a long string data
553
10
    const UInt8* begin = values.data();
554
10
    const UInt8* end = begin + values.size();
555
10
    const UInt8* pos = begin;
556
557
    /// Current index in the array of strings.
558
10
    size_t i = 0;
559
10
    size_t needle_size = search_state->substring_pattern.get_pattern_length();
560
561
    /// We will search for the next occurrence in all strings at once.
562
15
    while (pos < end) {
563
        // search return matched substring start offset
564
10
        pos = (UInt8*)search_state->substring_pattern.search((char*)pos, end - pos);
565
10
        if (pos >= end) {
566
5
            break;
567
5
        }
568
569
        /// Determine which index it refers to.
570
        /// begin + value_offsets[i] is the start offset of string at i+1
571
5
        while (i < value_offsets.size() && begin + value_offsets[i] < pos) {
572
0
            ++i;
573
0
        }
574
575
        /// We check that the entry does not pass through the boundaries of strings.
576
5
        if (pos + needle_size <= begin + value_offsets[i]) {
577
5
            result[i] = 1;
578
5
        }
579
580
        // move to next string offset
581
5
        pos = begin + value_offsets[i];
582
5
        ++i;
583
5
    }
584
585
10
    return Status::OK();
586
10
}
587
588
Status FunctionLikeBase::vector_const(const ColumnString& values, const StringRef* pattern_val,
589
                                      ColumnUInt8::Container& result, const LikeFn& function,
590
0
                                      LikeSearchState* search_state) const {
591
0
    RETURN_IF_ERROR((function)(search_state, values,
592
0
                               *reinterpret_cast<const StringRef*>(pattern_val), result));
593
0
    return Status::OK();
594
0
}
595
596
template <bool LIKE_PATTERN>
597
135
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
598
135
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
599
135
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
600
135
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
601
135
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
602
135
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
603
135
    size_t size = patterns.size();
604
605
272
    for (size_t i = 0; i < size; ++i) {
606
138
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
607
138
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
608
138
            !ends_with_state->_pattern_matched) {
609
1
            return nullptr;
610
1
        }
611
137
        std::string pattern_str = patterns.get_data_at(i).to_string();
612
137
        if (allpass_state->_pattern_matched) {
613
135
            if constexpr (LIKE_PATTERN) {
614
124
                allpass_state->like_pattern_match(pattern_str);
615
124
            } else {
616
11
                allpass_state->regexp_pattern_match(pattern_str);
617
11
            }
618
135
        }
619
137
        if (equal_state->_pattern_matched) {
620
135
            if constexpr (LIKE_PATTERN) {
621
124
                equal_state->like_pattern_match(pattern_str);
622
124
            } else {
623
11
                equal_state->regexp_pattern_match(pattern_str);
624
11
            }
625
135
        }
626
137
        if (substring_state->_pattern_matched) {
627
137
            if constexpr (LIKE_PATTERN) {
628
126
                substring_state->like_pattern_match(pattern_str);
629
126
            } else {
630
11
                substring_state->regexp_pattern_match(pattern_str);
631
11
            }
632
137
        }
633
137
        if (starts_with_state->_pattern_matched) {
634
135
            if constexpr (LIKE_PATTERN) {
635
124
                starts_with_state->like_pattern_match(pattern_str);
636
124
            } else {
637
11
                starts_with_state->regexp_pattern_match(pattern_str);
638
11
            }
639
135
        }
640
137
        if (ends_with_state->_pattern_matched) {
641
135
            if constexpr (LIKE_PATTERN) {
642
124
                ends_with_state->like_pattern_match(pattern_str);
643
124
            } else {
644
11
                ends_with_state->regexp_pattern_match(pattern_str);
645
11
            }
646
135
        }
647
137
    }
648
649
134
    if (allpass_state->_pattern_matched) {
650
0
        return allpass_state;
651
134
    } else if (equal_state->_pattern_matched) {
652
59
        return equal_state;
653
75
    } else if (substring_state->_pattern_matched) {
654
3
        return substring_state;
655
72
    } else if (starts_with_state->_pattern_matched) {
656
8
        return starts_with_state;
657
64
    } else if (ends_with_state->_pattern_matched) {
658
8
        return ends_with_state;
659
56
    } else {
660
56
        return nullptr;
661
56
    }
662
134
}
_ZN5doris10vectorized16FunctionLikeBase24pattern_type_recognitionILb1EEESt10shared_ptrINS0_24VectorPatternSearchStateEERKNS0_9ColumnStrIjEE
Line
Count
Source
597
124
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
598
124
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
599
124
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
600
124
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
601
124
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
602
124
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
603
124
    size_t size = patterns.size();
604
605
250
    for (size_t i = 0; i < size; ++i) {
606
127
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
607
127
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
608
127
            !ends_with_state->_pattern_matched) {
609
1
            return nullptr;
610
1
        }
611
126
        std::string pattern_str = patterns.get_data_at(i).to_string();
612
126
        if (allpass_state->_pattern_matched) {
613
124
            if constexpr (LIKE_PATTERN) {
614
124
                allpass_state->like_pattern_match(pattern_str);
615
            } else {
616
                allpass_state->regexp_pattern_match(pattern_str);
617
            }
618
124
        }
619
126
        if (equal_state->_pattern_matched) {
620
124
            if constexpr (LIKE_PATTERN) {
621
124
                equal_state->like_pattern_match(pattern_str);
622
            } else {
623
                equal_state->regexp_pattern_match(pattern_str);
624
            }
625
124
        }
626
126
        if (substring_state->_pattern_matched) {
627
126
            if constexpr (LIKE_PATTERN) {
628
126
                substring_state->like_pattern_match(pattern_str);
629
            } else {
630
                substring_state->regexp_pattern_match(pattern_str);
631
            }
632
126
        }
633
126
        if (starts_with_state->_pattern_matched) {
634
124
            if constexpr (LIKE_PATTERN) {
635
124
                starts_with_state->like_pattern_match(pattern_str);
636
            } else {
637
                starts_with_state->regexp_pattern_match(pattern_str);
638
            }
639
124
        }
640
126
        if (ends_with_state->_pattern_matched) {
641
124
            if constexpr (LIKE_PATTERN) {
642
124
                ends_with_state->like_pattern_match(pattern_str);
643
            } else {
644
                ends_with_state->regexp_pattern_match(pattern_str);
645
            }
646
124
        }
647
126
    }
648
649
123
    if (allpass_state->_pattern_matched) {
650
0
        return allpass_state;
651
123
    } else if (equal_state->_pattern_matched) {
652
57
        return equal_state;
653
66
    } else if (substring_state->_pattern_matched) {
654
3
        return substring_state;
655
63
    } else if (starts_with_state->_pattern_matched) {
656
6
        return starts_with_state;
657
57
    } else if (ends_with_state->_pattern_matched) {
658
6
        return ends_with_state;
659
51
    } else {
660
51
        return nullptr;
661
51
    }
662
123
}
_ZN5doris10vectorized16FunctionLikeBase24pattern_type_recognitionILb0EEESt10shared_ptrINS0_24VectorPatternSearchStateEERKNS0_9ColumnStrIjEE
Line
Count
Source
597
11
VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) {
598
11
    VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>();
599
11
    VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>();
600
11
    VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>();
601
11
    VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>();
602
11
    VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>();
603
11
    size_t size = patterns.size();
604
605
22
    for (size_t i = 0; i < size; ++i) {
606
11
        if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched &&
607
11
            !substring_state->_pattern_matched && !starts_with_state->_pattern_matched &&
608
11
            !ends_with_state->_pattern_matched) {
609
0
            return nullptr;
610
0
        }
611
11
        std::string pattern_str = patterns.get_data_at(i).to_string();
612
11
        if (allpass_state->_pattern_matched) {
613
            if constexpr (LIKE_PATTERN) {
614
                allpass_state->like_pattern_match(pattern_str);
615
11
            } else {
616
11
                allpass_state->regexp_pattern_match(pattern_str);
617
11
            }
618
11
        }
619
11
        if (equal_state->_pattern_matched) {
620
            if constexpr (LIKE_PATTERN) {
621
                equal_state->like_pattern_match(pattern_str);
622
11
            } else {
623
11
                equal_state->regexp_pattern_match(pattern_str);
624
11
            }
625
11
        }
626
11
        if (substring_state->_pattern_matched) {
627
            if constexpr (LIKE_PATTERN) {
628
                substring_state->like_pattern_match(pattern_str);
629
11
            } else {
630
11
                substring_state->regexp_pattern_match(pattern_str);
631
11
            }
632
11
        }
633
11
        if (starts_with_state->_pattern_matched) {
634
            if constexpr (LIKE_PATTERN) {
635
                starts_with_state->like_pattern_match(pattern_str);
636
11
            } else {
637
11
                starts_with_state->regexp_pattern_match(pattern_str);
638
11
            }
639
11
        }
640
11
        if (ends_with_state->_pattern_matched) {
641
            if constexpr (LIKE_PATTERN) {
642
                ends_with_state->like_pattern_match(pattern_str);
643
11
            } else {
644
11
                ends_with_state->regexp_pattern_match(pattern_str);
645
11
            }
646
11
        }
647
11
    }
648
649
11
    if (allpass_state->_pattern_matched) {
650
0
        return allpass_state;
651
11
    } else if (equal_state->_pattern_matched) {
652
2
        return equal_state;
653
9
    } else if (substring_state->_pattern_matched) {
654
0
        return substring_state;
655
9
    } else if (starts_with_state->_pattern_matched) {
656
2
        return starts_with_state;
657
7
    } else if (ends_with_state->_pattern_matched) {
658
2
        return ends_with_state;
659
5
    } else {
660
5
        return nullptr;
661
5
    }
662
11
}
663
664
Status FunctionLikeBase::vector_non_const(const ColumnString& values, const ColumnString& patterns,
665
                                          ColumnUInt8::Container& result, LikeState* state,
666
135
                                          size_t input_rows_count) const {
667
135
    ColumnString::MutablePtr replaced_patterns;
668
135
    VPatternSearchStateSPtr vector_search_state;
669
135
    if (state->is_like_pattern) {
670
124
        if (state->has_custom_escape) {
671
0
            replaced_patterns = ColumnString::create();
672
0
            for (int i = 0; i < input_rows_count; ++i) {
673
0
                std::string val =
674
0
                        replace_pattern_by_escape(patterns.get_data_at(i), state->escape_char);
675
0
                replaced_patterns->insert_data(val.c_str(), val.size());
676
0
            }
677
0
            vector_search_state = pattern_type_recognition<true>(*replaced_patterns);
678
124
        } else {
679
124
            vector_search_state = pattern_type_recognition<true>(patterns);
680
124
        }
681
124
    } else {
682
11
        vector_search_state = pattern_type_recognition<false>(patterns);
683
11
    }
684
685
135
    const ColumnString& real_pattern = state->has_custom_escape ? *replaced_patterns : patterns;
686
687
135
    if (vector_search_state == nullptr) {
688
        // pattern type recognition failed, use default case
689
158
        for (int i = 0; i < input_rows_count; ++i) {
690
101
            const auto pattern_val = real_pattern.get_data_at(i);
691
101
            const auto value_val = values.get_data_at(i);
692
101
            RETURN_IF_ERROR((state->scalar_function)(&state->search_state, value_val, pattern_val,
693
101
                                                     &result[i]));
694
101
        }
695
57
        return Status::OK();
696
57
    }
697
78
    const auto* search_strings =
698
78
            static_cast<const ColumnString*>(vector_search_state->_search_strings.get());
699
78
    return (vector_search_state->_vector_function)(values, *search_strings, result);
700
135
}
701
702
Status FunctionLike::like_fn(LikeSearchState* state, const ColumnString& val,
703
0
                             const StringRef& pattern, ColumnUInt8::Container& result) {
704
0
    std::string re_pattern;
705
0
    convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern);
706
0
    return regexp_fn(state, val, {re_pattern.c_str(), re_pattern.size()}, result);
707
0
}
708
709
Status FunctionLike::like_fn_scalar(LikeSearchState* state, const StringRef& val,
710
62
                                    const StringRef& pattern, unsigned char* result) {
711
62
    std::string re_pattern;
712
62
    convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern);
713
714
62
    return regexp_fn_scalar(state, StringRef(val.data, val.size),
715
62
                            {re_pattern.c_str(), re_pattern.size()}, result);
716
62
}
717
718
void FunctionLike::convert_like_pattern(LikeSearchState* state, const std::string& pattern,
719
98
                                        std::string* re_pattern) {
720
98
    re_pattern->clear();
721
722
98
    if (pattern.empty()) {
723
4
        re_pattern->append("^$");
724
4
        return;
725
4
    }
726
727
    // add ^ to pattern head to match line head
728
94
    if (!pattern.empty() && pattern[0] != '%') {
729
73
        re_pattern->append("^");
730
73
    }
731
732
    // expect % and _, all chars should keep it literal mean.
733
462
    for (size_t i = 0; i < pattern.size(); i++) {
734
368
        char c = pattern[i];
735
368
        if (c == '\\' && i + 1 < pattern.size()) {
736
37
            char next_c = pattern[i + 1];
737
37
            if (next_c == '%' || next_c == '_') {
738
                // convert "\%" and "\_" to literal "%" and "_"
739
12
                re_pattern->append(1, next_c);
740
12
                i++;
741
12
                continue;
742
25
            } else if (next_c == '\\') {
743
                // keep valid escape "\\"
744
7
                re_pattern->append("\\\\");
745
7
                i++;
746
7
                continue;
747
7
            }
748
37
        }
749
750
349
        if (c == '%') {
751
61
            re_pattern->append(".*");
752
288
        } else if (c == '_') {
753
81
            re_pattern->append(".");
754
207
        } else {
755
            // special for hyperscan: [, ], (, ), {, }, -, *, +, \, |, /, :, ^, ., $, ?
756
207
            if (c == '[' || c == ']' || c == '(' || c == ')' || c == '{' || c == '}' || c == '-' ||
757
207
                c == '*' || c == '+' || c == '\\' || c == '|' || c == '/' || c == ':' || c == '^' ||
758
207
                c == '.' || c == '$' || c == '?') {
759
37
                re_pattern->append(1, '\\');
760
37
            }
761
207
            re_pattern->append(1, c);
762
207
        }
763
349
    }
764
765
    // add $ to pattern tail to match line tail
766
94
    if (!pattern.empty() && re_pattern->back() != '*') {
767
54
        re_pattern->append("$");
768
54
    }
769
94
}
770
771
128
void FunctionLike::remove_escape_character(std::string* search_string) {
772
128
    std::string tmp_search_string;
773
128
    tmp_search_string.swap(*search_string);
774
128
    int64_t len = tmp_search_string.length();
775
    // sometime 'like' may allowed converted to 'equals/start_with/end_with/sub_with'
776
    // so we need to remove escape from pattern to construct search string and use to do 'equals/start_with/end_with/sub_with'
777
300
    for (int i = 0; i < len;) {
778
172
        if (tmp_search_string[i] == '\\' && i + 1 < len &&
779
172
            (tmp_search_string[i + 1] == '%' || tmp_search_string[i + 1] == '_' ||
780
25
             tmp_search_string[i + 1] == '\\')) {
781
15
            search_string->append(1, tmp_search_string[i + 1]);
782
15
            i += 2;
783
157
        } else {
784
157
            search_string->append(1, tmp_search_string[i]);
785
157
            i++;
786
157
        }
787
172
    }
788
128
}
789
790
0
bool re2_full_match(const std::string& str, const RE2& re, std::vector<std::string>& results) {
791
0
    if (!re.ok()) {
792
0
        return false;
793
0
    }
794
795
0
    std::vector<RE2::Arg> arguments;
796
0
    std::vector<RE2::Arg*> arguments_ptrs;
797
0
    std::size_t args_count = re.NumberOfCapturingGroups();
798
0
    arguments.resize(args_count);
799
0
    arguments_ptrs.resize(args_count);
800
0
    results.resize(args_count);
801
0
    for (std::size_t i = 0; i < args_count; ++i) {
802
0
        arguments[i] = &results[i];
803
0
        arguments_ptrs[i] = &arguments[i];
804
0
    }
805
806
0
    return RE2::FullMatchN(str, re, arguments_ptrs.data(), (int)args_count);
807
0
}
808
809
0
void verbose_log_match(const std::string& str, const std::string& pattern_name, const RE2& re) {
810
0
    std::vector<std::string> results;
811
0
    VLOG_DEBUG << "arg str: " << str << ", size: " << str.size() << ", pattern " << pattern_name
812
0
               << ": " << re.pattern() << ", size: " << re.pattern().size();
813
0
    if (re2_full_match(str, re, results)) {
814
0
        for (int i = 0; i < results.size(); ++i) {
815
0
            VLOG_DEBUG << "match " << i << ": " << results[i] << ", size: " << results[i].size();
816
0
        }
817
0
    } else {
818
0
        VLOG_DEBUG << "no match";
819
0
    }
820
0
}
821
822
Status FunctionLike::construct_like_const_state(FunctionContext* context, const StringRef& pattern,
823
                                                std::shared_ptr<LikeState>& state,
824
90
                                                bool try_hyperscan) {
825
90
    std::string pattern_str;
826
90
    if (state->has_custom_escape) {
827
0
        pattern_str = replace_pattern_by_escape(pattern, state->escape_char);
828
90
    } else {
829
90
        pattern_str = pattern.to_string();
830
90
    }
831
90
    state->search_state.pattern_str = pattern_str;
832
90
    std::string search_string;
833
834
90
    if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) {
835
0
        state->search_state.set_search_string("");
836
0
        state->function = constant_allpass_fn;
837
0
        state->scalar_function = constant_allpass_fn_scalar;
838
90
    } else if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &search_string)) {
839
40
        if (VLOG_DEBUG_IS_ON) {
840
0
            verbose_log_match(pattern_str, "LIKE_EQUALS_RE", LIKE_EQUALS_RE);
841
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
842
0
        }
843
40
        remove_escape_character(&search_string);
844
40
        if (VLOG_DEBUG_IS_ON) {
845
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
846
0
                       << ", size: " << search_string.size();
847
0
        }
848
40
        state->search_state.set_search_string(search_string);
849
40
        state->function = constant_equals_fn;
850
40
        state->scalar_function = constant_equals_fn_scalar;
851
50
    } else if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &search_string)) {
852
4
        if (VLOG_DEBUG_IS_ON) {
853
0
            verbose_log_match(pattern_str, "LIKE_STARTS_WITH_RE", LIKE_STARTS_WITH_RE);
854
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
855
0
        }
856
4
        remove_escape_character(&search_string);
857
4
        if (VLOG_DEBUG_IS_ON) {
858
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
859
0
                       << ", size: " << search_string.size();
860
0
        }
861
4
        state->search_state.set_search_string(search_string);
862
4
        state->function = constant_starts_with_fn;
863
4
        state->scalar_function = constant_starts_with_fn_scalar;
864
46
    } else if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &search_string)) {
865
4
        if (VLOG_DEBUG_IS_ON) {
866
0
            verbose_log_match(pattern_str, "LIKE_ENDS_WITH_RE", LIKE_ENDS_WITH_RE);
867
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
868
0
        }
869
4
        remove_escape_character(&search_string);
870
4
        if (VLOG_DEBUG_IS_ON) {
871
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
872
0
                       << ", size: " << search_string.size();
873
0
        }
874
4
        state->search_state.set_search_string(search_string);
875
4
        state->function = constant_ends_with_fn;
876
4
        state->scalar_function = constant_ends_with_fn_scalar;
877
42
    } else if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &search_string)) {
878
6
        if (VLOG_DEBUG_IS_ON) {
879
0
            verbose_log_match(pattern_str, "LIKE_SUBSTRING_RE", LIKE_SUBSTRING_RE);
880
0
            VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size();
881
0
        }
882
6
        remove_escape_character(&search_string);
883
6
        if (VLOG_DEBUG_IS_ON) {
884
0
            VLOG_DEBUG << "search_string escape removed: " << search_string
885
0
                       << ", size: " << search_string.size();
886
0
        }
887
6
        state->search_state.set_search_string(search_string);
888
6
        state->function = constant_substring_fn;
889
6
        state->scalar_function = constant_substring_fn_scalar;
890
36
    } else {
891
36
        std::string re_pattern;
892
36
        convert_like_pattern(&state->search_state, pattern_str, &re_pattern);
893
36
        if (VLOG_DEBUG_IS_ON) {
894
0
            VLOG_DEBUG << "hyperscan, pattern str: " << pattern_str
895
0
                       << ", size: " << pattern_str.size() << ", re pattern: " << re_pattern
896
0
                       << ", size: " << re_pattern.size();
897
0
        }
898
899
36
        hs_database_t* database = nullptr;
900
36
        hs_scratch_t* scratch = nullptr;
901
36
        if (try_hyperscan && hs_prepare(context, re_pattern.c_str(), &database, &scratch).ok()) {
902
            // use hyperscan
903
36
            state->search_state.hs_database.reset(database);
904
36
            state->search_state.hs_scratch.reset(scratch);
905
36
        } else {
906
            // fallback to re2
907
            // reset hs_database to nullptr to indicate not use hyperscan
908
0
            state->search_state.hs_database.reset();
909
0
            state->search_state.hs_scratch.reset();
910
911
0
            RE2::Options opts;
912
0
            opts.set_never_nl(false);
913
0
            opts.set_dot_nl(true);
914
0
            state->search_state.regex = std::make_unique<RE2>(re_pattern, opts);
915
0
            if (!state->search_state.regex->ok()) {
916
0
                return Status::InternalError("Invalid regex expression: {}(origin: {})", re_pattern,
917
0
                                             pattern_str);
918
0
            }
919
0
        }
920
921
36
        state->function = constant_regex_fn;
922
36
        state->scalar_function = constant_regex_fn_scalar;
923
36
    }
924
90
    return Status::OK();
925
90
}
926
927
272
Status FunctionLike::open(FunctionContext* context, FunctionContext::FunctionStateScope scope) {
928
272
    if (scope != FunctionContext::THREAD_LOCAL) {
929
136
        return Status::OK();
930
136
    }
931
136
    std::shared_ptr<LikeState> state = std::make_shared<LikeState>();
932
136
    state->is_like_pattern = true;
933
136
    state->function = like_fn;
934
136
    state->scalar_function = like_fn_scalar;
935
136
    if (context->is_col_constant(2)) {
936
0
        state->has_custom_escape = true;
937
0
        const auto escape_col = context->get_constant_col(2)->column_ptr;
938
0
        const auto& escape = escape_col->get_data_at(0);
939
0
        if (escape.size != 1) {
940
0
            return Status::InternalError("Escape character must be a single character, got: {}",
941
0
                                         escape.to_string());
942
0
        }
943
0
        state->escape_char = escape.data[0];
944
0
    }
945
136
    if (context->is_col_constant(1)) {
946
90
        const auto pattern_col = context->get_constant_col(1)->column_ptr;
947
90
        const auto& pattern = pattern_col->get_data_at(0);
948
90
        RETURN_IF_ERROR(construct_like_const_state(context, pattern, state));
949
90
    }
950
136
    context->set_function_state(scope, state);
951
952
136
    return Status::OK();
953
136
}
954
955
Status FunctionRegexpLike::open(FunctionContext* context,
956
34
                                FunctionContext::FunctionStateScope scope) {
957
34
    if (scope != FunctionContext::THREAD_LOCAL) {
958
17
        return Status::OK();
959
17
    }
960
17
    std::shared_ptr<LikeState> state = std::make_shared<LikeState>();
961
17
    context->set_function_state(scope, state);
962
17
    state->is_like_pattern = false;
963
17
    state->function = regexp_fn;
964
17
    state->scalar_function = regexp_fn_scalar;
965
17
    if (context->is_col_constant(1)) {
966
17
        const auto pattern_col = context->get_constant_col(1)->column_ptr;
967
17
        const auto& pattern = pattern_col->get_data_at(0);
968
969
17
        std::string pattern_str = pattern.to_string();
970
17
        std::string search_string;
971
17
        if (RE2::FullMatch(pattern_str, ALLPASS_RE)) {
972
0
            state->search_state.set_search_string("");
973
0
            state->function = constant_allpass_fn;
974
0
            state->scalar_function = constant_allpass_fn_scalar;
975
17
        } else if (RE2::FullMatch(pattern_str, EQUALS_RE, &search_string)) {
976
2
            state->search_state.set_search_string(search_string);
977
2
            state->function = constant_equals_fn;
978
2
            state->scalar_function = constant_equals_fn_scalar;
979
15
        } else if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &search_string)) {
980
2
            state->search_state.set_search_string(search_string);
981
2
            state->function = constant_starts_with_fn;
982
2
            state->scalar_function = constant_starts_with_fn_scalar;
983
13
        } else if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &search_string)) {
984
2
            state->search_state.set_search_string(search_string);
985
2
            state->function = constant_ends_with_fn;
986
2
            state->scalar_function = constant_ends_with_fn_scalar;
987
11
        } else if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &search_string)) {
988
6
            state->search_state.set_search_string(search_string);
989
6
            state->function = constant_substring_fn;
990
6
            state->scalar_function = constant_substring_fn_scalar;
991
6
        } else {
992
5
            hs_database_t* database = nullptr;
993
5
            hs_scratch_t* scratch = nullptr;
994
5
            if (hs_prepare(context, pattern_str.c_str(), &database, &scratch).ok()) {
995
                // use hyperscan
996
5
                state->search_state.hs_database.reset(database);
997
5
                state->search_state.hs_scratch.reset(scratch);
998
5
            } else {
999
                // fallback to re2
1000
                // reset hs_database to nullptr to indicate not use hyperscan
1001
0
                state->search_state.hs_database.reset();
1002
0
                state->search_state.hs_scratch.reset();
1003
0
                RE2::Options opts;
1004
0
                opts.set_never_nl(false);
1005
0
                opts.set_dot_nl(true);
1006
0
                state->search_state.regex = std::make_unique<RE2>(pattern_str, opts);
1007
0
                if (!state->search_state.regex->ok()) {
1008
0
                    return Status::InternalError("Invalid regex expression: {}", pattern_str);
1009
0
                }
1010
0
            }
1011
5
            state->function = constant_regex_fn;
1012
5
            state->scalar_function = constant_regex_fn_scalar;
1013
5
        }
1014
17
    }
1015
17
    return Status::OK();
1016
17
}
1017
1018
1
void register_function_like(SimpleFunctionFactory& factory) {
1019
1
    factory.register_function<FunctionLike>();
1020
1
}
1021
1022
1
void register_function_regexp(SimpleFunctionFactory& factory) {
1023
1
    factory.register_function<FunctionRegexpLike>();
1024
1
    factory.register_alias(FunctionRegexpLike::name, FunctionRegexpLike::alias);
1025
1
}
1026
#include "common/compile_check_end.h"
1027
} // namespace doris::vectorized