/root/doris/be/src/vec/functions/like.cpp
| Line | Count | Source | 
| 1 |  | // Licensed to the Apache Software Foundation (ASF) under one | 
| 2 |  | // or more contributor license agreements.  See the NOTICE file | 
| 3 |  | // distributed with this work for additional information | 
| 4 |  | // regarding copyright ownership.  The ASF licenses this file | 
| 5 |  | // to you under the Apache License, Version 2.0 (the | 
| 6 |  | // "License"); you may not use this file except in compliance | 
| 7 |  | // with the License.  You may obtain a copy of the License at | 
| 8 |  | // | 
| 9 |  | //   http://www.apache.org/licenses/LICENSE-2.0 | 
| 10 |  | // | 
| 11 |  | // Unless required by applicable law or agreed to in writing, | 
| 12 |  | // software distributed under the License is distributed on an | 
| 13 |  | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | 
| 14 |  | // KIND, either express or implied.  See the License for the | 
| 15 |  | // specific language governing permissions and limitations | 
| 16 |  | // under the License. | 
| 17 |  |  | 
| 18 |  | #include "like.h" | 
| 19 |  |  | 
| 20 |  | #include <fmt/format.h> | 
| 21 |  | #include <hs/hs_compile.h> | 
| 22 |  | #include <re2/stringpiece.h> | 
| 23 |  |  | 
| 24 |  | #include <cstddef> | 
| 25 |  | #include <ostream> | 
| 26 |  | #include <utility> | 
| 27 |  | #include <vector> | 
| 28 |  |  | 
| 29 |  | #include "common/logging.h" | 
| 30 |  | #include "vec/columns/column.h" | 
| 31 |  | #include "vec/columns/column_const.h" | 
| 32 |  | #include "vec/columns/column_vector.h" | 
| 33 |  | #include "vec/common/string_ref.h" | 
| 34 |  | #include "vec/core/block.h" | 
| 35 |  | #include "vec/core/column_with_type_and_name.h" | 
| 36 |  | #include "vec/functions/simple_function_factory.h" | 
| 37 |  |  | 
| 38 |  | namespace doris::vectorized { | 
| 39 |  | #include "common/compile_check_begin.h" | 
| 40 |  | // A regex to match any regex pattern is equivalent to a substring search. | 
| 41 |  | static const RE2 SUBSTRING_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)"); | 
| 42 |  |  | 
| 43 |  | // A regex to match any regex pattern which is equivalent to matching a constant string | 
| 44 |  | // at the end of the string values. | 
| 45 |  | static const RE2 ENDS_WITH_RE(R"((?:\.\*)*([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)"); | 
| 46 |  |  | 
| 47 |  | // A regex to match any regex pattern which is equivalent to matching a constant string | 
| 48 |  | // at the end of the string values. | 
| 49 |  | static const RE2 STARTS_WITH_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)(?:\.\*)*)"); | 
| 50 |  |  | 
| 51 |  | // A regex to match any regex pattern which is equivalent to a constant string match. | 
| 52 |  | static const RE2 EQUALS_RE(R"(\^([^\.\^\{\[\(\|\)\]\}\+\*\?\$\\]*)\$)"); | 
| 53 |  | // A regex to match .* | 
| 54 |  | static const RE2 ALLPASS_RE(R"((\.\*)+)"); | 
| 55 |  |  | 
| 56 |  | // Like patterns | 
| 57 |  | static const re2::RE2 LIKE_SUBSTRING_RE(R"((?:%+)(((\\_)|([^%_\\]))+)(?:%+))"); | 
| 58 |  | static const re2::RE2 LIKE_ENDS_WITH_RE("(?:%+)(((\\\\_)|([^%_]))+)"); | 
| 59 |  | static const re2::RE2 LIKE_STARTS_WITH_RE(R"((((\\%)|(\\_)|([^%_\\]))+)(?:%+))"); | 
| 60 |  | static const re2::RE2 LIKE_EQUALS_RE("(((\\\\_)|([^%_]))+)"); | 
| 61 |  | static const re2::RE2 LIKE_ALLPASS_RE("%+"); | 
| 62 |  |  | 
| 63 |  | struct VectorAllpassSearchState : public VectorPatternSearchState { | 
| 64 | 307 |     VectorAllpassSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_allpass_fn) {} | 
| 65 |  |  | 
| 66 | 307 |     ~VectorAllpassSearchState() override = default; | 
| 67 |  |  | 
| 68 | 312 |     void like_pattern_match(const std::string& pattern_str) override { | 
| 69 | 312 |         if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) { | 
| 70 | 26 |             _search_strings->insert_default(); | 
| 71 | 286 |         } else { | 
| 72 | 286 |             _pattern_matched = false; | 
| 73 | 286 |         } | 
| 74 | 312 |     } | 
| 75 |  |  | 
| 76 | 0 |     void regexp_pattern_match(const std::string& pattern_str) override { | 
| 77 | 0 |         if (RE2::FullMatch(pattern_str, ALLPASS_RE)) { | 
| 78 | 0 |             _search_strings->insert_default(); | 
| 79 | 0 |         } else { | 
| 80 | 0 |             _pattern_matched = false; | 
| 81 | 0 |         } | 
| 82 | 0 |     } | 
| 83 |  | }; | 
| 84 |  |  | 
| 85 |  | struct VectorEqualSearchState : public VectorPatternSearchState { | 
| 86 | 307 |     VectorEqualSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_equals_fn) {} | 
| 87 |  |  | 
| 88 | 307 |     ~VectorEqualSearchState() override = default; | 
| 89 |  |  | 
| 90 | 315 |     void like_pattern_match(const std::string& pattern_str) override { | 
| 91 | 315 |         _search_string.clear(); | 
| 92 | 315 |         if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &_search_string)) { | 
| 93 | 86 |             FunctionLike::remove_escape_character(&_search_string); | 
| 94 | 86 |             _search_strings->insert_data(_search_string.c_str(), _search_string.size()); | 
| 95 | 229 |         } else { | 
| 96 | 229 |             _pattern_matched = false; | 
| 97 | 229 |         } | 
| 98 | 315 |     } | 
| 99 |  |  | 
| 100 | 0 |     void regexp_pattern_match(const std::string& pattern_str) override { | 
| 101 | 0 |         _search_string.clear(); | 
| 102 | 0 |         if (RE2::FullMatch(pattern_str, EQUALS_RE, &_search_string)) { | 
| 103 | 0 |             _search_strings->insert_data(_search_string.c_str(), _search_string.size()); | 
| 104 | 0 |         } else { | 
| 105 | 0 |             _pattern_matched = false; | 
| 106 | 0 |         } | 
| 107 | 0 |     } | 
| 108 |  | }; | 
| 109 |  |  | 
| 110 |  | struct VectorSubStringSearchState : public VectorPatternSearchState { | 
| 111 |  |     VectorSubStringSearchState() | 
| 112 | 307 |             : VectorPatternSearchState(FunctionLikeBase::vector_substring_fn) {} | 
| 113 |  |  | 
| 114 | 307 |     ~VectorSubStringSearchState() override = default; | 
| 115 |  |  | 
| 116 | 321 |     void like_pattern_match(const std::string& pattern_str) override { | 
| 117 | 321 |         _search_string.clear(); | 
| 118 | 321 |         if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &_search_string)) { | 
| 119 | 41 |             FunctionLike::remove_escape_character(&_search_string); | 
| 120 | 41 |             _search_strings->insert_data(_search_string.c_str(), _search_string.size()); | 
| 121 | 280 |         } else { | 
| 122 | 280 |             _pattern_matched = false; | 
| 123 | 280 |         } | 
| 124 | 321 |     } | 
| 125 |  |  | 
| 126 | 0 |     void regexp_pattern_match(const std::string& pattern_str) override { | 
| 127 | 0 |         _search_string.clear(); | 
| 128 | 0 |         if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &_search_string)) { | 
| 129 | 0 |             _search_strings->insert_data(_search_string.c_str(), _search_string.size()); | 
| 130 | 0 |         } else { | 
| 131 | 0 |             _pattern_matched = false; | 
| 132 | 0 |         } | 
| 133 | 0 |     } | 
| 134 |  | }; | 
| 135 |  |  | 
| 136 |  | struct VectorStartsWithSearchState : public VectorPatternSearchState { | 
| 137 |  |     VectorStartsWithSearchState() | 
| 138 | 307 |             : VectorPatternSearchState(FunctionLikeBase::vector_starts_with_fn) {} | 
| 139 |  |  | 
| 140 | 307 |     ~VectorStartsWithSearchState() override = default; | 
| 141 |  |  | 
| 142 | 313 |     void like_pattern_match(const std::string& pattern_str) override { | 
| 143 | 313 |         _search_string.clear(); | 
| 144 | 313 |         if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &_search_string)) { | 
| 145 | 37 |             FunctionLike::remove_escape_character(&_search_string); | 
| 146 | 37 |             _search_strings->insert_data(_search_string.c_str(), _search_string.size()); | 
| 147 | 276 |         } else { | 
| 148 | 276 |             _pattern_matched = false; | 
| 149 | 276 |         } | 
| 150 | 313 |     } | 
| 151 |  |  | 
| 152 | 0 |     void regexp_pattern_match(const std::string& pattern_str) override { | 
| 153 | 0 |         _search_string.clear(); | 
| 154 | 0 |         if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &_search_string)) { | 
| 155 | 0 |             _search_strings->insert_data(_search_string.c_str(), _search_string.size()); | 
| 156 | 0 |         } else { | 
| 157 | 0 |             _pattern_matched = false; | 
| 158 | 0 |         } | 
| 159 | 0 |     } | 
| 160 |  | }; | 
| 161 |  |  | 
| 162 |  | struct VectorEndsWithSearchState : public VectorPatternSearchState { | 
| 163 | 307 |     VectorEndsWithSearchState() : VectorPatternSearchState(FunctionLikeBase::vector_ends_with_fn) {} | 
| 164 |  |  | 
| 165 | 307 |     ~VectorEndsWithSearchState() override = default; | 
| 166 |  |  | 
| 167 | 312 |     void like_pattern_match(const std::string& pattern_str) override { | 
| 168 | 312 |         _search_string.clear(); | 
| 169 | 312 |         if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &_search_string)) { | 
| 170 | 36 |             FunctionLike::remove_escape_character(&_search_string); | 
| 171 | 36 |             _search_strings->insert_data(_search_string.c_str(), _search_string.size()); | 
| 172 | 276 |         } else { | 
| 173 | 276 |             _pattern_matched = false; | 
| 174 | 276 |         } | 
| 175 | 312 |     } | 
| 176 |  |  | 
| 177 | 0 |     void regexp_pattern_match(const std::string& pattern_str) override { | 
| 178 | 0 |         _search_string.clear(); | 
| 179 | 0 |         if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &_search_string)) { | 
| 180 | 0 |             _search_strings->insert_data(_search_string.c_str(), _search_string.size()); | 
| 181 | 0 |         } else { | 
| 182 | 0 |             _pattern_matched = false; | 
| 183 | 0 |         } | 
| 184 | 0 |     } | 
| 185 |  | }; | 
| 186 |  |  | 
| 187 | 0 | Status LikeSearchState::clone(LikeSearchState& cloned) { | 
| 188 | 0 |     cloned.set_search_string(search_string); | 
| 189 |  | 
 | 
| 190 | 0 |     std::string re_pattern; | 
| 191 | 0 |     FunctionLike::convert_like_pattern(this, pattern_str, &re_pattern); | 
| 192 | 0 |     if (hs_database) { // use hyperscan | 
| 193 | 0 |         hs_database_t* database = nullptr; | 
| 194 | 0 |         hs_scratch_t* scratch = nullptr; | 
| 195 | 0 |         RETURN_IF_ERROR(FunctionLike::hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch)); | 
| 196 |  |  | 
| 197 | 0 |         cloned.hs_database.reset(database); | 
| 198 | 0 |         cloned.hs_scratch.reset(scratch); | 
| 199 | 0 |     } else { // fallback to re2 | 
| 200 | 0 |         cloned.hs_database.reset(); | 
| 201 | 0 |         cloned.hs_scratch.reset(); | 
| 202 |  | 
 | 
| 203 | 0 |         RE2::Options opts; | 
| 204 | 0 |         opts.set_never_nl(false); | 
| 205 | 0 |         opts.set_dot_nl(true); | 
| 206 | 0 |         cloned.regex = std::make_unique<RE2>(re_pattern, opts); | 
| 207 | 0 |         if (!cloned.regex->ok()) { | 
| 208 | 0 |             return Status::InternalError("Invalid regex expression: {}", re_pattern); | 
| 209 | 0 |         } | 
| 210 | 0 |     } | 
| 211 |  |  | 
| 212 | 0 |     return Status::OK(); | 
| 213 | 0 | } | 
| 214 |  |  | 
| 215 |  | Status FunctionLikeBase::constant_allpass_fn(const LikeSearchState* state, const ColumnString& vals, | 
| 216 |  |                                              const StringRef& pattern, | 
| 217 | 14 |                                              ColumnUInt8::Container& result) { | 
| 218 | 14 |     memset(result.data(), 1, vals.size()); | 
| 219 | 14 |     return Status::OK(); | 
| 220 | 14 | } | 
| 221 |  |  | 
| 222 |  | Status FunctionLikeBase::constant_allpass_fn_scalar(const LikeSearchState* state, | 
| 223 |  |                                                     const StringRef& val, const StringRef& pattern, | 
| 224 | 0 |                                                     unsigned char* result) { | 
| 225 | 0 |     *result = 1; | 
| 226 | 0 |     return Status::OK(); | 
| 227 | 0 | } | 
| 228 |  |  | 
| 229 |  | Status FunctionLikeBase::vector_allpass_fn(const ColumnString& vals, | 
| 230 |  |                                            const ColumnString& search_strings, | 
| 231 | 21 |                                            ColumnUInt8::Container& result) { | 
| 232 | 21 |     DCHECK(vals.size() == search_strings.size()); | 
| 233 | 21 |     DCHECK(vals.size() == result.size()); | 
| 234 | 21 |     memset(result.data(), 1, vals.size()); | 
| 235 | 21 |     return Status::OK(); | 
| 236 | 21 | } | 
| 237 |  |  | 
| 238 |  | Status FunctionLikeBase::constant_starts_with_fn(const LikeSearchState* state, | 
| 239 |  |                                                  const ColumnString& val, const StringRef& pattern, | 
| 240 | 22 |                                                  ColumnUInt8::Container& result) { | 
| 241 | 22 |     auto sz = val.size(); | 
| 242 | 44 |     for (size_t i = 0; i < sz; i++) { | 
| 243 | 22 |         const auto& str_ref = val.get_data_at(i); | 
| 244 | 22 |         result[i] = (str_ref.size >= state->search_string_sv.size) && | 
| 245 | 22 |                     str_ref.start_with(state->search_string_sv); | 
| 246 | 22 |     } | 
| 247 | 22 |     return Status::OK(); | 
| 248 | 22 | } | 
| 249 |  |  | 
| 250 |  | Status FunctionLikeBase::constant_starts_with_fn_scalar(const LikeSearchState* state, | 
| 251 |  |                                                         const StringRef& val, | 
| 252 |  |                                                         const StringRef& pattern, | 
| 253 | 0 |                                                         unsigned char* result) { | 
| 254 | 0 |     *result = (val.size >= state->search_string_sv.size) && | 
| 255 | 0 |               (state->search_string_sv == val.substring(0, state->search_string_sv.size)); | 
| 256 | 0 |     return Status::OK(); | 
| 257 | 0 | } | 
| 258 |  |  | 
| 259 |  | Status FunctionLikeBase::vector_starts_with_fn(const ColumnString& vals, | 
| 260 |  |                                                const ColumnString& search_strings, | 
| 261 | 31 |                                                ColumnUInt8::Container& result) { | 
| 262 | 31 |     DCHECK(vals.size() == search_strings.size()); | 
| 263 | 31 |     DCHECK(vals.size() == result.size()); | 
| 264 | 31 |     auto sz = vals.size(); | 
| 265 | 67 |     for (size_t i = 0; i < sz; ++i) { | 
| 266 | 36 |         const auto& str_sv = vals.get_data_at(i); | 
| 267 | 36 |         const auto& search_string_sv = search_strings.get_data_at(i); | 
| 268 | 36 |         result[i] = (str_sv.size >= search_string_sv.size) && str_sv.start_with(search_string_sv); | 
| 269 | 36 |     } | 
| 270 | 31 |     return Status::OK(); | 
| 271 | 31 | } | 
| 272 |  |  | 
| 273 |  | Status FunctionLikeBase::constant_ends_with_fn(const LikeSearchState* state, | 
| 274 |  |                                                const ColumnString& val, const StringRef& pattern, | 
| 275 | 22 |                                                ColumnUInt8::Container& result) { | 
| 276 | 22 |     auto sz = val.size(); | 
| 277 | 44 |     for (size_t i = 0; i < sz; i++) { | 
| 278 | 22 |         const auto& str_ref = val.get_data_at(i); | 
| 279 | 22 |         result[i] = (str_ref.size >= state->search_string_sv.size) && | 
| 280 | 22 |                     str_ref.end_with(state->search_string_sv); | 
| 281 | 22 |     } | 
| 282 | 22 |     return Status::OK(); | 
| 283 | 22 | } | 
| 284 |  |  | 
| 285 |  | Status FunctionLikeBase::constant_ends_with_fn_scalar(const LikeSearchState* state, | 
| 286 |  |                                                       const StringRef& val, | 
| 287 |  |                                                       const StringRef& pattern, | 
| 288 | 0 |                                                       unsigned char* result) { | 
| 289 | 0 |     *result = (val.size >= state->search_string_sv.size) && | 
| 290 | 0 |               (state->search_string_sv == val.substring(val.size - state->search_string_sv.size, | 
| 291 | 0 |                                                         state->search_string_sv.size)); | 
| 292 | 0 |     return Status::OK(); | 
| 293 | 0 | } | 
| 294 |  |  | 
| 295 |  | Status FunctionLikeBase::vector_ends_with_fn(const ColumnString& vals, | 
| 296 |  |                                              const ColumnString& search_strings, | 
| 297 | 31 |                                              ColumnUInt8::Container& result) { | 
| 298 | 31 |     DCHECK(vals.size() == search_strings.size()); | 
| 299 | 31 |     DCHECK(vals.size() == result.size()); | 
| 300 | 31 |     auto sz = vals.size(); | 
| 301 | 67 |     for (size_t i = 0; i < sz; ++i) { | 
| 302 | 36 |         const auto& str_sv = vals.get_data_at(i); | 
| 303 | 36 |         const auto& search_string_sv = search_strings.get_data_at(i); | 
| 304 | 36 |         result[i] = (str_sv.size >= search_string_sv.size) && str_sv.end_with(search_string_sv); | 
| 305 | 36 |     } | 
| 306 | 31 |     return Status::OK(); | 
| 307 | 31 | } | 
| 308 |  |  | 
| 309 |  | Status FunctionLikeBase::constant_equals_fn(const LikeSearchState* state, const ColumnString& val, | 
| 310 |  |                                             const StringRef& pattern, | 
| 311 | 47 |                                             ColumnUInt8::Container& result) { | 
| 312 | 47 |     auto sz = val.size(); | 
| 313 | 94 |     for (size_t i = 0; i < sz; i++) { | 
| 314 | 47 |         result[i] = (val.get_data_at(i) == state->search_string_sv); | 
| 315 | 47 |     } | 
| 316 | 47 |     return Status::OK(); | 
| 317 | 47 | } | 
| 318 |  |  | 
| 319 |  | Status FunctionLikeBase::constant_equals_fn_scalar(const LikeSearchState* state, | 
| 320 |  |                                                    const StringRef& val, const StringRef& pattern, | 
| 321 | 0 |                                                    unsigned char* result) { | 
| 322 | 0 |     *result = (val == state->search_string_sv); | 
| 323 | 0 |     return Status::OK(); | 
| 324 | 0 | } | 
| 325 |  |  | 
| 326 |  | Status FunctionLikeBase::vector_equals_fn(const ColumnString& vals, | 
| 327 |  |                                           const ColumnString& search_strings, | 
| 328 | 78 |                                           ColumnUInt8::Container& result) { | 
| 329 | 78 |     DCHECK(vals.size() == search_strings.size()); | 
| 330 | 78 |     DCHECK(vals.size() == result.size()); | 
| 331 | 78 |     auto sz = vals.size(); | 
| 332 | 156 |     for (size_t i = 0; i < sz; ++i) { | 
| 333 | 78 |         const auto& str_sv = vals.get_data_at(i); | 
| 334 | 78 |         const auto& search_string_sv = search_strings.get_data_at(i); | 
| 335 | 78 |         result[i] = str_sv == search_string_sv; | 
| 336 | 78 |     } | 
| 337 | 78 |     return Status::OK(); | 
| 338 | 78 | } | 
| 339 |  |  | 
| 340 |  | Status FunctionLikeBase::constant_substring_fn(const LikeSearchState* state, | 
| 341 |  |                                                const ColumnString& val, const StringRef& pattern, | 
| 342 | 0 |                                                ColumnUInt8::Container& result) { | 
| 343 | 0 |     auto sz = val.size(); | 
| 344 | 0 |     for (size_t i = 0; i < sz; i++) { | 
| 345 | 0 |         if (state->search_string_sv.size == 0) { | 
| 346 | 0 |             result[i] = true; | 
| 347 | 0 |             continue; | 
| 348 | 0 |         } | 
| 349 | 0 |         result[i] = state->substring_pattern.search(val.get_data_at(i)) != -1; | 
| 350 | 0 |     } | 
| 351 | 0 |     return Status::OK(); | 
| 352 | 0 | } | 
| 353 |  |  | 
| 354 |  | Status FunctionLikeBase::constant_substring_fn_scalar(const LikeSearchState* state, | 
| 355 |  |                                                       const StringRef& val, | 
| 356 |  |                                                       const StringRef& pattern, | 
| 357 | 0 |                                                       unsigned char* result) { | 
| 358 | 0 |     if (state->search_string_sv.size == 0) { | 
| 359 | 0 |         *result = true; | 
| 360 | 0 |         return Status::OK(); | 
| 361 | 0 |     } | 
| 362 | 0 |     *result = state->substring_pattern.search(val) != -1; | 
| 363 | 0 |     return Status::OK(); | 
| 364 | 0 | } | 
| 365 |  |  | 
| 366 |  | Status FunctionLikeBase::vector_substring_fn(const ColumnString& vals, | 
| 367 |  |                                              const ColumnString& search_strings, | 
| 368 | 27 |                                              ColumnUInt8::Container& result) { | 
| 369 | 27 |     DCHECK(vals.size() == search_strings.size()); | 
| 370 | 27 |     DCHECK(vals.size() == result.size()); | 
| 371 | 27 |     auto sz = vals.size(); | 
| 372 | 54 |     for (size_t i = 0; i < sz; ++i) { | 
| 373 | 27 |         const auto& str_sv = vals.get_data_at(i); | 
| 374 | 27 |         const auto& search_string_sv = search_strings.get_data_at(i); | 
| 375 | 27 |         if (search_string_sv.size == 0) { | 
| 376 | 0 |             result[i] = true; | 
| 377 | 0 |             continue; | 
| 378 | 0 |         } | 
| 379 | 27 |         doris::StringSearch substring_search(&search_string_sv); | 
| 380 | 27 |         result[i] = substring_search.search(str_sv) != -1; | 
| 381 | 27 |     } | 
| 382 | 27 |     return Status::OK(); | 
| 383 | 27 | } | 
| 384 |  |  | 
| 385 |  | Status FunctionLikeBase::constant_regex_fn_scalar(const LikeSearchState* state, | 
| 386 |  |                                                   const StringRef& val, const StringRef& pattern, | 
| 387 | 53 |                                                   unsigned char* result) { | 
| 388 | 53 |     if (state->hs_database) { // use hyperscan | 
| 389 | 53 |         auto ret = hs_scan(state->hs_database.get(), val.data, (int)val.size, 0, | 
| 390 | 53 |                            state->hs_scratch.get(), | 
| 391 | 53 |                            doris::vectorized::LikeSearchState::hs_match_handler, (void*)result); | 
| 392 | 53 |         if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { | 
| 393 | 0 |             return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); | 
| 394 | 0 |         } | 
| 395 | 53 |     } else { // fallback to re2 | 
| 396 | 0 |         *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), *state->regex); | 
| 397 | 0 |     } | 
| 398 |  |  | 
| 399 | 53 |     return Status::OK(); | 
| 400 | 53 | } | 
| 401 |  |  | 
| 402 |  | Status FunctionLikeBase::regexp_fn_scalar(const LikeSearchState* state, const StringRef& val, | 
| 403 | 209 |                                           const StringRef& pattern, unsigned char* result) { | 
| 404 | 209 |     RE2::Options opts; | 
| 405 | 209 |     opts.set_never_nl(false); | 
| 406 | 209 |     opts.set_dot_nl(true); | 
| 407 | 209 |     re2::RE2 re(re2::StringPiece(pattern.data, pattern.size), opts); | 
| 408 | 209 |     if (re.ok()) { | 
| 409 | 209 |         *result = RE2::PartialMatch(re2::StringPiece(val.data, val.size), re); | 
| 410 | 209 |     } else { | 
| 411 | 0 |         return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string()); | 
| 412 | 0 |     } | 
| 413 |  |  | 
| 414 | 209 |     return Status::OK(); | 
| 415 | 209 | } | 
| 416 |  |  | 
| 417 |  | Status FunctionLikeBase::constant_regex_fn(const LikeSearchState* state, const ColumnString& val, | 
| 418 |  |                                            const StringRef& pattern, | 
| 419 | 75 |                                            ColumnUInt8::Container& result) { | 
| 420 | 75 |     auto sz = val.size(); | 
| 421 | 75 |     if (state->hs_database) { // use hyperscan | 
| 422 | 150 |         for (size_t i = 0; i < sz; i++) { | 
| 423 | 75 |             const auto& str_ref = val.get_data_at(i); | 
| 424 | 75 |             auto ret = hs_scan(state->hs_database.get(), str_ref.data, (int)str_ref.size, 0, | 
| 425 | 75 |                                state->hs_scratch.get(), | 
| 426 | 75 |                                doris::vectorized::LikeSearchState::hs_match_handler, | 
| 427 | 75 |                                (void*)(result.data() + i)); | 
| 428 | 75 |             if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { | 
| 429 | 0 |                 return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); | 
| 430 | 0 |             } | 
| 431 | 75 |         } | 
| 432 | 75 |     } else { // fallback to re2 | 
| 433 | 0 |         for (size_t i = 0; i < sz; i++) { | 
| 434 | 0 |             const auto& str_ref = val.get_data_at(i); | 
| 435 | 0 |             *(result.data() + i) = | 
| 436 | 0 |                     RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), *state->regex); | 
| 437 | 0 |         } | 
| 438 | 0 |     } | 
| 439 |  |  | 
| 440 | 75 |     return Status::OK(); | 
| 441 | 75 | } | 
| 442 |  |  | 
| 443 |  | Status FunctionLikeBase::regexp_fn(const LikeSearchState* state, const ColumnString& val, | 
| 444 | 0 |                                    const StringRef& pattern, ColumnUInt8::Container& result) { | 
| 445 | 0 |     std::string re_pattern(pattern.data, pattern.size); | 
| 446 |  | 
 | 
| 447 | 0 |     hs_database_t* database = nullptr; | 
| 448 | 0 |     hs_scratch_t* scratch = nullptr; | 
| 449 | 0 |     if (hs_prepare(nullptr, re_pattern.c_str(), &database, &scratch).ok()) { // use hyperscan | 
| 450 | 0 |         auto sz = val.size(); | 
| 451 | 0 |         for (size_t i = 0; i < sz; i++) { | 
| 452 | 0 |             const auto& str_ref = val.get_data_at(i); | 
| 453 | 0 |             auto ret = hs_scan(database, str_ref.data, (int)str_ref.size, 0, scratch, | 
| 454 | 0 |                                doris::vectorized::LikeSearchState::hs_match_handler, | 
| 455 | 0 |                                (void*)(result.data() + i)); | 
| 456 | 0 |             if (ret != HS_SUCCESS && ret != HS_SCAN_TERMINATED) { | 
| 457 | 0 |                 return Status::RuntimeError(fmt::format("hyperscan error: {}", ret)); | 
| 458 | 0 |             } | 
| 459 | 0 |         } | 
| 460 |  |  | 
| 461 | 0 |         hs_free_scratch(scratch); | 
| 462 | 0 |         hs_free_database(database); | 
| 463 | 0 |     } else { // fallback to re2 | 
| 464 | 0 |         RE2::Options opts; | 
| 465 | 0 |         opts.set_never_nl(false); | 
| 466 | 0 |         opts.set_dot_nl(true); | 
| 467 | 0 |         re2::RE2 re(re_pattern, opts); | 
| 468 | 0 |         if (re.ok()) { | 
| 469 | 0 |             auto sz = val.size(); | 
| 470 | 0 |             for (size_t i = 0; i < sz; i++) { | 
| 471 | 0 |                 const auto& str_ref = val.get_data_at(i); | 
| 472 | 0 |                 *(result.data() + i) = | 
| 473 | 0 |                         RE2::PartialMatch(re2::StringPiece(str_ref.data, str_ref.size), re); | 
| 474 | 0 |             } | 
| 475 | 0 |         } else { | 
| 476 | 0 |             return Status::RuntimeError("Invalid pattern: {}", pattern.debug_string()); | 
| 477 | 0 |         } | 
| 478 | 0 |     } | 
| 479 |  |  | 
| 480 | 0 |     return Status::OK(); | 
| 481 | 0 | } | 
| 482 |  |  | 
| 483 |  | // hyperscan compile expression to database and allocate scratch space | 
| 484 |  | Status FunctionLikeBase::hs_prepare(FunctionContext* context, const char* expression, | 
| 485 | 132 |                                     hs_database_t** database, hs_scratch_t** scratch) { | 
| 486 | 132 |     hs_compile_error_t* compile_err; | 
| 487 | 132 |     auto res = hs_compile(expression, HS_FLAG_DOTALL | HS_FLAG_ALLOWEMPTY | HS_FLAG_UTF8, | 
| 488 | 132 |                           HS_MODE_BLOCK, nullptr, database, &compile_err); | 
| 489 |  |  | 
| 490 | 132 |     if (res != HS_SUCCESS) { | 
| 491 | 0 |         *database = nullptr; | 
| 492 | 0 |         std::string error_message = compile_err->message; | 
| 493 | 0 |         hs_free_compile_error(compile_err); | 
| 494 |  |         // Do not call FunctionContext::set_error here, since we do not want to cancel the query here. | 
| 495 | 0 |         return Status::RuntimeError<false>("hs_compile regex pattern error:" + error_message); | 
| 496 | 0 |     } | 
| 497 | 132 |     hs_free_compile_error(compile_err); | 
| 498 |  |  | 
| 499 | 132 |     if (hs_alloc_scratch(*database, scratch) != HS_SUCCESS) { | 
| 500 | 0 |         hs_free_database(*database); | 
| 501 | 0 |         *database = nullptr; | 
| 502 | 0 |         *scratch = nullptr; | 
| 503 |  |         // Do not call FunctionContext::set_error here, since we do not want to cancel the query here. | 
| 504 | 0 |         return Status::RuntimeError<false>("hs_alloc_scratch allocate scratch space error"); | 
| 505 | 0 |     } | 
| 506 |  |  | 
| 507 | 132 |     return Status::OK(); | 
| 508 | 132 | } | 
| 509 |  |  | 
| 510 |  | Status FunctionLikeBase::execute_impl(FunctionContext* context, Block& block, | 
| 511 |  |                                       const ColumnNumbers& arguments, uint32_t result, | 
| 512 | 553 |                                       size_t input_rows_count) const { | 
| 513 | 553 |     const auto values_col = | 
| 514 | 553 |             block.get_by_position(arguments[0]).column->convert_to_full_column_if_const(); | 
| 515 | 553 |     const auto* values = check_and_get_column<ColumnString>(values_col.get()); | 
| 516 |  |  | 
| 517 | 553 |     if (!values) { | 
| 518 | 0 |         return Status::InternalError("Not supported input arguments types"); | 
| 519 | 0 |     } | 
| 520 |  |     // result column | 
| 521 | 553 |     auto res = ColumnUInt8::create(); | 
| 522 | 553 |     ColumnUInt8::Container& vec_res = res->get_data(); | 
| 523 |  |     // set default value to 0, and match functions only need to set 1/true | 
| 524 | 553 |     vec_res.resize_fill(input_rows_count); | 
| 525 | 553 |     auto* state = reinterpret_cast<LikeState*>( | 
| 526 | 553 |             context->get_function_state(FunctionContext::THREAD_LOCAL)); | 
| 527 |  |     // for constant_substring_fn, use long run length search for performance | 
| 528 | 553 |     if (constant_substring_fn == | 
| 529 | 553 |         *(state->function | 
| 530 | 553 |                   .target<doris::Status (*)(const LikeSearchState* state, const ColumnString&, | 
| 531 | 553 |                                             const StringRef&, ColumnUInt8::Container&)>())) { | 
| 532 | 66 |         RETURN_IF_ERROR(execute_substring(values->get_chars(), values->get_offsets(), vec_res, | 
| 533 | 66 |                                           &state->search_state)); | 
| 534 | 487 |     } else { | 
| 535 | 487 |         const auto pattern_col = block.get_by_position(arguments[1]).column; | 
| 536 | 487 |         if (const auto* str_patterns = check_and_get_column<ColumnString>(pattern_col.get())) { | 
| 537 | 307 |             RETURN_IF_ERROR( | 
| 538 | 307 |                     vector_non_const(*values, *str_patterns, vec_res, state, input_rows_count)); | 
| 539 | 307 |         } else if (const auto* const_patterns = | 
| 540 | 180 |                            check_and_get_column<ColumnConst>(pattern_col.get())) { | 
| 541 | 180 |             const auto& pattern_val = const_patterns->get_data_at(0); | 
| 542 | 180 |             RETURN_IF_ERROR(vector_const(*values, &pattern_val, vec_res, state->function, | 
| 543 | 180 |                                          &state->search_state)); | 
| 544 | 180 |         } else { | 
| 545 | 0 |             return Status::InternalError("Not supported input arguments types"); | 
| 546 | 0 |         } | 
| 547 | 487 |     } | 
| 548 | 553 |     block.replace_by_position(result, std::move(res)); | 
| 549 | 553 |     return Status::OK(); | 
| 550 | 553 | } | 
| 551 |  |  | 
| 552 |  | Status FunctionLikeBase::execute_substring(const ColumnString::Chars& values, | 
| 553 |  |                                            const ColumnString::Offsets& value_offsets, | 
| 554 |  |                                            ColumnUInt8::Container& result, | 
| 555 | 66 |                                            LikeSearchState* search_state) const { | 
| 556 |  |     // treat continuous multi string data as a long string data | 
| 557 | 66 |     const UInt8* begin = values.data(); | 
| 558 | 66 |     const UInt8* end = begin + values.size(); | 
| 559 | 66 |     const UInt8* pos = begin; | 
| 560 |  |  | 
| 561 |  |     /// Current index in the array of strings. | 
| 562 | 66 |     size_t i = 0; | 
| 563 | 66 |     size_t needle_size = search_state->substring_pattern.get_pattern_length(); | 
| 564 |  |  | 
| 565 |  |     /// We will search for the next occurrence in all strings at once. | 
| 566 | 108 |     while (pos < end) { | 
| 567 |  |         // search return matched substring start offset | 
| 568 | 64 |         pos = (UInt8*)search_state->substring_pattern.search((char*)pos, end - pos); | 
| 569 | 64 |         if (pos >= end) { | 
| 570 | 22 |             break; | 
| 571 | 22 |         } | 
| 572 |  |  | 
| 573 |  |         /// Determine which index it refers to. | 
| 574 |  |         /// begin + value_offsets[i] is the start offset of string at i+1 | 
| 575 | 42 |         while (i < value_offsets.size() && begin + value_offsets[i] < pos) { | 
| 576 | 0 |             ++i; | 
| 577 | 0 |         } | 
| 578 |  |  | 
| 579 |  |         /// We check that the entry does not pass through the boundaries of strings. | 
| 580 | 42 |         if (pos + needle_size <= begin + value_offsets[i]) { | 
| 581 | 42 |             result[i] = 1; | 
| 582 | 42 |         } | 
| 583 |  |  | 
| 584 |  |         // move to next string offset | 
| 585 | 42 |         pos = begin + value_offsets[i]; | 
| 586 | 42 |         ++i; | 
| 587 | 42 |     } | 
| 588 |  |  | 
| 589 | 66 |     return Status::OK(); | 
| 590 | 66 | } | 
| 591 |  |  | 
| 592 |  | Status FunctionLikeBase::vector_const(const ColumnString& values, const StringRef* pattern_val, | 
| 593 |  |                                       ColumnUInt8::Container& result, const LikeFn& function, | 
| 594 | 180 |                                       LikeSearchState* search_state) const { | 
| 595 | 180 |     RETURN_IF_ERROR((function)(search_state, values, | 
| 596 | 180 |                                *reinterpret_cast<const StringRef*>(pattern_val), result)); | 
| 597 | 180 |     return Status::OK(); | 
| 598 | 180 | } | 
| 599 |  |  | 
| 600 |  | template <bool LIKE_PATTERN> | 
| 601 | 307 | VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) { | 
| 602 | 307 |     VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>(); | 
| 603 | 307 |     VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>(); | 
| 604 | 307 |     VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>(); | 
| 605 | 307 |     VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>(); | 
| 606 | 307 |     VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>(); | 
| 607 | 307 |     size_t size = patterns.size(); | 
| 608 |  |  | 
| 609 | 652 |     for (size_t i = 0; i < size; ++i) { | 
| 610 | 357 |         if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched && | 
| 611 | 357 |             !substring_state->_pattern_matched && !starts_with_state->_pattern_matched && | 
| 612 | 357 |             !ends_with_state->_pattern_matched) { | 
| 613 | 12 |             return nullptr; | 
| 614 | 12 |         } | 
| 615 | 345 |         std::string pattern_str = patterns.get_data_at(i).to_string(); | 
| 616 | 345 |         if (allpass_state->_pattern_matched) { | 
| 617 | 312 |             if constexpr (LIKE_PATTERN) { | 
| 618 | 312 |                 allpass_state->like_pattern_match(pattern_str); | 
| 619 | 312 |             } else { | 
| 620 | 0 |                 allpass_state->regexp_pattern_match(pattern_str); | 
| 621 | 0 |             } | 
| 622 | 312 |         } | 
| 623 | 345 |         if (equal_state->_pattern_matched) { | 
| 624 | 315 |             if constexpr (LIKE_PATTERN) { | 
| 625 | 315 |                 equal_state->like_pattern_match(pattern_str); | 
| 626 | 315 |             } else { | 
| 627 | 0 |                 equal_state->regexp_pattern_match(pattern_str); | 
| 628 | 0 |             } | 
| 629 | 315 |         } | 
| 630 | 345 |         if (substring_state->_pattern_matched) { | 
| 631 | 321 |             if constexpr (LIKE_PATTERN) { | 
| 632 | 321 |                 substring_state->like_pattern_match(pattern_str); | 
| 633 | 321 |             } else { | 
| 634 | 0 |                 substring_state->regexp_pattern_match(pattern_str); | 
| 635 | 0 |             } | 
| 636 | 321 |         } | 
| 637 | 345 |         if (starts_with_state->_pattern_matched) { | 
| 638 | 313 |             if constexpr (LIKE_PATTERN) { | 
| 639 | 313 |                 starts_with_state->like_pattern_match(pattern_str); | 
| 640 | 313 |             } else { | 
| 641 | 0 |                 starts_with_state->regexp_pattern_match(pattern_str); | 
| 642 | 0 |             } | 
| 643 | 313 |         } | 
| 644 | 345 |         if (ends_with_state->_pattern_matched) { | 
| 645 | 312 |             if constexpr (LIKE_PATTERN) { | 
| 646 | 312 |                 ends_with_state->like_pattern_match(pattern_str); | 
| 647 | 312 |             } else { | 
| 648 | 0 |                 ends_with_state->regexp_pattern_match(pattern_str); | 
| 649 | 0 |             } | 
| 650 | 312 |         } | 
| 651 | 345 |     } | 
| 652 |  |  | 
| 653 | 295 |     if (allpass_state->_pattern_matched) { | 
| 654 | 21 |         return allpass_state; | 
| 655 | 274 |     } else if (equal_state->_pattern_matched) { | 
| 656 | 78 |         return equal_state; | 
| 657 | 196 |     } else if (substring_state->_pattern_matched) { | 
| 658 | 27 |         return substring_state; | 
| 659 | 169 |     } else if (starts_with_state->_pattern_matched) { | 
| 660 | 31 |         return starts_with_state; | 
| 661 | 138 |     } else if (ends_with_state->_pattern_matched) { | 
| 662 | 31 |         return ends_with_state; | 
| 663 | 107 |     } else { | 
| 664 | 107 |         return nullptr; | 
| 665 | 107 |     } | 
| 666 | 295 | } _ZN5doris10vectorized16FunctionLikeBase24pattern_type_recognitionILb1EEESt10shared_ptrINS0_24VectorPatternSearchStateEERKNS0_9ColumnStrIjEE| Line | Count | Source |  | 601 | 307 | VPatternSearchStateSPtr FunctionLikeBase::pattern_type_recognition(const ColumnString& patterns) { |  | 602 | 307 |     VPatternSearchStateSPtr allpass_state = std::make_shared<VectorAllpassSearchState>(); |  | 603 | 307 |     VPatternSearchStateSPtr equal_state = std::make_shared<VectorEqualSearchState>(); |  | 604 | 307 |     VPatternSearchStateSPtr substring_state = std::make_shared<VectorSubStringSearchState>(); |  | 605 | 307 |     VPatternSearchStateSPtr starts_with_state = std::make_shared<VectorStartsWithSearchState>(); |  | 606 | 307 |     VPatternSearchStateSPtr ends_with_state = std::make_shared<VectorEndsWithSearchState>(); |  | 607 | 307 |     size_t size = patterns.size(); |  | 608 |  |  |  | 609 | 652 |     for (size_t i = 0; i < size; ++i) { |  | 610 | 357 |         if (!allpass_state->_pattern_matched && !equal_state->_pattern_matched && |  | 611 | 357 |             !substring_state->_pattern_matched && !starts_with_state->_pattern_matched && |  | 612 | 357 |             !ends_with_state->_pattern_matched) { |  | 613 | 12 |             return nullptr; |  | 614 | 12 |         } |  | 615 | 345 |         std::string pattern_str = patterns.get_data_at(i).to_string(); |  | 616 | 345 |         if (allpass_state->_pattern_matched) { |  | 617 | 312 |             if constexpr (LIKE_PATTERN) { |  | 618 | 312 |                 allpass_state->like_pattern_match(pattern_str); |  | 619 |  |             } else { |  | 620 |  |                 allpass_state->regexp_pattern_match(pattern_str); |  | 621 |  |             } |  | 622 | 312 |         } |  | 623 | 345 |         if (equal_state->_pattern_matched) { |  | 624 | 315 |             if constexpr (LIKE_PATTERN) { |  | 625 | 315 |                 equal_state->like_pattern_match(pattern_str); |  | 626 |  |             } else { |  | 627 |  |                 equal_state->regexp_pattern_match(pattern_str); |  | 628 |  |             } |  | 629 | 315 |         } |  | 630 | 345 |         if (substring_state->_pattern_matched) { |  | 631 | 321 |             if constexpr (LIKE_PATTERN) { |  | 632 | 321 |                 substring_state->like_pattern_match(pattern_str); |  | 633 |  |             } else { |  | 634 |  |                 substring_state->regexp_pattern_match(pattern_str); |  | 635 |  |             } |  | 636 | 321 |         } |  | 637 | 345 |         if (starts_with_state->_pattern_matched) { |  | 638 | 313 |             if constexpr (LIKE_PATTERN) { |  | 639 | 313 |                 starts_with_state->like_pattern_match(pattern_str); |  | 640 |  |             } else { |  | 641 |  |                 starts_with_state->regexp_pattern_match(pattern_str); |  | 642 |  |             } |  | 643 | 313 |         } |  | 644 | 345 |         if (ends_with_state->_pattern_matched) { |  | 645 | 312 |             if constexpr (LIKE_PATTERN) { |  | 646 | 312 |                 ends_with_state->like_pattern_match(pattern_str); |  | 647 |  |             } else { |  | 648 |  |                 ends_with_state->regexp_pattern_match(pattern_str); |  | 649 |  |             } |  | 650 | 312 |         } |  | 651 | 345 |     } |  | 652 |  |  |  | 653 | 295 |     if (allpass_state->_pattern_matched) { |  | 654 | 21 |         return allpass_state; |  | 655 | 274 |     } else if (equal_state->_pattern_matched) { |  | 656 | 78 |         return equal_state; |  | 657 | 196 |     } else if (substring_state->_pattern_matched) { |  | 658 | 27 |         return substring_state; |  | 659 | 169 |     } else if (starts_with_state->_pattern_matched) { |  | 660 | 31 |         return starts_with_state; |  | 661 | 138 |     } else if (ends_with_state->_pattern_matched) { |  | 662 | 31 |         return ends_with_state; |  | 663 | 107 |     } else { |  | 664 | 107 |         return nullptr; |  | 665 | 107 |     } |  | 666 | 295 | } | 
Unexecuted instantiation: _ZN5doris10vectorized16FunctionLikeBase24pattern_type_recognitionILb0EEESt10shared_ptrINS0_24VectorPatternSearchStateEERKNS0_9ColumnStrIjEE | 
| 667 |  |  | 
| 668 |  | Status FunctionLikeBase::vector_non_const(const ColumnString& values, const ColumnString& patterns, | 
| 669 |  |                                           ColumnUInt8::Container& result, LikeState* state, | 
| 670 | 307 |                                           size_t input_rows_count) const { | 
| 671 | 307 |     ColumnString::MutablePtr replaced_patterns; | 
| 672 | 307 |     VPatternSearchStateSPtr vector_search_state; | 
| 673 | 307 |     if (state->is_like_pattern) { | 
| 674 | 307 |         if (state->has_custom_escape) { | 
| 675 | 0 |             replaced_patterns = ColumnString::create(); | 
| 676 | 0 |             for (int i = 0; i < input_rows_count; ++i) { | 
| 677 | 0 |                 std::string val = | 
| 678 | 0 |                         replace_pattern_by_escape(patterns.get_data_at(i), state->escape_char); | 
| 679 | 0 |                 replaced_patterns->insert_data(val.c_str(), val.size()); | 
| 680 | 0 |             } | 
| 681 | 0 |             vector_search_state = pattern_type_recognition<true>(*replaced_patterns); | 
| 682 | 307 |         } else { | 
| 683 | 307 |             vector_search_state = pattern_type_recognition<true>(patterns); | 
| 684 | 307 |         } | 
| 685 | 307 |     } else { | 
| 686 | 0 |         vector_search_state = pattern_type_recognition<false>(patterns); | 
| 687 | 0 |     } | 
| 688 |  |  | 
| 689 | 307 |     const ColumnString& real_pattern = state->has_custom_escape ? *replaced_patterns : patterns; | 
| 690 |  |  | 
| 691 | 307 |     if (vector_search_state == nullptr) { | 
| 692 |  |         // pattern type recognition failed, use default case | 
| 693 | 381 |         for (int i = 0; i < input_rows_count; ++i) { | 
| 694 | 262 |             const auto pattern_val = real_pattern.get_data_at(i); | 
| 695 | 262 |             const auto value_val = values.get_data_at(i); | 
| 696 | 262 |             RETURN_IF_ERROR((state->scalar_function)(&state->search_state, value_val, pattern_val, | 
| 697 | 262 |                                                      &result[i])); | 
| 698 | 262 |         } | 
| 699 | 119 |         return Status::OK(); | 
| 700 | 119 |     } | 
| 701 | 188 |     const auto* search_strings = | 
| 702 | 188 |             static_cast<const ColumnString*>(vector_search_state->_search_strings.get()); | 
| 703 | 188 |     return (vector_search_state->_vector_function)(values, *search_strings, result); | 
| 704 | 307 | } | 
| 705 |  |  | 
| 706 |  | Status FunctionLike::like_fn(const LikeSearchState* state, const ColumnString& val, | 
| 707 | 0 |                              const StringRef& pattern, ColumnUInt8::Container& result) { | 
| 708 | 0 |     std::string re_pattern; | 
| 709 | 0 |     convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern); | 
| 710 | 0 |     return regexp_fn(state, val, {re_pattern.c_str(), re_pattern.size()}, result); | 
| 711 | 0 | } | 
| 712 |  |  | 
| 713 |  | Status FunctionLike::like_fn_scalar(const LikeSearchState* state, const StringRef& val, | 
| 714 | 209 |                                     const StringRef& pattern, unsigned char* result) { | 
| 715 | 209 |     std::string re_pattern; | 
| 716 | 209 |     convert_like_pattern(state, std::string(pattern.data, pattern.size), &re_pattern); | 
| 717 |  |  | 
| 718 | 209 |     return regexp_fn_scalar(state, StringRef(val.data, val.size), | 
| 719 | 209 |                             {re_pattern.c_str(), re_pattern.size()}, result); | 
| 720 | 209 | } | 
| 721 |  |  | 
| 722 |  | void FunctionLike::convert_like_pattern(const LikeSearchState* state, const std::string& pattern, | 
| 723 | 319 |                                         std::string* re_pattern) { | 
| 724 | 319 |     re_pattern->clear(); | 
| 725 |  |  | 
| 726 | 319 |     if (pattern.empty()) { | 
| 727 | 14 |         re_pattern->append("^$"); | 
| 728 | 14 |         return; | 
| 729 | 14 |     } | 
| 730 |  |  | 
| 731 |  |     // add ^ to pattern head to match line head | 
| 732 | 305 |     if (!pattern.empty() && pattern[0] != '%') { | 
| 733 | 229 |         re_pattern->append("^"); | 
| 734 | 229 |     } | 
| 735 |  |  | 
| 736 |  |     // expect % and _, all chars should keep it literal mean. | 
| 737 | 1.96k |     for (size_t i = 0; i < pattern.size(); i++) { | 
| 738 | 1.66k |         char c = pattern[i]; | 
| 739 | 1.66k |         if (c == '\\' && i + 1 < pattern.size()) { | 
| 740 | 71 |             char next_c = pattern[i + 1]; | 
| 741 | 71 |             if (next_c == '%' || next_c == '_') { | 
| 742 |  |                 // convert "\%" and "\_" to literal "%" and "_" | 
| 743 | 35 |                 re_pattern->append(1, next_c); | 
| 744 | 35 |                 i++; | 
| 745 | 35 |                 continue; | 
| 746 | 36 |             } else if (next_c == '\\') { | 
| 747 |  |                 // keep valid escape "\\" | 
| 748 | 18 |                 re_pattern->append("\\\\"); | 
| 749 | 18 |                 i++; | 
| 750 | 18 |                 continue; | 
| 751 | 18 |             } | 
| 752 | 71 |         } | 
| 753 |  |  | 
| 754 | 1.61k |         if (c == '%') { | 
| 755 | 283 |             re_pattern->append(".*"); | 
| 756 | 1.32k |         } else if (c == '_') { | 
| 757 | 256 |             re_pattern->append("."); | 
| 758 | 1.07k |         } else { | 
| 759 |  |             // special for hyperscan: [, ], (, ), {, }, -, *, +, \, |, /, :, ^, ., $, ? | 
| 760 | 1.07k |             if (c == '[' || c == ']' || c == '(' || c == ')' || c == '{' || c == '}' || c == '-' || | 
| 761 | 1.07k |                 c == '*' || c == '+' || c == '\\' || c == '|' || c == '/' || c == ':' || c == '^' || | 
| 762 | 1.07k |                 c == '.' || c == '$' || c == '?') { | 
| 763 | 37 |                 re_pattern->append(1, '\\'); | 
| 764 | 37 |             } | 
| 765 | 1.07k |             re_pattern->append(1, c); | 
| 766 | 1.07k |         } | 
| 767 | 1.61k |     } | 
| 768 |  |  | 
| 769 |  |     // add $ to pattern tail to match line tail | 
| 770 | 305 |     if (!pattern.empty() && re_pattern->back() != '*') { | 
| 771 | 195 |         re_pattern->append("$"); | 
| 772 | 195 |     } | 
| 773 | 305 | } | 
| 774 |  |  | 
| 775 | 415 | void FunctionLike::remove_escape_character(std::string* search_string) { | 
| 776 | 415 |     std::string tmp_search_string; | 
| 777 | 415 |     tmp_search_string.swap(*search_string); | 
| 778 | 415 |     int64_t len = tmp_search_string.length(); | 
| 779 |  |     // sometime 'like' may allowed converted to 'equals/start_with/end_with/sub_with' | 
| 780 |  |     // so we need to remove escape from pattern to construct search string and use to do 'equals/start_with/end_with/sub_with' | 
| 781 | 1.87k |     for (int i = 0; i < len;) { | 
| 782 | 1.46k |         if (tmp_search_string[i] == '\\' && i + 1 < len && | 
| 783 | 1.46k |             (tmp_search_string[i + 1] == '%' || tmp_search_string[i + 1] == '_' || | 
| 784 | 44 |              tmp_search_string[i + 1] == '\\')) { | 
| 785 | 36 |             search_string->append(1, tmp_search_string[i + 1]); | 
| 786 | 36 |             i += 2; | 
| 787 | 1.42k |         } else { | 
| 788 | 1.42k |             search_string->append(1, tmp_search_string[i]); | 
| 789 | 1.42k |             i++; | 
| 790 | 1.42k |         } | 
| 791 | 1.46k |     } | 
| 792 | 415 | } | 
| 793 |  |  | 
| 794 | 0 | bool re2_full_match(const std::string& str, const RE2& re, std::vector<std::string>& results) { | 
| 795 | 0 |     if (!re.ok()) { | 
| 796 | 0 |         return false; | 
| 797 | 0 |     } | 
| 798 |  |  | 
| 799 | 0 |     std::vector<RE2::Arg> arguments; | 
| 800 | 0 |     std::vector<RE2::Arg*> arguments_ptrs; | 
| 801 | 0 |     std::size_t args_count = re.NumberOfCapturingGroups(); | 
| 802 | 0 |     arguments.resize(args_count); | 
| 803 | 0 |     arguments_ptrs.resize(args_count); | 
| 804 | 0 |     results.resize(args_count); | 
| 805 | 0 |     for (std::size_t i = 0; i < args_count; ++i) { | 
| 806 | 0 |         arguments[i] = &results[i]; | 
| 807 | 0 |         arguments_ptrs[i] = &arguments[i]; | 
| 808 | 0 |     } | 
| 809 |  | 
 | 
| 810 | 0 |     return RE2::FullMatchN(str, re, arguments_ptrs.data(), (int)args_count); | 
| 811 | 0 | } | 
| 812 |  |  | 
| 813 | 0 | void verbose_log_match(const std::string& str, const std::string& pattern_name, const RE2& re) { | 
| 814 | 0 |     std::vector<std::string> results; | 
| 815 | 0 |     VLOG_DEBUG << "arg str: " << str << ", size: " << str.size() << ", pattern " << pattern_name | 
| 816 | 0 |                << ": " << re.pattern() << ", size: " << re.pattern().size(); | 
| 817 | 0 |     if (re2_full_match(str, re, results)) { | 
| 818 | 0 |         for (int i = 0; i < results.size(); ++i) { | 
| 819 | 0 |             VLOG_DEBUG << "match " << i << ": " << results[i] << ", size: " << results[i].size(); | 
| 820 | 0 |         } | 
| 821 | 0 |     } else { | 
| 822 | 0 |         VLOG_DEBUG << "no match"; | 
| 823 | 0 |     } | 
| 824 | 0 | } | 
| 825 |  |  | 
| 826 |  | Status FunctionLike::construct_like_const_state(FunctionContext* context, const StringRef& pattern, | 
| 827 |  |                                                 std::shared_ptr<LikeState>& state, | 
| 828 | 347 |                                                 bool try_hyperscan) { | 
| 829 | 347 |     std::string pattern_str; | 
| 830 | 347 |     if (state->has_custom_escape) { | 
| 831 | 1 |         pattern_str = replace_pattern_by_escape(pattern, state->escape_char); | 
| 832 | 346 |     } else { | 
| 833 | 346 |         pattern_str = pattern.to_string(); | 
| 834 | 346 |     } | 
| 835 | 347 |     state->search_state.pattern_str = pattern_str; | 
| 836 | 347 |     std::string search_string; | 
| 837 |  |  | 
| 838 | 347 |     if (!pattern_str.empty() && RE2::FullMatch(pattern_str, LIKE_ALLPASS_RE)) { | 
| 839 | 22 |         state->search_state.set_search_string(""); | 
| 840 | 22 |         state->function = constant_allpass_fn; | 
| 841 | 22 |         state->scalar_function = constant_allpass_fn_scalar; | 
| 842 | 325 |     } else if (pattern_str.empty() || RE2::FullMatch(pattern_str, LIKE_EQUALS_RE, &search_string)) { | 
| 843 | 95 |         if (VLOG_DEBUG_IS_ON) { | 
| 844 | 0 |             verbose_log_match(pattern_str, "LIKE_EQUALS_RE", LIKE_EQUALS_RE); | 
| 845 | 0 |             VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size(); | 
| 846 | 0 |         } | 
| 847 | 95 |         remove_escape_character(&search_string); | 
| 848 | 95 |         if (VLOG_DEBUG_IS_ON) { | 
| 849 | 0 |             VLOG_DEBUG << "search_string escape removed: " << search_string | 
| 850 | 0 |                        << ", size: " << search_string.size(); | 
| 851 | 0 |         } | 
| 852 | 95 |         state->search_state.set_search_string(search_string); | 
| 853 | 95 |         state->function = constant_equals_fn; | 
| 854 | 95 |         state->scalar_function = constant_equals_fn_scalar; | 
| 855 | 230 |     } else if (RE2::FullMatch(pattern_str, LIKE_STARTS_WITH_RE, &search_string)) { | 
| 856 | 32 |         if (VLOG_DEBUG_IS_ON) { | 
| 857 | 0 |             verbose_log_match(pattern_str, "LIKE_STARTS_WITH_RE", LIKE_STARTS_WITH_RE); | 
| 858 | 0 |             VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size(); | 
| 859 | 0 |         } | 
| 860 | 32 |         remove_escape_character(&search_string); | 
| 861 | 32 |         if (VLOG_DEBUG_IS_ON) { | 
| 862 | 0 |             VLOG_DEBUG << "search_string escape removed: " << search_string | 
| 863 | 0 |                        << ", size: " << search_string.size(); | 
| 864 | 0 |         } | 
| 865 | 32 |         state->search_state.set_search_string(search_string); | 
| 866 | 32 |         state->function = constant_starts_with_fn; | 
| 867 | 32 |         state->scalar_function = constant_starts_with_fn_scalar; | 
| 868 | 198 |     } else if (RE2::FullMatch(pattern_str, LIKE_ENDS_WITH_RE, &search_string)) { | 
| 869 | 32 |         if (VLOG_DEBUG_IS_ON) { | 
| 870 | 0 |             verbose_log_match(pattern_str, "LIKE_ENDS_WITH_RE", LIKE_ENDS_WITH_RE); | 
| 871 | 0 |             VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size(); | 
| 872 | 0 |         } | 
| 873 | 32 |         remove_escape_character(&search_string); | 
| 874 | 32 |         if (VLOG_DEBUG_IS_ON) { | 
| 875 | 0 |             VLOG_DEBUG << "search_string escape removed: " << search_string | 
| 876 | 0 |                        << ", size: " << search_string.size(); | 
| 877 | 0 |         } | 
| 878 | 32 |         state->search_state.set_search_string(search_string); | 
| 879 | 32 |         state->function = constant_ends_with_fn; | 
| 880 | 32 |         state->scalar_function = constant_ends_with_fn_scalar; | 
| 881 | 166 |     } else if (RE2::FullMatch(pattern_str, LIKE_SUBSTRING_RE, &search_string)) { | 
| 882 | 56 |         if (VLOG_DEBUG_IS_ON) { | 
| 883 | 0 |             verbose_log_match(pattern_str, "LIKE_SUBSTRING_RE", LIKE_SUBSTRING_RE); | 
| 884 | 0 |             VLOG_DEBUG << "search_string : " << search_string << ", size: " << search_string.size(); | 
| 885 | 0 |         } | 
| 886 | 56 |         remove_escape_character(&search_string); | 
| 887 | 56 |         if (VLOG_DEBUG_IS_ON) { | 
| 888 | 0 |             VLOG_DEBUG << "search_string escape removed: " << search_string | 
| 889 | 0 |                        << ", size: " << search_string.size(); | 
| 890 | 0 |         } | 
| 891 | 56 |         state->search_state.set_search_string(search_string); | 
| 892 | 56 |         state->function = constant_substring_fn; | 
| 893 | 56 |         state->scalar_function = constant_substring_fn_scalar; | 
| 894 | 110 |     } else { | 
| 895 | 110 |         std::string re_pattern; | 
| 896 | 110 |         convert_like_pattern(&state->search_state, pattern_str, &re_pattern); | 
| 897 | 110 |         if (VLOG_DEBUG_IS_ON) { | 
| 898 | 0 |             VLOG_DEBUG << "hyperscan, pattern str: " << pattern_str | 
| 899 | 0 |                        << ", size: " << pattern_str.size() << ", re pattern: " << re_pattern | 
| 900 | 0 |                        << ", size: " << re_pattern.size(); | 
| 901 | 0 |         } | 
| 902 |  |  | 
| 903 | 110 |         hs_database_t* database = nullptr; | 
| 904 | 110 |         hs_scratch_t* scratch = nullptr; | 
| 905 | 110 |         if (try_hyperscan && hs_prepare(context, re_pattern.c_str(), &database, &scratch).ok()) { | 
| 906 |  |             // use hyperscan | 
| 907 | 110 |             state->search_state.hs_database.reset(database); | 
| 908 | 110 |             state->search_state.hs_scratch.reset(scratch); | 
| 909 | 110 |         } else { | 
| 910 |  |             // fallback to re2 | 
| 911 |  |             // reset hs_database to nullptr to indicate not use hyperscan | 
| 912 | 0 |             state->search_state.hs_database.reset(); | 
| 913 | 0 |             state->search_state.hs_scratch.reset(); | 
| 914 |  | 
 | 
| 915 | 0 |             RE2::Options opts; | 
| 916 | 0 |             opts.set_never_nl(false); | 
| 917 | 0 |             opts.set_dot_nl(true); | 
| 918 | 0 |             state->search_state.regex = std::make_unique<RE2>(re_pattern, opts); | 
| 919 | 0 |             if (!state->search_state.regex->ok()) { | 
| 920 | 0 |                 return Status::InternalError("Invalid regex expression: {}(origin: {})", re_pattern, | 
| 921 | 0 |                                              pattern_str); | 
| 922 | 0 |             } | 
| 923 | 0 |         } | 
| 924 |  |  | 
| 925 | 110 |         state->function = constant_regex_fn; | 
| 926 | 110 |         state->scalar_function = constant_regex_fn_scalar; | 
| 927 | 110 |     } | 
| 928 | 347 |     return Status::OK(); | 
| 929 | 347 | } | 
| 930 |  |  | 
| 931 | 1.07k | Status FunctionLike::open(FunctionContext* context, FunctionContext::FunctionStateScope scope) { | 
| 932 | 1.07k |     if (scope != FunctionContext::THREAD_LOCAL) { | 
| 933 | 536 |         return Status::OK(); | 
| 934 | 536 |     } | 
| 935 | 536 |     std::shared_ptr<LikeState> state = std::make_shared<LikeState>(); | 
| 936 | 536 |     state->is_like_pattern = true; | 
| 937 | 536 |     state->function = like_fn; | 
| 938 | 536 |     state->scalar_function = like_fn_scalar; | 
| 939 | 536 |     if (context->is_col_constant(2)) { | 
| 940 | 1 |         state->has_custom_escape = true; | 
| 941 | 1 |         const auto escape_col = context->get_constant_col(2)->column_ptr; | 
| 942 | 1 |         const auto& escape = escape_col->get_data_at(0); | 
| 943 | 1 |         if (escape.size != 1) { | 
| 944 | 0 |             return Status::InternalError("Escape character must be a single character, got: {}", | 
| 945 | 0 |                                          escape.to_string()); | 
| 946 | 0 |         } | 
| 947 | 1 |         state->escape_char = escape.data[0]; | 
| 948 | 1 |     } | 
| 949 | 536 |     if (context->is_col_constant(1)) { | 
| 950 | 347 |         const auto pattern_col = context->get_constant_col(1)->column_ptr; | 
| 951 | 347 |         const auto& pattern = pattern_col->get_data_at(0); | 
| 952 | 347 |         RETURN_IF_ERROR(construct_like_const_state(context, pattern, state)); | 
| 953 | 347 |     } | 
| 954 | 536 |     context->set_function_state(scope, state); | 
| 955 |  |  | 
| 956 | 536 |     return Status::OK(); | 
| 957 | 536 | } | 
| 958 |  |  | 
| 959 |  | Status FunctionRegexpLike::open(FunctionContext* context, | 
| 960 | 122 |                                 FunctionContext::FunctionStateScope scope) { | 
| 961 | 122 |     if (scope != FunctionContext::THREAD_LOCAL) { | 
| 962 | 61 |         return Status::OK(); | 
| 963 | 61 |     } | 
| 964 | 61 |     std::shared_ptr<LikeState> state = std::make_shared<LikeState>(); | 
| 965 | 61 |     context->set_function_state(scope, state); | 
| 966 | 61 |     state->is_like_pattern = false; | 
| 967 | 61 |     state->function = regexp_fn; | 
| 968 | 61 |     state->scalar_function = regexp_fn_scalar; | 
| 969 | 61 |     if (context->is_col_constant(1)) { | 
| 970 | 61 |         const auto pattern_col = context->get_constant_col(1)->column_ptr; | 
| 971 | 61 |         const auto& pattern = pattern_col->get_data_at(0); | 
| 972 |  |  | 
| 973 | 61 |         std::string pattern_str = pattern.to_string(); | 
| 974 | 61 |         std::string search_string; | 
| 975 | 61 |         if (RE2::FullMatch(pattern_str, ALLPASS_RE)) { | 
| 976 | 4 |             state->search_state.set_search_string(""); | 
| 977 | 4 |             state->function = constant_allpass_fn; | 
| 978 | 4 |             state->scalar_function = constant_allpass_fn_scalar; | 
| 979 | 57 |         } else if (RE2::FullMatch(pattern_str, EQUALS_RE, &search_string)) { | 
| 980 | 7 |             state->search_state.set_search_string(search_string); | 
| 981 | 7 |             state->function = constant_equals_fn; | 
| 982 | 7 |             state->scalar_function = constant_equals_fn_scalar; | 
| 983 | 50 |         } else if (RE2::FullMatch(pattern_str, STARTS_WITH_RE, &search_string)) { | 
| 984 | 7 |             state->search_state.set_search_string(search_string); | 
| 985 | 7 |             state->function = constant_starts_with_fn; | 
| 986 | 7 |             state->scalar_function = constant_starts_with_fn_scalar; | 
| 987 | 43 |         } else if (RE2::FullMatch(pattern_str, ENDS_WITH_RE, &search_string)) { | 
| 988 | 7 |             state->search_state.set_search_string(search_string); | 
| 989 | 7 |             state->function = constant_ends_with_fn; | 
| 990 | 7 |             state->scalar_function = constant_ends_with_fn_scalar; | 
| 991 | 36 |         } else if (RE2::FullMatch(pattern_str, SUBSTRING_RE, &search_string)) { | 
| 992 | 14 |             state->search_state.set_search_string(search_string); | 
| 993 | 14 |             state->function = constant_substring_fn; | 
| 994 | 14 |             state->scalar_function = constant_substring_fn_scalar; | 
| 995 | 22 |         } else { | 
| 996 | 22 |             hs_database_t* database = nullptr; | 
| 997 | 22 |             hs_scratch_t* scratch = nullptr; | 
| 998 | 22 |             if (hs_prepare(context, pattern_str.c_str(), &database, &scratch).ok()) { | 
| 999 |  |                 // use hyperscan | 
| 1000 | 22 |                 state->search_state.hs_database.reset(database); | 
| 1001 | 22 |                 state->search_state.hs_scratch.reset(scratch); | 
| 1002 | 22 |             } else { | 
| 1003 |  |                 // fallback to re2 | 
| 1004 |  |                 // reset hs_database to nullptr to indicate not use hyperscan | 
| 1005 | 0 |                 state->search_state.hs_database.reset(); | 
| 1006 | 0 |                 state->search_state.hs_scratch.reset(); | 
| 1007 | 0 |                 RE2::Options opts; | 
| 1008 | 0 |                 opts.set_never_nl(false); | 
| 1009 | 0 |                 opts.set_dot_nl(true); | 
| 1010 | 0 |                 state->search_state.regex = std::make_unique<RE2>(pattern_str, opts); | 
| 1011 | 0 |                 if (!state->search_state.regex->ok()) { | 
| 1012 | 0 |                     return Status::InternalError("Invalid regex expression: {}", pattern_str); | 
| 1013 | 0 |                 } | 
| 1014 | 0 |             } | 
| 1015 | 22 |             state->function = constant_regex_fn; | 
| 1016 | 22 |             state->scalar_function = constant_regex_fn_scalar; | 
| 1017 | 22 |         } | 
| 1018 | 61 |     } | 
| 1019 | 61 |     return Status::OK(); | 
| 1020 | 61 | } | 
| 1021 |  |  | 
| 1022 | 1 | void register_function_like(SimpleFunctionFactory& factory) { | 
| 1023 | 1 |     factory.register_function<FunctionLike>(); | 
| 1024 | 1 | } | 
| 1025 |  |  | 
| 1026 | 1 | void register_function_regexp(SimpleFunctionFactory& factory) { | 
| 1027 | 1 |     factory.register_function<FunctionRegexpLike>(); | 
| 1028 | 1 |     factory.register_alias(FunctionRegexpLike::name, FunctionRegexpLike::alias); | 
| 1029 | 1 | } | 
| 1030 |  | #include "common/compile_check_end.h" | 
| 1031 |  | } // namespace doris::vectorized |