Coverage Report

Created: 2026-03-16 13:13

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/storage/itoken_extractor.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "storage/itoken_extractor.h"
19
20
#include <stdint.h>
21
22
#include "util/simd/vstring_function.h"
23
24
namespace doris {
25
26
bool NgramTokenExtractor::next_in_string(const char* data, size_t length, size_t* __restrict pos,
27
                                         size_t* __restrict token_start,
28
1.01M
                                         size_t* __restrict token_length) const {
29
1.01M
    *token_start = *pos;
30
1.01M
    *token_length = 0;
31
1.01M
    size_t code_points = 0;
32
9.59M
    for (; code_points < n && *token_start + *token_length < length; ++code_points) {
33
8.58M
        size_t sz = get_utf8_byte_length(static_cast<uint8_t>(data[*token_start + *token_length]));
34
8.58M
        *token_length += sz;
35
8.58M
    }
36
1.01M
    *pos += get_utf8_byte_length(static_cast<uint8_t>(data[*pos]));
37
1.01M
    return code_points == n;
38
1.01M
}
39
40
bool NgramTokenExtractor::next_in_string_like(const char* data, size_t length, size_t* pos,
41
419
                                              std::string& token) const {
42
419
    token.clear();
43
44
419
    size_t code_points = 0;
45
419
    bool escaped = false;
46
1.50k
    for (size_t i = *pos; i < length;) {
47
1.45k
        if (escaped && (data[i] == '%' || data[i] == '_' || data[i] == '\\')) {
48
5
            token += data[i];
49
5
            ++code_points;
50
5
            escaped = false;
51
5
            ++i;
52
1.45k
        } else if (!escaped && (data[i] == '%' || data[i] == '_')) {
53
            /// This token is too small, go to the next.
54
57
            token.clear();
55
57
            code_points = 0;
56
57
            escaped = false;
57
57
            *pos = ++i;
58
1.39k
        } else if (!escaped && data[i] == '\\') {
59
5
            escaped = true;
60
5
            ++i;
61
1.39k
        } else {
62
1.39k
            const size_t sz = get_utf8_byte_length(static_cast<uint8_t>(data[i]));
63
2.82k
            for (size_t j = 0; j < sz; ++j) {
64
1.43k
                token += data[i + j];
65
1.43k
            }
66
1.39k
            i += sz;
67
1.39k
            ++code_points;
68
1.39k
            escaped = false;
69
1.39k
        }
70
71
1.45k
        if (code_points == n) {
72
369
            *pos += get_utf8_byte_length(static_cast<uint8_t>(data[*pos]));
73
369
            return true;
74
369
        }
75
1.45k
    }
76
77
50
    return false;
78
419
}
79
} // namespace doris