Coverage Report

Created: 2026-04-14 05:46

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/storage/itoken_extractor.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#ifndef DORIS_ITOKEN_EXTRACTOR_H
19
#define DORIS_ITOKEN_EXTRACTOR_H
20
21
#include <stddef.h>
22
23
#include <string>
24
25
#include "storage/index/bloom_filter/bloom_filter.h"
26
27
namespace doris {
28
29
/// Interface for string parsers.
30
struct ITokenExtractor {
31
3.40k
    virtual ~ITokenExtractor() = default;
32
33
    /// Fast inplace implementation for regular use.
34
    /// Gets string (data ptr and len) and start position for extracting next token (state of extractor).
35
    /// Returns false if parsing is finished, otherwise returns true.
36
    virtual bool next_in_string(const char* data, size_t length, size_t* __restrict pos,
37
                                size_t* __restrict token_start,
38
                                size_t* __restrict token_length) const = 0;
39
40
    /// Special implementation for creating bloom filter for LIKE function.
41
    /// It skips unescaped `%` and `_` and supports escaping symbols, but it is less lightweight.
42
    virtual bool next_in_string_like(const char* data, size_t length, size_t* pos,
43
                                     std::string& out) const = 0;
44
45
    virtual void string_to_bloom_filter(const char* data, size_t length,
46
                                        segment_v2::BloomFilter& bloom_filter) const = 0;
47
48
    virtual bool string_like_to_bloom_filter(const char* data, size_t length,
49
                                             segment_v2::BloomFilter& bloom_filter) const = 0;
50
};
51
52
template <typename Derived>
53
class ITokenExtractorHelper : public ITokenExtractor {
54
public:
55
    void string_to_bloom_filter(const char* data, size_t length,
56
17.9k
                                segment_v2::BloomFilter& bloom_filter) const override {
57
17.9k
        size_t cur = 0;
58
17.9k
        size_t token_start = 0;
59
17.9k
        size_t token_len = 0;
60
61
997k
        while (cur < length && static_cast<const Derived*>(this)->next_in_string(
62
997k
                                       data, length, &cur, &token_start, &token_len)) {
63
979k
            bloom_filter.add_bytes(data + token_start, token_len);
64
979k
        }
65
17.9k
    }
66
67
    bool string_like_to_bloom_filter(const char* data, size_t length,
68
37
                                     segment_v2::BloomFilter& bloom_filter) const override {
69
37
        size_t cur = 0;
70
37
        bool added = false;
71
37
        std::string token;
72
319
        while (cur < length &&
73
319
               static_cast<const Derived*>(this)->next_in_string_like(data, length, &cur, token)) {
74
282
            bloom_filter.add_bytes(token.data(), token.size());
75
282
            added = true;
76
282
        }
77
78
37
        return added;
79
37
    }
80
};
81
82
/// Parser extracting all ngrams from string.
83
struct NgramTokenExtractor final : public ITokenExtractorHelper<NgramTokenExtractor> {
84
public:
85
3.43k
    explicit NgramTokenExtractor(size_t n_) : n(n_) {}
86
87
    bool next_in_string(const char* data, size_t length, size_t* __restrict pos,
88
                        size_t* __restrict token_start,
89
                        size_t* __restrict token_length) const override;
90
91
    bool next_in_string_like(const char* data, size_t length, size_t* pos,
92
                             std::string& token) const override;
93
94
private:
95
    size_t n;
96
};
97
} // namespace doris
98
#endif //DORIS_ITOKEN_EXTRACTOR_H