Coverage Report

Created: 2026-03-13 09:58

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/storage/itoken_extractor.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#ifndef DORIS_ITOKEN_EXTRACTOR_H
19
#define DORIS_ITOKEN_EXTRACTOR_H
20
21
#include <stddef.h>
22
23
#include <string>
24
25
#include "storage/index/bloom_filter/bloom_filter.h"
26
27
namespace doris {
28
#include "common/compile_check_begin.h"
29
30
/// Interface for string parsers.
31
struct ITokenExtractor {
32
3.40k
    virtual ~ITokenExtractor() = default;
33
34
    /// Fast inplace implementation for regular use.
35
    /// Gets string (data ptr and len) and start position for extracting next token (state of extractor).
36
    /// Returns false if parsing is finished, otherwise returns true.
37
    virtual bool next_in_string(const char* data, size_t length, size_t* __restrict pos,
38
                                size_t* __restrict token_start,
39
                                size_t* __restrict token_length) const = 0;
40
41
    /// Special implementation for creating bloom filter for LIKE function.
42
    /// It skips unescaped `%` and `_` and supports escaping symbols, but it is less lightweight.
43
    virtual bool next_in_string_like(const char* data, size_t length, size_t* pos,
44
                                     std::string& out) const = 0;
45
46
    virtual void string_to_bloom_filter(const char* data, size_t length,
47
                                        segment_v2::BloomFilter& bloom_filter) const = 0;
48
49
    virtual bool string_like_to_bloom_filter(const char* data, size_t length,
50
                                             segment_v2::BloomFilter& bloom_filter) const = 0;
51
};
52
53
template <typename Derived>
54
class ITokenExtractorHelper : public ITokenExtractor {
55
public:
56
    void string_to_bloom_filter(const char* data, size_t length,
57
17.9k
                                segment_v2::BloomFilter& bloom_filter) const override {
58
17.9k
        size_t cur = 0;
59
17.9k
        size_t token_start = 0;
60
17.9k
        size_t token_len = 0;
61
62
998k
        while (cur < length && static_cast<const Derived*>(this)->next_in_string(
63
998k
                                       data, length, &cur, &token_start, &token_len)) {
64
980k
            bloom_filter.add_bytes(data + token_start, token_len);
65
980k
        }
66
17.9k
    }
67
68
    bool string_like_to_bloom_filter(const char* data, size_t length,
69
37
                                     segment_v2::BloomFilter& bloom_filter) const override {
70
37
        size_t cur = 0;
71
37
        bool added = false;
72
37
        std::string token;
73
324
        while (cur < length &&
74
328
               static_cast<const Derived*>(this)->next_in_string_like(data, length, &cur, token)) {
75
287
            bloom_filter.add_bytes(token.data(), token.size());
76
287
            added = true;
77
287
        }
78
79
37
        return added;
80
37
    }
81
};
82
83
/// Parser extracting all ngrams from string.
84
struct NgramTokenExtractor final : public ITokenExtractorHelper<NgramTokenExtractor> {
85
public:
86
3.44k
    explicit NgramTokenExtractor(size_t n_) : n(n_) {}
87
88
    bool next_in_string(const char* data, size_t length, size_t* __restrict pos,
89
                        size_t* __restrict token_start,
90
                        size_t* __restrict token_length) const override;
91
92
    bool next_in_string_like(const char* data, size_t length, size_t* pos,
93
                             std::string& token) const override;
94
95
private:
96
    size_t n;
97
};
98
} // namespace doris
99
#include "common/compile_check_end.h"
100
#endif //DORIS_ITOKEN_EXTRACTOR_H