Coverage Report

Created: 2026-03-16 01:21

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/storage/index/inverted/token_stream.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <unicode/utext.h>
21
22
#include <memory>
23
#include <string_view>
24
25
#include "CLucene.h"
26
#include "CLucene/analysis/AnalysisHeader.h"
27
#include "common/cast_set.h"
28
#include "storage/index/inverted/util/reader.h"
29
30
using namespace lucene::analysis;
31
32
namespace doris::segment_v2::inverted_index {
33
#include "common/compile_check_begin.h"
34
35
class DorisTokenizer;
36
using TokenizerPtr = std::shared_ptr<DorisTokenizer>;
37
38
using TokenStreamPtr = std::shared_ptr<TokenStream>;
39
40
/**
41
 * All custom tokenizers and token_filters must use the following functions 
42
 * to set token information. Using these unified set methods helps avoid 
43
 * unnecessary data copying.
44
 * 
45
 * Note: Must not mix with other set methods
46
 */
47
class DorisTokenStream {
48
public:
49
815
    DorisTokenStream() = default;
50
815
    virtual ~DorisTokenStream() = default;
51
52
106k
    void set(Token* t, const std::string_view& term, int32_t pos = 1) {
53
106k
        t->setTextNoCopy(term.data(), cast_set<int32_t>(term.size()));
54
106k
        t->setPositionIncrement(pos);
55
106k
    }
56
57
95.7k
    void set_text(Token* t, const std::string_view& term) {
58
95.7k
        t->setTextNoCopy(term.data(), cast_set<int32_t>(term.size()));
59
95.7k
    }
60
61
88.4k
    int32_t get_position_increment(Token* t) { return t->getPositionIncrement(); }
62
86.9k
    void set_position_increment(Token* t, int32_t pos) { t->setPositionIncrement(pos); }
63
};
64
65
class TokenStreamWrapper : public TokenStream {
66
public:
67
63
    explicit TokenStreamWrapper(std::shared_ptr<TokenStream> ts) : _impl(std::move(ts)) {}
68
63
    ~TokenStreamWrapper() override = default;
69
70
110k
    Token* next(Token* token) override { return _impl->next(token); }
71
0
    void close() override { _impl->close(); }
72
1
    void reset() override { _impl->reset(); }
73
74
private:
75
    std::shared_ptr<TokenStream> _impl;
76
};
77
78
class TokenStreamComponents {
79
public:
80
    TokenStreamComponents(TokenizerPtr tokenizer, TokenStreamPtr result)
81
169
            : _source(std::move(tokenizer)), _sink(std::move(result)) {}
82
83
    void set_reader(const ReaderPtr& reader);
84
    TokenStreamPtr get_token_stream();
85
    TokenizerPtr get_source();
86
87
private:
88
    TokenizerPtr _source;
89
    TokenStreamPtr _sink;
90
};
91
using TokenStreamComponentsPtr = std::shared_ptr<TokenStreamComponents>;
92
93
}; // namespace doris::segment_v2::inverted_index
94
#include "common/compile_check_end.h"