be/src/storage/index/inverted/token_stream.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <unicode/utext.h> |
21 | | |
22 | | #include <memory> |
23 | | #include <string_view> |
24 | | |
25 | | #include "CLucene.h" |
26 | | #include "CLucene/analysis/AnalysisHeader.h" |
27 | | #include "common/cast_set.h" |
28 | | #include "storage/index/inverted/util/reader.h" |
29 | | |
30 | | using namespace lucene::analysis; |
31 | | |
32 | | namespace doris::segment_v2::inverted_index { |
33 | | #include "common/compile_check_begin.h" |
34 | | |
35 | | class DorisTokenizer; |
36 | | using TokenizerPtr = std::shared_ptr<DorisTokenizer>; |
37 | | |
38 | | using TokenStreamPtr = std::shared_ptr<TokenStream>; |
39 | | |
40 | | /** |
41 | | * All custom tokenizers and token_filters must use the following functions |
42 | | * to set token information. Using these unified set methods helps avoid |
43 | | * unnecessary data copying. |
44 | | * |
45 | | * Note: Must not mix with other set methods |
46 | | */ |
47 | | class DorisTokenStream { |
48 | | public: |
49 | 815 | DorisTokenStream() = default; |
50 | 815 | virtual ~DorisTokenStream() = default; |
51 | | |
52 | 106k | void set(Token* t, const std::string_view& term, int32_t pos = 1) { |
53 | 106k | t->setTextNoCopy(term.data(), cast_set<int32_t>(term.size())); |
54 | 106k | t->setPositionIncrement(pos); |
55 | 106k | } |
56 | | |
57 | 95.7k | void set_text(Token* t, const std::string_view& term) { |
58 | 95.7k | t->setTextNoCopy(term.data(), cast_set<int32_t>(term.size())); |
59 | 95.7k | } |
60 | | |
61 | 88.4k | int32_t get_position_increment(Token* t) { return t->getPositionIncrement(); } |
62 | 86.9k | void set_position_increment(Token* t, int32_t pos) { t->setPositionIncrement(pos); } |
63 | | }; |
64 | | |
65 | | class TokenStreamWrapper : public TokenStream { |
66 | | public: |
67 | 63 | explicit TokenStreamWrapper(std::shared_ptr<TokenStream> ts) : _impl(std::move(ts)) {} |
68 | 63 | ~TokenStreamWrapper() override = default; |
69 | | |
70 | 110k | Token* next(Token* token) override { return _impl->next(token); } |
71 | 0 | void close() override { _impl->close(); } |
72 | 1 | void reset() override { _impl->reset(); } |
73 | | |
74 | | private: |
75 | | std::shared_ptr<TokenStream> _impl; |
76 | | }; |
77 | | |
78 | | class TokenStreamComponents { |
79 | | public: |
80 | | TokenStreamComponents(TokenizerPtr tokenizer, TokenStreamPtr result) |
81 | 169 | : _source(std::move(tokenizer)), _sink(std::move(result)) {} |
82 | | |
83 | | void set_reader(const ReaderPtr& reader); |
84 | | TokenStreamPtr get_token_stream(); |
85 | | TokenizerPtr get_source(); |
86 | | |
87 | | private: |
88 | | TokenizerPtr _source; |
89 | | TokenStreamPtr _sink; |
90 | | }; |
91 | | using TokenStreamComponentsPtr = std::shared_ptr<TokenStreamComponents>; |
92 | | |
93 | | }; // namespace doris::segment_v2::inverted_index |
94 | | #include "common/compile_check_end.h" |