be/src/exprs/function/function_search.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <CLucene.h> |
21 | | #include <gen_cpp/Exprs_types.h> |
22 | | |
23 | | #include <map> |
24 | | #include <memory> |
25 | | #include <string> |
26 | | #include <unordered_map> |
27 | | #include <vector> |
28 | | |
29 | | #include "core/block/block.h" |
30 | | #include "core/data_type/data_type.h" |
31 | | #include "core/data_type/data_type_number.h" |
32 | | #include "core/types.h" |
33 | | #include "exprs/function/function.h" |
34 | | #include "storage/index/index_query_context.h" |
35 | | #include "storage/index/inverted/inverted_index_cache.h" |
36 | | #include "storage/index/inverted/query_v2/boolean_query/operator_boolean_query.h" |
37 | | |
38 | | CL_NS_USE(index) |
39 | | namespace doris { |
40 | | |
41 | | using namespace doris::segment_v2; |
42 | | |
43 | | class IndexExecContext; |
44 | | |
45 | | struct FieldReaderBinding { |
46 | | std::string logical_field_name; |
47 | | std::string stored_field_name; |
48 | | std::wstring stored_field_wstr; |
49 | | DataTypePtr column_type; |
50 | | InvertedIndexQueryType query_type; |
51 | | InvertedIndexReaderPtr inverted_reader; |
52 | | std::shared_ptr<lucene::index::IndexReader> lucene_reader; |
53 | | std::map<std::string, std::string> index_properties; |
54 | | std::string binding_key; |
55 | | std::string analyzer_key; |
56 | | }; |
57 | | |
58 | | class FieldReaderResolver { |
59 | | public: |
60 | | FieldReaderResolver( |
61 | | const std::unordered_map<std::string, IndexFieldNameAndTypePair>& data_type_with_names, |
62 | | const std::unordered_map<std::string, IndexIterator*>& iterators, |
63 | | std::shared_ptr<IndexQueryContext> context, |
64 | | const std::vector<TSearchFieldBinding>& field_bindings = {}) |
65 | 1.14k | : _data_type_with_names(data_type_with_names), |
66 | 1.14k | _iterators(iterators), |
67 | 1.14k | _context(std::move(context)), |
68 | 1.14k | _field_bindings(field_bindings) { |
69 | | // Build lookup maps for quick access |
70 | 1.62k | for (const auto& binding : _field_bindings) { |
71 | 1.63k | if (binding.__isset.is_variant_subcolumn && binding.is_variant_subcolumn) { |
72 | 65 | _variant_subcolumn_fields.insert(binding.field_name); |
73 | 65 | } |
74 | 1.62k | _field_binding_map[binding.field_name] = &binding; |
75 | 1.62k | } |
76 | 1.14k | } |
77 | | |
78 | | Status resolve(const std::string& field_name, InvertedIndexQueryType query_type, |
79 | | FieldReaderBinding* binding); |
80 | | |
81 | | // Check if a field is a variant subcolumn |
82 | 1.96k | bool is_variant_subcolumn(const std::string& field_name) const { |
83 | 1.96k | return _variant_subcolumn_fields.count(field_name) > 0; |
84 | 1.96k | } |
85 | | |
86 | 1.08k | const std::vector<std::shared_ptr<lucene::index::IndexReader>>& readers() const { |
87 | 1.08k | return _readers; |
88 | 1.08k | } |
89 | | |
90 | | const std::unordered_map<std::string, std::shared_ptr<lucene::index::IndexReader>>& |
91 | 1.08k | reader_bindings() const { |
92 | 1.08k | return _binding_readers; |
93 | 1.08k | } |
94 | | |
95 | | const std::unordered_map<std::wstring, std::shared_ptr<lucene::index::IndexReader>>& |
96 | 1.10k | field_readers() const { |
97 | 1.10k | return _field_readers; |
98 | 1.10k | } |
99 | | |
100 | 1.10k | const std::unordered_map<std::string, FieldReaderBinding>& binding_cache() const { |
101 | 1.10k | return _cache; |
102 | 1.10k | } |
103 | | |
104 | 1.59k | IndexIterator* get_iterator(const std::string& field_name) const { |
105 | 1.59k | auto it = _iterators.find(field_name); |
106 | 18.4E | return (it != _iterators.end()) ? it->second : nullptr; |
107 | 1.59k | } |
108 | | |
109 | | private: |
110 | | std::string binding_key_for(const std::string& stored_field_name, |
111 | 1.92k | InvertedIndexQueryType query_type) const { |
112 | 1.92k | return stored_field_name + "#" + std::to_string(static_cast<int>(query_type)); |
113 | 1.92k | } |
114 | | |
115 | | const std::unordered_map<std::string, IndexFieldNameAndTypePair>& _data_type_with_names; |
116 | | const std::unordered_map<std::string, IndexIterator*>& _iterators; |
117 | | std::shared_ptr<IndexQueryContext> _context; |
118 | | std::vector<TSearchFieldBinding> _field_bindings; |
119 | | std::unordered_map<std::string, const TSearchFieldBinding*> _field_binding_map; |
120 | | std::unordered_set<std::string> _variant_subcolumn_fields; |
121 | | std::unordered_map<std::string, FieldReaderBinding> _cache; |
122 | | std::vector<std::shared_ptr<lucene::index::IndexReader>> _readers; |
123 | | std::unordered_map<std::string, std::shared_ptr<lucene::index::IndexReader>> _binding_readers; |
124 | | std::unordered_map<std::wstring, std::shared_ptr<lucene::index::IndexReader>> _field_readers; |
125 | | // Keep searcher cache handles alive for the resolver's lifetime. |
126 | | // This pins cached IndexSearcher entries so extracted IndexReaders remain valid. |
127 | | std::vector<segment_v2::InvertedIndexCacheHandle> _searcher_cache_handles; |
128 | | }; |
129 | | |
130 | | class FunctionSearch : public IFunction { |
131 | | public: |
132 | | static constexpr auto name = "search"; |
133 | | |
134 | 10 | static FunctionPtr create() { return std::make_shared<FunctionSearch>(); } |
135 | | |
136 | 2 | String get_name() const override { return name; } |
137 | | |
138 | 3 | bool is_variadic() const override { return true; } |
139 | | |
140 | 1 | size_t get_number_of_arguments() const override { return 0; } |
141 | | |
142 | | // We manage nulls explicitly for index pushdown only. |
143 | 5 | bool use_default_implementation_for_nulls() const override { return false; } |
144 | 6 | bool is_use_default_implementation_for_constants() const override { return false; } |
145 | | |
146 | 4 | bool use_default_implementation_for_constants() const override { return false; } |
147 | | |
148 | 2 | DataTypePtr get_return_type_impl(const DataTypes& /*arguments*/) const override { |
149 | 2 | return std::make_shared<DataTypeUInt8>(); |
150 | 2 | } |
151 | | |
152 | | Status execute_impl(FunctionContext* /*context*/, Block& /*block*/, |
153 | | const ColumnNumbers& /*arguments*/, uint32_t /*result*/, |
154 | | size_t /*input_rows_count*/) const override; |
155 | | |
156 | 1 | bool can_push_down_to_index() const override { return true; } |
157 | | |
158 | | Status evaluate_inverted_index( |
159 | | const ColumnsWithTypeAndName& arguments, |
160 | | const std::vector<IndexFieldNameAndTypePair>& data_type_with_names, |
161 | | std::vector<IndexIterator*> iterators, uint32_t num_rows, |
162 | | const InvertedIndexAnalyzerCtx* /*analyzer_ctx*/, |
163 | | InvertedIndexResultBitmap& bitmap_result) const override; |
164 | | |
165 | | Status evaluate_inverted_index_with_search_param( |
166 | | const TSearchParam& search_param, |
167 | | const std::unordered_map<std::string, IndexFieldNameAndTypePair>& data_type_with_names, |
168 | | std::unordered_map<std::string, IndexIterator*> iterators, uint32_t num_rows, |
169 | | InvertedIndexResultBitmap& bitmap_result, bool enable_cache = true) const; |
170 | | |
171 | | Status evaluate_inverted_index_with_search_param( |
172 | | const TSearchParam& search_param, |
173 | | const std::unordered_map<std::string, IndexFieldNameAndTypePair>& data_type_with_names, |
174 | | std::unordered_map<std::string, IndexIterator*> iterators, uint32_t num_rows, |
175 | | InvertedIndexResultBitmap& bitmap_result, bool enable_cache, |
176 | | const IndexExecContext* index_exec_ctx, |
177 | | const std::unordered_map<std::string, int>& field_name_to_column_id) const; |
178 | | |
179 | | Status evaluate_nested_query( |
180 | | const TSearchParam& search_param, const TSearchClause& nested_clause, |
181 | | const std::shared_ptr<IndexQueryContext>& context, FieldReaderResolver& resolver, |
182 | | uint32_t num_rows, const IndexExecContext* index_exec_ctx, |
183 | | const std::unordered_map<std::string, int>& field_name_to_column_id, |
184 | | std::shared_ptr<roaring::Roaring>& result_bitmap) const; |
185 | | |
186 | | // Public methods for testing |
187 | | enum class ClauseTypeCategory { |
188 | | NON_TOKENIZED, // TERM, PREFIX, WILDCARD, REGEXP, RANGE, LIST - no tokenization, use EQUAL_QUERY |
189 | | TOKENIZED, // PHRASE, MATCH, ANY, ALL - need tokenization, use MATCH_ANY_QUERY |
190 | | COMPOUND // AND, OR, NOT - boolean operations |
191 | | }; |
192 | | |
193 | | ClauseTypeCategory get_clause_type_category(const std::string& clause_type) const; |
194 | | |
195 | | // Analyze query type for a specific field in the search clause |
196 | | InvertedIndexQueryType analyze_field_query_type(const std::string& field_name, |
197 | | const TSearchClause& clause) const; |
198 | | |
199 | | // Map clause_type string to InvertedIndexQueryType |
200 | | InvertedIndexQueryType clause_type_to_query_type(const std::string& clause_type) const; |
201 | | |
202 | | Status build_query_recursive(const TSearchClause& clause, |
203 | | const std::shared_ptr<IndexQueryContext>& context, |
204 | | FieldReaderResolver& resolver, |
205 | | inverted_index::query_v2::QueryPtr* out, std::string* binding_key, |
206 | | const std::string& default_operator, |
207 | | int32_t minimum_should_match) const; |
208 | | |
209 | | Status build_leaf_query(const TSearchClause& clause, |
210 | | const std::shared_ptr<IndexQueryContext>& context, |
211 | | FieldReaderResolver& resolver, inverted_index::query_v2::QueryPtr* out, |
212 | | std::string* binding_key, const std::string& default_operator, |
213 | | int32_t minimum_should_match) const; |
214 | | }; |
215 | | |
216 | | } // namespace doris |