be/src/storage/predicate/like_column_predicate.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | #pragma once |
18 | | |
19 | | #include <glog/logging.h> |
20 | | #include <stdint.h> |
21 | | |
22 | | #include <boost/iterator/iterator_facade.hpp> |
23 | | #include <functional> |
24 | | #include <memory> |
25 | | #include <ostream> |
26 | | #include <string> |
27 | | #include <utility> |
28 | | |
29 | | #include "common/status.h" |
30 | | #include "core/column/column.h" |
31 | | #include "core/column/column_dictionary.h" |
32 | | #include "core/column/column_nullable.h" |
33 | | #include "core/string_ref.h" |
34 | | #include "core/types.h" |
35 | | #include "exprs/function/like.h" |
36 | | #include "storage/index/bloom_filter/bloom_filter.h" |
37 | | #include "storage/predicate/column_predicate.h" |
38 | | |
39 | | namespace roaring { |
40 | | class Roaring; |
41 | | } // namespace roaring |
42 | | |
43 | | namespace doris { |
44 | | class FunctionContext; |
45 | | |
46 | | template <PrimitiveType T> |
47 | | class LikeColumnPredicate final : public ColumnPredicate { |
48 | | public: |
49 | | ENABLE_FACTORY_CREATOR(LikeColumnPredicate); |
50 | | LikeColumnPredicate(bool opposite, uint32_t column_id, std::string col_name, |
51 | | doris::FunctionContext* fn_ctx, doris::StringRef val); |
52 | 0 | ~LikeColumnPredicate() override = default; Unexecuted instantiation: _ZN5doris19LikeColumnPredicateILNS_13PrimitiveTypeE15EED2Ev Unexecuted instantiation: _ZN5doris19LikeColumnPredicateILNS_13PrimitiveTypeE23EED2Ev |
53 | | LikeColumnPredicate(const LikeColumnPredicate<T>& other, uint32_t col_id) |
54 | 0 | : ColumnPredicate(other, col_id) { |
55 | 0 | _origin = other._origin; |
56 | 0 | pattern = other.pattern; |
57 | 0 | _state = other._state; |
58 | 0 | _opposite = other._opposite; |
59 | 0 | } Unexecuted instantiation: _ZN5doris19LikeColumnPredicateILNS_13PrimitiveTypeE15EEC2ERKS2_j Unexecuted instantiation: _ZN5doris19LikeColumnPredicateILNS_13PrimitiveTypeE23EEC2ERKS2_j |
60 | | LikeColumnPredicate(const LikeColumnPredicate<T>& other) = delete; |
61 | 0 | std::shared_ptr<ColumnPredicate> clone(uint32_t col_id) const override { |
62 | 0 | return LikeColumnPredicate<T>::create_shared(*this, col_id); |
63 | 0 | } Unexecuted instantiation: _ZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE15EE5cloneEj Unexecuted instantiation: _ZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE23EE5cloneEj |
64 | 0 | std::string debug_string() const override { |
65 | 0 | fmt::memory_buffer debug_string_buffer; |
66 | 0 | fmt::format_to(debug_string_buffer, "LikeColumnPredicate({}, pattern={}, origin={})", |
67 | 0 | ColumnPredicate::debug_string(), pattern, _origin); |
68 | 0 | return fmt::to_string(debug_string_buffer); |
69 | 0 | } Unexecuted instantiation: _ZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE15EE12debug_stringB5cxx11Ev Unexecuted instantiation: _ZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE23EE12debug_stringB5cxx11Ev |
70 | | |
71 | 0 | PredicateType type() const override { return PredicateType::EQ; }Unexecuted instantiation: _ZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE15EE4typeEv Unexecuted instantiation: _ZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE23EE4typeEv |
72 | | void evaluate_vec(const IColumn& column, uint16_t size, bool* flags) const override; |
73 | | |
74 | | void evaluate_and_vec(const IColumn& column, uint16_t size, bool* flags) const override; |
75 | | |
76 | 0 | std::string get_search_str() const override { |
77 | 0 | return std::string(reinterpret_cast<const char*>(pattern.data), pattern.size); |
78 | 0 | } Unexecuted instantiation: _ZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE15EE14get_search_strB5cxx11Ev Unexecuted instantiation: _ZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE23EE14get_search_strB5cxx11Ev |
79 | 0 | bool is_opposite() const { return _opposite; }Unexecuted instantiation: _ZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE15EE11is_oppositeEv Unexecuted instantiation: _ZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE23EE11is_oppositeEv |
80 | | |
81 | 0 | void set_page_ng_bf(std::unique_ptr<segment_v2::BloomFilter> src) override { |
82 | 0 | _page_ng_bf = std::move(src); |
83 | 0 | } Unexecuted instantiation: _ZN5doris19LikeColumnPredicateILNS_13PrimitiveTypeE15EE14set_page_ng_bfESt10unique_ptrINS_10segment_v211BloomFilterESt14default_deleteIS5_EE Unexecuted instantiation: _ZN5doris19LikeColumnPredicateILNS_13PrimitiveTypeE23EE14set_page_ng_bfESt10unique_ptrINS_10segment_v211BloomFilterESt14default_deleteIS5_EE |
84 | 0 | bool evaluate_and(const BloomFilter* bf) const override { |
85 | | // like predicate can not use normal bf, just return true to accept |
86 | 0 | if (!bf->is_ngram_bf()) return true; |
87 | 0 | if (_page_ng_bf) { |
88 | 0 | return bf->contains(*_page_ng_bf); |
89 | 0 | } |
90 | 0 | return true; |
91 | 0 | } Unexecuted instantiation: _ZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE15EE12evaluate_andEPKNS_10segment_v211BloomFilterE Unexecuted instantiation: _ZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE23EE12evaluate_andEPKNS_10segment_v211BloomFilterE |
92 | 0 | bool can_do_bloom_filter(bool ngram) const override { return ngram; }Unexecuted instantiation: _ZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE15EE19can_do_bloom_filterEb Unexecuted instantiation: _ZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE23EE19can_do_bloom_filterEb |
93 | | |
94 | | private: |
95 | | uint16_t _evaluate_inner(const IColumn& column, uint16_t* sel, uint16_t size) const override; |
96 | | |
97 | | template <bool is_and> |
98 | 0 | void _evaluate_vec(const IColumn& column, uint16_t size, bool* flags) const { |
99 | 0 | if (column.is_nullable()) { |
100 | 0 | auto* nullable_col = check_and_get_column<ColumnNullable>(column); |
101 | 0 | auto& null_map_data = nullable_col->get_null_map_column().get_data(); |
102 | 0 | auto& nested_col = nullable_col->get_nested_column(); |
103 | 0 | if (nested_col.is_column_dictionary()) { |
104 | 0 | auto* nested_col_ptr = check_and_get_column<ColumnDictI32>(nested_col); |
105 | 0 | const auto& dict_res = _find_code_from_dictionary_column(*nested_col_ptr); |
106 | 0 | auto& data_array = nested_col_ptr->get_data(); |
107 | 0 | for (uint16_t i = 0; i < size; i++) { |
108 | 0 | if (null_map_data[i]) { |
109 | 0 | if constexpr (is_and) { |
110 | 0 | flags[i] &= _opposite; |
111 | 0 | } else { |
112 | 0 | flags[i] = _opposite; |
113 | 0 | } |
114 | 0 | continue; |
115 | 0 | } |
116 | | |
117 | 0 | unsigned char flag = dict_res[data_array[i]]; |
118 | 0 | if constexpr (is_and) { |
119 | 0 | flags[i] &= _opposite ^ flag; |
120 | 0 | } else { |
121 | 0 | flags[i] = _opposite ^ flag; |
122 | 0 | } |
123 | 0 | } |
124 | 0 | } else { |
125 | 0 | throw Exception(Status::FatalError( |
126 | 0 | "vectorized (not) like predicates should be dict column")); |
127 | 0 | } |
128 | 0 | } else { |
129 | 0 | if (column.is_column_dictionary()) { |
130 | 0 | auto* nested_col_ptr = check_and_get_column<ColumnDictI32>(column); |
131 | 0 | auto& data_array = nested_col_ptr->get_data(); |
132 | 0 | const auto& dict_res = _find_code_from_dictionary_column(*nested_col_ptr); |
133 | 0 | for (uint16_t i = 0; i < size; i++) { |
134 | 0 | unsigned char flag = dict_res[data_array[i]]; |
135 | 0 | if constexpr (is_and) { |
136 | 0 | flags[i] &= _opposite ^ flag; |
137 | 0 | } else { |
138 | 0 | flags[i] = _opposite ^ flag; |
139 | 0 | } |
140 | 0 | } |
141 | 0 | } else { |
142 | 0 | throw Exception(Status::FatalError( |
143 | 0 | "vectorized (not) like predicates should be dict column")); |
144 | 0 | } |
145 | 0 | } |
146 | 0 | } Unexecuted instantiation: _ZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE15EE13_evaluate_vecILb0EEEvRKNS_7IColumnEtPb Unexecuted instantiation: _ZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE15EE13_evaluate_vecILb1EEEvRKNS_7IColumnEtPb Unexecuted instantiation: _ZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE23EE13_evaluate_vecILb0EEEvRKNS_7IColumnEtPb Unexecuted instantiation: _ZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE23EE13_evaluate_vecILb1EEEvRKNS_7IColumnEtPb |
147 | | std::vector<bool> __attribute__((flatten)) |
148 | 0 | _find_code_from_dictionary_column(const ColumnDictI32& column) const { |
149 | 0 | std::vector<bool> res; |
150 | 0 | if (_segment_id_to_cached_res_flags.if_contains( |
151 | 0 | column.get_rowset_segment_id(), |
152 | 0 | [&res](const auto& pair) { res = pair.second; })) {Unexecuted instantiation: _ZZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE15EE33_find_code_from_dictionary_columnERKNS_13ColumnDictI32EENKUlRKT_E_clISt4pairIKSB_INS_8RowsetIdEjESt6vectorIbSaIbEEEEEDaS8_ Unexecuted instantiation: _ZZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE23EE33_find_code_from_dictionary_columnERKNS_13ColumnDictI32EENKUlRKT_E_clISt4pairIKSB_INS_8RowsetIdEjESt6vectorIbSaIbEEEEEDaS8_ |
153 | 0 | return res; |
154 | 0 | } |
155 | | |
156 | 0 | std::vector<bool> tmp_res(column.dict_size(), false); |
157 | 0 | for (int i = 0; i < column.dict_size(); i++) { |
158 | 0 | StringRef cell_value = column.get_shrink_value(i); |
159 | 0 | unsigned char flag = 0; |
160 | 0 | THROW_IF_ERROR((_state->scalar_function)( |
161 | 0 | &_like_state, StringRef(cell_value.data, cell_value.size), pattern, &flag)); |
162 | 0 | tmp_res[i] = flag; |
163 | 0 | } |
164 | | // Sometimes the dict is not initialized when run comparison predicate here, for example, |
165 | | // the full page is null, then the reader will skip read, so that the dictionary is not |
166 | | // inited. The cached code is wrong during this case, because the following page maybe not |
167 | | // null, and the dict should have items in the future. |
168 | | // |
169 | | // Cached code may have problems, so that add a config here, if not opened, then |
170 | | // we will return the code and not cache it. |
171 | 0 | if (!column.is_dict_empty() && config::enable_low_cardinality_cache_code) { |
172 | 0 | _segment_id_to_cached_res_flags.emplace( |
173 | 0 | std::pair {column.get_rowset_segment_id(), tmp_res}); |
174 | 0 | } |
175 | |
|
176 | 0 | return tmp_res; |
177 | 0 | } Unexecuted instantiation: _ZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE15EE33_find_code_from_dictionary_columnERKNS_13ColumnDictI32E Unexecuted instantiation: _ZNK5doris19LikeColumnPredicateILNS_13PrimitiveTypeE23EE33_find_code_from_dictionary_columnERKNS_13ColumnDictI32E |
178 | | |
179 | | mutable phmap::parallel_flat_hash_map< |
180 | | std::pair<RowsetId, uint32_t>, std::vector<bool>, |
181 | | phmap::priv::hash_default_hash<std::pair<RowsetId, uint32_t>>, |
182 | | phmap::priv::hash_default_eq<std::pair<RowsetId, uint32_t>>, |
183 | | std::allocator<std::pair<const std::pair<RowsetId, uint32_t>, int32_t>>, 4, |
184 | | std::shared_mutex> |
185 | | _segment_id_to_cached_res_flags; |
186 | | |
187 | | std::string _origin; |
188 | | // lifetime controlled by scan node |
189 | | using StateType = LikeState; |
190 | | StringRef pattern; |
191 | | |
192 | | StateType* _state = nullptr; |
193 | | |
194 | | // A separate scratch region is required for every concurrent caller of the |
195 | | // Hyperscan API. So here _like_state is separate for each instance of |
196 | | // LikeColumnPredicate. |
197 | | LikeSearchState _like_state; |
198 | | std::shared_ptr<segment_v2::BloomFilter> _page_ng_bf; // for ngram-bf index |
199 | | }; |
200 | | |
201 | | } // namespace doris |