be/src/storage/rowset/beta_rowset_reader.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "storage/rowset/beta_rowset_reader.h" |
19 | | |
20 | | #include <stddef.h> |
21 | | |
22 | | #include <algorithm> |
23 | | #include <memory> |
24 | | #include <ostream> |
25 | | #include <roaring/roaring.hh> |
26 | | #include <set> |
27 | | #include <string> |
28 | | #include <unordered_map> |
29 | | #include <utility> |
30 | | |
31 | | #include "common/logging.h" |
32 | | #include "common/status.h" |
33 | | #include "core/block/block.h" |
34 | | #include "io/io_common.h" |
35 | | #include "runtime/descriptors.h" |
36 | | #include "runtime/runtime_profile.h" |
37 | | #include "storage/cache/schema_cache.h" |
38 | | #include "storage/delete/delete_handler.h" |
39 | | #include "storage/iterator/vgeneric_iterators.h" |
40 | | #include "storage/olap_define.h" |
41 | | #include "storage/predicate/block_column_predicate.h" |
42 | | #include "storage/predicate/column_predicate.h" |
43 | | #include "storage/row_cursor.h" |
44 | | #include "storage/rowset/rowset_meta.h" |
45 | | #include "storage/rowset/rowset_reader_context.h" |
46 | | #include "storage/schema.h" |
47 | | #include "storage/segment/lazy_init_segment_iterator.h" |
48 | | #include "storage/segment/segment.h" |
49 | | #include "storage/tablet/tablet_meta.h" |
50 | | #include "storage/tablet/tablet_schema.h" |
51 | | |
52 | | namespace doris { |
53 | | #include "common/compile_check_begin.h" |
54 | | using namespace ErrorCode; |
55 | | |
56 | | BetaRowsetReader::BetaRowsetReader(BetaRowsetSharedPtr rowset) |
57 | 884 | : _read_context(nullptr), _rowset(std::move(rowset)), _stats(&_owned_stats) { |
58 | 884 | _rowset->acquire(); |
59 | 884 | } |
60 | | |
61 | 939 | void BetaRowsetReader::reset_read_options() { |
62 | 939 | _read_options.delete_condition_predicates = AndBlockColumnPredicate::create_shared(); |
63 | 939 | _read_options.column_predicates.clear(); |
64 | 939 | _read_options.col_id_to_predicates.clear(); |
65 | 939 | _read_options.del_predicates_for_zone_map.clear(); |
66 | 939 | _read_options.key_ranges.clear(); |
67 | 939 | } |
68 | | |
69 | 0 | RowsetReaderSharedPtr BetaRowsetReader::clone() { |
70 | 0 | return RowsetReaderSharedPtr(new BetaRowsetReader(_rowset)); |
71 | 0 | } |
72 | | |
73 | 0 | void BetaRowsetReader::update_profile(RuntimeProfile* profile) { |
74 | 0 | if (_iterator != nullptr) { |
75 | 0 | _iterator->update_profile(profile); |
76 | 0 | } |
77 | 0 | } |
78 | | |
79 | | Status BetaRowsetReader::get_segment_iterators(RowsetReaderContext* read_context, |
80 | | std::vector<RowwiseIteratorUPtr>* out_iters, |
81 | 1.18k | bool use_cache) { |
82 | 1.18k | _read_context = read_context; |
83 | | // The segment iterator is created with its own statistics, |
84 | | // and the member variable '_stats' is initialized by '_stats(&owned_stats)'. |
85 | | // The choice of statistics used depends on the workload of the rowset reader. |
86 | | // For instance, if it's for query, the get_segment_iterators function |
87 | | // will receive one valid read_context with corresponding valid statistics, |
88 | | // and we will use those statistics. |
89 | | // However, for compaction or schema change workloads, |
90 | | // the read_context passed to the function will have null statistics, |
91 | | // and in such cases we will try to use the beta rowset reader's own statistics. |
92 | 1.18k | if (_read_context->stats != nullptr) { |
93 | 1.09k | _stats = _read_context->stats; |
94 | 1.09k | } |
95 | 1.18k | SCOPED_RAW_TIMER(&_stats->rowset_reader_get_segment_iterators_timer_ns); |
96 | | |
97 | 1.18k | RETURN_IF_ERROR(_rowset->load()); |
98 | | |
99 | | // convert RowsetReaderContext to StorageReadOptions |
100 | 1.18k | _read_options.block_row_max = read_context->batch_size; |
101 | 1.18k | _read_options.stats = _stats; |
102 | 1.18k | _read_options.push_down_agg_type_opt = _read_context->push_down_agg_type_opt; |
103 | 1.18k | _read_options.remaining_conjunct_roots = _read_context->remaining_conjunct_roots; |
104 | 1.18k | _read_options.common_expr_ctxs_push_down = _read_context->common_expr_ctxs_push_down; |
105 | 1.18k | _read_options.virtual_column_exprs = _read_context->virtual_column_exprs; |
106 | | |
107 | 1.18k | _read_options.all_access_paths = _read_context->all_access_paths; |
108 | 1.18k | _read_options.predicate_access_paths = _read_context->predicate_access_paths; |
109 | | |
110 | 1.18k | _read_options.ann_topn_runtime = _read_context->ann_topn_runtime; |
111 | 1.18k | _read_options.vir_cid_to_idx_in_block = _read_context->vir_cid_to_idx_in_block; |
112 | 1.18k | _read_options.vir_col_idx_to_type = _read_context->vir_col_idx_to_type; |
113 | 1.18k | _read_options.score_runtime = _read_context->score_runtime; |
114 | 1.18k | _read_options.collection_statistics = _read_context->collection_statistics; |
115 | 1.18k | _read_options.rowset_id = _rowset->rowset_id(); |
116 | 1.18k | _read_options.version = _rowset->version(); |
117 | 1.18k | _read_options.tablet_id = _rowset->rowset_meta()->tablet_id(); |
118 | 1.18k | _read_options.topn_limit = _topn_limit; |
119 | 1.18k | if (_read_context->lower_bound_keys != nullptr) { |
120 | 1.08k | for (int i = 0; i < _read_context->lower_bound_keys->size(); ++i) { |
121 | 0 | _read_options.key_ranges.emplace_back(&_read_context->lower_bound_keys->at(i), |
122 | 0 | _read_context->is_lower_keys_included->at(i), |
123 | 0 | &_read_context->upper_bound_keys->at(i), |
124 | 0 | _read_context->is_upper_keys_included->at(i)); |
125 | 0 | } |
126 | 1.08k | } |
127 | | |
128 | | // delete_hanlder is always set, but it maybe not init, so that it will return empty conditions |
129 | | // or predicates when it is not inited. |
130 | 1.18k | if (_read_context->delete_handler != nullptr) { |
131 | 1.08k | _read_context->delete_handler->get_delete_conditions_after_version( |
132 | 1.08k | _rowset->end_version(), _read_options.delete_condition_predicates.get(), |
133 | 1.08k | &_read_options.del_predicates_for_zone_map); |
134 | 1.08k | } |
135 | | |
136 | 1.18k | std::vector<uint32_t> read_columns; |
137 | 1.18k | std::set<uint32_t> read_columns_set; |
138 | 1.18k | std::set<uint32_t> delete_columns_set; |
139 | 4.89k | for (int i = 0; i < _read_context->return_columns->size(); ++i) { |
140 | 3.70k | read_columns.push_back(_read_context->return_columns->at(i)); |
141 | 3.70k | read_columns_set.insert(_read_context->return_columns->at(i)); |
142 | 3.70k | } |
143 | 1.18k | _read_options.delete_condition_predicates->get_all_column_ids(delete_columns_set); |
144 | 1.18k | for (auto cid : delete_columns_set) { |
145 | 406 | if (read_columns_set.find(cid) == read_columns_set.end()) { |
146 | 254 | read_columns.push_back(cid); |
147 | 254 | } |
148 | 406 | } |
149 | | // disable condition cache if you have delete condition |
150 | 1.18k | _read_context->condition_cache_digest = |
151 | 1.18k | delete_columns_set.empty() ? _read_context->condition_cache_digest : 0; |
152 | | // create segment iterators |
153 | 1.18k | VLOG_NOTICE << "read columns size: " << read_columns.size(); |
154 | 1.18k | _input_schema = std::make_shared<Schema>(_read_context->tablet_schema->columns(), read_columns); |
155 | | // output_schema only contains return_columns (excludes extra columns like delete-predicate columns). |
156 | | // It is used by merge/union iterators to determine how many columns to copy to the output block. |
157 | 1.18k | _output_schema = std::make_shared<Schema>(_read_context->tablet_schema->columns(), |
158 | 1.18k | *(_read_context->return_columns)); |
159 | 1.18k | if (_read_context->predicates != nullptr) { |
160 | 1.08k | _read_options.column_predicates.insert(_read_options.column_predicates.end(), |
161 | 1.08k | _read_context->predicates->begin(), |
162 | 1.08k | _read_context->predicates->end()); |
163 | 1.08k | for (auto pred : *(_read_context->predicates)) { |
164 | 0 | if (_read_options.col_id_to_predicates.count(pred->column_id()) < 1) { |
165 | 0 | _read_options.col_id_to_predicates.insert( |
166 | 0 | {pred->column_id(), AndBlockColumnPredicate::create_shared()}); |
167 | 0 | } |
168 | 0 | _read_options.col_id_to_predicates[pred->column_id()]->add_column_predicate( |
169 | 0 | SingleColumnBlockPredicate::create_unique(pred)); |
170 | 0 | } |
171 | 1.08k | } |
172 | | |
173 | | // Take a delete-bitmap for each segment, the bitmap contains all deletes |
174 | | // until the max read version, which is read_context->version.second |
175 | 1.18k | if (_read_context->delete_bitmap != nullptr) { |
176 | 5 | { |
177 | 5 | SCOPED_RAW_TIMER(&_stats->delete_bitmap_get_agg_ns); |
178 | 5 | RowsetId rowset_id = rowset()->rowset_id(); |
179 | 39 | for (uint32_t seg_id = 0; seg_id < rowset()->num_segments(); ++seg_id) { |
180 | 34 | auto d = _read_context->delete_bitmap->get_agg( |
181 | 34 | {rowset_id, seg_id, _read_context->version.second}); |
182 | 34 | if (d->isEmpty()) { |
183 | 11 | continue; // Empty delete bitmap for the segment |
184 | 11 | } |
185 | 23 | VLOG_TRACE << "Get the delete bitmap for rowset: " << rowset_id.to_string() |
186 | 0 | << ", segment id:" << seg_id << ", size:" << d->cardinality(); |
187 | 23 | _read_options.delete_bitmap.emplace(seg_id, std::move(d)); |
188 | 23 | } |
189 | 5 | } |
190 | 5 | } |
191 | | |
192 | 1.18k | if (_should_push_down_value_predicates()) { |
193 | | // sequence mapping currently only support merge on read, so can not push down value predicates |
194 | 603 | if (_read_context->value_predicates != nullptr && |
195 | 603 | !read_context->tablet_schema->has_seq_map()) { |
196 | 538 | _read_options.column_predicates.insert(_read_options.column_predicates.end(), |
197 | 538 | _read_context->value_predicates->begin(), |
198 | 538 | _read_context->value_predicates->end()); |
199 | 538 | for (auto pred : *(_read_context->value_predicates)) { |
200 | 0 | if (_read_options.col_id_to_predicates.count(pred->column_id()) < 1) { |
201 | 0 | _read_options.col_id_to_predicates.insert( |
202 | 0 | {pred->column_id(), AndBlockColumnPredicate::create_shared()}); |
203 | 0 | } |
204 | 0 | _read_options.col_id_to_predicates[pred->column_id()]->add_column_predicate( |
205 | 0 | SingleColumnBlockPredicate::create_unique(pred)); |
206 | 0 | } |
207 | 538 | } |
208 | 603 | } |
209 | 1.18k | _read_options.use_page_cache = _read_context->use_page_cache; |
210 | 1.18k | _read_options.tablet_schema = _read_context->tablet_schema; |
211 | 1.18k | _read_options.enable_unique_key_merge_on_write = |
212 | 1.18k | _read_context->enable_unique_key_merge_on_write; |
213 | 1.18k | _read_options.record_rowids = _read_context->record_rowids; |
214 | 1.18k | _read_options.topn_filter_source_node_ids = _read_context->topn_filter_source_node_ids; |
215 | 1.18k | _read_options.topn_filter_target_node_id = _read_context->topn_filter_target_node_id; |
216 | 1.18k | _read_options.read_orderby_key_reverse = _read_context->read_orderby_key_reverse; |
217 | 1.18k | _read_options.read_orderby_key_columns = _read_context->read_orderby_key_columns; |
218 | 1.18k | _read_options.io_ctx.reader_type = _read_context->reader_type; |
219 | 1.18k | _read_options.io_ctx.file_cache_stats = &_stats->file_cache_stats; |
220 | 1.18k | _read_options.runtime_state = _read_context->runtime_state; |
221 | 1.18k | _read_options.output_columns = _read_context->output_columns; |
222 | 1.18k | _read_options.io_ctx.reader_type = _read_context->reader_type; |
223 | 1.18k | _read_options.io_ctx.is_disposable = _read_context->reader_type != ReaderType::READER_QUERY; |
224 | 1.18k | _read_options.target_cast_type_for_variants = _read_context->target_cast_type_for_variants; |
225 | 1.18k | if (_read_context->runtime_state != nullptr) { |
226 | 0 | _read_options.io_ctx.query_id = &_read_context->runtime_state->query_id(); |
227 | 0 | _read_options.io_ctx.read_file_cache = |
228 | 0 | _read_context->runtime_state->query_options().enable_file_cache; |
229 | 0 | _read_options.io_ctx.is_disposable = |
230 | 0 | _read_context->runtime_state->query_options().disable_file_cache; |
231 | 0 | } |
232 | | |
233 | 1.18k | if (_read_context->condition_cache_digest) { |
234 | 0 | for (const auto& key_range : _read_options.key_ranges) { |
235 | 0 | _read_context->condition_cache_digest = |
236 | 0 | key_range.get_digest(_read_context->condition_cache_digest); |
237 | 0 | } |
238 | 0 | _read_options.condition_cache_digest = _read_context->condition_cache_digest; |
239 | 0 | } |
240 | | |
241 | 1.18k | _read_options.io_ctx.expiration_time = read_context->ttl_seconds; |
242 | | |
243 | 1.18k | bool enable_segment_cache = true; |
244 | 1.18k | auto* state = read_context->runtime_state; |
245 | 1.18k | if (state != nullptr) { |
246 | 0 | enable_segment_cache = state->query_options().__isset.enable_segment_cache |
247 | 0 | ? state->query_options().enable_segment_cache |
248 | 0 | : true; |
249 | 0 | } |
250 | | // When reader type is for query, session variable `enable_segment_cache` should be respected. |
251 | 1.18k | bool should_use_cache = use_cache || (_read_context->reader_type == ReaderType::READER_QUERY && |
252 | 1.18k | enable_segment_cache); |
253 | | |
254 | 1.18k | auto segment_count = _rowset->num_segments(); |
255 | 1.18k | auto [seg_start, seg_end] = _segment_offsets; |
256 | | // If seg_start == seg_end, it means that the segments of a rowset is not |
257 | | // split scanned by multiple scanners, and the rowset reader is used to read the whole rowset. |
258 | 1.18k | if (seg_start == seg_end) { |
259 | 1.18k | seg_start = 0; |
260 | 1.18k | seg_end = segment_count; |
261 | 1.18k | } |
262 | 1.18k | if (_read_context->record_rowids && _read_context->rowid_conversion) { |
263 | | // init segment rowid map for rowid conversion |
264 | 394 | std::vector<uint32_t> segment_rows; |
265 | 394 | RETURN_IF_ERROR(_rowset->get_segment_num_rows(&segment_rows, should_use_cache, _stats)); |
266 | 394 | RETURN_IF_ERROR(_read_context->rowid_conversion->init_segment_map(rowset()->rowset_id(), |
267 | 394 | segment_rows)); |
268 | 394 | } |
269 | | |
270 | 6.77k | for (int64_t i = seg_start; i < seg_end; i++) { |
271 | 5.59k | SCOPED_RAW_TIMER(&_stats->rowset_reader_create_iterators_timer_ns); |
272 | 5.59k | std::unique_ptr<RowwiseIterator> iter; |
273 | | |
274 | | /// For iterators, we don't need to initialize them all at once when creating them. |
275 | | /// Instead, we should initialize each iterator separately when really using them. |
276 | | /// This optimization minimizes the lifecycle of resources like column readers |
277 | | /// and prevents excessive memory consumption, especially for wide tables. |
278 | 5.59k | if (_segment_row_ranges.empty()) { |
279 | 5.59k | _read_options.row_ranges.clear(); |
280 | 5.59k | iter = std::make_unique<LazyInitSegmentIterator>(_rowset, i, should_use_cache, |
281 | 5.59k | _input_schema, _read_options); |
282 | 5.59k | } else { |
283 | 0 | DCHECK_EQ(seg_end - seg_start, _segment_row_ranges.size()); |
284 | 0 | auto local_options = _read_options; |
285 | 0 | local_options.row_ranges = _segment_row_ranges[i - seg_start]; |
286 | 0 | if (local_options.condition_cache_digest) { |
287 | 0 | local_options.condition_cache_digest = |
288 | 0 | local_options.row_ranges.get_digest(local_options.condition_cache_digest); |
289 | 0 | } |
290 | 0 | iter = std::make_unique<LazyInitSegmentIterator>(_rowset, i, should_use_cache, |
291 | 0 | _input_schema, local_options); |
292 | 0 | } |
293 | | |
294 | 5.59k | if (iter->empty()) { |
295 | 0 | continue; |
296 | 0 | } |
297 | 5.59k | out_iters->push_back(std::move(iter)); |
298 | 5.59k | } |
299 | | |
300 | 1.18k | return Status::OK(); |
301 | 1.18k | } |
302 | | |
303 | 247 | Status BetaRowsetReader::init(RowsetReaderContext* read_context, const RowSetSplits& rs_splits) { |
304 | 247 | _read_context = read_context; |
305 | 247 | _read_context->rowset_id = _rowset->rowset_id(); |
306 | 247 | _segment_offsets = rs_splits.segment_offsets; |
307 | 247 | _segment_row_ranges = rs_splits.segment_row_ranges; |
308 | 247 | return Status::OK(); |
309 | 247 | } |
310 | | |
311 | 6.74k | Status BetaRowsetReader::_init_iterator_once() { |
312 | 6.74k | return _init_iter_once.call([this] { return _init_iterator(); }); |
313 | 6.74k | } |
314 | | |
315 | 247 | Status BetaRowsetReader::_init_iterator() { |
316 | 247 | std::vector<RowwiseIteratorUPtr> iterators; |
317 | 247 | RETURN_IF_ERROR(get_segment_iterators(_read_context, &iterators)); |
318 | | |
319 | 247 | SCOPED_RAW_TIMER(&_stats->rowset_reader_init_iterators_timer_ns); |
320 | | |
321 | 247 | if (_read_context->merged_rows == nullptr) { |
322 | 103 | _read_context->merged_rows = &_merged_rows; |
323 | 103 | } |
324 | | // merge or union segment iterator |
325 | 247 | if (is_merge_iterator()) { |
326 | 8 | auto sequence_loc = -1; |
327 | 8 | if (_read_context->sequence_id_idx != -1) { |
328 | 0 | for (int loc = 0; loc < _read_context->return_columns->size(); loc++) { |
329 | 0 | if (_read_context->return_columns->at(loc) == _read_context->sequence_id_idx) { |
330 | 0 | sequence_loc = loc; |
331 | 0 | break; |
332 | 0 | } |
333 | 0 | } |
334 | 0 | } |
335 | 8 | _iterator = new_merge_iterator(std::move(iterators), sequence_loc, _read_context->is_unique, |
336 | 8 | _read_context->read_orderby_key_reverse, |
337 | 8 | _read_context->merged_rows, _output_schema); |
338 | 239 | } else { |
339 | 239 | if (_read_context->read_orderby_key_reverse) { |
340 | | // reverse iterators to read backward for ORDER BY key DESC |
341 | 0 | std::reverse(iterators.begin(), iterators.end()); |
342 | 0 | } |
343 | 239 | _iterator = new_union_iterator(std::move(iterators), _output_schema); |
344 | 239 | } |
345 | | |
346 | 247 | auto s = _iterator->init(_read_options); |
347 | 247 | if (!s.ok()) { |
348 | 0 | LOG(WARNING) << "failed to init iterator: " << s.to_string(); |
349 | 0 | _iterator.reset(); |
350 | 0 | return Status::Error<ROWSET_READER_INIT>(s.to_string()); |
351 | 0 | } |
352 | 247 | return Status::OK(); |
353 | 247 | } |
354 | | |
355 | 1.18k | bool BetaRowsetReader::_should_push_down_value_predicates() const { |
356 | | // if unique table with rowset [0-x] or [0-1] [2-y] [...], |
357 | | // value column predicates can be pushdown on rowset [0-x] or [2-y], [2-y] |
358 | | // must be compaction, not overlapping and don't have sequence column |
359 | 1.18k | return _rowset->keys_type() == UNIQUE_KEYS && |
360 | 1.18k | (((_rowset->start_version() == 0 || _rowset->start_version() == 2) && |
361 | 673 | !_rowset->_rowset_meta->is_segments_overlapping() && |
362 | 673 | _read_context->sequence_id_idx == -1) || |
363 | 673 | _read_context->enable_unique_key_merge_on_write); |
364 | 1.18k | } |
365 | | #include "common/compile_check_end.h" |
366 | | } // namespace doris |