be/src/exec/scan/scanner_scheduler.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "exec/scan/scanner_scheduler.h" |
19 | | |
20 | | #include <algorithm> |
21 | | #include <cstdint> |
22 | | #include <functional> |
23 | | #include <list> |
24 | | #include <memory> |
25 | | #include <ostream> |
26 | | #include <string> |
27 | | #include <utility> |
28 | | |
29 | | #include "common/compiler_util.h" // IWYU pragma: keep |
30 | | #include "common/config.h" |
31 | | #include "common/exception.h" |
32 | | #include "common/logging.h" |
33 | | #include "common/status.h" |
34 | | #include "core/block/block.h" |
35 | | #include "exec/pipeline/pipeline_task.h" |
36 | | #include "exec/scan/file_scanner.h" |
37 | | #include "exec/scan/olap_scanner.h" // IWYU pragma: keep |
38 | | #include "exec/scan/scan_node.h" |
39 | | #include "exec/scan/scanner.h" |
40 | | #include "exec/scan/scanner_context.h" |
41 | | #include "runtime/exec_env.h" |
42 | | #include "runtime/runtime_state.h" |
43 | | #include "runtime/thread_context.h" |
44 | | #include "runtime/workload_group/workload_group_manager.h" |
45 | | #include "storage/tablet/tablet.h" |
46 | | #include "util/async_io.h" // IWYU pragma: keep |
47 | | #include "util/cpu_info.h" |
48 | | #include "util/defer_op.h" |
49 | | #include "util/thread.h" |
50 | | #include "util/threadpool.h" |
51 | | |
52 | | namespace doris { |
53 | | |
54 | | Status ScannerScheduler::submit(std::shared_ptr<ScannerContext> ctx, |
55 | 964k | std::shared_ptr<ScanTask> scan_task) { |
56 | 964k | if (ctx->done()) { |
57 | 0 | return Status::OK(); |
58 | 0 | } |
59 | 964k | auto task_lock = ctx->task_exec_ctx(); |
60 | 964k | if (task_lock == nullptr) { |
61 | 18 | LOG(INFO) << "could not lock task execution context, query " << ctx->debug_string() |
62 | 18 | << " maybe finished"; |
63 | 18 | return Status::OK(); |
64 | 18 | } |
65 | 964k | std::shared_ptr<ScannerDelegate> scanner_delegate = scan_task->scanner.lock(); |
66 | 964k | if (scanner_delegate == nullptr) { |
67 | 0 | return Status::OK(); |
68 | 0 | } |
69 | | |
70 | 964k | scanner_delegate->_scanner->start_wait_worker_timer(); |
71 | 964k | TabletStorageType type = scanner_delegate->_scanner->get_storage_type(); |
72 | 964k | auto sumbit_task = [&]() { |
73 | 964k | auto work_func = [scanner_ref = scan_task, ctx]() { |
74 | 963k | auto status = [&] { |
75 | 963k | RETURN_IF_CATCH_EXCEPTION(_scanner_scan(ctx, scanner_ref)); |
76 | 962k | return Status::OK(); |
77 | 963k | }(); |
78 | | |
79 | 963k | if (!status.ok()) { |
80 | 0 | scanner_ref->set_status(status); |
81 | 0 | ctx->push_back_scan_task(scanner_ref); |
82 | 0 | return true; |
83 | 0 | } |
84 | 963k | return scanner_ref->is_eos(); |
85 | 963k | }; |
86 | 964k | SimplifiedScanTask simple_scan_task = {work_func, ctx, scan_task}; |
87 | 964k | return this->submit_scan_task(simple_scan_task); |
88 | 964k | }; |
89 | | |
90 | 964k | Status submit_status = sumbit_task(); |
91 | 964k | if (!submit_status.ok()) { |
92 | | // User will see TooManyTasks error. It looks like a more reasonable error. |
93 | 0 | Status scan_task_status = Status::TooManyTasks( |
94 | 0 | "Failed to submit scanner to scanner pool reason:" + |
95 | 0 | std::string(submit_status.msg()) + "|type:" + std::to_string(type)); |
96 | 0 | scan_task->set_status(scan_task_status); |
97 | 0 | return scan_task_status; |
98 | 0 | } |
99 | | |
100 | 964k | return Status::OK(); |
101 | 964k | } |
102 | | |
103 | | void handle_reserve_memory_failure(RuntimeState* state, std::shared_ptr<ScannerContext> ctx, |
104 | 0 | const Status& st, size_t reserve_size) { |
105 | 0 | ctx->clear_free_blocks(); |
106 | 0 | auto* local_state = ctx->local_state(); |
107 | |
|
108 | 0 | auto debug_msg = fmt::format( |
109 | 0 | "Query: {} , scanner try to reserve: {}, operator name {}, " |
110 | 0 | "operator " |
111 | 0 | "id: {}, " |
112 | 0 | "task id: " |
113 | 0 | "{}, failed: {}", |
114 | 0 | print_id(state->query_id()), PrettyPrinter::print_bytes(reserve_size), |
115 | 0 | local_state->get_name(), local_state->parent()->node_id(), state->task_id(), |
116 | 0 | st.to_string()); |
117 | | // PROCESS_MEMORY_EXCEEDED error msg alread contains process_mem_log_str |
118 | 0 | if (!st.is<ErrorCode::PROCESS_MEMORY_EXCEEDED>()) { |
119 | 0 | debug_msg += fmt::format(", debug info: {}", GlobalMemoryArbitrator::process_mem_log_str()); |
120 | 0 | } |
121 | 0 | VLOG_DEBUG << debug_msg; |
122 | |
|
123 | 0 | state->get_query_ctx()->set_low_memory_mode(); |
124 | 0 | } |
125 | | |
126 | | void ScannerScheduler::_scanner_scan(std::shared_ptr<ScannerContext> ctx, |
127 | 963k | std::shared_ptr<ScanTask> scan_task) { |
128 | 963k | auto task_lock = ctx->task_exec_ctx(); |
129 | 963k | if (task_lock == nullptr) { |
130 | 0 | return; |
131 | 0 | } |
132 | 963k | SCOPED_ATTACH_TASK(ctx->state()); |
133 | | |
134 | 963k | ctx->update_peak_running_scanner(1); |
135 | 963k | Defer defer([&] { ctx->update_peak_running_scanner(-1); }); |
136 | | |
137 | 963k | std::shared_ptr<ScannerDelegate> scanner_delegate = scan_task->scanner.lock(); |
138 | 963k | if (scanner_delegate == nullptr) { |
139 | 2 | return; |
140 | 2 | } |
141 | | |
142 | 963k | ScannerSPtr& scanner = scanner_delegate->_scanner; |
143 | | // for cpu hard limit, thread name should not be reset |
144 | 963k | if (ctx->_should_reset_thread_name) { |
145 | 0 | Thread::set_self_name("_scanner_scan"); |
146 | 0 | } |
147 | | |
148 | 963k | #ifndef __APPLE__ |
149 | | // The configuration item is used to lower the priority of the scanner thread, |
150 | | // typically employed to ensure CPU scheduling for write operations. |
151 | 963k | if (config::scan_thread_nice_value != 0 && scanner->get_name() != FileScanner::NAME) { |
152 | 0 | Thread::set_thread_nice_value(); |
153 | 0 | } |
154 | 963k | #endif |
155 | 963k | MonotonicStopWatch max_run_time_watch; |
156 | 963k | max_run_time_watch.start(); |
157 | 963k | scanner->update_wait_worker_timer(); |
158 | 963k | scanner->start_scan_cpu_timer(); |
159 | | |
160 | | // Counter update need prepare successfully, or it maybe core. For example, olap scanner |
161 | | // will open tablet reader during prepare, if not prepare successfully, tablet reader == nullptr. |
162 | 963k | bool need_update_profile = scanner->has_prepared(); |
163 | 1.91M | auto update_scanner_profile = [&]() { |
164 | 1.91M | if (need_update_profile) { |
165 | 5.40k | scanner->update_scan_cpu_timer(); |
166 | 5.40k | scanner->update_realtime_counters(); |
167 | 5.40k | need_update_profile = false; |
168 | 5.40k | } |
169 | 1.91M | }; |
170 | 963k | Defer defer_scanner([&] { |
171 | | // WorkloadGroup Policy will check cputime realtime, so that should update the counter |
172 | | // as soon as possible, could not update it on close. |
173 | 962k | update_scanner_profile(); |
174 | 962k | }); |
175 | 963k | Status status = Status::OK(); |
176 | 963k | bool eos = false; |
177 | 963k | ASSIGN_STATUS_IF_CATCH_EXCEPTION( |
178 | 963k | RuntimeState* state = ctx->state(); DCHECK(nullptr != state); |
179 | | // scanner->open may alloc plenty amount of memory(read blocks of data), |
180 | | // so better to also check low memory and clear free blocks here. |
181 | 963k | if (ctx->low_memory_mode()) { ctx->clear_free_blocks(); } |
182 | | |
183 | 963k | if (!scanner->has_prepared()) { |
184 | 963k | status = scanner->prepare(); |
185 | 963k | if (!status.ok()) { |
186 | 963k | eos = true; |
187 | 963k | } |
188 | 963k | } |
189 | | |
190 | 963k | if (!eos && !scanner->is_open()) { |
191 | 963k | status = scanner->open(state); |
192 | 963k | if (!status.ok()) { |
193 | 963k | eos = true; |
194 | 963k | } |
195 | 963k | scanner->set_opened(); |
196 | 963k | } |
197 | | |
198 | 963k | Status rf_status = scanner->try_append_late_arrival_runtime_filter(); |
199 | 963k | if (!rf_status.ok()) { |
200 | 963k | LOG(WARNING) << "Failed to append late arrival runtime filter: " |
201 | 963k | << rf_status.to_string(); |
202 | 963k | } |
203 | | |
204 | 963k | size_t raw_bytes_threshold = config::doris_scanner_row_bytes; |
205 | 963k | if (ctx->low_memory_mode()) { |
206 | 963k | ctx->clear_free_blocks(); |
207 | 963k | if (raw_bytes_threshold > ctx->low_memory_mode_scan_bytes_per_scanner()) { |
208 | 963k | raw_bytes_threshold = ctx->low_memory_mode_scan_bytes_per_scanner(); |
209 | 963k | } |
210 | 963k | } |
211 | | |
212 | 963k | size_t raw_bytes_read = 0; |
213 | 963k | bool first_read = true; int64_t limit = scanner->limit(); |
214 | | // If the first block is full, then it is true. Or the first block + second block > batch_size |
215 | 963k | bool has_first_full_block = false; |
216 | | |
217 | | // During low memory mode, every scan task will return at most 2 block to reduce memory usage. |
218 | 963k | while (!eos && raw_bytes_read < raw_bytes_threshold && |
219 | 963k | (!ctx->low_memory_mode() || !has_first_full_block) && |
220 | 963k | (!has_first_full_block || doris::thread_context() |
221 | 963k | ->thread_mem_tracker_mgr->limiter_mem_tracker() |
222 | 963k | ->check_limit(1))) { |
223 | 963k | if (UNLIKELY(ctx->done())) { |
224 | 963k | eos = true; |
225 | 963k | break; |
226 | 963k | } |
227 | 963k | if (max_run_time_watch.elapsed_time() > |
228 | 963k | config::doris_scanner_max_run_time_ms * 1e6) { |
229 | 963k | break; |
230 | 963k | } |
231 | 963k | DEFER_RELEASE_RESERVED(); |
232 | 963k | BlockUPtr free_block; |
233 | 963k | if (first_read) { |
234 | 963k | free_block = ctx->get_free_block(first_read); |
235 | 963k | } else { |
236 | 963k | if (state->get_query_ctx() |
237 | 963k | ->resource_ctx() |
238 | 963k | ->task_controller() |
239 | 963k | ->is_enable_reserve_memory()) { |
240 | 963k | size_t block_avg_bytes = scanner->get_block_avg_bytes(); |
241 | 963k | auto st = thread_context()->thread_mem_tracker_mgr->try_reserve( |
242 | 963k | block_avg_bytes); |
243 | 963k | if (!st.ok()) { |
244 | 963k | handle_reserve_memory_failure(state, ctx, st, block_avg_bytes); |
245 | 963k | break; |
246 | 963k | } |
247 | 963k | } |
248 | 963k | free_block = ctx->get_free_block(first_read); |
249 | 963k | } |
250 | 963k | if (free_block == nullptr) { |
251 | 963k | break; |
252 | 963k | } |
253 | | // We got a new created block or a reused block. |
254 | 963k | status = scanner->get_block_after_projects(state, free_block.get(), &eos); |
255 | 963k | first_read = false; |
256 | 963k | if (!status.ok()) { |
257 | 963k | LOG(WARNING) << "Scan thread read Scanner failed: " << status.to_string(); |
258 | 963k | break; |
259 | 963k | } |
260 | | // Check column type only after block is read successfully. |
261 | | // Or it may cause a crash when the block is not normal. |
262 | 963k | _make_sure_virtual_col_is_materialized(scanner, free_block.get()); |
263 | | // Projection will truncate useless columns, makes block size change. |
264 | 963k | auto free_block_bytes = free_block->allocated_bytes(); |
265 | 963k | raw_bytes_read += free_block_bytes; |
266 | 963k | if (!scan_task->cached_blocks.empty() && |
267 | 963k | scan_task->cached_blocks.back().first->rows() + free_block->rows() <= |
268 | 963k | ctx->batch_size()) { |
269 | 963k | size_t block_size = scan_task->cached_blocks.back().first->allocated_bytes(); |
270 | 963k | MutableBlock mutable_block(scan_task->cached_blocks.back().first.get()); |
271 | 963k | status = mutable_block.merge(*free_block); |
272 | 963k | if (!status.ok()) { |
273 | 963k | LOG(WARNING) << "Block merge failed: " << status.to_string(); |
274 | 963k | break; |
275 | 963k | } |
276 | 963k | scan_task->cached_blocks.back().second = mutable_block.allocated_bytes(); |
277 | 963k | scan_task->cached_blocks.back().first.get()->set_columns( |
278 | 963k | std::move(mutable_block.mutable_columns())); |
279 | | |
280 | | // Return block succeed or not, this free_block is not used by this scan task any more. |
281 | | // If block can be reused, its memory usage will be added back. |
282 | 963k | ctx->return_free_block(std::move(free_block)); |
283 | 963k | ctx->inc_block_usage(scan_task->cached_blocks.back().first->allocated_bytes() - |
284 | 963k | block_size); |
285 | 963k | } else { |
286 | 963k | if (!scan_task->cached_blocks.empty()) { |
287 | 963k | has_first_full_block = true; |
288 | 963k | } |
289 | 963k | ctx->inc_block_usage(free_block->allocated_bytes()); |
290 | 963k | scan_task->cached_blocks.emplace_back(std::move(free_block), free_block_bytes); |
291 | 963k | } |
292 | | |
293 | 963k | if (limit > 0 && limit < ctx->batch_size()) { |
294 | | // If this scanner has limit, and less than batch size, |
295 | | // return immediately and no need to wait raw_bytes_threshold. |
296 | | // This can save time that each scanner may only return a small number of rows, |
297 | | // but rows are enough from all scanners. |
298 | | // If not break, the query like "select * from tbl where id=1 limit 10" |
299 | | // may scan a lot data when the "id=1"'s filter ratio is high. |
300 | | // If limit is larger than batch size, this rule is skipped, |
301 | | // to avoid user specify a large limit and causing too much small blocks. |
302 | 963k | break; |
303 | 963k | } |
304 | | |
305 | 963k | if (scan_task->cached_blocks.back().first->rows() > 0) { |
306 | 963k | auto block_avg_bytes = (scan_task->cached_blocks.back().first->bytes() + |
307 | 963k | scan_task->cached_blocks.back().first->rows() - 1) / |
308 | 963k | scan_task->cached_blocks.back().first->rows() * |
309 | 963k | ctx->batch_size(); |
310 | 963k | scanner->update_block_avg_bytes(block_avg_bytes); |
311 | 963k | } |
312 | 963k | if (ctx->low_memory_mode()) { |
313 | 963k | ctx->clear_free_blocks(); |
314 | 963k | if (raw_bytes_threshold > ctx->low_memory_mode_scan_bytes_per_scanner()) { |
315 | 963k | raw_bytes_threshold = ctx->low_memory_mode_scan_bytes_per_scanner(); |
316 | 963k | } |
317 | 963k | } |
318 | 963k | } // end for while |
319 | | |
320 | 963k | if (UNLIKELY(!status.ok())) { |
321 | 963k | scan_task->set_status(status); |
322 | 963k | eos = true; |
323 | 963k | }, |
324 | 963k | status); |
325 | | |
326 | 960k | if (UNLIKELY(!status.ok())) { |
327 | 1.33k | scan_task->set_status(status); |
328 | 1.33k | eos = true; |
329 | 1.33k | } |
330 | | |
331 | 960k | if (eos) { |
332 | | // If eos, scanner will call _collect_profile_before_close to update profile, |
333 | | // so we need update_scanner_profile here |
334 | 955k | update_scanner_profile(); |
335 | 955k | scanner->mark_to_need_to_close(); |
336 | 955k | } |
337 | 960k | scan_task->set_eos(eos); |
338 | | |
339 | 18.4E | VLOG_DEBUG << fmt::format( |
340 | 18.4E | "Scanner context {} has finished task, cached_block {} current scheduled task is " |
341 | 18.4E | "{}, eos: {}, status: {}", |
342 | 18.4E | ctx->ctx_id, scan_task->cached_blocks.size(), ctx->num_scheduled_scanners(), eos, |
343 | 18.4E | status.to_string()); |
344 | | |
345 | 960k | ctx->push_back_scan_task(scan_task); |
346 | 960k | } |
347 | 17.9k | int ScannerScheduler::default_local_scan_thread_num() { |
348 | 17.9k | return config::doris_scanner_thread_pool_thread_num > 0 |
349 | 17.9k | ? config::doris_scanner_thread_pool_thread_num |
350 | 17.9k | : std::max(48, CpuInfo::num_cores() * 2); |
351 | 17.9k | } |
352 | 11.6k | int ScannerScheduler::default_remote_scan_thread_num() { |
353 | 11.6k | int num = config::doris_max_remote_scanner_thread_pool_thread_num > 0 |
354 | 11.6k | ? config::doris_max_remote_scanner_thread_pool_thread_num |
355 | 11.6k | : std::max(512, CpuInfo::num_cores() * 10); |
356 | 11.6k | return std::max(num, default_local_scan_thread_num()); |
357 | 11.6k | } |
358 | | |
359 | 37 | int ScannerScheduler::get_remote_scan_thread_queue_size() { |
360 | 37 | return config::doris_remote_scanner_thread_pool_queue_size; |
361 | 37 | } |
362 | | |
363 | 6.32k | int ScannerScheduler::default_min_active_scan_threads() { |
364 | 6.32k | return config::min_active_scan_threads > 0 |
365 | 6.32k | ? config::min_active_scan_threads |
366 | 6.32k | : config::min_active_scan_threads = CpuInfo::num_cores() * 2; |
367 | 6.32k | } |
368 | | |
369 | 6.32k | int ScannerScheduler::default_min_active_file_scan_threads() { |
370 | 6.32k | return config::min_active_file_scan_threads > 0 |
371 | 6.32k | ? config::min_active_file_scan_threads |
372 | 6.32k | : config::min_active_file_scan_threads = CpuInfo::num_cores() * 8; |
373 | 6.32k | } |
374 | | |
375 | | void ScannerScheduler::_make_sure_virtual_col_is_materialized( |
376 | 1.00M | const std::shared_ptr<Scanner>& scanner, Block* free_block) { |
377 | 1.00M | #ifndef NDEBUG |
378 | | // Currently, virtual column can only be used on olap table. |
379 | 1.00M | std::shared_ptr<OlapScanner> olap_scanner = std::dynamic_pointer_cast<OlapScanner>(scanner); |
380 | 1.00M | if (olap_scanner == nullptr) { |
381 | 13.6k | return; |
382 | 13.6k | } |
383 | | |
384 | 986k | if (free_block->rows() == 0) { |
385 | 758k | return; |
386 | 758k | } |
387 | | |
388 | 227k | size_t idx = 0; |
389 | 695k | for (const auto& entry : *free_block) { |
390 | | // Virtual column must be materialized on the end of SegmentIterator's next batch method. |
391 | 695k | const ColumnNothing* column_nothing = |
392 | 695k | check_and_get_column<ColumnNothing>(entry.column.get()); |
393 | 695k | if (column_nothing == nullptr) { |
394 | 695k | idx++; |
395 | 695k | continue; |
396 | 695k | } |
397 | | |
398 | 18.4E | std::vector<std::string> vcid_to_idx; |
399 | | |
400 | 18.4E | for (const auto& pair : olap_scanner->_vir_cid_to_idx_in_block) { |
401 | 0 | vcid_to_idx.push_back(fmt::format("{}-{}", pair.first, pair.second)); |
402 | 0 | } |
403 | | |
404 | 18.4E | std::string error_msg = fmt::format( |
405 | 18.4E | "Column in idx {} is nothing, block columns {}, normal_columns " |
406 | 18.4E | "{}, " |
407 | 18.4E | "vir_cid_to_idx_in_block_msg {}", |
408 | 18.4E | idx, free_block->columns(), olap_scanner->_return_columns.size(), |
409 | 18.4E | fmt::format("_vir_cid_to_idx_in_block:[{}]", fmt::join(vcid_to_idx, ","))); |
410 | 18.4E | throw doris::Exception(ErrorCode::INTERNAL_ERROR, error_msg); |
411 | 695k | } |
412 | 227k | #endif |
413 | 227k | } |
414 | | |
415 | 964k | Result<SharedListenableFuture<Void>> ScannerSplitRunner::process_for(std::chrono::nanoseconds) { |
416 | 964k | _started = true; |
417 | 964k | bool is_completed = _scan_func(); |
418 | 964k | if (is_completed) { |
419 | 958k | _completion_future.set_value(Void {}); |
420 | 958k | } |
421 | 964k | return SharedListenableFuture<Void>::create_ready(Void {}); |
422 | 964k | } |
423 | | |
424 | 962k | bool ScannerSplitRunner::is_finished() { |
425 | 962k | return _completion_future.is_done(); |
426 | 962k | } |
427 | | |
428 | 957k | Status ScannerSplitRunner::finished_status() { |
429 | 957k | return _completion_future.get_status(); |
430 | 957k | } |
431 | | |
432 | 0 | bool ScannerSplitRunner::is_started() const { |
433 | 0 | return _started.load(); |
434 | 0 | } |
435 | | |
436 | 5.41k | bool ScannerSplitRunner::is_auto_reschedule() const { |
437 | 5.41k | return false; |
438 | 5.41k | } |
439 | | |
440 | | } // namespace doris |