Coverage Report

Created: 2026-04-01 07:21

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exec/scan/scanner_scheduler.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exec/scan/scanner_scheduler.h"
19
20
#include <algorithm>
21
#include <cstdint>
22
#include <functional>
23
#include <list>
24
#include <memory>
25
#include <ostream>
26
#include <string>
27
#include <utility>
28
29
#include "common/compiler_util.h" // IWYU pragma: keep
30
#include "common/config.h"
31
#include "common/exception.h"
32
#include "common/logging.h"
33
#include "common/status.h"
34
#include "core/block/block.h"
35
#include "exec/pipeline/pipeline_task.h"
36
#include "exec/scan/file_scanner.h"
37
#include "exec/scan/olap_scanner.h" // IWYU pragma: keep
38
#include "exec/scan/scan_node.h"
39
#include "exec/scan/scanner.h"
40
#include "exec/scan/scanner_context.h"
41
#include "runtime/exec_env.h"
42
#include "runtime/runtime_state.h"
43
#include "runtime/thread_context.h"
44
#include "runtime/workload_group/workload_group_manager.h"
45
#include "storage/tablet/tablet.h"
46
#include "util/async_io.h" // IWYU pragma: keep
47
#include "util/cpu_info.h"
48
#include "util/defer_op.h"
49
#include "util/thread.h"
50
#include "util/threadpool.h"
51
52
namespace doris {
53
54
Status ScannerScheduler::submit(std::shared_ptr<ScannerContext> ctx,
55
1.26M
                                std::shared_ptr<ScanTask> scan_task) {
56
1.26M
    if (ctx->done()) {
57
0
        return Status::OK();
58
0
    }
59
1.26M
    auto task_lock = ctx->task_exec_ctx();
60
1.26M
    if (task_lock == nullptr) {
61
18
        LOG(INFO) << "could not lock task execution context, query " << ctx->debug_string()
62
18
                  << " maybe finished";
63
18
        return Status::OK();
64
18
    }
65
1.26M
    std::shared_ptr<ScannerDelegate> scanner_delegate = scan_task->scanner.lock();
66
1.26M
    if (scanner_delegate == nullptr) {
67
0
        return Status::OK();
68
0
    }
69
70
1.26M
    scan_task->set_state(ScanTask::State::IN_FLIGHT);
71
1.26M
    scanner_delegate->_scanner->pause();
72
1.26M
    TabletStorageType type = scanner_delegate->_scanner->get_storage_type();
73
1.26M
    auto sumbit_task = [&]() {
74
1.26M
        auto work_func = [scanner_ref = scan_task, ctx]() {
75
1.26M
            auto status = [&] {
76
1.26M
                RETURN_IF_CATCH_EXCEPTION(_scanner_scan(ctx, scanner_ref));
77
1.26M
                return Status::OK();
78
            }();
79
1.26M
80
0
            if (!status.ok()) {
81
0
                scanner_ref->set_status(status);
82
0
                ctx->push_back_scan_task(scanner_ref);
83
0
                return true;
84
1.26M
            }
85
1.26M
            return scanner_ref->is_eos();
86
1.26M
        };
87
1.26M
        SimplifiedScanTask simple_scan_task = {work_func, ctx, scan_task};
88
1.26M
        return this->submit_scan_task(simple_scan_task);
89
    };
90
1.26M
91
1.26M
    Status submit_status = sumbit_task();
92
    if (!submit_status.ok()) {
93
0
        // User will see TooManyTasks error. It looks like a more reasonable error.
94
0
        Status scan_task_status = Status::TooManyTasks(
95
0
                "Failed to submit scanner to scanner pool reason:" +
96
0
                std::string(submit_status.msg()) + "|type:" + std::to_string(type));
97
0
        scan_task->set_status(scan_task_status);
98
0
        return scan_task_status;
99
    }
100
1.26M
101
1.26M
    return Status::OK();
102
}
103
104
0
void handle_reserve_memory_failure(RuntimeState* state, std::shared_ptr<ScannerContext> ctx,
105
0
                                   const Status& st, size_t reserve_size) {
106
0
    ctx->clear_free_blocks();
107
    auto* local_state = ctx->local_state();
108
0
109
0
    auto debug_msg = fmt::format(
110
0
            "Query: {} , scanner try to reserve: {}, operator name {}, "
111
0
            "operator "
112
0
            "id: {}, "
113
0
            "task id: "
114
0
            "{}, failed: {}",
115
0
            print_id(state->query_id()), PrettyPrinter::print_bytes(reserve_size),
116
0
            local_state->get_name(), local_state->parent()->node_id(), state->task_id(),
117
            st.to_string());
118
0
    // PROCESS_MEMORY_EXCEEDED error msg alread contains process_mem_log_str
119
0
    if (!st.is<ErrorCode::PROCESS_MEMORY_EXCEEDED>()) {
120
0
        debug_msg += fmt::format(", debug info: {}", GlobalMemoryArbitrator::process_mem_log_str());
121
0
    }
122
    VLOG_DEBUG << debug_msg;
123
0
124
0
    state->get_query_ctx()->set_low_memory_mode();
125
}
126
127
1.26M
void ScannerScheduler::_scanner_scan(std::shared_ptr<ScannerContext> ctx,
128
1.26M
                                     std::shared_ptr<ScanTask> scan_task) {
129
1.26M
    auto task_lock = ctx->task_exec_ctx();
130
0
    if (task_lock == nullptr) {
131
0
        return;
132
1.26M
    }
133
    SCOPED_ATTACH_TASK(ctx->state());
134
1.26M
135
1.26M
    ctx->update_peak_running_scanner(1);
136
    Defer defer([&] { ctx->update_peak_running_scanner(-1); });
137
1.26M
138
1.26M
    std::shared_ptr<ScannerDelegate> scanner_delegate = scan_task->scanner.lock();
139
0
    if (scanner_delegate == nullptr) {
140
0
        return;
141
    }
142
1.26M
143
    ScannerSPtr& scanner = scanner_delegate->_scanner;
144
1.26M
    // for cpu hard limit, thread name should not be reset
145
0
    if (ctx->_should_reset_thread_name) {
146
0
        Thread::set_self_name("_scanner_scan");
147
    }
148
1.26M
149
#ifndef __APPLE__
150
    // The configuration item is used to lower the priority of the scanner thread,
151
1.26M
    // typically employed to ensure CPU scheduling for write operations.
152
0
    if (config::scan_thread_nice_value != 0 && scanner->get_name() != FileScanner::NAME) {
153
0
        Thread::set_thread_nice_value();
154
1.26M
    }
155
#endif
156
157
    // we set and get counter according below order, to make sure the counter is updated before get_block, and the time of get_block is recorded in the counter.
158
    // 1. update_wait_worker_timer to make sure the time of waiting for worker thread is recorded in the timer
159
    // 2. start_scan_cpu_timer to make sure the cpu timer include the time of open and get_block, which is the real cpu time of scanner
160
    // 3. update_scan_cpu_timer when defer, to make sure the cpu timer include the time of open and get_block, which is the real cpu time of scanner
161
    // 4. start_wait_worker_timer when defer, to make sure the time of waiting for worker thread is recorded in the timer
162
1.26M
163
1.26M
    MonotonicStopWatch max_run_time_watch;
164
1.26M
    max_run_time_watch.start();
165
1.26M
    scanner->resume();
166
167
1.26M
    bool need_update_profile = true;
168
1.26M
    auto update_scanner_profile = [&]() {
169
1.24M
        if (need_update_profile) {
170
1.24M
            scanner->pause();
171
1.24M
            scanner->update_realtime_counters();
172
1.24M
            need_update_profile = false;
173
1.24M
        }
174
1.24M
    };
175
176
1.26M
    Status status = Status::OK();
177
1.26M
    bool eos = false;
178
1.26M
179
1.26M
    ASSIGN_STATUS_IF_CATCH_EXCEPTION(
180
            RuntimeState* state = ctx->state(); DCHECK(nullptr != state);
181
8.94k
            // scanner->open may alloc plenty amount of memory(read blocks of data),
182
8.94k
            // so better to also check low memory and clear free blocks here.
183
1.26M
            if (ctx->low_memory_mode()) { ctx->clear_free_blocks(); }
184
185
1.26M
            if (!scanner->has_prepared()) {
186
1.26M
                status = scanner->prepare();
187
                if (!status.ok()) {
188
                    eos = true;
189
1.26M
                }
190
            }
191
1.26M
192
1.26M
            if (!eos && !scanner->is_open()) {
193
1.26M
                status = scanner->open(state);
194
1.26M
                if (!status.ok()) {
195
1.26M
                    eos = true;
196
1.26M
                }
197
                scanner->set_opened();
198
1.26M
            }
199
1.26M
200
1.26M
            Status rf_status = scanner->try_append_late_arrival_runtime_filter();
201
1.26M
            if (!rf_status.ok()) {
202
1.26M
                LOG(WARNING) << "Failed to append late arrival runtime filter: "
203
1.26M
                             << rf_status.to_string();
204
1.26M
            }
205
206
1.26M
            size_t raw_bytes_threshold = config::doris_scanner_row_bytes;
207
1.26M
            if (ctx->low_memory_mode()) {
208
1.26M
                ctx->clear_free_blocks();
209
1.26M
                if (raw_bytes_threshold > ctx->low_memory_mode_scan_bytes_per_scanner()) {
210
1.26M
                    raw_bytes_threshold = ctx->low_memory_mode_scan_bytes_per_scanner();
211
                }
212
1.26M
            }
213
1.26M
214
1.26M
            bool first_read = true;
215
1.26M
            int64_t limit = scanner->limit(); if (UNLIKELY(ctx->done())) {
216
1.26M
                eos = true;
217
1.26M
            } else if (ctx->remaining_limit() == 0) { eos = true; } else if (!eos) {
218
1.26M
                do {
219
                    DEFER_RELEASE_RESERVED();
220
1.26M
                    BlockUPtr free_block;
221
1.26M
                    if (first_read) {
222
                        free_block = ctx->get_free_block(first_read);
223
1.26M
                    } else {
224
                        if (state->get_query_ctx()
225
                                    ->resource_ctx()
226
1.26M
                                    ->task_controller()
227
1.26M
                                    ->is_enable_reserve_memory()) {
228
1.26M
                            size_t block_avg_bytes = scanner->get_block_avg_bytes();
229
1.26M
                            auto st = thread_context()->thread_mem_tracker_mgr->try_reserve(
230
1.26M
                                    block_avg_bytes);
231
1.26M
                            if (!st.ok()) {
232
1.26M
                                handle_reserve_memory_failure(state, ctx, st, block_avg_bytes);
233
1.26M
                                break;
234
1.26M
                            }
235
                        }
236
1.26M
                        free_block = ctx->get_free_block(first_read);
237
1.26M
                    }
238
1.26M
                    if (free_block == nullptr) {
239
1.26M
                        break;
240
1.26M
                    }
241
1.26M
                    // We got a new created block or a reused block.
242
1.26M
                    status = scanner->get_block_after_projects(state, free_block.get(), &eos);
243
1.26M
                    first_read = false;
244
1.26M
                    if (!status.ok()) {
245
1.26M
                        LOG(WARNING) << "Scan thread read Scanner failed: " << status.to_string();
246
1.26M
                        break;
247
1.26M
                    }
248
1.26M
                    // Check column type only after block is read successfully.
249
1.26M
                    // Or it may cause a crash when the block is not normal.
250
1.26M
                    _make_sure_virtual_col_is_materialized(scanner, free_block.get());
251
1.26M
252
1.26M
                    // Shared limit quota: acquire rows from the context's shared pool.
253
1.26M
                    // Discard or truncate the block if quota is exhausted.
254
1.26M
                    if (free_block->rows() > 0) {
255
1.26M
                        int64_t block_rows = free_block->rows();
256
1.26M
                        int64_t granted = ctx->acquire_limit_quota(block_rows);
257
1.26M
                        if (granted == 0) {
258
1.26M
                            // No quota remaining, discard this block and mark eos.
259
1.26M
                            ctx->return_free_block(std::move(free_block));
260
1.26M
                            eos = true;
261
1.26M
                            break;
262
1.26M
                        } else if (granted < block_rows) {
263
1.26M
                            // Partial quota: truncate block to granted rows and mark eos.
264
1.26M
                            free_block->set_num_rows(granted);
265
1.26M
                            eos = true;
266
                        }
267
1.26M
                    }
268
1.26M
                    // Projection will truncate useless columns, makes block size change.
269
1.26M
                    auto free_block_bytes = free_block->allocated_bytes();
270
1.26M
                    ctx->reestimated_block_mem_bytes(cast_set<int64_t>(free_block_bytes));
271
1.26M
                    DCHECK(scan_task->cached_block == nullptr);
272
1.26M
                    ctx->inc_block_usage(free_block->allocated_bytes());
273
                    scan_task->cached_block = std::move(free_block);
274
275
1.26M
                    // Per-scanner small-limit optimization: if limit is small (< batch_size),
276
                    // return immediately instead of accumulating to raw_bytes_threshold.
277
                    if (limit > 0 && limit < ctx->batch_size()) {
278
                        break;
279
1.26M
                    }
280
1.26M
281
1.26M
                    if (scan_task->cached_block->rows() > 0) {
282
1.26M
                        auto block_avg_bytes = (scan_task->cached_block->bytes() +
283
                                                scan_task->cached_block->rows() - 1) /
284
1.26M
                                               scan_task->cached_block->rows() * ctx->batch_size();
285
1.26M
                        scanner->update_block_avg_bytes(block_avg_bytes);
286
1.26M
                    }
287
1.26M
                    if (ctx->low_memory_mode()) {
288
                        ctx->clear_free_blocks();
289
1.26M
                    }
290
1.26M
                } while (false);
291
1.26M
            }
292
1.26M
293
                                              if (UNLIKELY(!status.ok())) {
294
1.26M
                                                  scan_task->set_status(status);
295
1.26M
                                                  eos = true;
296
1.26M
                                              },
297
1.26M
                                              status);
298
1.26M
299
1.26M
    if (UNLIKELY(!status.ok())) {
300
1.26M
        scan_task->set_status(status);
301
1.26M
        eos = true;
302
1.26M
    }
303
1.26M
304
1.26M
    if (eos) {
305
1.26M
        // If eos, scanner will call _collect_profile_before_close to update profile,
306
1.26M
        // so we need update_scanner_profile here
307
1.26M
        update_scanner_profile();
308
1.26M
        scanner->mark_to_need_to_close();
309
        scan_task->set_state(ScanTask::State::EOS);
310
    } else {
311
        scan_task->set_state(ScanTask::State::COMPLETED);
312
1.26M
    }
313
1.26M
314
1.26M
    VLOG_DEBUG << fmt::format(
315
1.26M
            "Scanner context {} has finished task, current scheduled task is "
316
1.26M
            "{}, eos: {}, status: {}",
317
1.26M
            ctx->ctx_id, ctx->num_scheduled_scanners(), eos, status.to_string());
318
1.26M
319
1.26M
    ctx->push_back_scan_task(scan_task);
320
1.26M
}
321
1.26M
int ScannerScheduler::default_local_scan_thread_num() {
322
    return config::doris_scanner_thread_pool_thread_num > 0
323
                   ? config::doris_scanner_thread_pool_thread_num
324
                   : std::max(48, CpuInfo::num_cores() * 2);
325
1.26M
}
326
1.26M
int ScannerScheduler::default_remote_scan_thread_num() {
327
1.26M
    int num = config::doris_max_remote_scanner_thread_pool_thread_num > 0
328
                      ? config::doris_max_remote_scanner_thread_pool_thread_num
329
1.26M
                      : std::max(512, CpuInfo::num_cores() * 10);
330
1.26M
    return std::max(num, default_local_scan_thread_num());
331
1.26M
}
332
1.26M
333
1.26M
int ScannerScheduler::get_remote_scan_thread_queue_size() {
334
1.26M
    return config::doris_remote_scanner_thread_pool_queue_size;
335
1.26M
}
336
1.26M
337
1.26M
int ScannerScheduler::default_min_active_scan_threads() {
338
1.26M
    return config::min_active_scan_threads > 0
339
1.26M
                   ? config::min_active_scan_threads
340
1.26M
                   : config::min_active_scan_threads = CpuInfo::num_cores() * 2;
341
1.26M
}
342
1.26M
343
int ScannerScheduler::default_min_active_file_scan_threads() {
344
1.26M
    return config::min_active_file_scan_threads > 0
345
1.26M
                   ? config::min_active_file_scan_threads
346
1.26M
                   : config::min_active_file_scan_threads = CpuInfo::num_cores() * 8;
347
1.26M
}
348
1.26M
349
void ScannerScheduler::_make_sure_virtual_col_is_materialized(
350
1.25M
        const std::shared_ptr<Scanner>& scanner, Block* free_block) {
351
1.33k
#ifndef NDEBUG
352
1.33k
    // Currently, virtual column can only be used on olap table.
353
1.33k
    std::shared_ptr<OlapScanner> olap_scanner = std::dynamic_pointer_cast<OlapScanner>(scanner);
354
    if (olap_scanner == nullptr) {
355
1.25M
        return;
356
    }
357
358
1.24M
    if (free_block->rows() == 0) {
359
1.24M
        return;
360
1.24M
    }
361
1.25M
362
    size_t idx = 0;
363
18.4E
    for (const auto& entry : *free_block) {
364
18.4E
        // Virtual column must be materialized on the end of SegmentIterator's next batch method.
365
18.4E
        const ColumnNothing* column_nothing =
366
18.4E
                check_and_get_column<ColumnNothing>(entry.column.get());
367
18.4E
        if (column_nothing == nullptr) {
368
            idx++;
369
1.25M
            continue;
370
1.25M
        }
371
72.3k
372
72.3k
        std::vector<std::string> vcid_to_idx;
373
72.3k
374
72.3k
        for (const auto& pair : olap_scanner->_vir_cid_to_idx_in_block) {
375
72.3k
            vcid_to_idx.push_back(fmt::format("{}-{}", pair.first, pair.second));
376
66.2k
        }
377
66.2k
378
66.2k
        std::string error_msg = fmt::format(
379
66.2k
                "Column in idx {} is nothing, block columns {}, normal_columns "
380
66.2k
                "{}, "
381
66.2k
                "vir_cid_to_idx_in_block_msg {}",
382
                idx, free_block->columns(), olap_scanner->_return_columns.size(),
383
38
                fmt::format("_vir_cid_to_idx_in_block:[{}]", fmt::join(vcid_to_idx, ",")));
384
38
        throw doris::Exception(ErrorCode::INTERNAL_ERROR, error_msg);
385
38
    }
386
#endif
387
6.12k
}
388
6.12k
389
6.12k
Result<SharedListenableFuture<Void>> ScannerSplitRunner::process_for(std::chrono::nanoseconds) {
390
6.12k
    _started = true;
391
6.12k
    bool is_completed = _scan_func();
392
    if (is_completed) {
393
6.12k
        _completion_future.set_value(Void {});
394
6.12k
    }
395
6.12k
    return SharedListenableFuture<Void>::create_ready(Void {});
396
6.12k
}
397
6.12k
398
bool ScannerSplitRunner::is_finished() {
399
    return _completion_future.is_done();
400
1.35M
}
401
1.35M
402
Status ScannerSplitRunner::finished_status() {
403
1.35M
    return _completion_future.get_status();
404
1.35M
}
405
150k
406
150k
bool ScannerSplitRunner::is_started() const {
407
    return _started.load();
408
1.20M
}
409
975k
410
975k
bool ScannerSplitRunner::is_auto_reschedule() const {
411
    return false;
412
227k
}
413
712k
414
} // namespace doris