Coverage Report

Created: 2026-03-12 15:09

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exec/scan/scanner_scheduler.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exec/scan/scanner_scheduler.h"
19
20
#include <algorithm>
21
#include <cstdint>
22
#include <functional>
23
#include <list>
24
#include <memory>
25
#include <ostream>
26
#include <string>
27
#include <utility>
28
29
#include "common/compiler_util.h" // IWYU pragma: keep
30
#include "common/config.h"
31
#include "common/exception.h"
32
#include "common/logging.h"
33
#include "common/status.h"
34
#include "core/block/block.h"
35
#include "exec/pipeline/pipeline_task.h"
36
#include "exec/scan/file_scanner.h"
37
#include "exec/scan/olap_scanner.h" // IWYU pragma: keep
38
#include "exec/scan/scan_node.h"
39
#include "exec/scan/scanner.h"
40
#include "exec/scan/scanner_context.h"
41
#include "runtime/exec_env.h"
42
#include "runtime/runtime_state.h"
43
#include "runtime/thread_context.h"
44
#include "runtime/workload_group/workload_group_manager.h"
45
#include "storage/tablet/tablet.h"
46
#include "util/async_io.h" // IWYU pragma: keep
47
#include "util/cpu_info.h"
48
#include "util/defer_op.h"
49
#include "util/thread.h"
50
#include "util/threadpool.h"
51
52
namespace doris {
53
54
Status ScannerScheduler::submit(std::shared_ptr<ScannerContext> ctx,
55
964k
                                std::shared_ptr<ScanTask> scan_task) {
56
964k
    if (ctx->done()) {
57
0
        return Status::OK();
58
0
    }
59
964k
    auto task_lock = ctx->task_exec_ctx();
60
964k
    if (task_lock == nullptr) {
61
18
        LOG(INFO) << "could not lock task execution context, query " << ctx->debug_string()
62
18
                  << " maybe finished";
63
18
        return Status::OK();
64
18
    }
65
964k
    std::shared_ptr<ScannerDelegate> scanner_delegate = scan_task->scanner.lock();
66
964k
    if (scanner_delegate == nullptr) {
67
0
        return Status::OK();
68
0
    }
69
70
964k
    scanner_delegate->_scanner->start_wait_worker_timer();
71
964k
    TabletStorageType type = scanner_delegate->_scanner->get_storage_type();
72
964k
    auto sumbit_task = [&]() {
73
964k
        auto work_func = [scanner_ref = scan_task, ctx]() {
74
963k
            auto status = [&] {
75
963k
                RETURN_IF_CATCH_EXCEPTION(_scanner_scan(ctx, scanner_ref));
76
962k
                return Status::OK();
77
963k
            }();
78
79
963k
            if (!status.ok()) {
80
0
                scanner_ref->set_status(status);
81
0
                ctx->push_back_scan_task(scanner_ref);
82
0
                return true;
83
0
            }
84
963k
            return scanner_ref->is_eos();
85
963k
        };
86
964k
        SimplifiedScanTask simple_scan_task = {work_func, ctx, scan_task};
87
964k
        return this->submit_scan_task(simple_scan_task);
88
964k
    };
89
90
964k
    Status submit_status = sumbit_task();
91
964k
    if (!submit_status.ok()) {
92
        // User will see TooManyTasks error. It looks like a more reasonable error.
93
0
        Status scan_task_status = Status::TooManyTasks(
94
0
                "Failed to submit scanner to scanner pool reason:" +
95
0
                std::string(submit_status.msg()) + "|type:" + std::to_string(type));
96
0
        scan_task->set_status(scan_task_status);
97
0
        return scan_task_status;
98
0
    }
99
100
964k
    return Status::OK();
101
964k
}
102
103
void handle_reserve_memory_failure(RuntimeState* state, std::shared_ptr<ScannerContext> ctx,
104
0
                                   const Status& st, size_t reserve_size) {
105
0
    ctx->clear_free_blocks();
106
0
    auto* local_state = ctx->local_state();
107
108
0
    auto debug_msg = fmt::format(
109
0
            "Query: {} , scanner try to reserve: {}, operator name {}, "
110
0
            "operator "
111
0
            "id: {}, "
112
0
            "task id: "
113
0
            "{}, failed: {}",
114
0
            print_id(state->query_id()), PrettyPrinter::print_bytes(reserve_size),
115
0
            local_state->get_name(), local_state->parent()->node_id(), state->task_id(),
116
0
            st.to_string());
117
    // PROCESS_MEMORY_EXCEEDED error msg alread contains process_mem_log_str
118
0
    if (!st.is<ErrorCode::PROCESS_MEMORY_EXCEEDED>()) {
119
0
        debug_msg += fmt::format(", debug info: {}", GlobalMemoryArbitrator::process_mem_log_str());
120
0
    }
121
0
    VLOG_DEBUG << debug_msg;
122
123
0
    state->get_query_ctx()->set_low_memory_mode();
124
0
}
125
126
void ScannerScheduler::_scanner_scan(std::shared_ptr<ScannerContext> ctx,
127
963k
                                     std::shared_ptr<ScanTask> scan_task) {
128
963k
    auto task_lock = ctx->task_exec_ctx();
129
963k
    if (task_lock == nullptr) {
130
0
        return;
131
0
    }
132
963k
    SCOPED_ATTACH_TASK(ctx->state());
133
134
963k
    ctx->update_peak_running_scanner(1);
135
963k
    Defer defer([&] { ctx->update_peak_running_scanner(-1); });
136
137
963k
    std::shared_ptr<ScannerDelegate> scanner_delegate = scan_task->scanner.lock();
138
963k
    if (scanner_delegate == nullptr) {
139
2
        return;
140
2
    }
141
142
963k
    ScannerSPtr& scanner = scanner_delegate->_scanner;
143
    // for cpu hard limit, thread name should not be reset
144
963k
    if (ctx->_should_reset_thread_name) {
145
0
        Thread::set_self_name("_scanner_scan");
146
0
    }
147
148
963k
#ifndef __APPLE__
149
    // The configuration item is used to lower the priority of the scanner thread,
150
    // typically employed to ensure CPU scheduling for write operations.
151
963k
    if (config::scan_thread_nice_value != 0 && scanner->get_name() != FileScanner::NAME) {
152
0
        Thread::set_thread_nice_value();
153
0
    }
154
963k
#endif
155
963k
    MonotonicStopWatch max_run_time_watch;
156
963k
    max_run_time_watch.start();
157
963k
    scanner->update_wait_worker_timer();
158
963k
    scanner->start_scan_cpu_timer();
159
160
    // Counter update need prepare successfully, or it maybe core. For example, olap scanner
161
    // will open tablet reader during prepare, if not prepare successfully, tablet reader == nullptr.
162
963k
    bool need_update_profile = scanner->has_prepared();
163
1.91M
    auto update_scanner_profile = [&]() {
164
1.91M
        if (need_update_profile) {
165
5.40k
            scanner->update_scan_cpu_timer();
166
5.40k
            scanner->update_realtime_counters();
167
5.40k
            need_update_profile = false;
168
5.40k
        }
169
1.91M
    };
170
963k
    Defer defer_scanner([&] {
171
        // WorkloadGroup Policy will check cputime realtime, so that should update the counter
172
        // as soon as possible, could not update it on close.
173
962k
        update_scanner_profile();
174
962k
    });
175
963k
    Status status = Status::OK();
176
963k
    bool eos = false;
177
963k
    ASSIGN_STATUS_IF_CATCH_EXCEPTION(
178
963k
            RuntimeState* state = ctx->state(); DCHECK(nullptr != state);
179
            // scanner->open may alloc plenty amount of memory(read blocks of data),
180
            // so better to also check low memory and clear free blocks here.
181
963k
            if (ctx->low_memory_mode()) { ctx->clear_free_blocks(); }
182
183
963k
            if (!scanner->has_prepared()) {
184
963k
                status = scanner->prepare();
185
963k
                if (!status.ok()) {
186
963k
                    eos = true;
187
963k
                }
188
963k
            }
189
190
963k
            if (!eos && !scanner->is_open()) {
191
963k
                status = scanner->open(state);
192
963k
                if (!status.ok()) {
193
963k
                    eos = true;
194
963k
                }
195
963k
                scanner->set_opened();
196
963k
            }
197
198
963k
            Status rf_status = scanner->try_append_late_arrival_runtime_filter();
199
963k
            if (!rf_status.ok()) {
200
963k
                LOG(WARNING) << "Failed to append late arrival runtime filter: "
201
963k
                             << rf_status.to_string();
202
963k
            }
203
204
963k
            size_t raw_bytes_threshold = config::doris_scanner_row_bytes;
205
963k
            if (ctx->low_memory_mode()) {
206
963k
                ctx->clear_free_blocks();
207
963k
                if (raw_bytes_threshold > ctx->low_memory_mode_scan_bytes_per_scanner()) {
208
963k
                    raw_bytes_threshold = ctx->low_memory_mode_scan_bytes_per_scanner();
209
963k
                }
210
963k
            }
211
212
963k
            size_t raw_bytes_read = 0;
213
963k
            bool first_read = true; int64_t limit = scanner->limit();
214
            // If the first block is full, then it is true. Or the first block + second block > batch_size
215
963k
            bool has_first_full_block = false;
216
217
            // During low memory mode, every scan task will return at most 2 block to reduce memory usage.
218
963k
            while (!eos && raw_bytes_read < raw_bytes_threshold &&
219
963k
                   (!ctx->low_memory_mode() || !has_first_full_block) &&
220
963k
                   (!has_first_full_block || doris::thread_context()
221
963k
                                                     ->thread_mem_tracker_mgr->limiter_mem_tracker()
222
963k
                                                     ->check_limit(1))) {
223
963k
                if (UNLIKELY(ctx->done())) {
224
963k
                    eos = true;
225
963k
                    break;
226
963k
                }
227
963k
                if (max_run_time_watch.elapsed_time() >
228
963k
                    config::doris_scanner_max_run_time_ms * 1e6) {
229
963k
                    break;
230
963k
                }
231
963k
                DEFER_RELEASE_RESERVED();
232
963k
                BlockUPtr free_block;
233
963k
                if (first_read) {
234
963k
                    free_block = ctx->get_free_block(first_read);
235
963k
                } else {
236
963k
                    if (state->get_query_ctx()
237
963k
                                ->resource_ctx()
238
963k
                                ->task_controller()
239
963k
                                ->is_enable_reserve_memory()) {
240
963k
                        size_t block_avg_bytes = scanner->get_block_avg_bytes();
241
963k
                        auto st = thread_context()->thread_mem_tracker_mgr->try_reserve(
242
963k
                                block_avg_bytes);
243
963k
                        if (!st.ok()) {
244
963k
                            handle_reserve_memory_failure(state, ctx, st, block_avg_bytes);
245
963k
                            break;
246
963k
                        }
247
963k
                    }
248
963k
                    free_block = ctx->get_free_block(first_read);
249
963k
                }
250
963k
                if (free_block == nullptr) {
251
963k
                    break;
252
963k
                }
253
                // We got a new created block or a reused block.
254
963k
                status = scanner->get_block_after_projects(state, free_block.get(), &eos);
255
963k
                first_read = false;
256
963k
                if (!status.ok()) {
257
963k
                    LOG(WARNING) << "Scan thread read Scanner failed: " << status.to_string();
258
963k
                    break;
259
963k
                }
260
                // Check column type only after block is read successfully.
261
                // Or it may cause a crash when the block is not normal.
262
963k
                _make_sure_virtual_col_is_materialized(scanner, free_block.get());
263
                // Projection will truncate useless columns, makes block size change.
264
963k
                auto free_block_bytes = free_block->allocated_bytes();
265
963k
                raw_bytes_read += free_block_bytes;
266
963k
                if (!scan_task->cached_blocks.empty() &&
267
963k
                    scan_task->cached_blocks.back().first->rows() + free_block->rows() <=
268
963k
                            ctx->batch_size()) {
269
963k
                    size_t block_size = scan_task->cached_blocks.back().first->allocated_bytes();
270
963k
                    MutableBlock mutable_block(scan_task->cached_blocks.back().first.get());
271
963k
                    status = mutable_block.merge(*free_block);
272
963k
                    if (!status.ok()) {
273
963k
                        LOG(WARNING) << "Block merge failed: " << status.to_string();
274
963k
                        break;
275
963k
                    }
276
963k
                    scan_task->cached_blocks.back().second = mutable_block.allocated_bytes();
277
963k
                    scan_task->cached_blocks.back().first.get()->set_columns(
278
963k
                            std::move(mutable_block.mutable_columns()));
279
280
                    // Return block succeed or not, this free_block is not used by this scan task any more.
281
                    // If block can be reused, its memory usage will be added back.
282
963k
                    ctx->return_free_block(std::move(free_block));
283
963k
                    ctx->inc_block_usage(scan_task->cached_blocks.back().first->allocated_bytes() -
284
963k
                                         block_size);
285
963k
                } else {
286
963k
                    if (!scan_task->cached_blocks.empty()) {
287
963k
                        has_first_full_block = true;
288
963k
                    }
289
963k
                    ctx->inc_block_usage(free_block->allocated_bytes());
290
963k
                    scan_task->cached_blocks.emplace_back(std::move(free_block), free_block_bytes);
291
963k
                }
292
293
963k
                if (limit > 0 && limit < ctx->batch_size()) {
294
                    // If this scanner has limit, and less than batch size,
295
                    // return immediately and no need to wait raw_bytes_threshold.
296
                    // This can save time that each scanner may only return a small number of rows,
297
                    // but rows are enough from all scanners.
298
                    // If not break, the query like "select * from tbl where id=1 limit 10"
299
                    // may scan a lot data when the "id=1"'s filter ratio is high.
300
                    // If limit is larger than batch size, this rule is skipped,
301
                    // to avoid user specify a large limit and causing too much small blocks.
302
963k
                    break;
303
963k
                }
304
305
963k
                if (scan_task->cached_blocks.back().first->rows() > 0) {
306
963k
                    auto block_avg_bytes = (scan_task->cached_blocks.back().first->bytes() +
307
963k
                                            scan_task->cached_blocks.back().first->rows() - 1) /
308
963k
                                           scan_task->cached_blocks.back().first->rows() *
309
963k
                                           ctx->batch_size();
310
963k
                    scanner->update_block_avg_bytes(block_avg_bytes);
311
963k
                }
312
963k
                if (ctx->low_memory_mode()) {
313
963k
                    ctx->clear_free_blocks();
314
963k
                    if (raw_bytes_threshold > ctx->low_memory_mode_scan_bytes_per_scanner()) {
315
963k
                        raw_bytes_threshold = ctx->low_memory_mode_scan_bytes_per_scanner();
316
963k
                    }
317
963k
                }
318
963k
            } // end for while
319
320
963k
            if (UNLIKELY(!status.ok())) {
321
963k
                scan_task->set_status(status);
322
963k
                eos = true;
323
963k
            },
324
963k
            status);
325
326
960k
    if (UNLIKELY(!status.ok())) {
327
1.33k
        scan_task->set_status(status);
328
1.33k
        eos = true;
329
1.33k
    }
330
331
960k
    if (eos) {
332
        // If eos, scanner will call _collect_profile_before_close to update profile,
333
        // so we need update_scanner_profile here
334
955k
        update_scanner_profile();
335
955k
        scanner->mark_to_need_to_close();
336
955k
    }
337
960k
    scan_task->set_eos(eos);
338
339
18.4E
    VLOG_DEBUG << fmt::format(
340
18.4E
            "Scanner context {} has finished task, cached_block {} current scheduled task is "
341
18.4E
            "{}, eos: {}, status: {}",
342
18.4E
            ctx->ctx_id, scan_task->cached_blocks.size(), ctx->num_scheduled_scanners(), eos,
343
18.4E
            status.to_string());
344
345
960k
    ctx->push_back_scan_task(scan_task);
346
960k
}
347
17.9k
int ScannerScheduler::default_local_scan_thread_num() {
348
17.9k
    return config::doris_scanner_thread_pool_thread_num > 0
349
17.9k
                   ? config::doris_scanner_thread_pool_thread_num
350
17.9k
                   : std::max(48, CpuInfo::num_cores() * 2);
351
17.9k
}
352
11.6k
int ScannerScheduler::default_remote_scan_thread_num() {
353
11.6k
    int num = config::doris_max_remote_scanner_thread_pool_thread_num > 0
354
11.6k
                      ? config::doris_max_remote_scanner_thread_pool_thread_num
355
11.6k
                      : std::max(512, CpuInfo::num_cores() * 10);
356
11.6k
    return std::max(num, default_local_scan_thread_num());
357
11.6k
}
358
359
37
int ScannerScheduler::get_remote_scan_thread_queue_size() {
360
37
    return config::doris_remote_scanner_thread_pool_queue_size;
361
37
}
362
363
6.32k
int ScannerScheduler::default_min_active_scan_threads() {
364
6.32k
    return config::min_active_scan_threads > 0
365
6.32k
                   ? config::min_active_scan_threads
366
6.32k
                   : config::min_active_scan_threads = CpuInfo::num_cores() * 2;
367
6.32k
}
368
369
6.32k
int ScannerScheduler::default_min_active_file_scan_threads() {
370
6.32k
    return config::min_active_file_scan_threads > 0
371
6.32k
                   ? config::min_active_file_scan_threads
372
6.32k
                   : config::min_active_file_scan_threads = CpuInfo::num_cores() * 8;
373
6.32k
}
374
375
void ScannerScheduler::_make_sure_virtual_col_is_materialized(
376
1.00M
        const std::shared_ptr<Scanner>& scanner, Block* free_block) {
377
1.00M
#ifndef NDEBUG
378
    // Currently, virtual column can only be used on olap table.
379
1.00M
    std::shared_ptr<OlapScanner> olap_scanner = std::dynamic_pointer_cast<OlapScanner>(scanner);
380
1.00M
    if (olap_scanner == nullptr) {
381
13.6k
        return;
382
13.6k
    }
383
384
986k
    if (free_block->rows() == 0) {
385
758k
        return;
386
758k
    }
387
388
227k
    size_t idx = 0;
389
695k
    for (const auto& entry : *free_block) {
390
        // Virtual column must be materialized on the end of SegmentIterator's next batch method.
391
695k
        const ColumnNothing* column_nothing =
392
695k
                check_and_get_column<ColumnNothing>(entry.column.get());
393
695k
        if (column_nothing == nullptr) {
394
695k
            idx++;
395
695k
            continue;
396
695k
        }
397
398
18.4E
        std::vector<std::string> vcid_to_idx;
399
400
18.4E
        for (const auto& pair : olap_scanner->_vir_cid_to_idx_in_block) {
401
0
            vcid_to_idx.push_back(fmt::format("{}-{}", pair.first, pair.second));
402
0
        }
403
404
18.4E
        std::string error_msg = fmt::format(
405
18.4E
                "Column in idx {} is nothing, block columns {}, normal_columns "
406
18.4E
                "{}, "
407
18.4E
                "vir_cid_to_idx_in_block_msg {}",
408
18.4E
                idx, free_block->columns(), olap_scanner->_return_columns.size(),
409
18.4E
                fmt::format("_vir_cid_to_idx_in_block:[{}]", fmt::join(vcid_to_idx, ",")));
410
18.4E
        throw doris::Exception(ErrorCode::INTERNAL_ERROR, error_msg);
411
695k
    }
412
227k
#endif
413
227k
}
414
415
964k
Result<SharedListenableFuture<Void>> ScannerSplitRunner::process_for(std::chrono::nanoseconds) {
416
964k
    _started = true;
417
964k
    bool is_completed = _scan_func();
418
964k
    if (is_completed) {
419
958k
        _completion_future.set_value(Void {});
420
958k
    }
421
964k
    return SharedListenableFuture<Void>::create_ready(Void {});
422
964k
}
423
424
962k
bool ScannerSplitRunner::is_finished() {
425
962k
    return _completion_future.is_done();
426
962k
}
427
428
957k
Status ScannerSplitRunner::finished_status() {
429
957k
    return _completion_future.get_status();
430
957k
}
431
432
0
bool ScannerSplitRunner::is_started() const {
433
0
    return _started.load();
434
0
}
435
436
5.41k
bool ScannerSplitRunner::is_auto_reschedule() const {
437
5.41k
    return false;
438
5.41k
}
439
440
} // namespace doris