Coverage Report

Created: 2026-06-26 20:21

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exec/pipeline/pipeline_task.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exec/pipeline/pipeline_task.h"
19
20
#include <fmt/core.h>
21
#include <fmt/format.h>
22
#include <gen_cpp/Metrics_types.h>
23
#include <glog/logging.h>
24
25
#include <algorithm>
26
#include <memory>
27
#include <ostream>
28
#include <vector>
29
30
#include "common/exception.h"
31
#include "common/logging.h"
32
#include "common/status.h"
33
#include "core/block/block.h"
34
#include "exec/operator/exchange_source_operator.h"
35
#include "exec/operator/operator.h"
36
#include "exec/operator/rec_cte_source_operator.h"
37
#include "exec/operator/scan_operator.h"
38
#include "exec/pipeline/dependency.h"
39
#include "exec/pipeline/pipeline.h"
40
#include "exec/pipeline/pipeline_fragment_context.h"
41
#include "exec/pipeline/revokable_task.h"
42
#include "exec/pipeline/task_queue.h"
43
#include "exec/pipeline/task_scheduler.h"
44
#include "exec/spill/spill_file.h"
45
#include "runtime/descriptors.h"
46
#include "runtime/exec_env.h"
47
#include "runtime/query_context.h"
48
#include "runtime/runtime_profile.h"
49
#include "runtime/runtime_profile_counter_names.h"
50
#include "runtime/thread_context.h"
51
#include "runtime/workload_group/workload_group_manager.h"
52
#include "util/defer_op.h"
53
#include "util/mem_info.h"
54
#include "util/uid_util.h"
55
56
namespace doris {
57
class RuntimeState;
58
} // namespace doris
59
60
namespace doris {
61
62
PipelineTask::PipelineTask(PipelinePtr& pipeline, uint32_t task_id, RuntimeState* state,
63
                           std::shared_ptr<PipelineFragmentContext> fragment_context,
64
                           RuntimeProfile* parent_profile,
65
                           std::map<int, std::pair<std::shared_ptr<BasicSharedState>,
66
                                                   std::vector<std::shared_ptr<Dependency>>>>
67
                                   shared_state_map,
68
                           int task_idx)
69
        :
70
#ifdef BE_TEST
71
72.1k
          _query_id(fragment_context ? fragment_context->get_query_id() : TUniqueId()),
72
#else
73
          _query_id(fragment_context->get_query_id()),
74
#endif
75
72.1k
          _index(task_id),
76
72.1k
          _pipeline(pipeline),
77
72.1k
          _opened(false),
78
72.1k
          _state(state),
79
72.1k
          _fragment_context(fragment_context),
80
72.1k
          _parent_profile(parent_profile),
81
72.1k
          _operators(pipeline->operators()),
82
72.1k
          _source(_operators.front().get()),
83
72.1k
          _root(_operators.back().get()),
84
72.1k
          _sink(pipeline->sink_shared_pointer()),
85
72.1k
          _shared_state_map(std::move(shared_state_map)),
86
72.1k
          _task_idx(task_idx),
87
72.1k
          _memory_sufficient_dependency(state->get_query_ctx()->get_memory_sufficient_dependency()),
88
72.1k
          _pipeline_name(_pipeline->name()) {
89
#ifndef BE_TEST
90
    _query_mem_tracker = fragment_context->get_query_ctx()->query_mem_tracker();
91
#endif
92
72.1k
    _execution_dependencies.push_back(state->get_query_ctx()->get_execution_dependency());
93
72.1k
    if (!_shared_state_map.contains(_sink->dests_id().front())) {
94
72.1k
        auto shared_state = _sink->create_shared_state();
95
72.1k
        if (shared_state) {
96
43
            _sink_shared_state = shared_state;
97
43
        }
98
72.1k
    }
99
72.1k
}
100
101
72.1k
PipelineTask::~PipelineTask() {
102
72.1k
    auto reset_member = [&]() {
103
72.1k
        _shared_state_map.clear();
104
72.1k
        _sink_shared_state.reset();
105
72.1k
        _op_shared_states.clear();
106
72.1k
        _sink.reset();
107
72.1k
        _operators.clear();
108
72.1k
        _block.reset();
109
72.1k
        _pipeline.reset();
110
72.1k
    };
111
// PipelineTask is also hold by task queue( https://github.com/apache/doris/pull/49753),
112
// so that it maybe the last one to be destructed.
113
// But pipeline task hold some objects, like operators, shared state, etc. So that should release
114
// memory manually.
115
#ifndef BE_TEST
116
    if (_query_mem_tracker) {
117
        SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(_query_mem_tracker);
118
        reset_member();
119
        return;
120
    }
121
#endif
122
72.1k
    reset_member();
123
72.1k
}
124
125
Status PipelineTask::prepare(const std::vector<TScanRangeParams>& scan_range, const int sender_id,
126
23
                             const TDataSink& tsink) {
127
23
    DCHECK(_sink);
128
23
    _init_profile();
129
23
    SCOPED_TIMER(_task_profile->total_time_counter());
130
23
    SCOPED_CPU_TIMER(_task_cpu_timer);
131
23
    SCOPED_TIMER(_prepare_timer);
132
23
    DBUG_EXECUTE_IF("fault_inject::PipelineXTask::prepare", {
133
23
        Status status = Status::Error<INTERNAL_ERROR>("fault_inject pipeline_task prepare failed");
134
23
        return status;
135
23
    });
136
23
    {
137
        // set sink local state
138
23
        LocalSinkStateInfo info {_task_idx,         _task_profile.get(),
139
23
                                 sender_id,         get_sink_shared_state().get(),
140
23
                                 _shared_state_map, tsink};
141
23
        RETURN_IF_ERROR(_sink->setup_local_state(_state, info));
142
23
    }
143
144
23
    _scan_ranges = scan_range;
145
23
    auto* parent_profile = _state->get_sink_local_state()->operator_profile();
146
147
48
    for (int op_idx = cast_set<int>(_operators.size() - 1); op_idx >= 0; op_idx--) {
148
25
        auto& op = _operators[op_idx];
149
25
        LocalStateInfo info {parent_profile, _scan_ranges, get_op_shared_state(op->operator_id()),
150
25
                             _shared_state_map, _task_idx};
151
25
        RETURN_IF_ERROR(op->setup_local_state(_state, info));
152
25
        parent_profile = _state->get_local_state(op->operator_id())->operator_profile();
153
25
    }
154
23
    {
155
23
        const auto& deps =
156
23
                _state->get_local_state(_source->operator_id())->execution_dependencies();
157
23
        std::unique_lock<std::mutex> lc(_dependency_lifecycle_lock);
158
23
        std::copy(deps.begin(), deps.end(),
159
23
                  std::inserter(_execution_dependencies, _execution_dependencies.end()));
160
23
    }
161
23
    if (auto fragment = _fragment_context.lock()) {
162
22
        if (fragment->get_query_ctx()->is_cancelled()) {
163
0
            unblock_all_dependencies();
164
0
            return fragment->get_query_ctx()->exec_status();
165
0
        }
166
22
    } else {
167
1
        return Status::InternalError("Fragment already finished! Query: {}", print_id(_query_id));
168
1
    }
169
22
    _block = doris::Block::create_unique();
170
22
    return _state_transition(State::RUNNABLE);
171
23
}
172
173
14
Status PipelineTask::_extract_dependencies() {
174
14
    std::vector<std::vector<Dependency*>> read_dependencies;
175
14
    std::vector<Dependency*> write_dependencies;
176
14
    std::vector<Dependency*> finish_dependencies;
177
14
    read_dependencies.resize(_operators.size());
178
14
    size_t i = 0;
179
16
    for (auto& op : _operators) {
180
16
        auto* local_state = _state->get_local_state(op->operator_id());
181
16
        DCHECK(local_state);
182
16
        read_dependencies[i] = local_state->dependencies();
183
16
        auto* fin_dep = local_state->finishdependency();
184
16
        if (fin_dep) {
185
9
            finish_dependencies.push_back(fin_dep);
186
9
        }
187
16
        i++;
188
16
    }
189
14
    DBUG_EXECUTE_IF("fault_inject::PipelineXTask::_extract_dependencies", {
190
14
        Status status = Status::Error<INTERNAL_ERROR>(
191
14
                "fault_inject pipeline_task _extract_dependencies failed");
192
14
        return status;
193
14
    });
194
14
    {
195
14
        auto* local_state = _state->get_sink_local_state();
196
14
        write_dependencies = local_state->dependencies();
197
14
        auto* fin_dep = local_state->finishdependency();
198
14
        if (fin_dep) {
199
14
            finish_dependencies.push_back(fin_dep);
200
14
        }
201
14
    }
202
14
    {
203
14
        std::unique_lock<std::mutex> lc(_dependency_lifecycle_lock);
204
14
        read_dependencies.swap(_read_dependencies);
205
14
        write_dependencies.swap(_write_dependencies);
206
14
        finish_dependencies.swap(_finish_dependencies);
207
14
    }
208
14
    return Status::OK();
209
14
}
210
211
6
bool PipelineTask::inject_shared_state(std::shared_ptr<BasicSharedState> shared_state) {
212
6
    if (!shared_state) {
213
1
        return false;
214
1
    }
215
    // Shared state is created by upstream task's sink operator and shared by source operator of
216
    // this task.
217
7
    for (auto& op : _operators) {
218
7
        if (shared_state->related_op_ids.contains(op->operator_id())) {
219
3
            _op_shared_states.insert({op->operator_id(), shared_state});
220
3
            return true;
221
3
        }
222
7
    }
223
    // Shared state is created by the first sink operator and shared by sink operator of this task.
224
    // For example, Set operations.
225
2
    if (shared_state->related_op_ids.contains(_sink->dests_id().front())) {
226
1
        DCHECK_EQ(_sink_shared_state, nullptr)
227
0
                << " Sink: " << _sink->get_name() << " dest id: " << _sink->dests_id().front();
228
1
        _sink_shared_state = shared_state;
229
1
        return true;
230
1
    }
231
1
    return false;
232
2
}
233
234
23
void PipelineTask::_init_profile() {
235
23
    _task_profile = std::make_unique<RuntimeProfile>(fmt::format("PipelineTask(index={})", _index));
236
23
    _parent_profile->add_child(_task_profile.get(), true, nullptr);
237
23
    _task_cpu_timer = ADD_TIMER(_task_profile, profile::TASK_CPU_TIME);
238
239
23
    static const char* exec_time = profile::EXECUTE_TIME;
240
23
    _exec_timer = ADD_TIMER(_task_profile, exec_time);
241
23
    _prepare_timer = ADD_CHILD_TIMER(_task_profile, profile::PREPARE_TIME, exec_time);
242
23
    _open_timer = ADD_CHILD_TIMER(_task_profile, profile::OPEN_TIME, exec_time);
243
23
    _get_block_timer = ADD_CHILD_TIMER(_task_profile, profile::GET_BLOCK_TIME, exec_time);
244
23
    _get_block_counter = ADD_COUNTER(_task_profile, profile::GET_BLOCK_COUNTER, TUnit::UNIT);
245
23
    _sink_timer = ADD_CHILD_TIMER(_task_profile, profile::SINK_TIME, exec_time);
246
23
    _close_timer = ADD_CHILD_TIMER(_task_profile, profile::CLOSE_TIME, exec_time);
247
248
23
    _wait_worker_timer = ADD_TIMER_WITH_LEVEL(_task_profile, profile::WAIT_WORKER_TIME, 1);
249
250
23
    _schedule_counts = ADD_COUNTER(_task_profile, profile::NUM_SCHEDULE_TIMES, TUnit::UNIT);
251
23
    _yield_counts = ADD_COUNTER(_task_profile, profile::NUM_YIELD_TIMES, TUnit::UNIT);
252
23
    _core_change_times = ADD_COUNTER(_task_profile, profile::CORE_CHANGE_TIMES, TUnit::UNIT);
253
23
    _memory_reserve_times = ADD_COUNTER(_task_profile, profile::MEMORY_RESERVE_TIMES, TUnit::UNIT);
254
23
    _memory_reserve_failed_times =
255
23
            ADD_COUNTER(_task_profile, profile::MEMORY_RESERVE_FAILED_TIMES, TUnit::UNIT);
256
23
}
257
258
6
void PipelineTask::_fresh_profile_counter() {
259
6
    COUNTER_SET(_schedule_counts, (int64_t)_schedule_time);
260
6
    COUNTER_SET(_wait_worker_timer, (int64_t)_wait_worker_watcher.elapsed_time());
261
6
}
262
263
14
Status PipelineTask::_open() {
264
14
    SCOPED_TIMER(_task_profile->total_time_counter());
265
14
    SCOPED_CPU_TIMER(_task_cpu_timer);
266
14
    SCOPED_TIMER(_open_timer);
267
14
    _dry_run = _sink->should_dry_run(_state);
268
16
    for (auto& o : _operators) {
269
16
        RETURN_IF_ERROR(_state->get_local_state(o->operator_id())->open(_state));
270
16
    }
271
14
    RETURN_IF_ERROR(_state->get_sink_local_state()->open(_state));
272
14
    RETURN_IF_ERROR(_extract_dependencies());
273
14
    DBUG_EXECUTE_IF("fault_inject::PipelineXTask::open", {
274
14
        Status status = Status::Error<INTERNAL_ERROR>("fault_inject pipeline_task open failed");
275
14
        return status;
276
14
    });
277
14
    _opened = true;
278
14
    return Status::OK();
279
14
}
280
281
68
Status PipelineTask::_prepare() {
282
68
    SCOPED_TIMER(_task_profile->total_time_counter());
283
68
    SCOPED_CPU_TIMER(_task_cpu_timer);
284
80
    for (auto& o : _operators) {
285
80
        RETURN_IF_ERROR(_state->get_local_state(o->operator_id())->prepare(_state));
286
80
    }
287
68
    RETURN_IF_ERROR(_state->get_sink_local_state()->prepare(_state));
288
68
    return Status::OK();
289
68
}
290
291
45
bool PipelineTask::_wait_to_start() {
292
    // Before task starting, we should make sure
293
    // 1. Execution dependency is ready (which is controlled by FE 2-phase commit)
294
    // 2. Runtime filter dependencies are ready
295
    // 3. All tablets are loaded into local storage
296
45
    return std::any_of(
297
45
            _execution_dependencies.begin(), _execution_dependencies.end(),
298
62
            [&](Dependency* dep) -> bool { return dep->is_blocked_by(shared_from_this()); });
299
45
}
300
301
18
bool PipelineTask::_is_pending_finish() {
302
    // Spilling may be in progress if eos is true.
303
25
    return std::ranges::any_of(_finish_dependencies, [&](Dependency* dep) -> bool {
304
25
        return dep->is_blocked_by(shared_from_this());
305
25
    });
306
18
}
307
308
1
bool PipelineTask::is_blockable() const {
309
    // Before task starting, we should make sure
310
    // 1. Execution dependency is ready (which is controlled by FE 2-phase commit)
311
    // 2. Runtime filter dependencies are ready
312
    // 3. All tablets are loaded into local storage
313
314
1
    if (_state->enable_fuzzy_blockable_task()) {
315
0
        if ((_schedule_time + _task_idx) % 2 == 0) {
316
0
            return true;
317
0
        }
318
0
    }
319
320
1
    return std::ranges::any_of(_operators,
321
1
                               [&](OperatorPtr op) -> bool { return op->is_blockable(_state); }) ||
322
1
           _sink->is_blockable(_state);
323
1
}
324
325
11
void PipelineTask::_stop_accepting_submit() {
326
11
    std::unique_lock<std::mutex> lock(_blockable_check_lock);
327
11
    _accept_submit = false;
328
11
}
329
330
1.51M
bool PipelineTask::_is_blocked() {
331
    // `_dry_run = true` means we do not need data from source operator.
332
1.51M
    if (!_dry_run) {
333
3.02M
        for (int i = cast_set<int>(_read_dependencies.size() - 1); i >= 0; i--) {
334
            // `_read_dependencies` is organized according to operators. For each operator, running condition is met iff all dependencies are ready.
335
1.51M
            for (auto* dep : _read_dependencies[i]) {
336
1.51M
                if (dep->is_blocked_by(shared_from_this())) {
337
15
                    return true;
338
15
                }
339
1.51M
            }
340
            // If all dependencies are ready for this operator, we can execute this task if no datum is needed from upstream operators.
341
1.51M
            if (!_operators[i]->need_more_input_data(_state)) {
342
2
                break;
343
2
            }
344
1.51M
        }
345
1.51M
    }
346
1.51M
    return _memory_sufficient_dependency->is_blocked_by(shared_from_this()) ||
347
1.51M
           std::ranges::any_of(_write_dependencies, [&](Dependency* dep) -> bool {
348
1.51M
               return dep->is_blocked_by(shared_from_this());
349
1.51M
           });
350
1.51M
}
351
352
5
void PipelineTask::unblock_all_dependencies() {
353
    // Keep dependency pointers and task-owned operator/shared state stable because set_ready() may
354
    // synchronously call wake_up() and submit this task.
355
5
    std::unique_lock<std::mutex> lock(_dependency_lifecycle_lock);
356
5
    auto fragment = _fragment_context.lock();
357
5
    if (!is_finalized() && fragment) {
358
5
        try {
359
5
            DCHECK(_wake_up_early || fragment->is_canceled());
360
5
            DBUG_EXECUTE_IF("PipelineTask::unblock_all_dependencies.before_set_ready", {
361
5
                if (dp->callback.has_value()) {
362
5
                    DBUG_RUN_CALLBACK();
363
5
                }
364
5
            });
365
5
            std::ranges::for_each(_write_dependencies,
366
5
                                  [&](Dependency* dep) { dep->set_always_ready(); });
367
5
            std::ranges::for_each(_finish_dependencies,
368
6
                                  [&](Dependency* dep) { dep->set_always_ready(); });
369
5
            std::ranges::for_each(_read_dependencies, [&](std::vector<Dependency*>& deps) {
370
3
                std::ranges::for_each(deps, [&](Dependency* dep) { dep->set_always_ready(); });
371
3
            });
372
            // All `_execution_deps` will never be set blocking from ready. So we just set ready here.
373
5
            std::ranges::for_each(_execution_dependencies,
374
11
                                  [&](Dependency* dep) { dep->set_ready(); });
375
5
            _memory_sufficient_dependency->set_ready();
376
5
        } catch (const doris::Exception& e) {
377
0
            LOG(WARNING) << "unblock_all_dependencies failed: " << e.code() << ", "
378
0
                         << e.to_string();
379
0
        }
380
5
    }
381
5
}
382
383
// When current memory pressure is low, memory usage may increase significantly in the next
384
// operator run, while there is no revocable memory available for spilling.
385
// Trigger memory revoking when pressure is high and revocable memory is significant.
386
// Memory pressure is evaluated using two signals:
387
// 1. Query memory usage exceeds a threshold ratio of the query memory limit.
388
// 2. Workload group memory usage reaches the workload group low-watermark threshold.
389
2.29k
bool PipelineTask::_should_trigger_revoking(const size_t reserve_size) const {
390
2.29k
    if (!_state->enable_spill()) {
391
3
        return false;
392
3
    }
393
394
2.28k
    auto query_mem_tracker = _state->get_query_ctx()->query_mem_tracker();
395
2.28k
    auto wg = _state->get_query_ctx()->workload_group();
396
2.28k
    if (!query_mem_tracker || !wg) {
397
2.27k
        return false;
398
2.27k
    }
399
400
8
    const auto parallelism = std::max(1, _pipeline->num_tasks());
401
8
    const auto query_water_mark = 90; // 90%
402
8
    const auto group_mem_limit = wg->memory_limit();
403
8
    auto query_limit = query_mem_tracker->limit();
404
8
    if (query_limit <= 0) {
405
1
        query_limit = group_mem_limit;
406
7
    } else if (query_limit > group_mem_limit && group_mem_limit > 0) {
407
1
        query_limit = group_mem_limit;
408
1
    }
409
410
8
    if (query_limit <= 0) {
411
1
        return false;
412
1
    }
413
414
7
    if ((reserve_size * parallelism) <= (query_limit / 5)) {
415
1
        return false;
416
1
    }
417
418
6
    bool is_high_memory_pressure = false;
419
6
    const auto used_mem = query_mem_tracker->consumption() + reserve_size * parallelism;
420
6
    if (used_mem >= int64_t((double(query_limit) * query_water_mark / 100))) {
421
2
        is_high_memory_pressure = true;
422
2
    }
423
424
6
    if (!is_high_memory_pressure) {
425
4
        bool is_low_watermark;
426
4
        bool is_high_watermark;
427
4
        wg->check_mem_used(&is_low_watermark, &is_high_watermark);
428
4
        is_high_memory_pressure = is_low_watermark || is_high_watermark;
429
4
    }
430
431
6
    if (is_high_memory_pressure) {
432
4
        const auto revocable_size = _get_revocable_size();
433
4
        const auto total_estimated_revocable = revocable_size * parallelism;
434
4
        return total_estimated_revocable >= int64_t(double(query_limit) * 0.2);
435
4
    }
436
437
2
    return false;
438
6
}
439
440
/**
441
 * `_eos` indicates whether the execution phase is done. `done` indicates whether we could close
442
 * this task.
443
 *
444
 * For example,
445
 * 1. if `_eos` is false which means we should continue to get next block so we cannot close (e.g.
446
 *    `done` is false)
447
 * 2. if `_eos` is true which means all blocks from source are exhausted but `_is_pending_finish()`
448
 *    is true which means we should wait for a pending dependency ready (maybe a running rpc), so we
449
 *    cannot close (e.g. `done` is false)
450
 * 3. if `_eos` is true which means all blocks from source are exhausted and `_is_pending_finish()`
451
 *    is false which means we can close immediately (e.g. `done` is true)
452
 * @param done
453
 * @return
454
 */
455
39
Status PipelineTask::execute(bool* done) {
456
39
    if (_exec_state != State::RUNNABLE || _blocked_dep != nullptr) [[unlikely]] {
457
1
#ifdef BE_TEST
458
1
        return Status::InternalError("Pipeline task is not runnable! Task info: {}",
459
1
                                     debug_string());
460
#else
461
        return Status::FatalError("Pipeline task is not runnable! Task info: {}", debug_string());
462
#endif
463
1
    }
464
465
38
    auto fragment_context = _fragment_context.lock();
466
38
    if (!fragment_context) {
467
0
        return Status::InternalError("Fragment already finished! Query: {}", print_id(_query_id));
468
0
    }
469
38
    int64_t time_spent = 0;
470
38
    ThreadCpuStopWatch cpu_time_stop_watch;
471
38
    cpu_time_stop_watch.start();
472
38
    SCOPED_ATTACH_TASK(_state);
473
38
    Defer running_defer {[&]() {
474
38
        int64_t delta_cpu_time = cpu_time_stop_watch.elapsed_time();
475
38
        _task_cpu_timer->update(delta_cpu_time);
476
38
        fragment_context->get_query_ctx()->resource_ctx()->cpu_context()->update_cpu_cost_ms(
477
38
                delta_cpu_time);
478
479
        // If task is woke up early, we should terminate all operators, and this task could be closed immediately.
480
38
        if (_wake_up_early) {
481
3
            _eos = true;
482
3
            *done = true;
483
35
        } else if (_eos && !_spilling &&
484
35
                   (fragment_context->is_canceled() || !_is_pending_finish())) {
485
            // Debug point for testing the race condition fix: inject set_wake_up_early() +
486
            // unblock_all_dependencies() here to simulate Thread B writing A then B between
487
            // Thread A's two reads of _wake_up_early.
488
11
            DBUG_EXECUTE_IF("PipelineTask::execute.wake_up_early_in_else_if", {
489
11
                set_wake_up_early();
490
11
                unblock_all_dependencies();
491
11
            });
492
11
            *done = true;
493
11
        }
494
495
        // NOTE: The operator terminate() call is intentionally placed AFTER the
496
        // _is_pending_finish() check above, not before. This ordering is critical to avoid a race
497
        // condition with the seq_cst memory ordering guarantee:
498
        //
499
        // Pipeline::make_all_runnable() writes in this order:
500
        //   (A) set_wake_up_early()  ->  (B) unblock_all_dependencies() [sets finish_dep._always_ready]
501
        //
502
        // If we checked _wake_up_early (A) before _is_pending_finish() (B), there would be a
503
        // window where Thread A reads _wake_up_early=false, then Thread B writes both A and B,
504
        // then Thread A reads _is_pending_finish()=false (due to _always_ready). Thread A would
505
        // then set *done=true without ever calling operator terminate(), causing close() to run
506
        // on operators that were never properly terminated (e.g. RuntimeFilterProducer still in
507
        // WAITING_FOR_SYNCED_SIZE state when insert() is called).
508
        //
509
        // By reading _is_pending_finish() (B) before the second read of _wake_up_early (A),
510
        // if Thread A observes B's effect (_always_ready=true), it is guaranteed to also observe
511
        // A's effect (_wake_up_early=true) on this second read, ensuring operator terminate() is
512
        // called. This relies on _wake_up_early and _always_ready both being std::atomic with the
513
        // default seq_cst ordering — do not weaken them to relaxed or acq/rel.
514
38
        if (_wake_up_early) {
515
4
            THROW_IF_ERROR(_root->terminate(_state));
516
4
            THROW_IF_ERROR(_sink->terminate(_state));
517
4
        }
518
38
    }};
519
38
    const auto query_id = _state->query_id();
520
    // If this task is already EOS and block is empty (which means we already output all blocks),
521
    // just return here.
522
38
    if (_eos && !_spilling) {
523
3
        return Status::OK();
524
3
    }
525
    // If this task is blocked by a spilling request and waken up immediately, the spilling
526
    // dependency will not block this task and we should just run here.
527
35
    if (!_block->empty()) {
528
0
        LOG(INFO) << "Query: " << print_id(query_id) << " has pending block, size: "
529
0
                  << PrettyPrinter::print_bytes(_block->allocated_bytes());
530
0
        DCHECK(_spilling);
531
0
    }
532
533
35
    SCOPED_TIMER(_task_profile->total_time_counter());
534
35
    SCOPED_TIMER(_exec_timer);
535
536
35
    if (!_wake_up_early) {
537
35
        RETURN_IF_ERROR(_prepare());
538
35
    }
539
35
    DBUG_EXECUTE_IF("fault_inject::PipelineXTask::execute", {
540
35
        Status status = Status::Error<INTERNAL_ERROR>("fault_inject pipeline_task execute failed");
541
35
        return status;
542
35
    });
543
    // `_wake_up_early` must be after `_wait_to_start()`
544
35
    if (_wait_to_start() || _wake_up_early) {
545
2
        return Status::OK();
546
2
    }
547
33
    RETURN_IF_ERROR(_prepare());
548
549
    // The status must be runnable
550
33
    if (!_opened && !fragment_context->is_canceled()) {
551
13
        DBUG_EXECUTE_IF("PipelineTask::execute.open_sleep", {
552
13
            auto required_pipeline_id =
553
13
                    DebugPoints::instance()->get_debug_param_or_default<int32_t>(
554
13
                            "PipelineTask::execute.open_sleep", "pipeline_id", -1);
555
13
            auto required_task_id = DebugPoints::instance()->get_debug_param_or_default<int32_t>(
556
13
                    "PipelineTask::execute.open_sleep", "task_id", -1);
557
13
            if (required_pipeline_id == pipeline_id() && required_task_id == task_id()) {
558
13
                LOG(WARNING) << "PipelineTask::execute.open_sleep sleep 5s";
559
13
                sleep(5);
560
13
            }
561
13
        });
562
563
13
        SCOPED_RAW_TIMER(&time_spent);
564
13
        RETURN_IF_ERROR(_open());
565
13
    }
566
567
1.51M
    while (!fragment_context->is_canceled()) {
568
1.51M
        SCOPED_RAW_TIMER(&time_spent);
569
1.51M
        Defer defer {[&]() {
570
            // If this run is pended by a spilling request, the block will be output in next run.
571
1.51M
            if (!_spilling) {
572
1.50M
                _block->clear_column_data(_root->row_desc().num_materialized_slots());
573
1.50M
            }
574
1.51M
        }};
575
        // `_wake_up_early` must be after `_is_blocked()`
576
1.51M
        if (_is_blocked() || _wake_up_early) {
577
17
            return Status::OK();
578
17
        }
579
580
        /// When a task is cancelled,
581
        /// its blocking state will be cleared and it will transition to a ready state (though it is not truly ready).
582
        /// Here, checking whether it is cancelled to prevent tasks in a blocking state from being re-executed.
583
1.51M
        if (fragment_context->is_canceled()) {
584
0
            break;
585
0
        }
586
587
1.51M
        if (time_spent > _exec_time_slice) {
588
4
            COUNTER_UPDATE(_yield_counts, 1);
589
4
            break;
590
4
        }
591
1.51M
        auto* block = _block.get();
592
593
1.51M
        DBUG_EXECUTE_IF("fault_inject::PipelineXTask::executing", {
594
1.51M
            Status status =
595
1.51M
                    Status::Error<INTERNAL_ERROR>("fault_inject pipeline_task executing failed");
596
1.51M
            return status;
597
1.51M
        });
598
599
        // `_sink->is_finished(_state)` means sink operator should be finished
600
1.51M
        if (_sink->is_finished(_state)) {
601
1
            set_wake_up_early();
602
1
            return Status::OK();
603
1
        }
604
605
        // `_dry_run` means sink operator need no more data
606
1.51M
        _eos = _dry_run || _eos;
607
1.51M
        _spilling = false;
608
1.51M
        auto workload_group = _state->workload_group();
609
        // If last run is pended by a spilling request, `_block` is produced with some rows in last
610
        // run, so we will resume execution using the block.
611
1.51M
        if (!_eos && _block->empty()) {
612
1.51M
            SCOPED_TIMER(_get_block_timer);
613
1.51M
            if (_state->low_memory_mode()) {
614
0
                _sink->set_low_memory_mode(_state);
615
0
                for (auto& op : _operators) {
616
0
                    op->set_low_memory_mode(_state);
617
0
                }
618
0
            }
619
1.51M
            DEFER_RELEASE_RESERVED();
620
1.51M
            _get_block_counter->update(1);
621
            // Sum reserve sizes across all operators in this pipeline.
622
            // Each operator reports only its own requirement (non-recursive).
623
1.51M
            size_t reserve_size = 0;
624
1.51M
            for (auto& op : _operators) {
625
1.51M
                reserve_size += op->get_reserve_mem_size(_state);
626
1.51M
                op->reset_reserve_mem_size(_state);
627
1.51M
            }
628
1.51M
            if (workload_group &&
629
1.51M
                _state->get_query_ctx()
630
49.5k
                        ->resource_ctx()
631
49.5k
                        ->task_controller()
632
49.5k
                        ->is_enable_reserve_memory() &&
633
1.51M
                reserve_size > 0) {
634
1.10k
                if (_should_trigger_revoking(reserve_size)) {
635
0
                    LOG(INFO) << fmt::format(
636
0
                            "Query: {} sink: {}, node id: {}, task id: {}, reserve size: {} when "
637
0
                            "high memory pressure, try to spill",
638
0
                            print_id(_query_id), _sink->get_name(), _sink->node_id(),
639
0
                            _state->task_id(), reserve_size);
640
0
                    ExecEnv::GetInstance()->workload_group_mgr()->add_paused_query(
641
0
                            _state->get_query_ctx()->resource_ctx()->shared_from_this(),
642
0
                            reserve_size,
643
0
                            Status::Error<ErrorCode::QUERY_MEMORY_EXCEEDED>(
644
0
                                    "high memory pressure, try to spill"));
645
0
                    _spilling = true;
646
0
                    continue;
647
0
                }
648
1.10k
                if (!_try_to_reserve_memory(reserve_size, _root)) {
649
1.10k
                    continue;
650
1.10k
                }
651
1.10k
            }
652
653
1.50M
            bool eos = false;
654
1.50M
            RETURN_IF_ERROR(_root->get_block_after_projects(_state, block, &eos));
655
1.50M
            _eos = eos;
656
1.50M
        }
657
658
1.51M
        if (!_block->empty() || _eos) {
659
1.19k
            SCOPED_TIMER(_sink_timer);
660
1.19k
            Status status = Status::OK();
661
1.19k
            DEFER_RELEASE_RESERVED();
662
1.19k
            if (_state->get_query_ctx()
663
1.19k
                        ->resource_ctx()
664
1.19k
                        ->task_controller()
665
1.19k
                        ->is_enable_reserve_memory() &&
666
1.19k
                workload_group && !(_wake_up_early || _dry_run)) {
667
1.17k
                const auto sink_reserve_size = _sink->get_reserve_mem_size(_state, _eos);
668
669
1.17k
                if (sink_reserve_size > 0 && _should_trigger_revoking(sink_reserve_size)) {
670
0
                    LOG(INFO) << fmt::format(
671
0
                            "Query: {} sink: {}, node id: {}, task id: {}, reserve size: {} when "
672
0
                            "high memory pressure, try to spill",
673
0
                            print_id(_query_id), _sink->get_name(), _sink->node_id(),
674
0
                            _state->task_id(), sink_reserve_size);
675
0
                    ExecEnv::GetInstance()->workload_group_mgr()->add_paused_query(
676
0
                            _state->get_query_ctx()->resource_ctx()->shared_from_this(),
677
0
                            sink_reserve_size,
678
0
                            Status::Error<ErrorCode::QUERY_MEMORY_EXCEEDED>(
679
0
                                    "high memory pressure, try to spill"));
680
0
                    _spilling = true;
681
0
                    continue;
682
0
                }
683
684
1.17k
                if (sink_reserve_size > 0 &&
685
1.17k
                    !_try_to_reserve_memory(sink_reserve_size, _sink.get())) {
686
1.17k
                    continue;
687
1.17k
                }
688
1.17k
            }
689
690
17
            DBUG_EXECUTE_IF("PipelineTask::execute.sink_eos_sleep", {
691
17
                auto required_pipeline_id =
692
17
                        DebugPoints::instance()->get_debug_param_or_default<int32_t>(
693
17
                                "PipelineTask::execute.sink_eos_sleep", "pipeline_id", -1);
694
17
                auto required_task_id =
695
17
                        DebugPoints::instance()->get_debug_param_or_default<int32_t>(
696
17
                                "PipelineTask::execute.sink_eos_sleep", "task_id", -1);
697
17
                if (required_pipeline_id == pipeline_id() && required_task_id == task_id()) {
698
17
                    LOG(WARNING) << "PipelineTask::execute.sink_eos_sleep sleep 10s";
699
17
                    sleep(10);
700
17
                }
701
17
            });
702
703
17
            DBUG_EXECUTE_IF("PipelineTask::execute.terminate", {
704
17
                if (_eos) {
705
17
                    auto required_pipeline_id =
706
17
                            DebugPoints::instance()->get_debug_param_or_default<int32_t>(
707
17
                                    "PipelineTask::execute.terminate", "pipeline_id", -1);
708
17
                    auto required_task_id =
709
17
                            DebugPoints::instance()->get_debug_param_or_default<int32_t>(
710
17
                                    "PipelineTask::execute.terminate", "task_id", -1);
711
17
                    auto required_fragment_id =
712
17
                            DebugPoints::instance()->get_debug_param_or_default<int32_t>(
713
17
                                    "PipelineTask::execute.terminate", "fragment_id", -1);
714
17
                    if (required_pipeline_id == pipeline_id() && required_task_id == task_id() &&
715
17
                        fragment_context->get_fragment_id() == required_fragment_id) {
716
17
                        _wake_up_early = true;
717
17
                        unblock_all_dependencies();
718
17
                    } else if (required_pipeline_id == pipeline_id() &&
719
17
                               fragment_context->get_fragment_id() == required_fragment_id) {
720
17
                        LOG(WARNING) << "PipelineTask::execute.terminate sleep 5s";
721
17
                        sleep(5);
722
17
                    }
723
17
                }
724
17
            });
725
726
17
            status = _sink->sink(_state, block, _eos);
727
728
17
            if (_eos) {
729
11
                if (_sink->reset_to_rerun(_state, _root)) {
730
0
                    _eos = false;
731
11
                } else {
732
11
                    RETURN_IF_ERROR(close(Status::OK(), false));
733
11
                }
734
11
            }
735
736
17
            if (status.is<ErrorCode::END_OF_FILE>()) {
737
1
                set_wake_up_early();
738
1
                return Status::OK();
739
16
            } else if (!status) {
740
0
                return status;
741
0
            }
742
743
16
            if (_eos) { // just return, the scheduler will do finish work
744
10
                return Status::OK();
745
10
            }
746
16
        }
747
1.51M
    }
748
749
4
    RETURN_IF_ERROR(_state->get_query_ctx()->get_pipe_exec_scheduler()->submit(shared_from_this()));
750
4
    return Status::OK();
751
4
}
752
753
7
Status PipelineTask::do_revoke_memory(const std::shared_ptr<SpillContext>& spill_context) {
754
7
    auto fragment_context = _fragment_context.lock();
755
7
    if (!fragment_context) {
756
1
        return Status::InternalError("Fragment already finished! Query: {}", print_id(_query_id));
757
1
    }
758
759
6
    SCOPED_ATTACH_TASK(_state);
760
6
    ThreadCpuStopWatch cpu_time_stop_watch;
761
6
    cpu_time_stop_watch.start();
762
6
    Defer running_defer {[&]() {
763
6
        int64_t delta_cpu_time = cpu_time_stop_watch.elapsed_time();
764
6
        _task_cpu_timer->update(delta_cpu_time);
765
6
        fragment_context->get_query_ctx()->resource_ctx()->cpu_context()->update_cpu_cost_ms(
766
6
                delta_cpu_time);
767
768
        // If task is woke up early, unblock all dependencies and terminate all operators,
769
        // so this task could be closed immediately.
770
6
        if (_wake_up_early) {
771
1
            unblock_all_dependencies();
772
1
            THROW_IF_ERROR(_root->terminate(_state));
773
1
            THROW_IF_ERROR(_sink->terminate(_state));
774
1
            _eos = true;
775
1
        }
776
777
        // SpillContext tracks pipeline task count, not operator count.
778
        // Notify completion once after all operators + sink have finished revoking.
779
6
        if (spill_context) {
780
3
            spill_context->on_task_finished();
781
3
        }
782
6
    }};
783
784
    // Revoke memory from every operator that has enough revocable memory,
785
    // then revoke from the sink.
786
6
    for (auto& op : _operators) {
787
6
        if (op->revocable_mem_size(_state) >= SpillFile::MIN_SPILL_WRITE_BATCH_MEM) {
788
2
            RETURN_IF_ERROR(op->revoke_memory(_state));
789
2
        }
790
6
    }
791
792
6
    if (_sink->revocable_mem_size(_state) >= SpillFile::MIN_SPILL_WRITE_BATCH_MEM) {
793
1
        RETURN_IF_ERROR(_sink->revoke_memory(_state));
794
1
    }
795
6
    return Status::OK();
796
6
}
797
798
2.28k
bool PipelineTask::_try_to_reserve_memory(const size_t reserve_size, OperatorBase* op) {
799
2.28k
    auto st = thread_context()->thread_mem_tracker_mgr->try_reserve(reserve_size);
800
    // If reserve memory failed and the query is not enable spill, just disable reserve memory(this will enable
801
    // memory hard limit check, and will cancel the query if allocate memory failed) and let it run.
802
2.28k
    if (!st.ok() && !_state->enable_spill()) {
803
2
        LOG(INFO) << print_id(_query_id) << " reserve memory failed due to " << st
804
2
                  << ", and it is not enable spill, disable reserve memory and let it run";
805
2
        _state->get_query_ctx()->resource_ctx()->task_controller()->disable_reserve_memory();
806
2
        return true;
807
2
    }
808
2.27k
    COUNTER_UPDATE(_memory_reserve_times, 1);
809
810
    // Compute total revocable memory across all operators and the sink.
811
2.27k
    size_t total_revocable_mem_size = 0;
812
2.27k
    size_t operator_max_revocable_mem_size = 0;
813
814
2.27k
    if (!st.ok() || _state->enable_force_spill()) {
815
        // Compute total revocable memory across all operators and the sink.
816
2.27k
        total_revocable_mem_size = _sink->revocable_mem_size(_state);
817
2.27k
        operator_max_revocable_mem_size = total_revocable_mem_size;
818
2.27k
        for (auto& cur_op : _operators) {
819
2.27k
            total_revocable_mem_size += cur_op->revocable_mem_size(_state);
820
2.27k
            operator_max_revocable_mem_size =
821
2.27k
                    std::max(cur_op->revocable_mem_size(_state), operator_max_revocable_mem_size);
822
2.27k
        }
823
2.27k
    }
824
825
    // During enable force spill, other operators like scan opeartor will also try to reserve memory and will failed
826
    // here, if not add this check, it will always paused and resumed again.
827
2.27k
    if (st.ok() && _state->enable_force_spill()) {
828
0
        if (operator_max_revocable_mem_size >= _state->spill_min_revocable_mem()) {
829
0
            st = Status::Error<ErrorCode::QUERY_MEMORY_EXCEEDED>(
830
0
                    "force spill and there is an operator has memory "
831
0
                    "size {} exceeds min mem size {}",
832
0
                    PrettyPrinter::print_bytes(operator_max_revocable_mem_size),
833
0
                    PrettyPrinter::print_bytes(_state->spill_min_revocable_mem()));
834
0
        }
835
0
    }
836
837
2.27k
    if (!st.ok()) {
838
2.27k
        COUNTER_UPDATE(_memory_reserve_failed_times, 1);
839
        // build per-operator revocable memory info string for debugging
840
2.27k
        std::string ops_revocable_info;
841
2.27k
        {
842
2.27k
            fmt::memory_buffer buf;
843
2.27k
            for (auto& cur_op : _operators) {
844
2.27k
                fmt::format_to(buf, "{}({})-> ", cur_op->get_name(),
845
2.27k
                               PrettyPrinter::print_bytes(cur_op->revocable_mem_size(_state)));
846
2.27k
            }
847
2.27k
            if (_sink) {
848
2.27k
                fmt::format_to(buf, "{}({}) ", _sink->get_name(),
849
2.27k
                               PrettyPrinter::print_bytes(_sink->revocable_mem_size(_state)));
850
2.27k
            }
851
2.27k
            ops_revocable_info = fmt::to_string(buf);
852
2.27k
        }
853
854
2.27k
        auto debug_msg = fmt::format(
855
2.27k
                "Query: {} , try to reserve: {}, total revocable mem size: {}, failed reason: {}",
856
2.27k
                print_id(_query_id), PrettyPrinter::print_bytes(reserve_size),
857
2.27k
                PrettyPrinter::print_bytes(total_revocable_mem_size), st.to_string());
858
2.27k
        if (!ops_revocable_info.empty()) {
859
2.27k
            debug_msg += fmt::format(", ops_revocable=[{}]", ops_revocable_info);
860
2.27k
        }
861
        // PROCESS_MEMORY_EXCEEDED error msg already contains process_mem_log_str
862
2.27k
        if (!st.is<ErrorCode::PROCESS_MEMORY_EXCEEDED>()) {
863
2.27k
            debug_msg +=
864
2.27k
                    fmt::format(", debug info: {}", GlobalMemoryArbitrator::process_mem_log_str());
865
2.27k
        }
866
2.27k
        LOG(INFO) << debug_msg;
867
2.27k
        ExecEnv::GetInstance()->workload_group_mgr()->add_paused_query(
868
2.27k
                _state->get_query_ctx()->resource_ctx()->shared_from_this(), reserve_size, st);
869
2.27k
        _spilling = true;
870
2.27k
        return false;
871
2.27k
    }
872
0
    return true;
873
2.27k
}
874
875
160k
void PipelineTask::stop_if_finished() {
876
160k
    auto fragment = _fragment_context.lock();
877
160k
    if (!fragment) {
878
0
        return;
879
0
    }
880
160k
    SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(fragment->get_query_ctx()->query_mem_tracker());
881
160k
    if (auto sink = _sink) {
882
160k
        if (sink->is_finished(_state)) {
883
1
            set_wake_up_early();
884
1
            unblock_all_dependencies();
885
1
        }
886
160k
    }
887
160k
}
888
889
3
Status PipelineTask::finalize() {
890
3
    auto fragment = _fragment_context.lock();
891
3
    if (!fragment) {
892
0
        return Status::OK();
893
0
    }
894
3
    SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(fragment->get_query_ctx()->query_mem_tracker());
895
3
    _stop_accepting_submit();
896
    // Synchronize with unblock_all_dependencies() before clearing state used by wake_up()->submit().
897
3
    std::unique_lock<std::mutex> lock(_dependency_lifecycle_lock);
898
3
    RETURN_IF_ERROR(_state_transition(State::FINALIZED));
899
3
    _sink_shared_state.reset();
900
3
    _op_shared_states.clear();
901
3
    _shared_state_map.clear();
902
3
    _block.reset();
903
3
    _operators.clear();
904
3
    _sink.reset();
905
3
    _pipeline.reset();
906
3
    return Status::OK();
907
3
}
908
909
19
Status PipelineTask::close(Status exec_status, bool close_sink) {
910
19
    if (close_sink) {
911
8
        _stop_accepting_submit();
912
8
    }
913
19
    int64_t close_ns = 0;
914
19
    Status s;
915
19
    {
916
19
        SCOPED_RAW_TIMER(&close_ns);
917
19
        if (close_sink) {
918
8
            s = _sink->close(_state, exec_status);
919
8
        }
920
23
        for (auto& op : _operators) {
921
23
            auto tem = op->close(_state);
922
23
            if (!tem.ok() && s.ok()) {
923
0
                s = std::move(tem);
924
0
            }
925
23
        }
926
19
    }
927
19
    if (_opened) {
928
17
        COUNTER_UPDATE(_close_timer, close_ns);
929
17
        COUNTER_UPDATE(_task_profile->total_time_counter(), close_ns);
930
17
    }
931
932
19
    if (close_sink && _opened) {
933
6
        _task_profile->add_info_string("WakeUpEarly", std::to_string(_wake_up_early.load()));
934
6
        _fresh_profile_counter();
935
6
    }
936
937
19
    if (close_sink) {
938
        // Synchronize FINISHED with forced unblocking so delayed wake_up() sees a stable state.
939
8
        std::unique_lock<std::mutex> lock(_dependency_lifecycle_lock);
940
8
        RETURN_IF_ERROR(_state_transition(State::FINISHED));
941
8
    }
942
19
    return s;
943
19
}
944
945
19.4k
std::string PipelineTask::debug_string() {
946
19.4k
    fmt::memory_buffer debug_string_buffer;
947
948
19.4k
    fmt::format_to(debug_string_buffer, "QueryId: {}\n", print_id(_query_id));
949
19.4k
    fmt::format_to(debug_string_buffer, "InstanceId: {}\n",
950
19.4k
                   print_id(_state->fragment_instance_id()));
951
952
19.4k
    fmt::format_to(debug_string_buffer,
953
19.4k
                   "PipelineTask[id = {}, open = {}, eos = {}, state = {}, dry run = "
954
19.4k
                   "{}, _wake_up_early = {}, _wake_up_by = {}, time elapsed since last state "
955
19.4k
                   "changing = {}s, spilling = {}, is running = {}]",
956
19.4k
                   _index, _opened, _eos, _to_string(_exec_state), _dry_run, _wake_up_early.load(),
957
19.4k
                   _wake_by, _state_change_watcher.elapsed_time() / NANOS_PER_SEC, _spilling,
958
19.4k
                   is_running());
959
19.4k
    std::unique_lock<std::mutex> lc(_dependency_lifecycle_lock);
960
19.4k
    auto* cur_blocked_dep = _blocked_dep;
961
19.4k
    auto fragment = _fragment_context.lock();
962
19.4k
    if (is_finalized() || !fragment) {
963
0
        fmt::format_to(debug_string_buffer, " pipeline name = {}", _pipeline_name);
964
0
        return fmt::to_string(debug_string_buffer);
965
0
    }
966
19.4k
    auto elapsed = fragment->elapsed_time() / NANOS_PER_SEC;
967
19.4k
    fmt::format_to(debug_string_buffer, " elapse time = {}s, block dependency = [{}]\n", elapsed,
968
19.4k
                   cur_blocked_dep && !is_finalized() ? cur_blocked_dep->debug_string() : "NULL");
969
970
19.4k
    if (_state && _state->local_runtime_filter_mgr()) {
971
0
        fmt::format_to(debug_string_buffer, "local_runtime_filter_mgr: [{}]\n",
972
0
                       _state->local_runtime_filter_mgr()->debug_string());
973
0
    }
974
975
19.4k
    fmt::format_to(debug_string_buffer, "operators: ");
976
38.8k
    for (size_t i = 0; i < _operators.size(); i++) {
977
19.4k
        fmt::format_to(debug_string_buffer, "\n{}",
978
19.4k
                       _opened && !is_finalized()
979
19.4k
                               ? _operators[i]->debug_string(_state, cast_set<int>(i))
980
19.4k
                               : _operators[i]->debug_string(cast_set<int>(i)));
981
19.4k
    }
982
19.4k
    fmt::format_to(debug_string_buffer, "\n{}\n",
983
19.4k
                   _opened && !is_finalized()
984
19.4k
                           ? _sink->debug_string(_state, cast_set<int>(_operators.size()))
985
19.4k
                           : _sink->debug_string(cast_set<int>(_operators.size())));
986
987
19.4k
    fmt::format_to(debug_string_buffer, "\nRead Dependency Information: \n");
988
989
19.4k
    size_t i = 0;
990
38.8k
    for (; i < _read_dependencies.size(); i++) {
991
38.8k
        for (size_t j = 0; j < _read_dependencies[i].size(); j++) {
992
19.4k
            fmt::format_to(debug_string_buffer, "{}. {}\n", i,
993
19.4k
                           _read_dependencies[i][j]->debug_string(cast_set<int>(i) + 1));
994
19.4k
        }
995
19.4k
    }
996
997
19.4k
    fmt::format_to(debug_string_buffer, "{}. {}\n", i,
998
19.4k
                   _memory_sufficient_dependency->debug_string(cast_set<int>(i++)));
999
1000
19.4k
    fmt::format_to(debug_string_buffer, "\nWrite Dependency Information: \n");
1001
38.8k
    for (size_t j = 0; j < _write_dependencies.size(); j++, i++) {
1002
19.4k
        fmt::format_to(debug_string_buffer, "{}. {}\n", i,
1003
19.4k
                       _write_dependencies[j]->debug_string(cast_set<int>(j) + 1));
1004
19.4k
    }
1005
1006
19.4k
    fmt::format_to(debug_string_buffer, "\nExecution Dependency Information: \n");
1007
58.3k
    for (size_t j = 0; j < _execution_dependencies.size(); j++, i++) {
1008
38.8k
        fmt::format_to(debug_string_buffer, "{}. {}\n", i,
1009
38.8k
                       _execution_dependencies[j]->debug_string(cast_set<int>(i) + 1));
1010
38.8k
    }
1011
1012
19.4k
    fmt::format_to(debug_string_buffer, "Finish Dependency Information: \n");
1013
58.3k
    for (size_t j = 0; j < _finish_dependencies.size(); j++, i++) {
1014
38.8k
        fmt::format_to(debug_string_buffer, "{}. {}\n", i,
1015
38.8k
                       _finish_dependencies[j]->debug_string(cast_set<int>(i) + 1));
1016
38.8k
    }
1017
19.4k
    return fmt::to_string(debug_string_buffer);
1018
19.4k
}
1019
1020
6
size_t PipelineTask::_get_revocable_size() const {
1021
    // Sum revocable memory from every operator in the pipeline + the sink.
1022
    // Each operator reports only its own revocable memory (no child recursion).
1023
6
    size_t total = 0;
1024
6
    size_t sink_revocable_size = _sink->revocable_mem_size(_state);
1025
6
    if (sink_revocable_size >= SpillFile::MIN_SPILL_WRITE_BATCH_MEM) {
1026
3
        total += sink_revocable_size;
1027
3
    }
1028
6
    for (const auto& op : _operators) {
1029
6
        size_t ops_revocable_size = op->revocable_mem_size(_state);
1030
6
        if (ops_revocable_size >= SpillFile::MIN_SPILL_WRITE_BATCH_MEM) {
1031
4
            total += ops_revocable_size;
1032
4
        }
1033
6
    }
1034
6
    return total;
1035
6
}
1036
1037
2
size_t PipelineTask::get_revocable_size() const {
1038
2
    if (!_opened || is_finalized() || _running || (_eos && !_spilling)) {
1039
0
        return 0;
1040
0
    }
1041
1042
2
    return _get_revocable_size();
1043
2
}
1044
1045
3
Status PipelineTask::revoke_memory(const std::shared_ptr<SpillContext>& spill_context) {
1046
3
    DCHECK(spill_context);
1047
3
    if (is_finalized()) {
1048
1
        spill_context->on_task_finished();
1049
1
        VLOG_DEBUG << "Query: " << print_id(_state->query_id()) << ", task: " << ((void*)this)
1050
0
                   << " finalized";
1051
1
        return Status::OK();
1052
1
    }
1053
1054
2
    const auto revocable_size = get_revocable_size();
1055
2
    if (revocable_size >= SpillFile::MIN_SPILL_WRITE_BATCH_MEM) {
1056
1
        auto revokable_task = std::make_shared<RevokableTask>(shared_from_this(), spill_context);
1057
        // Submit a revocable task to run, the run method will call revoke memory. Currently the
1058
        // underline pipeline task is still blocked.
1059
1
        RETURN_IF_ERROR(_state->get_query_ctx()->get_pipe_exec_scheduler()->submit(revokable_task));
1060
1
    } else {
1061
1
        spill_context->on_task_finished();
1062
1
        VLOG_DEBUG << "Query: " << print_id(_state->query_id()) << ", task: " << ((void*)this)
1063
0
                   << " has not enough data to revoke: " << revocable_size;
1064
1
    }
1065
2
    return Status::OK();
1066
2
}
1067
1068
29
void PipelineTask::wake_up(Dependency* dep, std::unique_lock<std::mutex>& /* dep_lock */) {
1069
56
    auto cancel_if_error = [&](const Status& st) {
1070
56
        if (!st.ok()) {
1071
0
            if (auto frag = fragment_context().lock()) {
1072
0
                frag->cancel(st);
1073
0
            }
1074
0
        }
1075
56
    };
1076
    // call by dependency
1077
29
    DCHECK_EQ(_blocked_dep, dep) << "dep : " << dep->debug_string(0) << "task: " << debug_string();
1078
29
    _blocked_dep = nullptr;
1079
29
    auto holder = std::dynamic_pointer_cast<PipelineTask>(shared_from_this());
1080
29
    cancel_if_error(_state_transition(PipelineTask::State::RUNNABLE));
1081
    // Under _wake_up_early, FINISHED/FINALIZED → RUNNABLE is a legal no-op
1082
    // (_state_transition returns OK but state stays unchanged). We must not
1083
    // resubmit a terminated task: finalize() clears _sink/_operators, and
1084
    // submit() → is_blockable() would dereference them → SIGSEGV.
1085
29
    if (_exec_state == State::FINISHED || _exec_state == State::FINALIZED) {
1086
2
        return;
1087
2
    }
1088
27
    if (auto f = _fragment_context.lock(); f) {
1089
27
        cancel_if_error(_state->get_query_ctx()->get_pipe_exec_scheduler()->submit(holder));
1090
27
    }
1091
27
}
1092
1093
141
Status PipelineTask::_state_transition(State new_state) {
1094
141
    const auto& table =
1095
141
            _wake_up_early ? WAKE_UP_EARLY_LEGAL_STATE_TRANSITION : LEGAL_STATE_TRANSITION;
1096
141
    auto current_state = _exec_state.load();
1097
141
    if (!table[(int)new_state].contains(current_state)) {
1098
31
        return Status::InternalError(
1099
31
                "Task state transition from {} to {} is not allowed! Task: query_id={}, "
1100
31
                "instance_id={}, id={}, pipeline={}, open={}, eos={}, dry_run={}, "
1101
31
                "wake_up_early={}, wake_by={}, spilling={}, running={}",
1102
31
                _to_string(current_state), _to_string(new_state), print_id(_query_id),
1103
31
                print_id(_state->fragment_instance_id()), _index, _pipeline_name, _opened,
1104
31
                _eos.load(), _dry_run, _wake_up_early.load(), _wake_by.load(), _spilling.load(),
1105
31
                is_running());
1106
31
    }
1107
    // FINISHED/FINALIZED -> RUNNABLE is legal under wake_up_early (delayed wake_up() arriving
1108
    // after the task already terminated), but we must not actually move the state backwards
1109
    // or update profile info (which would misleadingly show RUNNABLE for a terminated task).
1110
110
    bool need_move = !((_exec_state == State::FINISHED || _exec_state == State::FINALIZED) &&
1111
110
                       new_state == State::RUNNABLE);
1112
110
    if (need_move) {
1113
104
        if (_exec_state != new_state) {
1114
102
            _state_change_watcher.reset();
1115
102
            _state_change_watcher.start();
1116
102
        }
1117
104
        _task_profile->add_info_string("TaskState", _to_string(new_state));
1118
104
        _task_profile->add_info_string("BlockedByDependency",
1119
104
                                       _blocked_dep ? _blocked_dep->name() : "");
1120
104
        _exec_state = new_state;
1121
104
    }
1122
110
    return Status::OK();
1123
141
}
1124
1125
} // namespace doris