Coverage Report

Created: 2026-04-14 10:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exec/pipeline/pipeline_task.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "exec/pipeline/pipeline_task.h"
19
20
#include <fmt/core.h>
21
#include <fmt/format.h>
22
#include <gen_cpp/Metrics_types.h>
23
#include <glog/logging.h>
24
25
#include <algorithm>
26
#include <memory>
27
#include <ostream>
28
#include <vector>
29
30
#include "common/logging.h"
31
#include "common/status.h"
32
#include "core/block/block.h"
33
#include "exec/operator/exchange_source_operator.h"
34
#include "exec/operator/operator.h"
35
#include "exec/operator/rec_cte_source_operator.h"
36
#include "exec/operator/scan_operator.h"
37
#include "exec/pipeline/dependency.h"
38
#include "exec/pipeline/pipeline.h"
39
#include "exec/pipeline/pipeline_fragment_context.h"
40
#include "exec/pipeline/revokable_task.h"
41
#include "exec/pipeline/task_queue.h"
42
#include "exec/pipeline/task_scheduler.h"
43
#include "exec/spill/spill_file.h"
44
#include "runtime/descriptors.h"
45
#include "runtime/exec_env.h"
46
#include "runtime/query_context.h"
47
#include "runtime/runtime_profile.h"
48
#include "runtime/runtime_profile_counter_names.h"
49
#include "runtime/thread_context.h"
50
#include "runtime/workload_group/workload_group_manager.h"
51
#include "util/defer_op.h"
52
#include "util/mem_info.h"
53
#include "util/uid_util.h"
54
55
namespace doris {
56
class RuntimeState;
57
} // namespace doris
58
59
namespace doris {
60
61
PipelineTask::PipelineTask(PipelinePtr& pipeline, uint32_t task_id, RuntimeState* state,
62
                           std::shared_ptr<PipelineFragmentContext> fragment_context,
63
                           RuntimeProfile* parent_profile,
64
                           std::map<int, std::pair<std::shared_ptr<BasicSharedState>,
65
                                                   std::vector<std::shared_ptr<Dependency>>>>
66
                                   shared_state_map,
67
                           int task_idx)
68
        :
69
#ifdef BE_TEST
70
          _query_id(fragment_context ? fragment_context->get_query_id() : TUniqueId()),
71
#else
72
2.04M
          _query_id(fragment_context->get_query_id()),
73
#endif
74
2.04M
          _index(task_id),
75
2.04M
          _pipeline(pipeline),
76
2.04M
          _opened(false),
77
2.04M
          _state(state),
78
2.04M
          _fragment_context(fragment_context),
79
2.04M
          _parent_profile(parent_profile),
80
2.04M
          _operators(pipeline->operators()),
81
2.04M
          _source(_operators.front().get()),
82
2.04M
          _root(_operators.back().get()),
83
2.04M
          _sink(pipeline->sink_shared_pointer()),
84
2.04M
          _shared_state_map(std::move(shared_state_map)),
85
2.04M
          _task_idx(task_idx),
86
2.04M
          _memory_sufficient_dependency(state->get_query_ctx()->get_memory_sufficient_dependency()),
87
2.04M
          _pipeline_name(_pipeline->name()) {
88
2.04M
#ifndef BE_TEST
89
2.04M
    _query_mem_tracker = fragment_context->get_query_ctx()->query_mem_tracker();
90
2.04M
#endif
91
2.04M
    _execution_dependencies.push_back(state->get_query_ctx()->get_execution_dependency());
92
2.04M
    if (!_shared_state_map.contains(_sink->dests_id().front())) {
93
1.72M
        auto shared_state = _sink->create_shared_state();
94
1.72M
        if (shared_state) {
95
1.71M
            _sink_shared_state = shared_state;
96
1.71M
        }
97
1.72M
    }
98
2.04M
}
99
100
2.04M
PipelineTask::~PipelineTask() {
101
2.12M
    auto reset_member = [&]() {
102
2.12M
        _shared_state_map.clear();
103
2.12M
        _sink_shared_state.reset();
104
2.12M
        _op_shared_states.clear();
105
2.12M
        _sink.reset();
106
2.12M
        _operators.clear();
107
2.12M
        _block.reset();
108
2.12M
        _pipeline.reset();
109
2.12M
    };
110
// PipelineTask is also hold by task queue( https://github.com/apache/doris/pull/49753),
111
// so that it maybe the last one to be destructed.
112
// But pipeline task hold some objects, like operators, shared state, etc. So that should release
113
// memory manually.
114
2.04M
#ifndef BE_TEST
115
2.04M
    if (_query_mem_tracker) {
116
2.04M
        SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(_query_mem_tracker);
117
2.04M
        reset_member();
118
2.04M
        return;
119
2.04M
    }
120
18.4E
#endif
121
18.4E
    reset_member();
122
18.4E
}
123
124
Status PipelineTask::prepare(const std::vector<TScanRangeParams>& scan_range, const int sender_id,
125
2.04M
                             const TDataSink& tsink) {
126
2.04M
    DCHECK(_sink);
127
2.04M
    _init_profile();
128
2.04M
    SCOPED_TIMER(_task_profile->total_time_counter());
129
2.04M
    SCOPED_CPU_TIMER(_task_cpu_timer);
130
2.04M
    SCOPED_TIMER(_prepare_timer);
131
2.04M
    DBUG_EXECUTE_IF("fault_inject::PipelineXTask::prepare", {
132
2.04M
        Status status = Status::Error<INTERNAL_ERROR>("fault_inject pipeline_task prepare failed");
133
2.04M
        return status;
134
2.04M
    });
135
2.04M
    {
136
        // set sink local state
137
2.04M
        LocalSinkStateInfo info {_task_idx,         _task_profile.get(),
138
2.04M
                                 sender_id,         get_sink_shared_state().get(),
139
2.04M
                                 _shared_state_map, tsink};
140
2.04M
        RETURN_IF_ERROR(_sink->setup_local_state(_state, info));
141
2.04M
    }
142
143
2.04M
    _scan_ranges = scan_range;
144
2.04M
    auto* parent_profile = _state->get_sink_local_state()->operator_profile();
145
146
4.75M
    for (int op_idx = cast_set<int>(_operators.size() - 1); op_idx >= 0; op_idx--) {
147
2.71M
        auto& op = _operators[op_idx];
148
2.71M
        LocalStateInfo info {parent_profile, _scan_ranges, get_op_shared_state(op->operator_id()),
149
2.71M
                             _shared_state_map, _task_idx};
150
2.71M
        RETURN_IF_ERROR(op->setup_local_state(_state, info));
151
2.71M
        parent_profile = _state->get_local_state(op->operator_id())->operator_profile();
152
2.71M
    }
153
2.04M
    {
154
2.04M
        const auto& deps =
155
2.04M
                _state->get_local_state(_source->operator_id())->execution_dependencies();
156
2.04M
        std::unique_lock<std::mutex> lc(_dependency_lock);
157
2.04M
        std::copy(deps.begin(), deps.end(),
158
2.04M
                  std::inserter(_execution_dependencies, _execution_dependencies.end()));
159
2.04M
    }
160
2.04M
    if (auto fragment = _fragment_context.lock()) {
161
2.04M
        if (fragment->get_query_ctx()->is_cancelled()) {
162
0
            unblock_all_dependencies();
163
0
            return fragment->get_query_ctx()->exec_status();
164
0
        }
165
18.4E
    } else {
166
18.4E
        return Status::InternalError("Fragment already finished! Query: {}", print_id(_query_id));
167
18.4E
    }
168
2.04M
    _block = doris::Block::create_unique();
169
2.04M
    return _state_transition(State::RUNNABLE);
170
2.04M
}
171
172
2.04M
Status PipelineTask::_extract_dependencies() {
173
2.04M
    std::vector<std::vector<Dependency*>> read_dependencies;
174
2.04M
    std::vector<Dependency*> write_dependencies;
175
2.04M
    std::vector<Dependency*> finish_dependencies;
176
2.04M
    read_dependencies.resize(_operators.size());
177
2.04M
    size_t i = 0;
178
2.71M
    for (auto& op : _operators) {
179
2.71M
        auto* local_state = _state->get_local_state(op->operator_id());
180
2.71M
        DCHECK(local_state);
181
2.71M
        read_dependencies[i] = local_state->dependencies();
182
2.71M
        auto* fin_dep = local_state->finishdependency();
183
2.71M
        if (fin_dep) {
184
9
            finish_dependencies.push_back(fin_dep);
185
9
        }
186
2.71M
        i++;
187
2.71M
    }
188
2.04M
    DBUG_EXECUTE_IF("fault_inject::PipelineXTask::_extract_dependencies", {
189
2.04M
        Status status = Status::Error<INTERNAL_ERROR>(
190
2.04M
                "fault_inject pipeline_task _extract_dependencies failed");
191
2.04M
        return status;
192
2.04M
    });
193
2.04M
    {
194
2.04M
        auto* local_state = _state->get_sink_local_state();
195
2.04M
        write_dependencies = local_state->dependencies();
196
2.04M
        auto* fin_dep = local_state->finishdependency();
197
2.04M
        if (fin_dep) {
198
845k
            finish_dependencies.push_back(fin_dep);
199
845k
        }
200
2.04M
    }
201
2.04M
    {
202
2.04M
        std::unique_lock<std::mutex> lc(_dependency_lock);
203
2.04M
        read_dependencies.swap(_read_dependencies);
204
2.04M
        write_dependencies.swap(_write_dependencies);
205
2.04M
        finish_dependencies.swap(_finish_dependencies);
206
2.04M
    }
207
2.04M
    return Status::OK();
208
2.04M
}
209
210
1.07M
bool PipelineTask::inject_shared_state(std::shared_ptr<BasicSharedState> shared_state) {
211
1.07M
    if (!shared_state) {
212
602k
        return false;
213
602k
    }
214
    // Shared state is created by upstream task's sink operator and shared by source operator of
215
    // this task.
216
544k
    for (auto& op : _operators) {
217
544k
        if (shared_state->related_op_ids.contains(op->operator_id())) {
218
466k
            _op_shared_states.insert({op->operator_id(), shared_state});
219
466k
            return true;
220
466k
        }
221
544k
    }
222
    // Shared state is created by the first sink operator and shared by sink operator of this task.
223
    // For example, Set operations.
224
11.0k
    if (shared_state->related_op_ids.contains(_sink->dests_id().front())) {
225
11.0k
        DCHECK_EQ(_sink_shared_state, nullptr)
226
0
                << " Sink: " << _sink->get_name() << " dest id: " << _sink->dests_id().front();
227
11.0k
        _sink_shared_state = shared_state;
228
11.0k
        return true;
229
11.0k
    }
230
18.4E
    return false;
231
9.68k
}
232
233
2.04M
void PipelineTask::_init_profile() {
234
2.04M
    _task_profile = std::make_unique<RuntimeProfile>(fmt::format("PipelineTask(index={})", _index));
235
2.04M
    _parent_profile->add_child(_task_profile.get(), true, nullptr);
236
2.04M
    _task_cpu_timer = ADD_TIMER(_task_profile, profile::TASK_CPU_TIME);
237
238
2.04M
    static const char* exec_time = profile::EXECUTE_TIME;
239
2.04M
    _exec_timer = ADD_TIMER(_task_profile, exec_time);
240
2.04M
    _prepare_timer = ADD_CHILD_TIMER(_task_profile, profile::PREPARE_TIME, exec_time);
241
2.04M
    _open_timer = ADD_CHILD_TIMER(_task_profile, profile::OPEN_TIME, exec_time);
242
2.04M
    _get_block_timer = ADD_CHILD_TIMER(_task_profile, profile::GET_BLOCK_TIME, exec_time);
243
2.04M
    _get_block_counter = ADD_COUNTER(_task_profile, profile::GET_BLOCK_COUNTER, TUnit::UNIT);
244
2.04M
    _sink_timer = ADD_CHILD_TIMER(_task_profile, profile::SINK_TIME, exec_time);
245
2.04M
    _close_timer = ADD_CHILD_TIMER(_task_profile, profile::CLOSE_TIME, exec_time);
246
247
2.04M
    _wait_worker_timer = ADD_TIMER_WITH_LEVEL(_task_profile, profile::WAIT_WORKER_TIME, 1);
248
249
2.04M
    _schedule_counts = ADD_COUNTER(_task_profile, profile::NUM_SCHEDULE_TIMES, TUnit::UNIT);
250
2.04M
    _yield_counts = ADD_COUNTER(_task_profile, profile::NUM_YIELD_TIMES, TUnit::UNIT);
251
2.04M
    _core_change_times = ADD_COUNTER(_task_profile, profile::CORE_CHANGE_TIMES, TUnit::UNIT);
252
2.04M
    _memory_reserve_times = ADD_COUNTER(_task_profile, profile::MEMORY_RESERVE_TIMES, TUnit::UNIT);
253
2.04M
    _memory_reserve_failed_times =
254
2.04M
            ADD_COUNTER(_task_profile, profile::MEMORY_RESERVE_FAILED_TIMES, TUnit::UNIT);
255
2.04M
}
256
257
2.04M
void PipelineTask::_fresh_profile_counter() {
258
2.04M
    COUNTER_SET(_schedule_counts, (int64_t)_schedule_time);
259
2.04M
    COUNTER_SET(_wait_worker_timer, (int64_t)_wait_worker_watcher.elapsed_time());
260
2.04M
}
261
262
2.04M
Status PipelineTask::_open() {
263
2.04M
    SCOPED_TIMER(_task_profile->total_time_counter());
264
2.04M
    SCOPED_CPU_TIMER(_task_cpu_timer);
265
2.04M
    SCOPED_TIMER(_open_timer);
266
2.04M
    _dry_run = _sink->should_dry_run(_state);
267
2.72M
    for (auto& o : _operators) {
268
2.72M
        RETURN_IF_ERROR(_state->get_local_state(o->operator_id())->open(_state));
269
2.72M
    }
270
2.04M
    RETURN_IF_ERROR(_state->get_sink_local_state()->open(_state));
271
2.04M
    RETURN_IF_ERROR(_extract_dependencies());
272
2.04M
    DBUG_EXECUTE_IF("fault_inject::PipelineXTask::open", {
273
2.04M
        Status status = Status::Error<INTERNAL_ERROR>("fault_inject pipeline_task open failed");
274
2.04M
        return status;
275
2.04M
    });
276
2.04M
    _opened = true;
277
2.04M
    return Status::OK();
278
2.04M
}
279
280
12.3M
Status PipelineTask::_prepare() {
281
12.3M
    SCOPED_TIMER(_task_profile->total_time_counter());
282
12.3M
    SCOPED_CPU_TIMER(_task_cpu_timer);
283
15.9M
    for (auto& o : _operators) {
284
15.9M
        RETURN_IF_ERROR(_state->get_local_state(o->operator_id())->prepare(_state));
285
15.9M
    }
286
12.3M
    RETURN_IF_ERROR(_state->get_sink_local_state()->prepare(_state));
287
12.3M
    return Status::OK();
288
12.3M
}
289
290
7.05M
bool PipelineTask::_wait_to_start() {
291
    // Before task starting, we should make sure
292
    // 1. Execution dependency is ready (which is controlled by FE 2-phase commit)
293
    // 2. Runtime filter dependencies are ready
294
    // 3. All tablets are loaded into local storage
295
7.05M
    return std::any_of(
296
7.05M
            _execution_dependencies.begin(), _execution_dependencies.end(),
297
8.38M
            [&](Dependency* dep) -> bool { return dep->is_blocked_by(shared_from_this()); });
298
7.05M
}
299
300
2.41M
bool PipelineTask::_is_pending_finish() {
301
    // Spilling may be in progress if eos is true.
302
2.41M
    return std::ranges::any_of(_finish_dependencies, [&](Dependency* dep) -> bool {
303
1.22M
        return dep->is_blocked_by(shared_from_this());
304
1.22M
    });
305
2.41M
}
306
307
7.45M
bool PipelineTask::is_blockable() const {
308
    // Before task starting, we should make sure
309
    // 1. Execution dependency is ready (which is controlled by FE 2-phase commit)
310
    // 2. Runtime filter dependencies are ready
311
    // 3. All tablets are loaded into local storage
312
313
7.45M
    if (_state->enable_fuzzy_blockable_task()) {
314
2.27k
        if ((_schedule_time + _task_idx) % 2 == 0) {
315
1.20k
            return true;
316
1.20k
        }
317
2.27k
    }
318
319
7.44M
    return std::ranges::any_of(_operators,
320
9.79M
                               [&](OperatorPtr op) -> bool { return op->is_blockable(_state); }) ||
321
7.45M
           _sink->is_blockable(_state);
322
7.45M
}
323
324
9.96M
bool PipelineTask::_is_blocked() {
325
    // `_dry_run = true` means we do not need data from source operator.
326
9.96M
    if (!_dry_run) {
327
18.2M
        for (int i = cast_set<int>(_read_dependencies.size() - 1); i >= 0; i--) {
328
            // `_read_dependencies` is organized according to operators. For each operator, running condition is met iff all dependencies are ready.
329
12.9M
            for (auto* dep : _read_dependencies[i]) {
330
12.9M
                if (dep->is_blocked_by(shared_from_this())) {
331
3.14M
                    return true;
332
3.14M
                }
333
12.9M
            }
334
            // If all dependencies are ready for this operator, we can execute this task if no datum is needed from upstream operators.
335
8.35M
            if (!_operators[i]->need_more_input_data(_state)) {
336
56.4k
                break;
337
56.4k
            }
338
8.35M
        }
339
9.93M
    }
340
6.81M
    return _memory_sufficient_dependency->is_blocked_by(shared_from_this()) ||
341
8.95M
           std::ranges::any_of(_write_dependencies, [&](Dependency* dep) -> bool {
342
8.95M
               return dep->is_blocked_by(shared_from_this());
343
8.95M
           });
344
9.96M
}
345
346
1.15M
void PipelineTask::unblock_all_dependencies() {
347
    // We use a lock to assure all dependencies are not deconstructed here.
348
1.15M
    std::unique_lock<std::mutex> lc(_dependency_lock);
349
1.15M
    auto fragment = _fragment_context.lock();
350
1.15M
    if (!is_finalized() && fragment) {
351
50.4k
        try {
352
50.4k
            DCHECK(_wake_up_early || fragment->is_canceled());
353
50.4k
            std::ranges::for_each(_write_dependencies,
354
94.9k
                                  [&](Dependency* dep) { dep->set_always_ready(); });
355
50.4k
            std::ranges::for_each(_finish_dependencies,
356
50.4k
                                  [&](Dependency* dep) { dep->set_always_ready(); });
357
53.3k
            std::ranges::for_each(_read_dependencies, [&](std::vector<Dependency*>& deps) {
358
62.9k
                std::ranges::for_each(deps, [&](Dependency* dep) { dep->set_always_ready(); });
359
53.3k
            });
360
            // All `_execution_deps` will never be set blocking from ready. So we just set ready here.
361
50.4k
            std::ranges::for_each(_execution_dependencies,
362
59.9k
                                  [&](Dependency* dep) { dep->set_ready(); });
363
50.4k
            _memory_sufficient_dependency->set_ready();
364
50.4k
        } catch (const doris::Exception& e) {
365
0
            LOG(WARNING) << "unblock_all_dependencies failed: " << e.code() << ", "
366
0
                         << e.to_string();
367
0
        }
368
50.4k
    }
369
1.15M
}
370
371
// When current memory pressure is low, memory usage may increase significantly in the next
372
// operator run, while there is no revocable memory available for spilling.
373
// Trigger memory revoking when pressure is high and revocable memory is significant.
374
// Memory pressure is evaluated using two signals:
375
// 1. Query memory usage exceeds a threshold ratio of the query memory limit.
376
// 2. Workload group memory usage reaches the workload group low-watermark threshold.
377
7.80M
bool PipelineTask::_should_trigger_revoking(const size_t reserve_size) const {
378
7.80M
    if (!_state->enable_spill()) {
379
7.80M
        return false;
380
7.80M
    }
381
382
762
    auto query_mem_tracker = _state->get_query_ctx()->query_mem_tracker();
383
762
    auto wg = _state->get_query_ctx()->workload_group();
384
6.75k
    if (!query_mem_tracker || !wg) {
385
2.01k
        return false;
386
2.01k
    }
387
388
18.4E
    const auto parallelism = std::max(1, _pipeline->num_tasks());
389
18.4E
    const auto query_water_mark = 90; // 90%
390
18.4E
    const auto group_mem_limit = wg->memory_limit();
391
18.4E
    auto query_limit = query_mem_tracker->limit();
392
18.4E
    if (query_limit <= 0) {
393
1
        query_limit = group_mem_limit;
394
18.4E
    } else if (query_limit > group_mem_limit && group_mem_limit > 0) {
395
319
        query_limit = group_mem_limit;
396
319
    }
397
398
18.4E
    if (query_limit <= 0) {
399
1
        return false;
400
1
    }
401
402
18.4E
    if ((reserve_size * parallelism) <= (query_limit / 5)) {
403
4.73k
        return false;
404
4.73k
    }
405
406
18.4E
    bool is_high_memory_pressure = false;
407
18.4E
    const auto used_mem = query_mem_tracker->consumption() + reserve_size * parallelism;
408
18.4E
    if (used_mem >= int64_t((double(query_limit) * query_water_mark / 100))) {
409
2
        is_high_memory_pressure = true;
410
2
    }
411
412
18.4E
    if (!is_high_memory_pressure) {
413
4
        bool is_low_watermark;
414
4
        bool is_high_watermark;
415
4
        wg->check_mem_used(&is_low_watermark, &is_high_watermark);
416
4
        is_high_memory_pressure = is_low_watermark || is_high_watermark;
417
4
    }
418
419
18.4E
    if (is_high_memory_pressure) {
420
4
        const auto revocable_size = [&]() {
421
4
            size_t total = _sink->revocable_mem_size(_state);
422
4
            for (const auto& op : _operators) {
423
4
                total += op->revocable_mem_size(_state);
424
4
            }
425
4
            return total;
426
4
        }();
427
428
4
        const auto total_estimated_revocable = revocable_size * parallelism;
429
4
        return total_estimated_revocable >= int64_t(double(query_limit) * 0.2);
430
4
    }
431
432
18.4E
    return false;
433
18.4E
}
434
435
/**
436
 * `_eos` indicates whether the execution phase is done. `done` indicates whether we could close
437
 * this task.
438
 *
439
 * For example,
440
 * 1. if `_eos` is false which means we should continue to get next block so we cannot close (e.g.
441
 *    `done` is false)
442
 * 2. if `_eos` is true which means all blocks from source are exhausted but `_is_pending_finish()`
443
 *    is true which means we should wait for a pending dependency ready (maybe a running rpc), so we
444
 *    cannot close (e.g. `done` is false)
445
 * 3. if `_eos` is true which means all blocks from source are exhausted and `_is_pending_finish()`
446
 *    is false which means we can close immediately (e.g. `done` is true)
447
 * @param done
448
 * @return
449
 */
450
7.44M
Status PipelineTask::execute(bool* done) {
451
7.44M
    if (_exec_state != State::RUNNABLE || _blocked_dep != nullptr) [[unlikely]] {
452
#ifdef BE_TEST
453
        return Status::InternalError("Pipeline task is not runnable! Task info: {}",
454
                                     debug_string());
455
#else
456
1
        return Status::FatalError("Pipeline task is not runnable! Task info: {}", debug_string());
457
1
#endif
458
1
    }
459
460
7.44M
    auto fragment_context = _fragment_context.lock();
461
7.44M
    if (!fragment_context) {
462
0
        return Status::InternalError("Fragment already finished! Query: {}", print_id(_query_id));
463
0
    }
464
7.44M
    int64_t time_spent = 0;
465
7.44M
    ThreadCpuStopWatch cpu_time_stop_watch;
466
7.44M
    cpu_time_stop_watch.start();
467
7.44M
    SCOPED_ATTACH_TASK(_state);
468
7.44M
    Defer running_defer {[&]() {
469
7.44M
        int64_t delta_cpu_time = cpu_time_stop_watch.elapsed_time();
470
7.44M
        _task_cpu_timer->update(delta_cpu_time);
471
7.44M
        fragment_context->get_query_ctx()->resource_ctx()->cpu_context()->update_cpu_cost_ms(
472
7.44M
                delta_cpu_time);
473
474
        // If task is woke up early, we should terminate all operators, and this task could be closed immediately.
475
7.44M
        if (_wake_up_early) {
476
7.88k
            _eos = true;
477
7.88k
            *done = true;
478
7.43M
        } else if (_eos && !_spilling &&
479
7.43M
                   (fragment_context->is_canceled() || !_is_pending_finish())) {
480
            // Debug point for testing the race condition fix: inject set_wake_up_early() +
481
            // unblock_all_dependencies() here to simulate Thread B writing A then B between
482
            // Thread A's two reads of _wake_up_early.
483
2.03M
            DBUG_EXECUTE_IF("PipelineTask::execute.wake_up_early_in_else_if", {
484
2.03M
                set_wake_up_early();
485
2.03M
                unblock_all_dependencies();
486
2.03M
            });
487
2.03M
            *done = true;
488
2.03M
        }
489
490
        // NOTE: The operator terminate() call is intentionally placed AFTER the
491
        // _is_pending_finish() check above, not before. This ordering is critical to avoid a race
492
        // condition with the seq_cst memory ordering guarantee:
493
        //
494
        // Pipeline::make_all_runnable() writes in this order:
495
        //   (A) set_wake_up_early()  ->  (B) unblock_all_dependencies() [sets finish_dep._always_ready]
496
        //
497
        // If we checked _wake_up_early (A) before _is_pending_finish() (B), there would be a
498
        // window where Thread A reads _wake_up_early=false, then Thread B writes both A and B,
499
        // then Thread A reads _is_pending_finish()=false (due to _always_ready). Thread A would
500
        // then set *done=true without ever calling operator terminate(), causing close() to run
501
        // on operators that were never properly terminated (e.g. RuntimeFilterProducer still in
502
        // WAITING_FOR_SYNCED_SIZE state when insert() is called).
503
        //
504
        // By reading _is_pending_finish() (B) before the second read of _wake_up_early (A),
505
        // if Thread A observes B's effect (_always_ready=true), it is guaranteed to also observe
506
        // A's effect (_wake_up_early=true) on this second read, ensuring operator terminate() is
507
        // called. This relies on _wake_up_early and _always_ready both being std::atomic with the
508
        // default seq_cst ordering — do not weaken them to relaxed or acq/rel.
509
7.44M
        if (_wake_up_early) {
510
7.95k
            THROW_IF_ERROR(_root->terminate(_state));
511
7.95k
            THROW_IF_ERROR(_sink->terminate(_state));
512
7.95k
        }
513
7.44M
    }};
514
7.44M
    const auto query_id = _state->query_id();
515
    // If this task is already EOS and block is empty (which means we already output all blocks),
516
    // just return here.
517
7.44M
    if (_eos && !_spilling) {
518
382k
        return Status::OK();
519
382k
    }
520
    // If this task is blocked by a spilling request and waken up immediately, the spilling
521
    // dependency will not block this task and we should just run here.
522
7.06M
    if (!_block->empty()) {
523
0
        LOG(INFO) << "Query: " << print_id(query_id) << " has pending block, size: "
524
0
                  << PrettyPrinter::print_bytes(_block->allocated_bytes());
525
0
        DCHECK(_spilling);
526
0
    }
527
528
7.06M
    SCOPED_TIMER(_task_profile->total_time_counter());
529
7.06M
    SCOPED_TIMER(_exec_timer);
530
531
7.07M
    if (!_wake_up_early) {
532
7.07M
        RETURN_IF_ERROR(_prepare());
533
7.07M
    }
534
7.06M
    DBUG_EXECUTE_IF("fault_inject::PipelineXTask::execute", {
535
7.06M
        Status status = Status::Error<INTERNAL_ERROR>("fault_inject pipeline_task execute failed");
536
7.06M
        return status;
537
7.06M
    });
538
    // `_wake_up_early` must be after `_wait_to_start()`
539
7.06M
    if (_wait_to_start() || _wake_up_early) {
540
1.79M
        return Status::OK();
541
1.79M
    }
542
5.26M
    RETURN_IF_ERROR(_prepare());
543
544
    // The status must be runnable
545
5.26M
    if (!_opened && !fragment_context->is_canceled()) {
546
2.04M
        DBUG_EXECUTE_IF("PipelineTask::execute.open_sleep", {
547
2.04M
            auto required_pipeline_id =
548
2.04M
                    DebugPoints::instance()->get_debug_param_or_default<int32_t>(
549
2.04M
                            "PipelineTask::execute.open_sleep", "pipeline_id", -1);
550
2.04M
            auto required_task_id = DebugPoints::instance()->get_debug_param_or_default<int32_t>(
551
2.04M
                    "PipelineTask::execute.open_sleep", "task_id", -1);
552
2.04M
            if (required_pipeline_id == pipeline_id() && required_task_id == task_id()) {
553
2.04M
                LOG(WARNING) << "PipelineTask::execute.open_sleep sleep 5s";
554
2.04M
                sleep(5);
555
2.04M
            }
556
2.04M
        });
557
558
2.04M
        SCOPED_RAW_TIMER(&time_spent);
559
2.04M
        RETURN_IF_ERROR(_open());
560
2.04M
    }
561
562
9.97M
    while (!fragment_context->is_canceled()) {
563
9.97M
        SCOPED_RAW_TIMER(&time_spent);
564
9.97M
        Defer defer {[&]() {
565
            // If this run is pended by a spilling request, the block will be output in next run.
566
9.96M
            if (!_spilling) {
567
9.96M
                _block->clear_column_data(_root->row_desc().num_materialized_slots());
568
9.96M
            }
569
9.96M
        }};
570
        // `_wake_up_early` must be after `_is_blocked()`
571
9.97M
        if (_is_blocked() || _wake_up_early) {
572
3.16M
            return Status::OK();
573
3.16M
        }
574
575
        /// When a task is cancelled,
576
        /// its blocking state will be cleared and it will transition to a ready state (though it is not truly ready).
577
        /// Here, checking whether it is cancelled to prevent tasks in a blocking state from being re-executed.
578
6.80M
        if (fragment_context->is_canceled()) {
579
1
            break;
580
1
        }
581
582
6.80M
        if (time_spent > _exec_time_slice) {
583
49.3k
            COUNTER_UPDATE(_yield_counts, 1);
584
49.3k
            break;
585
49.3k
        }
586
6.75M
        auto* block = _block.get();
587
588
6.75M
        DBUG_EXECUTE_IF("fault_inject::PipelineXTask::executing", {
589
6.75M
            Status status =
590
6.75M
                    Status::Error<INTERNAL_ERROR>("fault_inject pipeline_task executing failed");
591
6.75M
            return status;
592
6.75M
        });
593
594
        // `_sink->is_finished(_state)` means sink operator should be finished
595
6.75M
        if (_sink->is_finished(_state)) {
596
18
            set_wake_up_early();
597
18
            return Status::OK();
598
18
        }
599
600
        // `_dry_run` means sink operator need no more data
601
6.75M
        _eos = _dry_run || _eos;
602
6.75M
        _spilling = false;
603
6.75M
        auto workload_group = _state->workload_group();
604
        // If last run is pended by a spilling request, `_block` is produced with some rows in last
605
        // run, so we will resume execution using the block.
606
6.75M
        if (!_eos && _block->empty()) {
607
6.73M
            SCOPED_TIMER(_get_block_timer);
608
6.73M
            if (_state->low_memory_mode()) {
609
0
                _sink->set_low_memory_mode(_state);
610
0
                for (auto& op : _operators) {
611
0
                    op->set_low_memory_mode(_state);
612
0
                }
613
0
            }
614
6.73M
            DEFER_RELEASE_RESERVED();
615
6.73M
            _get_block_counter->update(1);
616
            // Sum reserve sizes across all operators in this pipeline.
617
            // Each operator reports only its own requirement (non-recursive).
618
6.73M
            size_t reserve_size = 0;
619
7.63M
            for (auto& op : _operators) {
620
7.63M
                reserve_size += op->get_reserve_mem_size(_state);
621
7.63M
                op->reset_reserve_mem_size(_state);
622
7.63M
            }
623
6.73M
            if (workload_group &&
624
6.73M
                _state->get_query_ctx()
625
5.29M
                        ->resource_ctx()
626
5.29M
                        ->task_controller()
627
5.29M
                        ->is_enable_reserve_memory() &&
628
6.73M
                reserve_size > 0) {
629
5.25M
                if (_should_trigger_revoking(reserve_size)) {
630
0
                    LOG(INFO) << fmt::format(
631
0
                            "Query: {} sink: {}, node id: {}, task id: {}, reserve size: {} when "
632
0
                            "high memory pressure, try to spill",
633
0
                            print_id(_query_id), _sink->get_name(), _sink->node_id(),
634
0
                            _state->task_id(), reserve_size);
635
0
                    ExecEnv::GetInstance()->workload_group_mgr()->add_paused_query(
636
0
                            _state->get_query_ctx()->resource_ctx()->shared_from_this(),
637
0
                            reserve_size,
638
0
                            Status::Error<ErrorCode::QUERY_MEMORY_EXCEEDED>(
639
0
                                    "high memory pressure, try to spill"));
640
0
                    _spilling = true;
641
0
                    continue;
642
0
                }
643
5.25M
                if (!_try_to_reserve_memory(reserve_size, _root)) {
644
1.08k
                    continue;
645
1.08k
                }
646
5.25M
            }
647
648
6.73M
            bool eos = false;
649
6.73M
            RETURN_IF_ERROR(_root->get_block_after_projects(_state, block, &eos));
650
6.73M
            RETURN_IF_ERROR(block->check_type_and_column());
651
6.73M
            _eos = eos;
652
6.73M
        }
653
654
6.75M
        if (!_block->empty() || _eos) {
655
2.78M
            SCOPED_TIMER(_sink_timer);
656
2.78M
            Status status = Status::OK();
657
2.78M
            DEFER_RELEASE_RESERVED();
658
2.78M
            if (_state->get_query_ctx()
659
2.78M
                        ->resource_ctx()
660
2.78M
                        ->task_controller()
661
2.78M
                        ->is_enable_reserve_memory() &&
662
2.78M
                workload_group && !(_wake_up_early || _dry_run)) {
663
2.77M
                const auto sink_reserve_size = _sink->get_reserve_mem_size(_state, _eos);
664
665
2.77M
                if (sink_reserve_size > 0 && _should_trigger_revoking(sink_reserve_size)) {
666
0
                    LOG(INFO) << fmt::format(
667
0
                            "Query: {} sink: {}, node id: {}, task id: {}, reserve size: {} when "
668
0
                            "high memory pressure, try to spill",
669
0
                            print_id(_query_id), _sink->get_name(), _sink->node_id(),
670
0
                            _state->task_id(), sink_reserve_size);
671
0
                    ExecEnv::GetInstance()->workload_group_mgr()->add_paused_query(
672
0
                            _state->get_query_ctx()->resource_ctx()->shared_from_this(),
673
0
                            sink_reserve_size,
674
0
                            Status::Error<ErrorCode::QUERY_MEMORY_EXCEEDED>(
675
0
                                    "high memory pressure, try to spill"));
676
0
                    _spilling = true;
677
0
                    continue;
678
0
                }
679
680
2.77M
                if (sink_reserve_size > 0 &&
681
2.77M
                    !_try_to_reserve_memory(sink_reserve_size, _sink.get())) {
682
933
                    continue;
683
933
                }
684
2.77M
            }
685
686
2.78M
            DBUG_EXECUTE_IF("PipelineTask::execute.sink_eos_sleep", {
687
2.78M
                auto required_pipeline_id =
688
2.78M
                        DebugPoints::instance()->get_debug_param_or_default<int32_t>(
689
2.78M
                                "PipelineTask::execute.sink_eos_sleep", "pipeline_id", -1);
690
2.78M
                auto required_task_id =
691
2.78M
                        DebugPoints::instance()->get_debug_param_or_default<int32_t>(
692
2.78M
                                "PipelineTask::execute.sink_eos_sleep", "task_id", -1);
693
2.78M
                if (required_pipeline_id == pipeline_id() && required_task_id == task_id()) {
694
2.78M
                    LOG(WARNING) << "PipelineTask::execute.sink_eos_sleep sleep 10s";
695
2.78M
                    sleep(10);
696
2.78M
                }
697
2.78M
            });
698
699
2.78M
            DBUG_EXECUTE_IF("PipelineTask::execute.terminate", {
700
2.78M
                if (_eos) {
701
2.78M
                    auto required_pipeline_id =
702
2.78M
                            DebugPoints::instance()->get_debug_param_or_default<int32_t>(
703
2.78M
                                    "PipelineTask::execute.terminate", "pipeline_id", -1);
704
2.78M
                    auto required_task_id =
705
2.78M
                            DebugPoints::instance()->get_debug_param_or_default<int32_t>(
706
2.78M
                                    "PipelineTask::execute.terminate", "task_id", -1);
707
2.78M
                    auto required_fragment_id =
708
2.78M
                            DebugPoints::instance()->get_debug_param_or_default<int32_t>(
709
2.78M
                                    "PipelineTask::execute.terminate", "fragment_id", -1);
710
2.78M
                    if (required_pipeline_id == pipeline_id() && required_task_id == task_id() &&
711
2.78M
                        fragment_context->get_fragment_id() == required_fragment_id) {
712
2.78M
                        _wake_up_early = true;
713
2.78M
                        unblock_all_dependencies();
714
2.78M
                    } else if (required_pipeline_id == pipeline_id() &&
715
2.78M
                               fragment_context->get_fragment_id() == required_fragment_id) {
716
2.78M
                        LOG(WARNING) << "PipelineTask::execute.terminate sleep 5s";
717
2.78M
                        sleep(5);
718
2.78M
                    }
719
2.78M
                }
720
2.78M
            });
721
2.78M
            RETURN_IF_ERROR(block->check_type_and_column());
722
2.78M
            status = _sink->sink(_state, block, _eos);
723
724
2.78M
            if (_eos) {
725
2.04M
                if (_sink->reset_to_rerun(_state, _root)) {
726
1.97k
                    _eos = false;
727
2.04M
                } else {
728
2.04M
                    RETURN_IF_ERROR(close(Status::OK(), false));
729
2.04M
                }
730
2.04M
            }
731
732
2.78M
            if (status.is<ErrorCode::END_OF_FILE>()) {
733
14
                set_wake_up_early();
734
14
                return Status::OK();
735
2.78M
            } else if (!status) {
736
12
                return status;
737
12
            }
738
739
2.78M
            if (_eos) { // just return, the scheduler will do finish work
740
2.04M
                return Status::OK();
741
2.04M
            }
742
2.78M
        }
743
6.75M
    }
744
745
52.3k
    RETURN_IF_ERROR(_state->get_query_ctx()->get_pipe_exec_scheduler()->submit(shared_from_this()));
746
52.3k
    return Status::OK();
747
52.3k
}
748
749
7
Status PipelineTask::do_revoke_memory(const std::shared_ptr<SpillContext>& spill_context) {
750
7
    auto fragment_context = _fragment_context.lock();
751
7
    if (!fragment_context) {
752
1
        return Status::InternalError("Fragment already finished! Query: {}", print_id(_query_id));
753
1
    }
754
755
6
    SCOPED_ATTACH_TASK(_state);
756
6
    ThreadCpuStopWatch cpu_time_stop_watch;
757
6
    cpu_time_stop_watch.start();
758
6
    Defer running_defer {[&]() {
759
6
        int64_t delta_cpu_time = cpu_time_stop_watch.elapsed_time();
760
6
        _task_cpu_timer->update(delta_cpu_time);
761
6
        fragment_context->get_query_ctx()->resource_ctx()->cpu_context()->update_cpu_cost_ms(
762
6
                delta_cpu_time);
763
764
        // If task is woke up early, unblock all dependencies and terminate all operators,
765
        // so this task could be closed immediately.
766
6
        if (_wake_up_early) {
767
1
            unblock_all_dependencies();
768
1
            THROW_IF_ERROR(_root->terminate(_state));
769
1
            THROW_IF_ERROR(_sink->terminate(_state));
770
1
            _eos = true;
771
1
        }
772
773
        // SpillContext tracks pipeline task count, not operator count.
774
        // Notify completion once after all operators + sink have finished revoking.
775
6
        if (spill_context) {
776
3
            spill_context->on_task_finished();
777
3
        }
778
6
    }};
779
780
    // Revoke memory from every operator that has enough revocable memory,
781
    // then revoke from the sink.
782
6
    for (auto& op : _operators) {
783
6
        if (op->revocable_mem_size(_state) >= SpillFile::MIN_SPILL_WRITE_BATCH_MEM) {
784
2
            RETURN_IF_ERROR(op->revoke_memory(_state));
785
2
        }
786
6
    }
787
788
6
    if (_sink->revocable_mem_size(_state) >= SpillFile::MIN_SPILL_WRITE_BATCH_MEM) {
789
1
        RETURN_IF_ERROR(_sink->revoke_memory(_state));
790
1
    }
791
6
    return Status::OK();
792
6
}
793
794
7.81M
bool PipelineTask::_try_to_reserve_memory(const size_t reserve_size, OperatorBase* op) {
795
7.81M
    auto st = thread_context()->thread_mem_tracker_mgr->try_reserve(reserve_size);
796
    // If reserve memory failed and the query is not enable spill, just disable reserve memory(this will enable
797
    // memory hard limit check, and will cancel the query if allocate memory failed) and let it run.
798
7.81M
    if (!st.ok() && !_state->enable_spill()) {
799
2
        LOG(INFO) << print_id(_query_id) << " reserve memory failed due to " << st
800
2
                  << ", and it is not enable spill, disable reserve memory and let it run";
801
2
        _state->get_query_ctx()->resource_ctx()->task_controller()->disable_reserve_memory();
802
2
        return true;
803
2
    }
804
7.81M
    COUNTER_UPDATE(_memory_reserve_times, 1);
805
806
    // Compute total revocable memory across all operators and the sink.
807
7.81M
    size_t total_revocable_mem_size = 0;
808
7.81M
    size_t operator_max_revocable_mem_size = 0;
809
810
7.81M
    if (!st.ok() || _state->enable_force_spill()) {
811
        // Compute total revocable memory across all operators and the sink.
812
4.99k
        total_revocable_mem_size = _sink->revocable_mem_size(_state);
813
4.99k
        operator_max_revocable_mem_size = total_revocable_mem_size;
814
4.99k
        for (auto& cur_op : _operators) {
815
4.99k
            total_revocable_mem_size += cur_op->revocable_mem_size(_state);
816
4.99k
            operator_max_revocable_mem_size =
817
4.99k
                    std::max(cur_op->revocable_mem_size(_state), operator_max_revocable_mem_size);
818
4.99k
        }
819
4.99k
    }
820
821
    // During enable force spill, other operators like scan opeartor will also try to reserve memory and will failed
822
    // here, if not add this check, it will always paused and resumed again.
823
7.81M
    if (st.ok() && _state->enable_force_spill()) {
824
2.97k
        if (operator_max_revocable_mem_size >= _state->spill_min_revocable_mem()) {
825
0
            st = Status::Error<ErrorCode::QUERY_MEMORY_EXCEEDED>(
826
0
                    "force spill and there is an operator has memory "
827
0
                    "size {} exceeds min mem size {}",
828
0
                    PrettyPrinter::print_bytes(operator_max_revocable_mem_size),
829
0
                    PrettyPrinter::print_bytes(_state->spill_min_revocable_mem()));
830
0
        }
831
2.97k
    }
832
833
7.81M
    if (!st.ok()) {
834
2.01k
        COUNTER_UPDATE(_memory_reserve_failed_times, 1);
835
        // build per-operator revocable memory info string for debugging
836
2.01k
        std::string ops_revocable_info;
837
2.01k
        {
838
2.01k
            fmt::memory_buffer buf;
839
2.01k
            for (auto& cur_op : _operators) {
840
2.01k
                fmt::format_to(buf, "{}({})-> ", cur_op->get_name(),
841
2.01k
                               PrettyPrinter::print_bytes(cur_op->revocable_mem_size(_state)));
842
2.01k
            }
843
2.01k
            if (_sink) {
844
2.01k
                fmt::format_to(buf, "{}({}) ", _sink->get_name(),
845
2.01k
                               PrettyPrinter::print_bytes(_sink->revocable_mem_size(_state)));
846
2.01k
            }
847
2.01k
            ops_revocable_info = fmt::to_string(buf);
848
2.01k
        }
849
850
2.01k
        auto debug_msg = fmt::format(
851
2.01k
                "Query: {} , try to reserve: {}, total revocable mem size: {}, failed reason: {}",
852
2.01k
                print_id(_query_id), PrettyPrinter::print_bytes(reserve_size),
853
2.01k
                PrettyPrinter::print_bytes(total_revocable_mem_size), st.to_string());
854
2.01k
        if (!ops_revocable_info.empty()) {
855
2.01k
            debug_msg += fmt::format(", ops_revocable=[{}]", ops_revocable_info);
856
2.01k
        }
857
        // PROCESS_MEMORY_EXCEEDED error msg already contains process_mem_log_str
858
2.01k
        if (!st.is<ErrorCode::PROCESS_MEMORY_EXCEEDED>()) {
859
2.01k
            debug_msg +=
860
2.01k
                    fmt::format(", debug info: {}", GlobalMemoryArbitrator::process_mem_log_str());
861
2.01k
        }
862
2.01k
        LOG(INFO) << debug_msg;
863
2.01k
        ExecEnv::GetInstance()->workload_group_mgr()->add_paused_query(
864
2.01k
                _state->get_query_ctx()->resource_ctx()->shared_from_this(), reserve_size, st);
865
2.01k
        _spilling = true;
866
2.01k
        return false;
867
2.01k
    }
868
7.81M
    return true;
869
7.81M
}
870
871
922k
void PipelineTask::stop_if_finished() {
872
922k
    auto fragment = _fragment_context.lock();
873
922k
    if (!fragment) {
874
0
        return;
875
0
    }
876
922k
    SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(fragment->get_query_ctx()->query_mem_tracker());
877
922k
    if (auto sink = _sink) {
878
742k
        if (sink->is_finished(_state)) {
879
1.81k
            set_wake_up_early();
880
1.81k
            unblock_all_dependencies();
881
1.81k
        }
882
742k
    }
883
922k
}
884
885
2.04M
Status PipelineTask::finalize() {
886
2.04M
    auto fragment = _fragment_context.lock();
887
2.04M
    if (!fragment) {
888
0
        return Status::OK();
889
0
    }
890
2.04M
    SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(fragment->get_query_ctx()->query_mem_tracker());
891
2.04M
    RETURN_IF_ERROR(_state_transition(State::FINALIZED));
892
2.04M
    std::unique_lock<std::mutex> lc(_dependency_lock);
893
2.04M
    _sink_shared_state.reset();
894
2.04M
    _op_shared_states.clear();
895
2.04M
    _shared_state_map.clear();
896
2.04M
    _block.reset();
897
2.04M
    _operators.clear();
898
2.04M
    _sink.reset();
899
2.04M
    _pipeline.reset();
900
2.04M
    return Status::OK();
901
2.04M
}
902
903
4.08M
Status PipelineTask::close(Status exec_status, bool close_sink) {
904
4.08M
    int64_t close_ns = 0;
905
4.08M
    Status s;
906
4.08M
    {
907
4.08M
        SCOPED_RAW_TIMER(&close_ns);
908
4.08M
        if (close_sink) {
909
2.04M
            s = _sink->close(_state, exec_status);
910
2.04M
        }
911
5.44M
        for (auto& op : _operators) {
912
5.44M
            auto tem = op->close(_state);
913
5.44M
            if (!tem.ok() && s.ok()) {
914
9
                s = std::move(tem);
915
9
            }
916
5.44M
        }
917
4.08M
    }
918
4.09M
    if (_opened) {
919
4.09M
        COUNTER_UPDATE(_close_timer, close_ns);
920
4.09M
        COUNTER_UPDATE(_task_profile->total_time_counter(), close_ns);
921
4.09M
    }
922
923
4.08M
    if (close_sink && _opened) {
924
2.04M
        _task_profile->add_info_string("WakeUpEarly", std::to_string(_wake_up_early.load()));
925
2.04M
        _fresh_profile_counter();
926
2.04M
    }
927
928
4.08M
    if (close_sink) {
929
2.04M
        RETURN_IF_ERROR(_state_transition(State::FINISHED));
930
2.04M
    }
931
4.08M
    return s;
932
4.08M
}
933
934
16.8k
std::string PipelineTask::debug_string() {
935
16.8k
    fmt::memory_buffer debug_string_buffer;
936
937
16.8k
    fmt::format_to(debug_string_buffer, "QueryId: {}\n", print_id(_query_id));
938
16.8k
    fmt::format_to(debug_string_buffer, "InstanceId: {}\n",
939
16.8k
                   print_id(_state->fragment_instance_id()));
940
941
16.8k
    fmt::format_to(debug_string_buffer,
942
16.8k
                   "PipelineTask[id = {}, open = {}, eos = {}, state = {}, dry run = "
943
16.8k
                   "{}, _wake_up_early = {}, _wake_up_by = {}, time elapsed since last state "
944
16.8k
                   "changing = {}s, spilling = {}, is running = {}]",
945
16.8k
                   _index, _opened, _eos, _to_string(_exec_state), _dry_run, _wake_up_early.load(),
946
16.8k
                   _wake_by, _state_change_watcher.elapsed_time() / NANOS_PER_SEC, _spilling,
947
16.8k
                   is_running());
948
16.8k
    std::unique_lock<std::mutex> lc(_dependency_lock);
949
16.8k
    auto* cur_blocked_dep = _blocked_dep;
950
16.8k
    auto fragment = _fragment_context.lock();
951
16.8k
    if (is_finalized() || !fragment) {
952
142
        fmt::format_to(debug_string_buffer, " pipeline name = {}", _pipeline_name);
953
142
        return fmt::to_string(debug_string_buffer);
954
142
    }
955
16.7k
    auto elapsed = fragment->elapsed_time() / NANOS_PER_SEC;
956
16.7k
    fmt::format_to(debug_string_buffer, " elapse time = {}s, block dependency = [{}]\n", elapsed,
957
16.7k
                   cur_blocked_dep && !is_finalized() ? cur_blocked_dep->debug_string() : "NULL");
958
959
16.7k
    if (_state && _state->local_runtime_filter_mgr()) {
960
582
        fmt::format_to(debug_string_buffer, "local_runtime_filter_mgr: [{}]\n",
961
582
                       _state->local_runtime_filter_mgr()->debug_string());
962
582
    }
963
964
16.7k
    fmt::format_to(debug_string_buffer, "operators: ");
965
33.7k
    for (size_t i = 0; i < _operators.size(); i++) {
966
17.0k
        fmt::format_to(debug_string_buffer, "\n{}",
967
17.0k
                       _opened && !is_finalized()
968
17.0k
                               ? _operators[i]->debug_string(_state, cast_set<int>(i))
969
17.0k
                               : _operators[i]->debug_string(cast_set<int>(i)));
970
17.0k
    }
971
16.7k
    fmt::format_to(debug_string_buffer, "\n{}\n",
972
16.7k
                   _opened && !is_finalized()
973
16.7k
                           ? _sink->debug_string(_state, cast_set<int>(_operators.size()))
974
16.7k
                           : _sink->debug_string(cast_set<int>(_operators.size())));
975
976
16.7k
    fmt::format_to(debug_string_buffer, "\nRead Dependency Information: \n");
977
978
16.7k
    size_t i = 0;
979
33.5k
    for (; i < _read_dependencies.size(); i++) {
980
33.4k
        for (size_t j = 0; j < _read_dependencies[i].size(); j++) {
981
16.6k
            fmt::format_to(debug_string_buffer, "{}. {}\n", i,
982
16.6k
                           _read_dependencies[i][j]->debug_string(cast_set<int>(i) + 1));
983
16.6k
        }
984
16.8k
    }
985
986
16.7k
    fmt::format_to(debug_string_buffer, "{}. {}\n", i,
987
16.7k
                   _memory_sufficient_dependency->debug_string(cast_set<int>(i++)));
988
989
16.7k
    fmt::format_to(debug_string_buffer, "\nWrite Dependency Information: \n");
990
33.2k
    for (size_t j = 0; j < _write_dependencies.size(); j++, i++) {
991
16.5k
        fmt::format_to(debug_string_buffer, "{}. {}\n", i,
992
16.5k
                       _write_dependencies[j]->debug_string(cast_set<int>(j) + 1));
993
16.5k
    }
994
995
16.7k
    fmt::format_to(debug_string_buffer, "\nExecution Dependency Information: \n");
996
50.3k
    for (size_t j = 0; j < _execution_dependencies.size(); j++, i++) {
997
33.6k
        fmt::format_to(debug_string_buffer, "{}. {}\n", i,
998
33.6k
                       _execution_dependencies[j]->debug_string(cast_set<int>(i) + 1));
999
33.6k
    }
1000
1001
16.7k
    fmt::format_to(debug_string_buffer, "Finish Dependency Information: \n");
1002
49.3k
    for (size_t j = 0; j < _finish_dependencies.size(); j++, i++) {
1003
32.6k
        fmt::format_to(debug_string_buffer, "{}. {}\n", i,
1004
32.6k
                       _finish_dependencies[j]->debug_string(cast_set<int>(i) + 1));
1005
32.6k
    }
1006
16.7k
    return fmt::to_string(debug_string_buffer);
1007
16.8k
}
1008
1009
2
size_t PipelineTask::get_revocable_size() const {
1010
2
    if (!_opened || is_finalized() || _running || (_eos && !_spilling)) {
1011
0
        return 0;
1012
0
    }
1013
1014
    // Sum revocable memory from every operator in the pipeline + the sink.
1015
    // Each operator reports only its own revocable memory (no child recursion).
1016
2
    size_t total = _sink->revocable_mem_size(_state);
1017
2
    for (const auto& op : _operators) {
1018
2
        total += op->revocable_mem_size(_state);
1019
2
    }
1020
2
    return total;
1021
2
}
1022
1023
3
Status PipelineTask::revoke_memory(const std::shared_ptr<SpillContext>& spill_context) {
1024
3
    DCHECK(spill_context);
1025
3
    if (is_finalized()) {
1026
1
        spill_context->on_task_finished();
1027
1
        VLOG_DEBUG << "Query: " << print_id(_state->query_id()) << ", task: " << ((void*)this)
1028
0
                   << " finalized";
1029
1
        return Status::OK();
1030
1
    }
1031
1032
2
    const auto revocable_size = get_revocable_size();
1033
2
    if (revocable_size >= SpillFile::MIN_SPILL_WRITE_BATCH_MEM) {
1034
1
        auto revokable_task = std::make_shared<RevokableTask>(shared_from_this(), spill_context);
1035
        // Submit a revocable task to run, the run method will call revoke memory. Currently the
1036
        // underline pipeline task is still blocked.
1037
1
        RETURN_IF_ERROR(_state->get_query_ctx()->get_pipe_exec_scheduler()->submit(revokable_task));
1038
1
    } else {
1039
1
        spill_context->on_task_finished();
1040
1
        VLOG_DEBUG << "Query: " << print_id(_state->query_id()) << ", task: " << ((void*)this)
1041
0
                   << " has not enough data to revoke: " << revocable_size;
1042
1
    }
1043
2
    return Status::OK();
1044
2
}
1045
1046
5.35M
void PipelineTask::wake_up(Dependency* dep, std::unique_lock<std::mutex>& /* dep_lock */) {
1047
10.7M
    auto cancel_if_error = [&](const Status& st) {
1048
10.7M
        if (!st.ok()) {
1049
0
            if (auto frag = fragment_context().lock()) {
1050
0
                frag->cancel(st);
1051
0
            }
1052
0
        }
1053
10.7M
    };
1054
    // call by dependency
1055
5.35M
    DCHECK_EQ(_blocked_dep, dep) << "dep : " << dep->debug_string(0) << "task: " << debug_string();
1056
5.35M
    _blocked_dep = nullptr;
1057
5.35M
    auto holder = std::dynamic_pointer_cast<PipelineTask>(shared_from_this());
1058
5.35M
    cancel_if_error(_state_transition(PipelineTask::State::RUNNABLE));
1059
    // Under _wake_up_early, FINISHED/FINALIZED → RUNNABLE is a legal no-op
1060
    // (_state_transition returns OK but state stays unchanged). We must not
1061
    // resubmit a terminated task: finalize() clears _sink/_operators, and
1062
    // submit() → is_blockable() would dereference them → SIGSEGV.
1063
5.35M
    if (_exec_state == State::FINISHED || _exec_state == State::FINALIZED) {
1064
2
        return;
1065
2
    }
1066
5.36M
    if (auto f = _fragment_context.lock(); f) {
1067
5.36M
        cancel_if_error(_state->get_query_ctx()->get_pipe_exec_scheduler()->submit(holder));
1068
5.36M
    }
1069
5.35M
}
1070
1071
16.7M
Status PipelineTask::_state_transition(State new_state) {
1072
16.7M
    const auto& table =
1073
16.7M
            _wake_up_early ? WAKE_UP_EARLY_LEGAL_STATE_TRANSITION : LEGAL_STATE_TRANSITION;
1074
16.7M
    if (!table[(int)new_state].contains(_exec_state)) {
1075
31
        return Status::InternalError(
1076
31
                "Task state transition from {} to {} is not allowed! Task info: {}",
1077
31
                _to_string(_exec_state), _to_string(new_state), debug_string());
1078
31
    }
1079
    // FINISHED/FINALIZED → RUNNABLE is legal under wake_up_early (delayed wake_up() arriving
1080
    // after the task already terminated), but we must not actually move the state backwards
1081
    // or update profile info (which would misleadingly show RUNNABLE for a terminated task).
1082
16.7M
    bool need_move = !((_exec_state == State::FINISHED || _exec_state == State::FINALIZED) &&
1083
16.7M
                       new_state == State::RUNNABLE);
1084
16.8M
    if (need_move) {
1085
16.8M
        if (_exec_state != new_state) {
1086
16.8M
            _state_change_watcher.reset();
1087
16.8M
            _state_change_watcher.start();
1088
16.8M
        }
1089
16.8M
        _task_profile->add_info_string("TaskState", _to_string(new_state));
1090
16.8M
        _task_profile->add_info_string("BlockedByDependency",
1091
16.8M
                                       _blocked_dep ? _blocked_dep->name() : "");
1092
16.8M
        _exec_state = new_state;
1093
16.8M
    }
1094
16.7M
    return Status::OK();
1095
16.7M
}
1096
1097
} // namespace doris