Coverage Report

Created: 2026-06-30 19:59

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/root/doris/be/src/pipeline/pipeline_task.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "pipeline_task.h"
19
20
#include <fmt/core.h>
21
#include <fmt/format.h>
22
#include <gen_cpp/Metrics_types.h>
23
#include <glog/logging.h>
24
25
#include <algorithm>
26
#include <memory>
27
#include <ostream>
28
#include <vector>
29
30
#include "common/logging.h"
31
#include "common/status.h"
32
#include "pipeline/dependency.h"
33
#include "pipeline/exec/operator.h"
34
#include "pipeline/exec/scan_operator.h"
35
#include "pipeline/pipeline.h"
36
#include "pipeline/pipeline_fragment_context.h"
37
#include "pipeline/task_queue.h"
38
#include "pipeline/task_scheduler.h"
39
#include "revokable_task.h"
40
#include "runtime/descriptors.h"
41
#include "runtime/exec_env.h"
42
#include "runtime/query_context.h"
43
#include "runtime/thread_context.h"
44
#include "runtime/workload_group/workload_group_manager.h"
45
#include "util/defer_op.h"
46
#include "util/mem_info.h"
47
#include "util/runtime_profile.h"
48
#include "util/uid_util.h"
49
#include "vec/core/block.h"
50
#include "vec/spill/spill_stream.h"
51
52
namespace doris {
53
class RuntimeState;
54
} // namespace doris
55
56
namespace doris::pipeline {
57
#include "common/compile_check_begin.h"
58
59
PipelineTask::PipelineTask(PipelinePtr& pipeline, uint32_t task_id, RuntimeState* state,
60
                           std::shared_ptr<PipelineFragmentContext> fragment_context,
61
                           RuntimeProfile* parent_profile,
62
                           std::map<int, std::pair<std::shared_ptr<BasicSharedState>,
63
                                                   std::vector<std::shared_ptr<Dependency>>>>
64
                                   shared_state_map,
65
                           int task_idx)
66
        :
67
#ifdef BE_TEST
68
72.1k
          _query_id(fragment_context ? fragment_context->get_query_id() : TUniqueId()),
69
#else
70
          _query_id(fragment_context->get_query_id()),
71
#endif
72
72.1k
          _index(task_id),
73
72.1k
          _pipeline(pipeline),
74
72.1k
          _opened(false),
75
72.1k
          _state(state),
76
72.1k
          _fragment_context(fragment_context),
77
72.1k
          _parent_profile(parent_profile),
78
72.1k
          _operators(pipeline->operators()),
79
72.1k
          _source(_operators.front().get()),
80
72.1k
          _root(_operators.back().get()),
81
72.1k
          _sink(pipeline->sink_shared_pointer()),
82
72.1k
          _shared_state_map(std::move(shared_state_map)),
83
72.1k
          _task_idx(task_idx),
84
72.1k
          _memory_sufficient_dependency(state->get_query_ctx()->get_memory_sufficient_dependency()),
85
72.1k
          _pipeline_name(_pipeline->name()) {
86
#ifndef BE_TEST
87
    _query_mem_tracker = fragment_context->get_query_ctx()->query_mem_tracker();
88
#endif
89
72.1k
    _execution_dependencies.push_back(state->get_query_ctx()->get_execution_dependency());
90
72.1k
    if (!_shared_state_map.contains(_sink->dests_id().front())) {
91
72.1k
        auto shared_state = _sink->create_shared_state();
92
72.1k
        if (shared_state) {
93
33
            _sink_shared_state = shared_state;
94
33
        }
95
72.1k
    }
96
72.1k
}
97
98
72.1k
PipelineTask::~PipelineTask() {
99
72.1k
    auto reset_member = [&]() {
100
72.1k
        _shared_state_map.clear();
101
72.1k
        _sink_shared_state.reset();
102
72.1k
        _op_shared_states.clear();
103
72.1k
        _sink.reset();
104
72.1k
        _operators.clear();
105
72.1k
        _block.reset();
106
72.1k
        _pipeline.reset();
107
72.1k
    };
108
// PipelineTask is also hold by task queue( https://github.com/apache/doris/pull/49753),
109
// so that it maybe the last one to be destructed.
110
// But pipeline task hold some objects, like operators, shared state, etc. So that should release
111
// memory manually.
112
#ifndef BE_TEST
113
    if (_query_mem_tracker) {
114
        SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(_query_mem_tracker);
115
        reset_member();
116
        return;
117
    }
118
#endif
119
72.1k
    reset_member();
120
72.1k
}
121
122
Status PipelineTask::prepare(const std::vector<TScanRangeParams>& scan_range, const int sender_id,
123
19
                             const TDataSink& tsink) {
124
19
    DCHECK(_sink);
125
19
    _init_profile();
126
19
    SCOPED_TIMER(_task_profile->total_time_counter());
127
19
    SCOPED_CPU_TIMER(_task_cpu_timer);
128
19
    SCOPED_TIMER(_prepare_timer);
129
19
    DBUG_EXECUTE_IF("fault_inject::PipelineXTask::prepare", {
130
19
        Status status = Status::Error<INTERNAL_ERROR>("fault_inject pipeline_task prepare failed");
131
19
        return status;
132
19
    });
133
19
    {
134
        // set sink local state
135
19
        LocalSinkStateInfo info {_task_idx,         _task_profile.get(),
136
19
                                 sender_id,         get_sink_shared_state().get(),
137
19
                                 _shared_state_map, tsink};
138
19
        RETURN_IF_ERROR(_sink->setup_local_state(_state, info));
139
19
    }
140
141
19
    _scan_ranges = scan_range;
142
19
    auto* parent_profile = _state->get_sink_local_state()->operator_profile();
143
144
40
    for (int op_idx = cast_set<int>(_operators.size() - 1); op_idx >= 0; op_idx--) {
145
21
        auto& op = _operators[op_idx];
146
21
        LocalStateInfo info {parent_profile, _scan_ranges, get_op_shared_state(op->operator_id()),
147
21
                             _shared_state_map, _task_idx};
148
21
        RETURN_IF_ERROR(op->setup_local_state(_state, info));
149
21
        parent_profile = _state->get_local_state(op->operator_id())->operator_profile();
150
21
    }
151
19
    {
152
19
        const auto& deps =
153
19
                _state->get_local_state(_source->operator_id())->execution_dependencies();
154
19
        std::unique_lock<std::mutex> lc(_dependency_lock);
155
19
        std::copy(deps.begin(), deps.end(),
156
19
                  std::inserter(_execution_dependencies, _execution_dependencies.end()));
157
19
    }
158
19
    if (auto fragment = _fragment_context.lock()) {
159
18
        if (fragment->get_query_ctx()->is_cancelled()) {
160
0
            terminate();
161
0
            return fragment->get_query_ctx()->exec_status();
162
0
        }
163
18
    } else {
164
1
        return Status::InternalError("Fragment already finished! Query: {}", print_id(_query_id));
165
1
    }
166
18
    _block = doris::vectorized::Block::create_unique();
167
18
    return _state_transition(State::RUNNABLE);
168
19
}
169
170
15
Status PipelineTask::_extract_dependencies() {
171
15
    std::vector<std::vector<Dependency*>> read_dependencies;
172
15
    std::vector<Dependency*> write_dependencies;
173
15
    std::vector<Dependency*> finish_dependencies;
174
15
    read_dependencies.resize(_operators.size());
175
15
    size_t i = 0;
176
17
    for (auto& op : _operators) {
177
17
        auto result = _state->get_local_state_result(op->operator_id());
178
17
        if (!result) {
179
1
            return result.error();
180
1
        }
181
16
        auto* local_state = result.value();
182
16
        read_dependencies[i] = local_state->dependencies();
183
16
        auto* fin_dep = local_state->finishdependency();
184
16
        if (fin_dep) {
185
9
            finish_dependencies.push_back(fin_dep);
186
9
        }
187
16
        i++;
188
16
    }
189
14
    DBUG_EXECUTE_IF("fault_inject::PipelineXTask::_extract_dependencies", {
190
14
        Status status = Status::Error<INTERNAL_ERROR>(
191
14
                "fault_inject pipeline_task _extract_dependencies failed");
192
14
        return status;
193
14
    });
194
14
    {
195
14
        auto* local_state = _state->get_sink_local_state();
196
14
        write_dependencies = local_state->dependencies();
197
14
        auto* fin_dep = local_state->finishdependency();
198
14
        if (fin_dep) {
199
14
            finish_dependencies.push_back(fin_dep);
200
14
        }
201
14
    }
202
14
    {
203
14
        std::unique_lock<std::mutex> lc(_dependency_lock);
204
14
        read_dependencies.swap(_read_dependencies);
205
14
        write_dependencies.swap(_write_dependencies);
206
14
        finish_dependencies.swap(_finish_dependencies);
207
14
    }
208
14
    return Status::OK();
209
14
}
210
211
6
bool PipelineTask::inject_shared_state(std::shared_ptr<BasicSharedState> shared_state) {
212
6
    if (!shared_state) {
213
1
        return false;
214
1
    }
215
    // Shared state is created by upstream task's sink operator and shared by source operator of
216
    // this task.
217
7
    for (auto& op : _operators) {
218
7
        if (shared_state->related_op_ids.contains(op->operator_id())) {
219
3
            _op_shared_states.insert({op->operator_id(), shared_state});
220
3
            return true;
221
3
        }
222
7
    }
223
    // Shared state is created by the first sink operator and shared by sink operator of this task.
224
    // For example, Set operations.
225
2
    if (shared_state->related_op_ids.contains(_sink->dests_id().front())) {
226
1
        DCHECK_EQ(_sink_shared_state, nullptr)
227
0
                << " Sink: " << _sink->get_name() << " dest id: " << _sink->dests_id().front();
228
1
        _sink_shared_state = shared_state;
229
1
        return true;
230
1
    }
231
1
    return false;
232
2
}
233
234
19
void PipelineTask::_init_profile() {
235
19
    _task_profile = std::make_unique<RuntimeProfile>(fmt::format("PipelineTask(index={})", _index));
236
19
    _parent_profile->add_child(_task_profile.get(), true, nullptr);
237
19
    _task_cpu_timer = ADD_TIMER(_task_profile, "TaskCpuTime");
238
239
19
    static const char* exec_time = "ExecuteTime";
240
19
    _exec_timer = ADD_TIMER(_task_profile, exec_time);
241
19
    _prepare_timer = ADD_CHILD_TIMER(_task_profile, "PrepareTime", exec_time);
242
19
    _open_timer = ADD_CHILD_TIMER(_task_profile, "OpenTime", exec_time);
243
19
    _get_block_timer = ADD_CHILD_TIMER(_task_profile, "GetBlockTime", exec_time);
244
19
    _get_block_counter = ADD_COUNTER(_task_profile, "GetBlockCounter", TUnit::UNIT);
245
19
    _sink_timer = ADD_CHILD_TIMER(_task_profile, "SinkTime", exec_time);
246
19
    _close_timer = ADD_CHILD_TIMER(_task_profile, "CloseTime", exec_time);
247
248
19
    _wait_worker_timer = ADD_TIMER_WITH_LEVEL(_task_profile, "WaitWorkerTime", 1);
249
250
19
    _schedule_counts = ADD_COUNTER(_task_profile, "NumScheduleTimes", TUnit::UNIT);
251
19
    _yield_counts = ADD_COUNTER(_task_profile, "NumYieldTimes", TUnit::UNIT);
252
19
    _core_change_times = ADD_COUNTER(_task_profile, "CoreChangeTimes", TUnit::UNIT);
253
19
    _memory_reserve_times = ADD_COUNTER(_task_profile, "MemoryReserveTimes", TUnit::UNIT);
254
19
    _memory_reserve_failed_times =
255
19
            ADD_COUNTER(_task_profile, "MemoryReserveFailedTimes", TUnit::UNIT);
256
19
}
257
258
6
void PipelineTask::_fresh_profile_counter() {
259
6
    COUNTER_SET(_schedule_counts, (int64_t)_schedule_time);
260
6
    COUNTER_SET(_wait_worker_timer, (int64_t)_wait_worker_watcher.elapsed_time());
261
6
}
262
263
14
Status PipelineTask::_open() {
264
14
    SCOPED_TIMER(_task_profile->total_time_counter());
265
14
    SCOPED_CPU_TIMER(_task_cpu_timer);
266
14
    SCOPED_TIMER(_open_timer);
267
14
    _dry_run = _sink->should_dry_run(_state);
268
16
    for (auto& o : _operators) {
269
16
        RETURN_IF_ERROR(_state->get_local_state(o->operator_id())->open(_state));
270
16
    }
271
14
    RETURN_IF_ERROR(_state->get_sink_local_state()->open(_state));
272
14
    RETURN_IF_ERROR(_extract_dependencies());
273
14
    DBUG_EXECUTE_IF("fault_inject::PipelineXTask::open", {
274
14
        Status status = Status::Error<INTERNAL_ERROR>("fault_inject pipeline_task open failed");
275
14
        return status;
276
14
    });
277
14
    _opened = true;
278
14
    return Status::OK();
279
14
}
280
281
68
Status PipelineTask::_prepare() {
282
68
    SCOPED_TIMER(_task_profile->total_time_counter());
283
68
    SCOPED_CPU_TIMER(_task_cpu_timer);
284
80
    for (auto& o : _operators) {
285
80
        RETURN_IF_ERROR(_state->get_local_state(o->operator_id())->prepare(_state));
286
80
    }
287
68
    RETURN_IF_ERROR(_state->get_sink_local_state()->prepare(_state));
288
68
    return Status::OK();
289
68
}
290
291
45
bool PipelineTask::_wait_to_start() {
292
    // Before task starting, we should make sure
293
    // 1. Execution dependency is ready (which is controlled by FE 2-phase commit)
294
    // 2. Runtime filter dependencies are ready
295
    // 3. All tablets are loaded into local storage
296
45
    return std::any_of(
297
45
            _execution_dependencies.begin(), _execution_dependencies.end(),
298
62
            [&](Dependency* dep) -> bool { return dep->is_blocked_by(shared_from_this()); });
299
45
}
300
301
18
bool PipelineTask::_is_pending_finish() {
302
    // Spilling may be in progress if eos is true.
303
25
    return std::ranges::any_of(_finish_dependencies, [&](Dependency* dep) -> bool {
304
25
        return dep->is_blocked_by(shared_from_this());
305
25
    });
306
18
}
307
308
0
bool PipelineTask::is_blockable() const {
309
    // Before task starting, we should make sure
310
    // 1. Execution dependency is ready (which is controlled by FE 2-phase commit)
311
    // 2. Runtime filter dependencies are ready
312
    // 3. All tablets are loaded into local storage
313
314
0
    if (_state->enable_fuzzy_blockable_task()) {
315
0
        if ((_schedule_time + _task_idx) % 2 == 0) {
316
0
            return true;
317
0
        }
318
0
    }
319
320
0
    return std::ranges::any_of(_operators,
321
0
                               [&](OperatorPtr op) -> bool { return op->is_blockable(_state); }) ||
322
0
           _sink->is_blockable(_state);
323
0
}
324
325
11
void PipelineTask::_stop_accepting_submit() {
326
11
    std::unique_lock<std::mutex> lock(_blockable_check_lock);
327
11
    _accept_submit = false;
328
11
}
329
330
543k
bool PipelineTask::_is_blocked() {
331
    // `_dry_run = true` means we do not need data from source operator.
332
543k
    if (!_dry_run) {
333
1.08M
        for (int i = cast_set<int>(_read_dependencies.size() - 1); i >= 0; i--) {
334
            // `_read_dependencies` is organized according to operators. For each operator, running condition is met iff all dependencies are ready.
335
543k
            for (auto* dep : _read_dependencies[i]) {
336
543k
                if (dep->is_blocked_by(shared_from_this())) {
337
15
                    return true;
338
15
                }
339
543k
            }
340
            // If all dependencies are ready for this operator, we can execute this task if no datum is needed from upstream operators.
341
543k
            if (!_operators[i]->need_more_input_data(_state)) {
342
2
                break;
343
2
            }
344
543k
        }
345
543k
    }
346
543k
    return _memory_sufficient_dependency->is_blocked_by(shared_from_this()) ||
347
543k
           std::ranges::any_of(_write_dependencies, [&](Dependency* dep) -> bool {
348
543k
               return dep->is_blocked_by(shared_from_this());
349
543k
           });
350
543k
}
351
352
7
void PipelineTask::terminate() {
353
    // We use a lock to assure all dependencies are not deconstructed here.
354
7
    std::unique_lock<std::mutex> lc(_dependency_lock);
355
7
    auto fragment = _fragment_context.lock();
356
7
    if (!is_finalized() && fragment) {
357
7
        try {
358
7
            DCHECK(_wake_up_early || fragment->is_canceled());
359
7
            std::ranges::for_each(_write_dependencies,
360
7
                                  [&](Dependency* dep) { dep->set_always_ready(); });
361
7
            std::ranges::for_each(_finish_dependencies,
362
14
                                  [&](Dependency* dep) { dep->set_always_ready(); });
363
7
            std::ranges::for_each(_read_dependencies, [&](std::vector<Dependency*>& deps) {
364
7
                std::ranges::for_each(deps, [&](Dependency* dep) { dep->set_always_ready(); });
365
7
            });
366
            // All `_execution_deps` will never be set blocking from ready. So we just set ready here.
367
7
            std::ranges::for_each(_execution_dependencies,
368
14
                                  [&](Dependency* dep) { dep->set_ready(); });
369
7
            _memory_sufficient_dependency->set_ready();
370
7
        } catch (const doris::Exception& e) {
371
0
            LOG(WARNING) << "Terminate failed: " << e.code() << ", " << e.to_string();
372
0
        }
373
7
    }
374
7
}
375
376
/**
377
 * `_eos` indicates whether the execution phase is done. `done` indicates whether we could close
378
 * this task.
379
 *
380
 * For example,
381
 * 1. if `_eos` is false which means we should continue to get next block so we cannot close (e.g.
382
 *    `done` is false)
383
 * 2. if `_eos` is true which means all blocks from source are exhausted but `_is_pending_finish()`
384
 *    is true which means we should wait for a pending dependency ready (maybe a running rpc), so we
385
 *    cannot close (e.g. `done` is false)
386
 * 3. if `_eos` is true which means all blocks from source are exhausted and `_is_pending_finish()`
387
 *    is false which means we can close immediately (e.g. `done` is true)
388
 * @param done
389
 * @return
390
 */
391
39
Status PipelineTask::execute(bool* done) {
392
39
    if (_exec_state != State::RUNNABLE || _blocked_dep != nullptr) [[unlikely]] {
393
1
#ifdef BE_TEST
394
1
        return Status::InternalError("Pipeline task is not runnable! Task info: {}",
395
1
                                     debug_string());
396
#else
397
        return Status::FatalError("Pipeline task is not runnable! Task info: {}", debug_string());
398
#endif
399
1
    }
400
401
38
    auto fragment_context = _fragment_context.lock();
402
38
    if (!fragment_context) {
403
0
        return Status::InternalError("Fragment already finished! Query: {}", print_id(_query_id));
404
0
    }
405
38
    int64_t time_spent = 0;
406
38
    ThreadCpuStopWatch cpu_time_stop_watch;
407
38
    cpu_time_stop_watch.start();
408
38
    SCOPED_ATTACH_TASK(_state);
409
38
    Defer running_defer {[&]() {
410
38
        int64_t delta_cpu_time = cpu_time_stop_watch.elapsed_time();
411
38
        _task_cpu_timer->update(delta_cpu_time);
412
38
        fragment_context->get_query_ctx()->resource_ctx()->cpu_context()->update_cpu_cost_ms(
413
38
                delta_cpu_time);
414
415
        // If task is woke up early, we should terminate all operators, and this task could be closed immediately.
416
38
        if (_wake_up_early) {
417
3
            _eos = true;
418
3
            *done = true;
419
35
        } else if (_eos && !_spilling &&
420
35
                   (fragment_context->is_canceled() || !_is_pending_finish())) {
421
            // Debug point for testing the race condition fix: inject set_wake_up_early() +
422
            // terminate() here to simulate Thread B writing A then B between Thread A's two
423
            // reads of _wake_up_early.
424
11
            DBUG_EXECUTE_IF("PipelineTask::execute.wake_up_early_in_else_if", {
425
11
                set_wake_up_early();
426
11
                terminate();
427
11
            });
428
11
            *done = true;
429
11
        }
430
431
        // NOTE: The terminate() call is intentionally placed AFTER the _is_pending_finish() check
432
        // above, not before. This ordering is critical to avoid a race condition:
433
        //
434
        // Pipeline::make_all_runnable() writes in this order:
435
        //   (A) set_wake_up_early()  ->  (B) terminate() [sets finish_dep._always_ready]
436
        //
437
        // If we checked _wake_up_early (A) before _is_pending_finish() (B), there would be a
438
        // window where Thread A reads _wake_up_early=false, then Thread B writes both A and B,
439
        // then Thread A reads _is_pending_finish()=false (due to _always_ready). Thread A would
440
        // then set *done=true without ever calling operator terminate(), causing close() to run
441
        // on operators that were never properly terminated (e.g. RuntimeFilterProducer still in
442
        // WAITING_FOR_SYNCED_SIZE state when insert() is called).
443
        //
444
        // By reading _is_pending_finish() (B) before the second read of _wake_up_early (A),
445
        // if Thread A observes B's effect (_always_ready=true), it is guaranteed to also observe
446
        // A's effect (_wake_up_early=true) on this second read, ensuring terminate() is called.
447
38
        if (_wake_up_early) {
448
4
            terminate();
449
4
            THROW_IF_ERROR(_root->terminate(_state));
450
4
            THROW_IF_ERROR(_sink->terminate(_state));
451
4
        }
452
38
    }};
453
38
    const auto query_id = _state->query_id();
454
    // If this task is already EOS and block is empty (which means we already output all blocks),
455
    // just return here.
456
38
    if (_eos && !_spilling) {
457
3
        return Status::OK();
458
3
    }
459
    // If this task is blocked by a spilling request and waken up immediately, the spilling
460
    // dependency will not block this task and we should just run here.
461
35
    if (!_block->empty()) {
462
0
        LOG(INFO) << "Query: " << print_id(query_id) << " has pending block, size: "
463
0
                  << PrettyPrinter::print_bytes(_block->allocated_bytes());
464
0
        DCHECK(_spilling);
465
0
    }
466
467
35
    SCOPED_TIMER(_task_profile->total_time_counter());
468
35
    SCOPED_TIMER(_exec_timer);
469
470
35
    if (!_wake_up_early) {
471
35
        RETURN_IF_ERROR(_prepare());
472
35
    }
473
35
    DBUG_EXECUTE_IF("fault_inject::PipelineXTask::execute", {
474
35
        Status status = Status::Error<INTERNAL_ERROR>("fault_inject pipeline_task execute failed");
475
35
        return status;
476
35
    });
477
    // `_wake_up_early` must be after `_wait_to_start()`
478
35
    if (_wait_to_start() || _wake_up_early) {
479
2
        return Status::OK();
480
2
    }
481
33
    RETURN_IF_ERROR(_prepare());
482
483
    // The status must be runnable
484
33
    if (!_opened && !fragment_context->is_canceled()) {
485
13
        DBUG_EXECUTE_IF("PipelineTask::execute.open_sleep", {
486
13
            auto required_pipeline_id =
487
13
                    DebugPoints::instance()->get_debug_param_or_default<int32_t>(
488
13
                            "PipelineTask::execute.open_sleep", "pipeline_id", -1);
489
13
            auto required_task_id = DebugPoints::instance()->get_debug_param_or_default<int32_t>(
490
13
                    "PipelineTask::execute.open_sleep", "task_id", -1);
491
13
            if (required_pipeline_id == pipeline_id() && required_task_id == task_id()) {
492
13
                LOG(WARNING) << "PipelineTask::execute.open_sleep sleep 5s";
493
13
                sleep(5);
494
13
            }
495
13
        });
496
497
13
        SCOPED_RAW_TIMER(&time_spent);
498
13
        RETURN_IF_ERROR(_open());
499
13
    }
500
501
543k
    while (!fragment_context->is_canceled()) {
502
543k
        SCOPED_RAW_TIMER(&time_spent);
503
543k
        Defer defer {[&]() {
504
            // If this run is pended by a spilling request, the block will be output in next run.
505
543k
            if (!_spilling) {
506
541k
                _block->clear_column_data(_root->row_desc().num_materialized_slots());
507
541k
            }
508
543k
        }};
509
        // `_wake_up_early` must be after `_is_blocked()`
510
543k
        if (_is_blocked() || _wake_up_early) {
511
18
            return Status::OK();
512
18
        }
513
514
        /// When a task is cancelled,
515
        /// its blocking state will be cleared and it will transition to a ready state (though it is not truly ready).
516
        /// Here, checking whether it is cancelled to prevent tasks in a blocking state from being re-executed.
517
543k
        if (fragment_context->is_canceled()) {
518
0
            break;
519
0
        }
520
521
543k
        if (time_spent > _exec_time_slice) {
522
4
            COUNTER_UPDATE(_yield_counts, 1);
523
4
            break;
524
4
        }
525
543k
        auto* block = _block.get();
526
527
543k
        DBUG_EXECUTE_IF("fault_inject::PipelineXTask::executing", {
528
543k
            Status status =
529
543k
                    Status::Error<INTERNAL_ERROR>("fault_inject pipeline_task executing failed");
530
543k
            return status;
531
543k
        });
532
533
        // `_sink->is_finished(_state)` means sink operator should be finished
534
543k
        if (_sink->is_finished(_state)) {
535
0
            set_wake_up_early();
536
0
            return Status::OK();
537
0
        }
538
539
        // `_dry_run` means sink operator need no more data
540
543k
        _eos = _dry_run || _eos;
541
543k
        _spilling = false;
542
543k
        auto workload_group = _state->workload_group();
543
        // If last run is pended by a spilling request, `_block` is produced with some rows in last
544
        // run, so we will resume execution using the block.
545
543k
        if (!_eos && _block->empty()) {
546
542k
            SCOPED_TIMER(_get_block_timer);
547
542k
            if (_state->low_memory_mode()) {
548
0
                _sink->set_low_memory_mode(_state);
549
0
                _root->set_low_memory_mode(_state);
550
0
            }
551
542k
            DEFER_RELEASE_RESERVED();
552
542k
            _get_block_counter->update(1);
553
542k
            const auto reserve_size = _root->get_reserve_mem_size(_state);
554
542k
            _root->reset_reserve_mem_size(_state);
555
556
542k
            if (workload_group &&
557
542k
                _state->get_query_ctx()
558
53.6k
                        ->resource_ctx()
559
53.6k
                        ->task_controller()
560
53.6k
                        ->is_enable_reserve_memory() &&
561
542k
                reserve_size > 0) {
562
1.06k
                if (!_try_to_reserve_memory(reserve_size, _root)) {
563
1.06k
                    continue;
564
1.06k
                }
565
1.06k
            }
566
567
541k
            bool eos = false;
568
541k
            RETURN_IF_ERROR(_root->get_block_after_projects(_state, block, &eos));
569
541k
            RETURN_IF_ERROR(block->check_type_and_column());
570
541k
            _eos = eos;
571
541k
        }
572
573
542k
        if (!_block->empty() || _eos) {
574
1.08k
            SCOPED_TIMER(_sink_timer);
575
1.08k
            Status status = Status::OK();
576
1.08k
            DEFER_RELEASE_RESERVED();
577
1.08k
            if (_state->get_query_ctx()
578
1.08k
                        ->resource_ctx()
579
1.08k
                        ->task_controller()
580
1.08k
                        ->is_enable_reserve_memory() &&
581
1.08k
                workload_group && !(_wake_up_early || _dry_run)) {
582
1.07k
                const auto sink_reserve_size = _sink->get_reserve_mem_size(_state, _eos);
583
1.07k
                if (sink_reserve_size > 0 &&
584
1.07k
                    !_try_to_reserve_memory(sink_reserve_size, _sink.get())) {
585
1.06k
                    continue;
586
1.06k
                }
587
1.07k
            }
588
589
17
            if (_eos) {
590
11
                RETURN_IF_ERROR(close(Status::OK(), false));
591
11
            }
592
593
17
            DBUG_EXECUTE_IF("PipelineTask::execute.sink_eos_sleep", {
594
17
                auto required_pipeline_id =
595
17
                        DebugPoints::instance()->get_debug_param_or_default<int32_t>(
596
17
                                "PipelineTask::execute.sink_eos_sleep", "pipeline_id", -1);
597
17
                auto required_task_id =
598
17
                        DebugPoints::instance()->get_debug_param_or_default<int32_t>(
599
17
                                "PipelineTask::execute.sink_eos_sleep", "task_id", -1);
600
17
                if (required_pipeline_id == pipeline_id() && required_task_id == task_id()) {
601
17
                    LOG(WARNING) << "PipelineTask::execute.sink_eos_sleep sleep 10s";
602
17
                    sleep(10);
603
17
                }
604
17
            });
605
606
17
            DBUG_EXECUTE_IF("PipelineTask::execute.terminate", {
607
17
                if (_eos) {
608
17
                    auto required_pipeline_id =
609
17
                            DebugPoints::instance()->get_debug_param_or_default<int32_t>(
610
17
                                    "PipelineTask::execute.terminate", "pipeline_id", -1);
611
17
                    auto required_task_id =
612
17
                            DebugPoints::instance()->get_debug_param_or_default<int32_t>(
613
17
                                    "PipelineTask::execute.terminate", "task_id", -1);
614
17
                    auto required_fragment_id =
615
17
                            DebugPoints::instance()->get_debug_param_or_default<int32_t>(
616
17
                                    "PipelineTask::execute.terminate", "fragment_id", -1);
617
17
                    if (required_pipeline_id == pipeline_id() && required_task_id == task_id() &&
618
17
                        fragment_context->get_fragment_id() == required_fragment_id) {
619
17
                        _wake_up_early = true;
620
17
                        terminate();
621
17
                    } else if (required_pipeline_id == pipeline_id() &&
622
17
                               fragment_context->get_fragment_id() == required_fragment_id) {
623
17
                        LOG(WARNING) << "PipelineTask::execute.terminate sleep 5s";
624
17
                        sleep(5);
625
17
                    }
626
17
                }
627
17
            });
628
17
            RETURN_IF_ERROR(block->check_type_and_column());
629
17
            status = _sink->sink(_state, block, _eos);
630
631
17
            if (status.is<ErrorCode::END_OF_FILE>()) {
632
1
                set_wake_up_early();
633
1
                return Status::OK();
634
16
            } else if (!status) {
635
0
                return status;
636
0
            }
637
638
16
            if (_eos) { // just return, the scheduler will do finish work
639
10
                return Status::OK();
640
10
            }
641
16
        }
642
542k
    }
643
644
4
    RETURN_IF_ERROR(_state->get_query_ctx()->get_pipe_exec_scheduler()->submit(shared_from_this()));
645
4
    return Status::OK();
646
4
}
647
648
0
Status PipelineTask::do_revoke_memory(const std::shared_ptr<SpillContext>& spill_context) {
649
0
    auto fragment_context = _fragment_context.lock();
650
0
    if (!fragment_context) {
651
0
        return Status::InternalError("Fragment already finished! Query: {}", print_id(_query_id));
652
0
    }
653
654
0
    SCOPED_ATTACH_TASK(_state);
655
0
    ThreadCpuStopWatch cpu_time_stop_watch;
656
0
    cpu_time_stop_watch.start();
657
0
    Defer running_defer {[&]() {
658
0
        int64_t delta_cpu_time = cpu_time_stop_watch.elapsed_time();
659
0
        _task_cpu_timer->update(delta_cpu_time);
660
0
        fragment_context->get_query_ctx()->resource_ctx()->cpu_context()->update_cpu_cost_ms(
661
0
                delta_cpu_time);
662
663
        // If task is woke up early, we should terminate all operators, and this task could be closed immediately.
664
0
        if (_wake_up_early) {
665
0
            terminate();
666
0
            THROW_IF_ERROR(_root->terminate(_state));
667
0
            THROW_IF_ERROR(_sink->terminate(_state));
668
0
            _eos = true;
669
0
        }
670
0
    }};
671
672
0
    return _sink->revoke_memory(_state, spill_context);
673
0
}
674
675
2.13k
bool PipelineTask::_try_to_reserve_memory(const size_t reserve_size, OperatorBase* op) {
676
2.13k
    auto st = thread_context()->thread_mem_tracker_mgr->try_reserve(reserve_size);
677
    // If reserve memory failed and the query is not enable spill, just disable reserve memory(this will enable
678
    // memory hard limit check, and will cancel the query if allocate memory failed) and let it run.
679
2.13k
    if (!st.ok() && !_state->enable_spill()) {
680
2
        LOG(INFO) << print_id(_query_id) << " reserve memory failed due to " << st
681
2
                  << ", and it is not enable spill, disable reserve memory and let it run";
682
2
        _state->get_query_ctx()->resource_ctx()->task_controller()->disable_reserve_memory();
683
2
        return true;
684
2
    }
685
2.13k
    COUNTER_UPDATE(_memory_reserve_times, 1);
686
2.13k
    auto sink_revocable_mem_size = _sink->revocable_mem_size(_state);
687
2.13k
    if (st.ok() && _state->enable_force_spill() && _sink->is_spillable() &&
688
2.13k
        sink_revocable_mem_size >= vectorized::SpillStream::MIN_SPILL_WRITE_BATCH_MEM) {
689
0
        st = Status(ErrorCode::QUERY_MEMORY_EXCEEDED, "Force Spill");
690
0
    }
691
2.13k
    if (!st.ok()) {
692
2.13k
        COUNTER_UPDATE(_memory_reserve_failed_times, 1);
693
2.13k
        auto debug_msg = fmt::format(
694
2.13k
                "Query: {} , try to reserve: {}, operator name: {}, operator "
695
2.13k
                "id: {}, task id: {}, root revocable mem size: {}, sink revocable mem"
696
2.13k
                "size: {}, failed: {}",
697
2.13k
                print_id(_query_id), PrettyPrinter::print_bytes(reserve_size), op->get_name(),
698
2.13k
                op->node_id(), _state->task_id(),
699
2.13k
                PrettyPrinter::print_bytes(op->revocable_mem_size(_state)),
700
2.13k
                PrettyPrinter::print_bytes(sink_revocable_mem_size), st.to_string());
701
        // PROCESS_MEMORY_EXCEEDED error msg already contains process_mem_log_str
702
2.13k
        if (!st.is<ErrorCode::PROCESS_MEMORY_EXCEEDED>()) {
703
2.13k
            debug_msg +=
704
2.13k
                    fmt::format(", debug info: {}", GlobalMemoryArbitrator::process_mem_log_str());
705
2.13k
        }
706
        // If sink has enough revocable memory, trigger revoke memory
707
2.13k
        LOG(INFO) << fmt::format(
708
2.13k
                "Query: {} sink: {}, node id: {}, task id: "
709
2.13k
                "{}, revocable mem size: {}",
710
2.13k
                print_id(_query_id), _sink->get_name(), _sink->node_id(), _state->task_id(),
711
2.13k
                PrettyPrinter::print_bytes(sink_revocable_mem_size));
712
2.13k
        ExecEnv::GetInstance()->workload_group_mgr()->add_paused_query(
713
2.13k
                _state->get_query_ctx()->resource_ctx()->shared_from_this(), reserve_size, st);
714
2.13k
        _spilling = true;
715
2.13k
        return false;
716
        // !!! Attention:
717
        // In the past, if reserve failed, not add this query to paused list, because it is very small, will not
718
        // consume a lot of memory. But need set low memory mode to indicate that the system should
719
        // not use too much memory.
720
        // But if we only set _state->get_query_ctx()->set_low_memory_mode() here, and return true, the query will
721
        // continue to run and not blocked, and this reserve maybe the last block of join sink opertorator, and it will
722
        // build hash table directly and will consume a lot of memory. So that should return false directly.
723
        // TODO: we should using a global system buffer management logic to deal with low memory mode.
724
        /**
725
        if (sink_revocable_mem_size >= vectorized::SpillStream::MIN_SPILL_WRITE_BATCH_MEM) {
726
            LOG(INFO) << fmt::format(
727
                    "Query: {} sink: {}, node id: {}, task id: "
728
                    "{}, revocable mem size: {}",
729
                    print_id(_query_id), _sink->get_name(), _sink->node_id(), _state->task_id(),
730
                    PrettyPrinter::print_bytes(sink_revocable_mem_size));
731
            ExecEnv::GetInstance()->workload_group_mgr()->add_paused_query(
732
                    _state->get_query_ctx()->resource_ctx()->shared_from_this(), reserve_size, st);
733
            _spilling = true;
734
            return false;
735
        } else {
736
            _state->get_query_ctx()->set_low_memory_mode();
737
        } */
738
2.13k
    }
739
0
    return true;
740
2.13k
}
741
742
259k
void PipelineTask::stop_if_finished() {
743
259k
    auto fragment = _fragment_context.lock();
744
259k
    if (!fragment) {
745
0
        return;
746
0
    }
747
259k
    SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(fragment->get_query_ctx()->query_mem_tracker());
748
259k
    if (auto sink = _sink) {
749
259k
        if (sink->is_finished(_state)) {
750
1
            set_wake_up_early();
751
1
            terminate();
752
1
        }
753
259k
    }
754
259k
}
755
756
3
Status PipelineTask::finalize() {
757
3
    auto fragment = _fragment_context.lock();
758
3
    if (!fragment) {
759
0
        return Status::OK();
760
0
    }
761
3
    SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(fragment->get_query_ctx()->query_mem_tracker());
762
3
    _stop_accepting_submit();
763
3
    RETURN_IF_ERROR(_state_transition(State::FINALIZED));
764
3
    std::unique_lock<std::mutex> lc(_dependency_lock);
765
3
    _sink_shared_state.reset();
766
3
    _op_shared_states.clear();
767
3
    _shared_state_map.clear();
768
3
    _block.reset();
769
3
    _operators.clear();
770
3
    _sink.reset();
771
3
    _pipeline.reset();
772
3
    return Status::OK();
773
3
}
774
775
19
Status PipelineTask::close(Status exec_status, bool close_sink) {
776
19
    if (close_sink) {
777
8
        _stop_accepting_submit();
778
8
    }
779
19
    int64_t close_ns = 0;
780
19
    Status s;
781
19
    {
782
19
        SCOPED_RAW_TIMER(&close_ns);
783
19
        if (close_sink) {
784
8
            s = _sink->close(_state, exec_status);
785
8
        }
786
23
        for (auto& op : _operators) {
787
23
            auto tem = op->close(_state);
788
23
            if (!tem.ok() && s.ok()) {
789
0
                s = tem;
790
0
            }
791
23
        }
792
19
    }
793
19
    if (_opened) {
794
17
        COUNTER_UPDATE(_close_timer, close_ns);
795
17
        COUNTER_UPDATE(_task_profile->total_time_counter(), close_ns);
796
17
    }
797
798
19
    if (close_sink && _opened) {
799
6
        _task_profile->add_info_string("WakeUpEarly", std::to_string(_wake_up_early.load()));
800
6
        _fresh_profile_counter();
801
6
    }
802
803
19
    if (close_sink) {
804
8
        RETURN_IF_ERROR(_state_transition(State::FINISHED));
805
8
    }
806
19
    return s;
807
19
}
808
809
34.7k
std::string PipelineTask::debug_string() {
810
34.7k
    fmt::memory_buffer debug_string_buffer;
811
812
34.7k
    fmt::format_to(debug_string_buffer, "QueryId: {}\n", print_id(_query_id));
813
34.7k
    fmt::format_to(debug_string_buffer, "InstanceId: {}\n",
814
34.7k
                   print_id(_state->fragment_instance_id()));
815
816
34.7k
    fmt::format_to(debug_string_buffer,
817
34.7k
                   "PipelineTask[id = {}, open = {}, eos = {}, state = {}, dry run = "
818
34.7k
                   "{}, _wake_up_early = {}, _wake_up_by = {}, time elapsed since last state "
819
34.7k
                   "changing = {}s, spilling = {}, is running = {}]",
820
34.7k
                   _index, _opened, _eos, _to_string(_exec_state), _dry_run, _wake_up_early.load(),
821
34.7k
                   _wake_by, _state_change_watcher.elapsed_time() / NANOS_PER_SEC, _spilling,
822
34.7k
                   is_running());
823
34.7k
    std::unique_lock<std::mutex> lc(_dependency_lock);
824
34.7k
    auto* cur_blocked_dep = _blocked_dep;
825
34.7k
    auto fragment = _fragment_context.lock();
826
34.7k
    if (is_finalized() || !fragment) {
827
5
        fmt::format_to(debug_string_buffer, " pipeline name = {}", _pipeline_name);
828
5
        return fmt::to_string(debug_string_buffer);
829
5
    }
830
34.7k
    auto elapsed = fragment->elapsed_time() / NANOS_PER_SEC;
831
34.7k
    fmt::format_to(debug_string_buffer, " elapse time = {}s, block dependency = [{}]\n", elapsed,
832
34.7k
                   cur_blocked_dep && !is_finalized() ? cur_blocked_dep->debug_string() : "NULL");
833
834
34.7k
    if (_state && _state->local_runtime_filter_mgr()) {
835
0
        fmt::format_to(debug_string_buffer, "local_runtime_filter_mgr: [{}]\n",
836
0
                       _state->local_runtime_filter_mgr()->debug_string());
837
0
    }
838
839
34.7k
    fmt::format_to(debug_string_buffer, "operators: ");
840
69.4k
    for (size_t i = 0; i < _operators.size(); i++) {
841
34.7k
        fmt::format_to(debug_string_buffer, "\n{}",
842
34.7k
                       _opened && !is_finalized()
843
34.7k
                               ? _operators[i]->debug_string(_state, cast_set<int>(i))
844
34.7k
                               : _operators[i]->debug_string(cast_set<int>(i)));
845
34.7k
    }
846
34.7k
    fmt::format_to(debug_string_buffer, "\n{}\n",
847
34.7k
                   _opened && !is_finalized()
848
34.7k
                           ? _sink->debug_string(_state, cast_set<int>(_operators.size()))
849
34.7k
                           : _sink->debug_string(cast_set<int>(_operators.size())));
850
851
34.7k
    fmt::format_to(debug_string_buffer, "\nRead Dependency Information: \n");
852
853
34.7k
    size_t i = 0;
854
69.4k
    for (; i < _read_dependencies.size(); i++) {
855
69.4k
        for (size_t j = 0; j < _read_dependencies[i].size(); j++) {
856
34.7k
            fmt::format_to(debug_string_buffer, "{}. {}\n", i,
857
34.7k
                           _read_dependencies[i][j]->debug_string(cast_set<int>(i) + 1));
858
34.7k
        }
859
34.7k
    }
860
861
34.7k
    fmt::format_to(debug_string_buffer, "{}. {}\n", i,
862
34.7k
                   _memory_sufficient_dependency->debug_string(cast_set<int>(i++)));
863
864
34.7k
    fmt::format_to(debug_string_buffer, "\nWrite Dependency Information: \n");
865
69.4k
    for (size_t j = 0; j < _write_dependencies.size(); j++, i++) {
866
34.7k
        fmt::format_to(debug_string_buffer, "{}. {}\n", i,
867
34.7k
                       _write_dependencies[j]->debug_string(cast_set<int>(j) + 1));
868
34.7k
    }
869
870
34.7k
    fmt::format_to(debug_string_buffer, "\nExecution Dependency Information: \n");
871
104k
    for (size_t j = 0; j < _execution_dependencies.size(); j++, i++) {
872
69.4k
        fmt::format_to(debug_string_buffer, "{}. {}\n", i,
873
69.4k
                       _execution_dependencies[j]->debug_string(cast_set<int>(i) + 1));
874
69.4k
    }
875
876
34.7k
    fmt::format_to(debug_string_buffer, "Finish Dependency Information: \n");
877
104k
    for (size_t j = 0; j < _finish_dependencies.size(); j++, i++) {
878
69.4k
        fmt::format_to(debug_string_buffer, "{}. {}\n", i,
879
69.4k
                       _finish_dependencies[j]->debug_string(cast_set<int>(i) + 1));
880
69.4k
    }
881
34.7k
    return fmt::to_string(debug_string_buffer);
882
34.7k
}
883
884
0
size_t PipelineTask::get_revocable_size() const {
885
0
    if (!_opened || is_finalized() || _running || (_eos && !_spilling)) {
886
0
        return 0;
887
0
    }
888
889
0
    return _sink->revocable_mem_size(_state);
890
0
}
891
892
0
Status PipelineTask::revoke_memory(const std::shared_ptr<SpillContext>& spill_context) {
893
0
    DCHECK(spill_context);
894
0
    if (is_finalized()) {
895
0
        spill_context->on_task_finished();
896
0
        VLOG_DEBUG << "Query: " << print_id(_state->query_id()) << ", task: " << ((void*)this)
897
0
                   << " finalized";
898
0
        return Status::OK();
899
0
    }
900
901
0
    const auto revocable_size = _sink->revocable_mem_size(_state);
902
0
    if (revocable_size >= vectorized::SpillStream::MIN_SPILL_WRITE_BATCH_MEM) {
903
0
        auto revokable_task = std::make_shared<RevokableTask>(shared_from_this(), spill_context);
904
0
        RETURN_IF_ERROR(_state->get_query_ctx()->get_pipe_exec_scheduler()->submit(revokable_task));
905
0
    } else {
906
0
        spill_context->on_task_finished();
907
0
        LOG(INFO) << "Query: " << print_id(_state->query_id()) << ", task: " << ((void*)this)
908
0
                  << " has not enough data to revoke: " << revocable_size;
909
0
    }
910
0
    return Status::OK();
911
0
}
912
913
25
Status PipelineTask::wake_up(Dependency* dep) {
914
    // call by dependency
915
25
    DCHECK_EQ(_blocked_dep, dep) << "dep : " << dep->debug_string(0) << "task: " << debug_string();
916
25
    _blocked_dep = nullptr;
917
25
    auto holder = std::dynamic_pointer_cast<PipelineTask>(shared_from_this());
918
25
    RETURN_IF_ERROR(_state_transition(PipelineTask::State::RUNNABLE));
919
25
    RETURN_IF_ERROR(_state->get_query_ctx()->get_pipe_exec_scheduler()->submit(holder));
920
25
    return Status::OK();
921
25
}
922
923
104
Status PipelineTask::_state_transition(State new_state) {
924
104
    if (_exec_state != new_state) {
925
99
        _state_change_watcher.reset();
926
99
        _state_change_watcher.start();
927
99
    }
928
104
    _task_profile->add_info_string("TaskState", _to_string(new_state));
929
104
    _task_profile->add_info_string("BlockedByDependency", _blocked_dep ? _blocked_dep->name() : "");
930
104
    if (!LEGAL_STATE_TRANSITION[(int)new_state].contains(_exec_state)) {
931
17
        return Status::InternalError(
932
17
                "Task state transition from {} to {} is not allowed! Task info: {}",
933
17
                _to_string(_exec_state), _to_string(new_state), debug_string());
934
17
    }
935
87
    _exec_state = new_state;
936
87
    return Status::OK();
937
104
}
938
939
#include "common/compile_check_end.h"
940
} // namespace doris::pipeline