Coverage Report

Created: 2026-04-16 04:10

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exec/pipeline/pipeline_fragment_context.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <brpc/closure_guard.h>
21
#include <gen_cpp/Types_types.h>
22
#include <gen_cpp/types.pb.h>
23
24
#include <atomic>
25
#include <cstddef>
26
#include <cstdint>
27
#include <functional>
28
#include <memory>
29
#include <mutex>
30
#include <set>
31
#include <string>
32
#include <vector>
33
34
#include "common/status.h"
35
#include "exec/pipeline/pipeline.h"
36
#include "exec/pipeline/pipeline_task.h"
37
#include "runtime/query_context.h"
38
#include "runtime/runtime_profile.h"
39
#include "runtime/runtime_state.h"
40
#include "runtime/task_execution_context.h"
41
#include "util/stopwatch.hpp"
42
43
namespace doris {
44
struct ReportStatusRequest;
45
class ExecEnv;
46
class RuntimeFilterMergeControllerEntity;
47
class TDataSink;
48
class TPipelineFragmentParams;
49
50
class Dependency;
51
52
class PipelineFragmentContext : public TaskExecutionContext {
53
public:
54
    ENABLE_FACTORY_CREATOR(PipelineFragmentContext);
55
    PipelineFragmentContext(TUniqueId query_id, const TPipelineFragmentParams& request,
56
                            std::shared_ptr<QueryContext> query_ctx, ExecEnv* exec_env,
57
                            const std::function<void(RuntimeState*, Status*)>& call_back);
58
59
    ~PipelineFragmentContext() override;
60
61
    void print_profile(const std::string& extra_info);
62
63
    std::vector<std::shared_ptr<TRuntimeProfileTree>> collect_realtime_profile() const;
64
    std::shared_ptr<TRuntimeProfileTree> collect_realtime_load_channel_profile() const;
65
66
    bool is_timeout(timespec now) const;
67
68
15.6k
    uint64_t elapsed_time() const { return _fragment_watcher.elapsed_time(); }
69
70
54
    int timeout_second() const { return _timeout; }
71
72
    PipelinePtr add_pipeline(PipelinePtr parent = nullptr, int idx = -1);
73
74
14.0M
    QueryContext* get_query_ctx() { return _query_ctx.get(); }
75
74.7M
    [[nodiscard]] bool is_canceled() const { return _query_ctx->is_cancelled(); }
76
77
    Status prepare(ThreadPool* thread_pool);
78
79
    Status submit();
80
81
428k
    void set_is_report_success(bool is_report_success) { _is_report_success = is_report_success; }
82
83
    void cancel(const Status reason);
84
85
    bool notify_close();
86
87
2.00M
    TUniqueId get_query_id() const { return _query_id; }
88
89
6
    [[nodiscard]] int get_fragment_id() const { return _fragment_id; }
90
91
118k
    uint32_t rec_cte_stage() const { return _rec_cte_stage; }
92
3.28k
    void set_rec_cte_stage(uint32_t stage) { _rec_cte_stage = stage; }
93
94
    void decrement_running_task(PipelineId pipeline_id);
95
96
    Status send_report(bool);
97
98
    void trigger_report_if_necessary();
99
    void refresh_next_report_time();
100
101
    std::string debug_string();
102
103
783k
    [[nodiscard]] int next_operator_id() { return _operator_id--; }
104
105
4.00M
    [[nodiscard]] int max_operator_id() const { return _operator_id; }
106
107
671k
    [[nodiscard]] int next_sink_operator_id() { return _sink_operator_id--; }
108
109
    [[nodiscard]] size_t get_revocable_size(bool* has_running_task) const;
110
111
    [[nodiscard]] std::vector<PipelineTask*> get_revocable_tasks() const;
112
113
76.6k
    void clear_finished_tasks() {
114
76.6k
        if (_need_notify_close) {
115
267
            return;
116
267
        }
117
357k
        for (size_t j = 0; j < _tasks.size(); j++) {
118
888k
            for (size_t i = 0; i < _tasks[j].size(); i++) {
119
606k
                _tasks[j][i].first->stop_if_finished();
120
606k
            }
121
281k
        }
122
76.4k
    }
123
124
    std::string get_load_error_url();
125
    std::string get_first_error_msg();
126
127
    std::set<int> get_deregister_runtime_filter() const;
128
129
    // Store the brpc ClosureGuard so the RPC response is deferred until this PFC is destroyed.
130
    // When need_send_report_on_destruction is true (final_close), send the report immediately
131
    // and do not store the guard (let it fire on return to complete the RPC).
132
    //
133
    // Thread safety: This method is NOT thread-safe. It reads/writes _wait_close_guard without
134
    // synchronization. Currently it is only called from rerun_fragment() which is invoked
135
    // sequentially by RecCTESourceOperatorX (a serial operator) — one opcode at a time per
136
    // fragment. Do NOT call this concurrently from multiple threads.
137
    Status listen_wait_close(const std::shared_ptr<brpc::ClosureGuard>& guard,
138
3.47k
                             bool need_send_report_on_destruction) {
139
3.47k
        if (_wait_close_guard) {
140
0
            return Status::InternalError("Already listening wait close");
141
0
        }
142
3.47k
        if (need_send_report_on_destruction) {
143
191
            return send_report(true);
144
3.28k
        } else {
145
3.28k
            _wait_close_guard = guard;
146
3.28k
        }
147
3.28k
        return Status::OK();
148
3.47k
    }
149
150
private:
151
    void _coordinator_callback(const ReportStatusRequest& req);
152
    std::string _to_http_path(const std::string& file_name) const;
153
154
    void _release_resource();
155
156
    Status _build_and_prepare_full_pipeline(ThreadPool* thread_pool);
157
158
    Status _build_pipelines(ObjectPool* pool, const DescriptorTbl& descs, OperatorPtr* root,
159
                            PipelinePtr cur_pipe);
160
    Status _create_tree_helper(ObjectPool* pool, const std::vector<TPlanNode>& tnodes,
161
                               const DescriptorTbl& descs, OperatorPtr parent, int* node_idx,
162
                               OperatorPtr* root, PipelinePtr& cur_pipe, int child_idx,
163
                               const bool followed_by_shuffled_join,
164
                               const bool require_bucket_distribution);
165
166
    Status _create_operator(ObjectPool* pool, const TPlanNode& tnode, const DescriptorTbl& descs,
167
                            OperatorPtr& op, PipelinePtr& cur_pipe, int parent_idx, int child_idx,
168
                            const bool followed_by_shuffled_join,
169
                            const bool require_bucket_distribution, OperatorPtr& cache_op);
170
    template <bool is_intersect>
171
    Status _build_operators_for_set_operation_node(ObjectPool* pool, const TPlanNode& tnode,
172
                                                   const DescriptorTbl& descs, OperatorPtr& op,
173
                                                   PipelinePtr& cur_pipe,
174
                                                   std::vector<DataSinkOperatorPtr>& sink_ops);
175
176
    Status _create_data_sink(ObjectPool* pool, const TDataSink& thrift_sink,
177
                             const std::vector<TExpr>& output_exprs,
178
                             const TPipelineFragmentParams& params, const RowDescriptor& row_desc,
179
                             RuntimeState* state, DescriptorTbl& desc_tbl,
180
                             PipelineId cur_pipeline_id);
181
    Status _plan_local_exchange(int num_buckets,
182
                                const std::map<int, int>& bucket_seq_to_instance_idx,
183
                                const std::map<int, int>& shuffle_idx_to_instance_idx);
184
    Status _plan_local_exchange(int num_buckets, int pip_idx, PipelinePtr pip,
185
                                const std::map<int, int>& bucket_seq_to_instance_idx,
186
                                const std::map<int, int>& shuffle_idx_to_instance_idx);
187
    void _inherit_pipeline_properties(const DataDistribution& data_distribution,
188
                                      PipelinePtr pipe_with_source, PipelinePtr pipe_with_sink);
189
    Status _add_local_exchange(int pip_idx, int idx, int node_id, ObjectPool* pool,
190
                               PipelinePtr cur_pipe, DataDistribution data_distribution,
191
                               bool* do_local_exchange, int num_buckets,
192
                               const std::map<int, int>& bucket_seq_to_instance_idx,
193
                               const std::map<int, int>& shuffle_idx_to_instance_idx);
194
    Status _add_local_exchange_impl(int idx, ObjectPool* pool, PipelinePtr cur_pipe,
195
                                    PipelinePtr new_pip, DataDistribution data_distribution,
196
                                    bool* do_local_exchange, int num_buckets,
197
                                    const std::map<int, int>& bucket_seq_to_instance_idx,
198
                                    const std::map<int, int>& shuffle_idx_to_instance_idx);
199
200
    Status _build_pipeline_tasks(ThreadPool* thread_pool);
201
    Status _build_pipeline_tasks_for_instance(
202
            int instance_idx,
203
            const std::vector<std::shared_ptr<RuntimeProfile>>& pipeline_id_to_profile);
204
    // Close the fragment instance and return true if the caller should call
205
    // remove_pipeline_context() **after** releasing _task_mutex. This avoids
206
    // holding _task_mutex while acquiring _pipeline_map's shard lock, which
207
    // would create an ABBA deadlock with dump_pipeline_tasks().
208
    bool _close_fragment_instance();
209
    void _init_next_report_time();
210
211
    // Id of this query
212
    TUniqueId _query_id;
213
    int _fragment_id;
214
215
    ExecEnv* _exec_env = nullptr;
216
217
    std::atomic_bool _prepared = false;
218
    bool _submitted = false;
219
220
    Pipelines _pipelines;
221
    PipelineId _next_pipeline_id = 0;
222
    std::mutex _task_mutex;
223
    int _closed_tasks = 0;
224
    // After prepared, `_total_tasks` is equal to the size of `_tasks`.
225
    // When submit fail, `_total_tasks` is equal to the number of tasks submitted.
226
    std::atomic<int> _total_tasks = 0;
227
228
    std::unique_ptr<RuntimeProfile> _fragment_level_profile;
229
    bool _is_report_success = false;
230
231
    std::unique_ptr<RuntimeState> _runtime_state;
232
233
    std::shared_ptr<QueryContext> _query_ctx;
234
235
    MonotonicStopWatch _fragment_watcher;
236
    RuntimeProfile::Counter* _prepare_timer = nullptr;
237
    RuntimeProfile::Counter* _init_context_timer = nullptr;
238
    RuntimeProfile::Counter* _build_pipelines_timer = nullptr;
239
    RuntimeProfile::Counter* _plan_local_exchanger_timer = nullptr;
240
    RuntimeProfile::Counter* _prepare_all_pipelines_timer = nullptr;
241
    RuntimeProfile::Counter* _build_tasks_timer = nullptr;
242
243
    std::function<void(RuntimeState*, Status*)> _call_back;
244
    std::atomic_bool _is_fragment_instance_closed = false;
245
246
    // If this is set to false, and '_is_report_success' is false as well,
247
    // This executor will not report status to FE on being cancelled.
248
    bool _is_report_on_cancel;
249
250
    // 0 indicates reporting is in progress or not required
251
    std::atomic_bool _disable_period_report = true;
252
    std::atomic_uint64_t _previous_report_time = 0;
253
254
    DescriptorTbl* _desc_tbl = nullptr;
255
    int _num_instances = 1;
256
257
    int _timeout = -1;
258
    bool _use_serial_source = false;
259
260
    OperatorPtr _root_op = nullptr;
261
    //
262
    /**
263
     * Matrix stores tasks with local runtime states.
264
     * This is a [n * m] matrix. n is parallelism of pipeline engine and m is the number of pipelines.
265
     *
266
     * 2-D matrix:
267
     * +-------------------------+------------+-------+
268
     * |            | Pipeline 0 | Pipeline 1 |  ...  |
269
     * +------------+------------+------------+-------+
270
     * | Instance 0 |  task 0-0  |  task 0-1  |  ...  |
271
     * +------------+------------+------------+-------+
272
     * | Instance 1 |  task 1-0  |  task 1-1  |  ...  |
273
     * +------------+------------+------------+-------+
274
     * | ...                                          |
275
     * +--------------------------------------+-------+
276
     */
277
    std::vector<
278
            std::vector<std::pair<std::shared_ptr<PipelineTask>, std::unique_ptr<RuntimeState>>>>
279
            _tasks;
280
281
    // TODO: remove the _sink and _multi_cast_stream_sink_senders to set both
282
    // of it in pipeline task not the fragment_context
283
#ifdef __clang__
284
#pragma clang diagnostic push
285
#pragma clang diagnostic ignored "-Wshadow-field"
286
#endif
287
    DataSinkOperatorPtr _sink = nullptr;
288
#ifdef __clang__
289
#pragma clang diagnostic pop
290
#endif
291
292
    // `_dag` manage dependencies between pipelines by pipeline ID. the indices will be blocked by members
293
    std::map<PipelineId, std::vector<PipelineId>> _dag;
294
295
    // We use preorder traversal to create an operator tree. When we meet a join node, we should
296
    // build probe operator and build operator in separate pipelines. To do this, we should build
297
    // ProbeSide first, and use `_pipelines_to_build` to store which pipeline the build operator
298
    // is in, so we can build BuildSide once we complete probe side.
299
    struct pipeline_parent_map {
300
        std::map<int, std::vector<PipelinePtr>> _build_side_pipelines;
301
27.3k
        void push(int parent_node_id, PipelinePtr pipeline) {
302
27.3k
            if (!_build_side_pipelines.contains(parent_node_id)) {
303
13.6k
                _build_side_pipelines.insert({parent_node_id, {pipeline}});
304
13.7k
            } else {
305
13.7k
                _build_side_pipelines[parent_node_id].push_back(pipeline);
306
13.7k
            }
307
27.3k
        }
308
660k
        void pop(PipelinePtr& cur_pipe, int parent_node_id, int child_idx) {
309
660k
            if (!_build_side_pipelines.contains(parent_node_id)) {
310
634k
                return;
311
634k
            }
312
660k
            DCHECK(_build_side_pipelines.contains(parent_node_id));
313
26.1k
            auto& child_pipeline = _build_side_pipelines[parent_node_id];
314
26.1k
            DCHECK(child_idx < child_pipeline.size());
315
26.1k
            cur_pipe = child_pipeline[child_idx];
316
26.1k
        }
317
428k
        void clear() { _build_side_pipelines.clear(); }
318
    } _pipeline_parent_map;
319
320
    std::mutex _state_map_lock;
321
322
    int _operator_id = 0;
323
    int _sink_operator_id = 0;
324
    /**
325
     * Some states are shared by tasks in different pipeline task (e.g. local exchange , broadcast join).
326
     *
327
     * local exchange sink 0 ->                               -> local exchange source 0
328
     *                            LocalExchangeSharedState
329
     * local exchange sink 1 ->                               -> local exchange source 1
330
     *
331
     * hash join build sink 0 ->                               -> hash join build source 0
332
     *                              HashJoinSharedState
333
     * hash join build sink 1 ->                               -> hash join build source 1
334
     *
335
     * So we should keep states here.
336
     */
337
    std::map<int,
338
             std::pair<std::shared_ptr<BasicSharedState>, std::vector<std::shared_ptr<Dependency>>>>
339
            _op_id_to_shared_state;
340
341
    std::map<PipelineId, Pipeline*> _pip_id_to_pipeline;
342
    std::vector<std::unique_ptr<RuntimeFilterMgr>> _runtime_filter_mgr_map;
343
344
    //Here are two types of runtime states:
345
    //    - _runtime state is at the Fragment level.
346
    //    - _task_runtime_states is at the task level, unique to each task.
347
348
    std::vector<TUniqueId> _fragment_instance_ids;
349
350
    // Total instance num running on all BEs
351
    int _total_instances = -1;
352
353
    TPipelineFragmentParams _params;
354
    int32_t _parallel_instances = 0;
355
356
    std::atomic<bool> _need_notify_close = false;
357
    // Holds the brpc ClosureGuard for async wait-close during recursive CTE rerun.
358
    // When the PFC finishes closing and is destroyed, the shared_ptr destructor fires
359
    // the ClosureGuard, which completes the brpc response to the RecCTESourceOperatorX.
360
    // Only written by listen_wait_close() from a single rerun_fragment RPC thread.
361
    std::shared_ptr<brpc::ClosureGuard> _wait_close_guard = nullptr;
362
363
    // The recursion round number for recursive CTE fragments.
364
    // Incremented each time the fragment is rebuilt via rerun_fragment(rebuild).
365
    // Used to stamp runtime filter RPCs so stale messages from old rounds are discarded.
366
    uint32_t _rec_cte_stage = 0;
367
};
368
} // namespace doris