be/src/exec/pipeline/pipeline_task.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "exec/pipeline/pipeline_task.h" |
19 | | |
20 | | #include <fmt/core.h> |
21 | | #include <fmt/format.h> |
22 | | #include <gen_cpp/Metrics_types.h> |
23 | | #include <glog/logging.h> |
24 | | |
25 | | #include <algorithm> |
26 | | #include <memory> |
27 | | #include <ostream> |
28 | | #include <vector> |
29 | | |
30 | | #include "common/logging.h" |
31 | | #include "common/status.h" |
32 | | #include "core/block/block.h" |
33 | | #include "exec/operator/exchange_source_operator.h" |
34 | | #include "exec/operator/operator.h" |
35 | | #include "exec/operator/rec_cte_source_operator.h" |
36 | | #include "exec/operator/scan_operator.h" |
37 | | #include "exec/pipeline/dependency.h" |
38 | | #include "exec/pipeline/pipeline.h" |
39 | | #include "exec/pipeline/pipeline_fragment_context.h" |
40 | | #include "exec/pipeline/revokable_task.h" |
41 | | #include "exec/pipeline/task_queue.h" |
42 | | #include "exec/pipeline/task_scheduler.h" |
43 | | #include "exec/spill/spill_file.h" |
44 | | #include "runtime/descriptors.h" |
45 | | #include "runtime/exec_env.h" |
46 | | #include "runtime/query_context.h" |
47 | | #include "runtime/runtime_profile.h" |
48 | | #include "runtime/runtime_profile_counter_names.h" |
49 | | #include "runtime/thread_context.h" |
50 | | #include "runtime/workload_group/workload_group_manager.h" |
51 | | #include "util/defer_op.h" |
52 | | #include "util/mem_info.h" |
53 | | #include "util/uid_util.h" |
54 | | |
55 | | namespace doris { |
56 | | class RuntimeState; |
57 | | } // namespace doris |
58 | | |
59 | | namespace doris { |
60 | | |
61 | | PipelineTask::PipelineTask(PipelinePtr& pipeline, uint32_t task_id, RuntimeState* state, |
62 | | std::shared_ptr<PipelineFragmentContext> fragment_context, |
63 | | RuntimeProfile* parent_profile, |
64 | | std::map<int, std::pair<std::shared_ptr<BasicSharedState>, |
65 | | std::vector<std::shared_ptr<Dependency>>>> |
66 | | shared_state_map, |
67 | | int task_idx) |
68 | | : |
69 | | #ifdef BE_TEST |
70 | | _query_id(fragment_context ? fragment_context->get_query_id() : TUniqueId()), |
71 | | #else |
72 | 2.02M | _query_id(fragment_context->get_query_id()), |
73 | | #endif |
74 | 2.02M | _index(task_id), |
75 | 2.02M | _pipeline(pipeline), |
76 | 2.02M | _opened(false), |
77 | 2.02M | _state(state), |
78 | 2.02M | _fragment_context(fragment_context), |
79 | 2.02M | _parent_profile(parent_profile), |
80 | 2.02M | _operators(pipeline->operators()), |
81 | 2.02M | _source(_operators.front().get()), |
82 | 2.02M | _root(_operators.back().get()), |
83 | 2.02M | _sink(pipeline->sink_shared_pointer()), |
84 | 2.02M | _shared_state_map(std::move(shared_state_map)), |
85 | 2.02M | _task_idx(task_idx), |
86 | 2.02M | _memory_sufficient_dependency(state->get_query_ctx()->get_memory_sufficient_dependency()), |
87 | 2.02M | _pipeline_name(_pipeline->name()) { |
88 | 2.02M | #ifndef BE_TEST |
89 | 2.02M | _query_mem_tracker = fragment_context->get_query_ctx()->query_mem_tracker(); |
90 | 2.02M | #endif |
91 | 2.02M | _execution_dependencies.push_back(state->get_query_ctx()->get_execution_dependency()); |
92 | 2.02M | if (!_shared_state_map.contains(_sink->dests_id().front())) { |
93 | 1.66M | auto shared_state = _sink->create_shared_state(); |
94 | 1.66M | if (shared_state) { |
95 | 1.65M | _sink_shared_state = shared_state; |
96 | 1.65M | } |
97 | 1.66M | } |
98 | 2.02M | } |
99 | | |
100 | 2.03M | PipelineTask::~PipelineTask() { |
101 | 2.10M | auto reset_member = [&]() { |
102 | 2.10M | _shared_state_map.clear(); |
103 | 2.10M | _sink_shared_state.reset(); |
104 | 2.10M | _op_shared_states.clear(); |
105 | 2.10M | _sink.reset(); |
106 | 2.10M | _operators.clear(); |
107 | 2.10M | _block.reset(); |
108 | 2.10M | _pipeline.reset(); |
109 | 2.10M | }; |
110 | | // PipelineTask is also hold by task queue( https://github.com/apache/doris/pull/49753), |
111 | | // so that it maybe the last one to be destructed. |
112 | | // But pipeline task hold some objects, like operators, shared state, etc. So that should release |
113 | | // memory manually. |
114 | 2.03M | #ifndef BE_TEST |
115 | 2.03M | if (_query_mem_tracker) { |
116 | 2.03M | SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(_query_mem_tracker); |
117 | 2.03M | reset_member(); |
118 | 2.03M | return; |
119 | 2.03M | } |
120 | 18.4E | #endif |
121 | 18.4E | reset_member(); |
122 | 18.4E | } |
123 | | |
124 | | Status PipelineTask::prepare(const std::vector<TScanRangeParams>& scan_range, const int sender_id, |
125 | 2.02M | const TDataSink& tsink) { |
126 | 2.02M | DCHECK(_sink); |
127 | 2.02M | _init_profile(); |
128 | 2.02M | SCOPED_TIMER(_task_profile->total_time_counter()); |
129 | 2.02M | SCOPED_CPU_TIMER(_task_cpu_timer); |
130 | 2.02M | SCOPED_TIMER(_prepare_timer); |
131 | 2.02M | DBUG_EXECUTE_IF("fault_inject::PipelineXTask::prepare", { |
132 | 2.02M | Status status = Status::Error<INTERNAL_ERROR>("fault_inject pipeline_task prepare failed"); |
133 | 2.02M | return status; |
134 | 2.02M | }); |
135 | 2.02M | { |
136 | | // set sink local state |
137 | 2.02M | LocalSinkStateInfo info {_task_idx, _task_profile.get(), |
138 | 2.02M | sender_id, get_sink_shared_state().get(), |
139 | 2.02M | _shared_state_map, tsink}; |
140 | 2.02M | RETURN_IF_ERROR(_sink->setup_local_state(_state, info)); |
141 | 2.02M | } |
142 | | |
143 | 2.02M | _scan_ranges = scan_range; |
144 | 2.02M | auto* parent_profile = _state->get_sink_local_state()->operator_profile(); |
145 | | |
146 | 4.59M | for (int op_idx = cast_set<int>(_operators.size() - 1); op_idx >= 0; op_idx--) { |
147 | 2.57M | auto& op = _operators[op_idx]; |
148 | 2.57M | LocalStateInfo info {parent_profile, _scan_ranges, get_op_shared_state(op->operator_id()), |
149 | 2.57M | _shared_state_map, _task_idx}; |
150 | 2.57M | RETURN_IF_ERROR(op->setup_local_state(_state, info)); |
151 | 2.57M | parent_profile = _state->get_local_state(op->operator_id())->operator_profile(); |
152 | 2.57M | } |
153 | 2.02M | { |
154 | 2.02M | const auto& deps = |
155 | 2.02M | _state->get_local_state(_source->operator_id())->execution_dependencies(); |
156 | 2.02M | std::unique_lock<std::mutex> lc(_dependency_lock); |
157 | 2.02M | std::copy(deps.begin(), deps.end(), |
158 | 2.02M | std::inserter(_execution_dependencies, _execution_dependencies.end())); |
159 | 2.02M | } |
160 | 2.02M | if (auto fragment = _fragment_context.lock()) { |
161 | 2.02M | if (fragment->get_query_ctx()->is_cancelled()) { |
162 | 0 | unblock_all_dependencies(); |
163 | 0 | return fragment->get_query_ctx()->exec_status(); |
164 | 0 | } |
165 | 2.02M | } else { |
166 | 1.93k | return Status::InternalError("Fragment already finished! Query: {}", print_id(_query_id)); |
167 | 1.93k | } |
168 | 2.02M | _block = doris::Block::create_unique(); |
169 | 2.02M | return _state_transition(State::RUNNABLE); |
170 | 2.02M | } |
171 | | |
172 | 2.02M | Status PipelineTask::_extract_dependencies() { |
173 | 2.02M | std::vector<std::vector<Dependency*>> read_dependencies; |
174 | 2.02M | std::vector<Dependency*> write_dependencies; |
175 | 2.02M | std::vector<Dependency*> finish_dependencies; |
176 | 2.02M | read_dependencies.resize(_operators.size()); |
177 | 2.02M | size_t i = 0; |
178 | 2.57M | for (auto& op : _operators) { |
179 | 2.57M | auto* local_state = _state->get_local_state(op->operator_id()); |
180 | 2.57M | DCHECK(local_state); |
181 | 2.57M | read_dependencies[i] = local_state->dependencies(); |
182 | 2.57M | auto* fin_dep = local_state->finishdependency(); |
183 | 2.57M | if (fin_dep) { |
184 | 9 | finish_dependencies.push_back(fin_dep); |
185 | 9 | } |
186 | 2.57M | i++; |
187 | 2.57M | } |
188 | 2.02M | DBUG_EXECUTE_IF("fault_inject::PipelineXTask::_extract_dependencies", { |
189 | 2.02M | Status status = Status::Error<INTERNAL_ERROR>( |
190 | 2.02M | "fault_inject pipeline_task _extract_dependencies failed"); |
191 | 2.02M | return status; |
192 | 2.02M | }); |
193 | 2.02M | { |
194 | 2.02M | auto* local_state = _state->get_sink_local_state(); |
195 | 2.02M | write_dependencies = local_state->dependencies(); |
196 | 2.02M | auto* fin_dep = local_state->finishdependency(); |
197 | 2.02M | if (fin_dep) { |
198 | 810k | finish_dependencies.push_back(fin_dep); |
199 | 810k | } |
200 | 2.02M | } |
201 | 2.02M | { |
202 | 2.02M | std::unique_lock<std::mutex> lc(_dependency_lock); |
203 | 2.02M | read_dependencies.swap(_read_dependencies); |
204 | 2.02M | write_dependencies.swap(_write_dependencies); |
205 | 2.02M | finish_dependencies.swap(_finish_dependencies); |
206 | 2.02M | } |
207 | 2.02M | return Status::OK(); |
208 | 2.02M | } |
209 | | |
210 | 1.20M | bool PipelineTask::inject_shared_state(std::shared_ptr<BasicSharedState> shared_state) { |
211 | 1.20M | if (!shared_state) { |
212 | 693k | return false; |
213 | 693k | } |
214 | | // Shared state is created by upstream task's sink operator and shared by source operator of |
215 | | // this task. |
216 | 580k | for (auto& op : _operators) { |
217 | 580k | if (shared_state->related_op_ids.contains(op->operator_id())) { |
218 | 501k | _op_shared_states.insert({op->operator_id(), shared_state}); |
219 | 501k | return true; |
220 | 501k | } |
221 | 580k | } |
222 | | // Shared state is created by the first sink operator and shared by sink operator of this task. |
223 | | // For example, Set operations. |
224 | 11.6k | if (shared_state->related_op_ids.contains(_sink->dests_id().front())) { |
225 | 11.6k | DCHECK_EQ(_sink_shared_state, nullptr) |
226 | 0 | << " Sink: " << _sink->get_name() << " dest id: " << _sink->dests_id().front(); |
227 | 11.6k | _sink_shared_state = shared_state; |
228 | 11.6k | return true; |
229 | 11.6k | } |
230 | 18.4E | return false; |
231 | 10.6k | } |
232 | | |
233 | 2.02M | void PipelineTask::_init_profile() { |
234 | 2.02M | _task_profile = std::make_unique<RuntimeProfile>(fmt::format("PipelineTask(index={})", _index)); |
235 | 2.02M | _parent_profile->add_child(_task_profile.get(), true, nullptr); |
236 | 2.02M | _task_cpu_timer = ADD_TIMER(_task_profile, profile::TASK_CPU_TIME); |
237 | | |
238 | 2.02M | static const char* exec_time = profile::EXECUTE_TIME; |
239 | 2.02M | _exec_timer = ADD_TIMER(_task_profile, exec_time); |
240 | 2.02M | _prepare_timer = ADD_CHILD_TIMER(_task_profile, profile::PREPARE_TIME, exec_time); |
241 | 2.02M | _open_timer = ADD_CHILD_TIMER(_task_profile, profile::OPEN_TIME, exec_time); |
242 | 2.02M | _get_block_timer = ADD_CHILD_TIMER(_task_profile, profile::GET_BLOCK_TIME, exec_time); |
243 | 2.02M | _get_block_counter = ADD_COUNTER(_task_profile, profile::GET_BLOCK_COUNTER, TUnit::UNIT); |
244 | 2.02M | _sink_timer = ADD_CHILD_TIMER(_task_profile, profile::SINK_TIME, exec_time); |
245 | 2.02M | _close_timer = ADD_CHILD_TIMER(_task_profile, profile::CLOSE_TIME, exec_time); |
246 | | |
247 | 2.02M | _wait_worker_timer = ADD_TIMER_WITH_LEVEL(_task_profile, profile::WAIT_WORKER_TIME, 1); |
248 | | |
249 | 2.02M | _schedule_counts = ADD_COUNTER(_task_profile, profile::NUM_SCHEDULE_TIMES, TUnit::UNIT); |
250 | 2.02M | _yield_counts = ADD_COUNTER(_task_profile, profile::NUM_YIELD_TIMES, TUnit::UNIT); |
251 | 2.02M | _core_change_times = ADD_COUNTER(_task_profile, profile::CORE_CHANGE_TIMES, TUnit::UNIT); |
252 | 2.02M | _memory_reserve_times = ADD_COUNTER(_task_profile, profile::MEMORY_RESERVE_TIMES, TUnit::UNIT); |
253 | 2.02M | _memory_reserve_failed_times = |
254 | 2.02M | ADD_COUNTER(_task_profile, profile::MEMORY_RESERVE_FAILED_TIMES, TUnit::UNIT); |
255 | 2.02M | } |
256 | | |
257 | 2.02M | void PipelineTask::_fresh_profile_counter() { |
258 | 2.02M | COUNTER_SET(_schedule_counts, (int64_t)_schedule_time); |
259 | 2.02M | COUNTER_SET(_wait_worker_timer, (int64_t)_wait_worker_watcher.elapsed_time()); |
260 | 2.02M | } |
261 | | |
262 | 2.02M | Status PipelineTask::_open() { |
263 | 2.02M | SCOPED_TIMER(_task_profile->total_time_counter()); |
264 | 2.02M | SCOPED_CPU_TIMER(_task_cpu_timer); |
265 | 2.02M | SCOPED_TIMER(_open_timer); |
266 | 2.02M | _dry_run = _sink->should_dry_run(_state); |
267 | 2.58M | for (auto& o : _operators) { |
268 | 2.58M | RETURN_IF_ERROR(_state->get_local_state(o->operator_id())->open(_state)); |
269 | 2.58M | } |
270 | 2.02M | RETURN_IF_ERROR(_state->get_sink_local_state()->open(_state)); |
271 | 2.02M | RETURN_IF_ERROR(_extract_dependencies()); |
272 | 2.02M | DBUG_EXECUTE_IF("fault_inject::PipelineXTask::open", { |
273 | 2.02M | Status status = Status::Error<INTERNAL_ERROR>("fault_inject pipeline_task open failed"); |
274 | 2.02M | return status; |
275 | 2.02M | }); |
276 | 2.02M | _opened = true; |
277 | 2.02M | return Status::OK(); |
278 | 2.02M | } |
279 | | |
280 | 12.2M | Status PipelineTask::_prepare() { |
281 | 12.2M | SCOPED_TIMER(_task_profile->total_time_counter()); |
282 | 12.2M | SCOPED_CPU_TIMER(_task_cpu_timer); |
283 | 15.2M | for (auto& o : _operators) { |
284 | 15.2M | RETURN_IF_ERROR(_state->get_local_state(o->operator_id())->prepare(_state)); |
285 | 15.2M | } |
286 | 12.2M | RETURN_IF_ERROR(_state->get_sink_local_state()->prepare(_state)); |
287 | 12.2M | return Status::OK(); |
288 | 12.2M | } |
289 | | |
290 | 7.02M | bool PipelineTask::_wait_to_start() { |
291 | | // Before task starting, we should make sure |
292 | | // 1. Execution dependency is ready (which is controlled by FE 2-phase commit) |
293 | | // 2. Runtime filter dependencies are ready |
294 | | // 3. All tablets are loaded into local storage |
295 | 7.02M | return std::any_of( |
296 | 7.02M | _execution_dependencies.begin(), _execution_dependencies.end(), |
297 | 8.30M | [&](Dependency* dep) -> bool { return dep->is_blocked_by(shared_from_this()); }); |
298 | 7.02M | } |
299 | | |
300 | 2.30M | bool PipelineTask::_is_pending_finish() { |
301 | | // Spilling may be in progress if eos is true. |
302 | 2.30M | return std::ranges::any_of(_finish_dependencies, [&](Dependency* dep) -> bool { |
303 | 1.09M | return dep->is_blocked_by(shared_from_this()); |
304 | 1.09M | }); |
305 | 2.30M | } |
306 | | |
307 | 7.32M | bool PipelineTask::is_blockable() const { |
308 | | // Before task starting, we should make sure |
309 | | // 1. Execution dependency is ready (which is controlled by FE 2-phase commit) |
310 | | // 2. Runtime filter dependencies are ready |
311 | | // 3. All tablets are loaded into local storage |
312 | | |
313 | 7.32M | if (_state->enable_fuzzy_blockable_task()) { |
314 | 7.68k | if ((_schedule_time + _task_idx) % 2 == 0) { |
315 | 4.32k | return true; |
316 | 4.32k | } |
317 | 7.68k | } |
318 | | |
319 | 7.32M | return std::ranges::any_of(_operators, |
320 | 9.20M | [&](OperatorPtr op) -> bool { return op->is_blockable(_state); }) || |
321 | 7.32M | _sink->is_blockable(_state); |
322 | 7.32M | } |
323 | | |
324 | 10.2M | bool PipelineTask::_is_blocked() { |
325 | | // `_dry_run = true` means we do not need data from source operator. |
326 | 10.2M | if (!_dry_run) { |
327 | 18.4M | for (int i = cast_set<int>(_read_dependencies.size() - 1); i >= 0; i--) { |
328 | | // `_read_dependencies` is organized according to operators. For each operator, running condition is met iff all dependencies are ready. |
329 | 13.7M | for (auto* dep : _read_dependencies[i]) { |
330 | 13.7M | if (dep->is_blocked_by(shared_from_this())) { |
331 | 3.13M | return true; |
332 | 3.13M | } |
333 | 13.7M | } |
334 | | // If all dependencies are ready for this operator, we can execute this task if no datum is needed from upstream operators. |
335 | 8.34M | if (!_operators[i]->need_more_input_data(_state)) { |
336 | 55.0k | break; |
337 | 55.0k | } |
338 | 8.34M | } |
339 | 10.1M | } |
340 | 7.07M | return _memory_sufficient_dependency->is_blocked_by(shared_from_this()) || |
341 | 9.18M | std::ranges::any_of(_write_dependencies, [&](Dependency* dep) -> bool { |
342 | 9.18M | return dep->is_blocked_by(shared_from_this()); |
343 | 9.18M | }); |
344 | 10.2M | } |
345 | | |
346 | 1.26M | void PipelineTask::unblock_all_dependencies() { |
347 | | // We use a lock to assure all dependencies are not deconstructed here. |
348 | 1.26M | std::unique_lock<std::mutex> lc(_dependency_lock); |
349 | 1.26M | auto fragment = _fragment_context.lock(); |
350 | 1.26M | if (!is_finalized() && fragment) { |
351 | 45.6k | try { |
352 | 45.6k | DCHECK(_wake_up_early || fragment->is_canceled()); |
353 | 45.6k | std::ranges::for_each(_write_dependencies, |
354 | 69.3k | [&](Dependency* dep) { dep->set_always_ready(); }); |
355 | 45.6k | std::ranges::for_each(_finish_dependencies, |
356 | 45.6k | [&](Dependency* dep) { dep->set_always_ready(); }); |
357 | 48.6k | std::ranges::for_each(_read_dependencies, [&](std::vector<Dependency*>& deps) { |
358 | 59.0k | std::ranges::for_each(deps, [&](Dependency* dep) { dep->set_always_ready(); }); |
359 | 48.6k | }); |
360 | | // All `_execution_deps` will never be set blocking from ready. So we just set ready here. |
361 | 45.6k | std::ranges::for_each(_execution_dependencies, |
362 | 55.6k | [&](Dependency* dep) { dep->set_ready(); }); |
363 | 45.6k | _memory_sufficient_dependency->set_ready(); |
364 | 45.6k | } catch (const doris::Exception& e) { |
365 | 0 | LOG(WARNING) << "unblock_all_dependencies failed: " << e.code() << ", " |
366 | 0 | << e.to_string(); |
367 | 0 | } |
368 | 45.6k | } |
369 | 1.26M | } |
370 | | |
371 | | // When current memory pressure is low, memory usage may increase significantly in the next |
372 | | // operator run, while there is no revocable memory available for spilling. |
373 | | // Trigger memory revoking when pressure is high and revocable memory is significant. |
374 | | // Memory pressure is evaluated using two signals: |
375 | | // 1. Query memory usage exceeds a threshold ratio of the query memory limit. |
376 | | // 2. Workload group memory usage reaches the workload group low-watermark threshold. |
377 | 8.08M | bool PipelineTask::_should_trigger_revoking(const size_t reserve_size) const { |
378 | 8.08M | if (!_state->enable_spill()) { |
379 | 8.07M | return false; |
380 | 8.07M | } |
381 | | |
382 | 8.85k | auto query_mem_tracker = _state->get_query_ctx()->query_mem_tracker(); |
383 | 8.85k | auto wg = _state->get_query_ctx()->workload_group(); |
384 | 15.3k | if (!query_mem_tracker || !wg) { |
385 | 2.08k | return false; |
386 | 2.08k | } |
387 | | |
388 | 6.76k | const auto parallelism = std::max(1, _pipeline->num_tasks()); |
389 | 6.76k | const auto query_water_mark = 90; // 90% |
390 | 6.76k | const auto group_mem_limit = wg->memory_limit(); |
391 | 6.76k | auto query_limit = query_mem_tracker->limit(); |
392 | 6.76k | if (query_limit <= 0) { |
393 | 1 | query_limit = group_mem_limit; |
394 | 6.76k | } else if (query_limit > group_mem_limit && group_mem_limit > 0) { |
395 | 724 | query_limit = group_mem_limit; |
396 | 724 | } |
397 | | |
398 | 6.76k | if (query_limit <= 0) { |
399 | 1 | return false; |
400 | 1 | } |
401 | | |
402 | 13.2k | if ((reserve_size * parallelism) <= (query_limit / 5)) { |
403 | 13.2k | return false; |
404 | 13.2k | } |
405 | | |
406 | 18.4E | bool is_high_memory_pressure = false; |
407 | 18.4E | const auto used_mem = query_mem_tracker->consumption() + reserve_size * parallelism; |
408 | 18.4E | if (used_mem >= int64_t((double(query_limit) * query_water_mark / 100))) { |
409 | 2 | is_high_memory_pressure = true; |
410 | 2 | } |
411 | | |
412 | 18.4E | if (!is_high_memory_pressure) { |
413 | 4 | bool is_low_watermark; |
414 | 4 | bool is_high_watermark; |
415 | 4 | wg->check_mem_used(&is_low_watermark, &is_high_watermark); |
416 | 4 | is_high_memory_pressure = is_low_watermark || is_high_watermark; |
417 | 4 | } |
418 | | |
419 | 18.4E | if (is_high_memory_pressure) { |
420 | 4 | const auto revocable_size = [&]() { |
421 | 4 | size_t total = _sink->revocable_mem_size(_state); |
422 | 4 | for (const auto& op : _operators) { |
423 | 4 | total += op->revocable_mem_size(_state); |
424 | 4 | } |
425 | 4 | return total; |
426 | 4 | }(); |
427 | | |
428 | 4 | const auto total_estimated_revocable = revocable_size * parallelism; |
429 | 4 | return total_estimated_revocable >= int64_t(double(query_limit) * 0.2); |
430 | 4 | } |
431 | | |
432 | 18.4E | return false; |
433 | 18.4E | } |
434 | | |
435 | | /** |
436 | | * `_eos` indicates whether the execution phase is done. `done` indicates whether we could close |
437 | | * this task. |
438 | | * |
439 | | * For example, |
440 | | * 1. if `_eos` is false which means we should continue to get next block so we cannot close (e.g. |
441 | | * `done` is false) |
442 | | * 2. if `_eos` is true which means all blocks from source are exhausted but `_is_pending_finish()` |
443 | | * is true which means we should wait for a pending dependency ready (maybe a running rpc), so we |
444 | | * cannot close (e.g. `done` is false) |
445 | | * 3. if `_eos` is true which means all blocks from source are exhausted and `_is_pending_finish()` |
446 | | * is false which means we can close immediately (e.g. `done` is true) |
447 | | * @param done |
448 | | * @return |
449 | | */ |
450 | 7.32M | Status PipelineTask::execute(bool* done) { |
451 | 7.32M | if (_exec_state != State::RUNNABLE || _blocked_dep != nullptr) [[unlikely]] { |
452 | | #ifdef BE_TEST |
453 | | return Status::InternalError("Pipeline task is not runnable! Task info: {}", |
454 | | debug_string()); |
455 | | #else |
456 | 1 | return Status::FatalError("Pipeline task is not runnable! Task info: {}", debug_string()); |
457 | 1 | #endif |
458 | 1 | } |
459 | | |
460 | 7.32M | auto fragment_context = _fragment_context.lock(); |
461 | 7.32M | if (!fragment_context) { |
462 | 0 | return Status::InternalError("Fragment already finished! Query: {}", print_id(_query_id)); |
463 | 0 | } |
464 | 7.32M | int64_t time_spent = 0; |
465 | 7.32M | ThreadCpuStopWatch cpu_time_stop_watch; |
466 | 7.32M | cpu_time_stop_watch.start(); |
467 | 7.32M | SCOPED_ATTACH_TASK(_state); |
468 | 7.32M | Defer running_defer {[&]() { |
469 | 7.31M | int64_t delta_cpu_time = cpu_time_stop_watch.elapsed_time(); |
470 | 7.31M | _task_cpu_timer->update(delta_cpu_time); |
471 | 7.31M | fragment_context->get_query_ctx()->resource_ctx()->cpu_context()->update_cpu_cost_ms( |
472 | 7.31M | delta_cpu_time); |
473 | | |
474 | | // If task is woke up early, we should terminate all operators, and this task could be closed immediately. |
475 | 7.31M | if (_wake_up_early) { |
476 | 7.24k | _eos = true; |
477 | 7.24k | *done = true; |
478 | 7.31M | } else if (_eos && !_spilling && |
479 | 7.31M | (fragment_context->is_canceled() || !_is_pending_finish())) { |
480 | | // Debug point for testing the race condition fix: inject set_wake_up_early() + |
481 | | // unblock_all_dependencies() here to simulate Thread B writing A then B between |
482 | | // Thread A's two reads of _wake_up_early. |
483 | 2.01M | DBUG_EXECUTE_IF("PipelineTask::execute.wake_up_early_in_else_if", { |
484 | 2.01M | set_wake_up_early(); |
485 | 2.01M | unblock_all_dependencies(); |
486 | 2.01M | }); |
487 | 2.01M | *done = true; |
488 | 2.01M | } |
489 | | |
490 | | // NOTE: The operator terminate() call is intentionally placed AFTER the |
491 | | // _is_pending_finish() check above, not before. This ordering is critical to avoid a race |
492 | | // condition with the seq_cst memory ordering guarantee: |
493 | | // |
494 | | // Pipeline::make_all_runnable() writes in this order: |
495 | | // (A) set_wake_up_early() -> (B) unblock_all_dependencies() [sets finish_dep._always_ready] |
496 | | // |
497 | | // If we checked _wake_up_early (A) before _is_pending_finish() (B), there would be a |
498 | | // window where Thread A reads _wake_up_early=false, then Thread B writes both A and B, |
499 | | // then Thread A reads _is_pending_finish()=false (due to _always_ready). Thread A would |
500 | | // then set *done=true without ever calling operator terminate(), causing close() to run |
501 | | // on operators that were never properly terminated (e.g. RuntimeFilterProducer still in |
502 | | // WAITING_FOR_SYNCED_SIZE state when insert() is called). |
503 | | // |
504 | | // By reading _is_pending_finish() (B) before the second read of _wake_up_early (A), |
505 | | // if Thread A observes B's effect (_always_ready=true), it is guaranteed to also observe |
506 | | // A's effect (_wake_up_early=true) on this second read, ensuring operator terminate() is |
507 | | // called. This relies on _wake_up_early and _always_ready both being std::atomic with the |
508 | | // default seq_cst ordering — do not weaken them to relaxed or acq/rel. |
509 | 7.31M | if (_wake_up_early) { |
510 | 7.30k | THROW_IF_ERROR(_root->terminate(_state)); |
511 | 7.30k | THROW_IF_ERROR(_sink->terminate(_state)); |
512 | 7.30k | } |
513 | 7.31M | }}; |
514 | 7.32M | const auto query_id = _state->query_id(); |
515 | | // If this task is already EOS and block is empty (which means we already output all blocks), |
516 | | // just return here. |
517 | 7.32M | if (_eos && !_spilling) { |
518 | 285k | return Status::OK(); |
519 | 285k | } |
520 | | // If this task is blocked by a spilling request and waken up immediately, the spilling |
521 | | // dependency will not block this task and we should just run here. |
522 | 7.04M | if (!_block->empty()) { |
523 | 0 | LOG(INFO) << "Query: " << print_id(query_id) << " has pending block, size: " |
524 | 0 | << PrettyPrinter::print_bytes(_block->allocated_bytes()); |
525 | 0 | DCHECK(_spilling); |
526 | 0 | } |
527 | | |
528 | 7.04M | SCOPED_TIMER(_task_profile->total_time_counter()); |
529 | 7.04M | SCOPED_TIMER(_exec_timer); |
530 | | |
531 | 7.04M | if (!_wake_up_early) { |
532 | 7.04M | RETURN_IF_ERROR(_prepare()); |
533 | 7.04M | } |
534 | 7.04M | DBUG_EXECUTE_IF("fault_inject::PipelineXTask::execute", { |
535 | 7.04M | Status status = Status::Error<INTERNAL_ERROR>("fault_inject pipeline_task execute failed"); |
536 | 7.04M | return status; |
537 | 7.04M | }); |
538 | | // `_wake_up_early` must be after `_wait_to_start()` |
539 | 7.04M | if (_wait_to_start() || _wake_up_early) { |
540 | 1.81M | return Status::OK(); |
541 | 1.81M | } |
542 | 5.23M | RETURN_IF_ERROR(_prepare()); |
543 | | |
544 | | // The status must be runnable |
545 | 5.23M | if (!_opened && !fragment_context->is_canceled()) { |
546 | 2.03M | DBUG_EXECUTE_IF("PipelineTask::execute.open_sleep", { |
547 | 2.03M | auto required_pipeline_id = |
548 | 2.03M | DebugPoints::instance()->get_debug_param_or_default<int32_t>( |
549 | 2.03M | "PipelineTask::execute.open_sleep", "pipeline_id", -1); |
550 | 2.03M | auto required_task_id = DebugPoints::instance()->get_debug_param_or_default<int32_t>( |
551 | 2.03M | "PipelineTask::execute.open_sleep", "task_id", -1); |
552 | 2.03M | if (required_pipeline_id == pipeline_id() && required_task_id == task_id()) { |
553 | 2.03M | LOG(WARNING) << "PipelineTask::execute.open_sleep sleep 5s"; |
554 | 2.03M | sleep(5); |
555 | 2.03M | } |
556 | 2.03M | }); |
557 | | |
558 | 2.03M | SCOPED_RAW_TIMER(&time_spent); |
559 | 2.03M | RETURN_IF_ERROR(_open()); |
560 | 2.03M | } |
561 | | |
562 | 10.2M | while (!fragment_context->is_canceled()) { |
563 | 10.2M | SCOPED_RAW_TIMER(&time_spent); |
564 | 10.2M | Defer defer {[&]() { |
565 | | // If this run is pended by a spilling request, the block will be output in next run. |
566 | 10.2M | if (!_spilling) { |
567 | 10.2M | _block->clear_column_data(_root->row_desc().num_materialized_slots()); |
568 | 10.2M | } |
569 | 10.2M | }}; |
570 | | // `_wake_up_early` must be after `_is_blocked()` |
571 | 10.2M | if (_is_blocked() || _wake_up_early) { |
572 | 3.15M | return Status::OK(); |
573 | 3.15M | } |
574 | | |
575 | | /// When a task is cancelled, |
576 | | /// its blocking state will be cleared and it will transition to a ready state (though it is not truly ready). |
577 | | /// Here, checking whether it is cancelled to prevent tasks in a blocking state from being re-executed. |
578 | 7.06M | if (fragment_context->is_canceled()) { |
579 | 1 | break; |
580 | 1 | } |
581 | | |
582 | 7.06M | if (time_spent > _exec_time_slice) { |
583 | 45.5k | COUNTER_UPDATE(_yield_counts, 1); |
584 | 45.5k | break; |
585 | 45.5k | } |
586 | 7.02M | auto* block = _block.get(); |
587 | | |
588 | 7.02M | DBUG_EXECUTE_IF("fault_inject::PipelineXTask::executing", { |
589 | 7.02M | Status status = |
590 | 7.02M | Status::Error<INTERNAL_ERROR>("fault_inject pipeline_task executing failed"); |
591 | 7.02M | return status; |
592 | 7.02M | }); |
593 | | |
594 | | // `_sink->is_finished(_state)` means sink operator should be finished |
595 | 7.02M | if (_sink->is_finished(_state)) { |
596 | 12 | set_wake_up_early(); |
597 | 12 | return Status::OK(); |
598 | 12 | } |
599 | | |
600 | | // `_dry_run` means sink operator need no more data |
601 | 7.02M | _eos = _dry_run || _eos; |
602 | 7.02M | _spilling = false; |
603 | 7.02M | auto workload_group = _state->workload_group(); |
604 | | // If last run is pended by a spilling request, `_block` is produced with some rows in last |
605 | | // run, so we will resume execution using the block. |
606 | 7.02M | if (!_eos && _block->empty()) { |
607 | 6.99M | SCOPED_TIMER(_get_block_timer); |
608 | 6.99M | if (_state->low_memory_mode()) { |
609 | 0 | _sink->set_low_memory_mode(_state); |
610 | 0 | for (auto& op : _operators) { |
611 | 0 | op->set_low_memory_mode(_state); |
612 | 0 | } |
613 | 0 | } |
614 | 6.99M | DEFER_RELEASE_RESERVED(); |
615 | 6.99M | _get_block_counter->update(1); |
616 | | // Sum reserve sizes across all operators in this pipeline. |
617 | | // Each operator reports only its own requirement (non-recursive). |
618 | 6.99M | size_t reserve_size = 0; |
619 | 7.76M | for (auto& op : _operators) { |
620 | 7.76M | reserve_size += op->get_reserve_mem_size(_state); |
621 | 7.76M | op->reset_reserve_mem_size(_state); |
622 | 7.76M | } |
623 | 6.99M | if (workload_group && |
624 | 6.99M | _state->get_query_ctx() |
625 | 5.56M | ->resource_ctx() |
626 | 5.56M | ->task_controller() |
627 | 5.56M | ->is_enable_reserve_memory() && |
628 | 6.99M | reserve_size > 0) { |
629 | 5.52M | if (_should_trigger_revoking(reserve_size)) { |
630 | 0 | LOG(INFO) << fmt::format( |
631 | 0 | "Query: {} sink: {}, node id: {}, task id: {}, reserve size: {} when " |
632 | 0 | "high memory pressure, try to spill", |
633 | 0 | print_id(_query_id), _sink->get_name(), _sink->node_id(), |
634 | 0 | _state->task_id(), reserve_size); |
635 | 0 | ExecEnv::GetInstance()->workload_group_mgr()->add_paused_query( |
636 | 0 | _state->get_query_ctx()->resource_ctx()->shared_from_this(), |
637 | 0 | reserve_size, |
638 | 0 | Status::Error<ErrorCode::QUERY_MEMORY_EXCEEDED>( |
639 | 0 | "high memory pressure, try to spill")); |
640 | 0 | _spilling = true; |
641 | 0 | continue; |
642 | 0 | } |
643 | 5.52M | if (!_try_to_reserve_memory(reserve_size, _root)) { |
644 | 1.11k | continue; |
645 | 1.11k | } |
646 | 5.52M | } |
647 | | |
648 | 6.99M | bool eos = false; |
649 | 6.99M | RETURN_IF_ERROR(_root->get_block_after_projects(_state, block, &eos)); |
650 | 6.99M | RETURN_IF_ERROR(block->check_type_and_column()); |
651 | 6.99M | _eos = eos; |
652 | 6.99M | } |
653 | | |
654 | 7.01M | if (!_block->empty() || _eos) { |
655 | 2.80M | SCOPED_TIMER(_sink_timer); |
656 | 2.80M | Status status = Status::OK(); |
657 | 2.80M | DEFER_RELEASE_RESERVED(); |
658 | 2.80M | if (_state->get_query_ctx() |
659 | 2.80M | ->resource_ctx() |
660 | 2.80M | ->task_controller() |
661 | 2.80M | ->is_enable_reserve_memory() && |
662 | 2.80M | workload_group && !(_wake_up_early || _dry_run)) { |
663 | 2.79M | const auto sink_reserve_size = _sink->get_reserve_mem_size(_state, _eos); |
664 | | |
665 | 2.79M | if (sink_reserve_size > 0 && _should_trigger_revoking(sink_reserve_size)) { |
666 | 0 | LOG(INFO) << fmt::format( |
667 | 0 | "Query: {} sink: {}, node id: {}, task id: {}, reserve size: {} when " |
668 | 0 | "high memory pressure, try to spill", |
669 | 0 | print_id(_query_id), _sink->get_name(), _sink->node_id(), |
670 | 0 | _state->task_id(), sink_reserve_size); |
671 | 0 | ExecEnv::GetInstance()->workload_group_mgr()->add_paused_query( |
672 | 0 | _state->get_query_ctx()->resource_ctx()->shared_from_this(), |
673 | 0 | sink_reserve_size, |
674 | 0 | Status::Error<ErrorCode::QUERY_MEMORY_EXCEEDED>( |
675 | 0 | "high memory pressure, try to spill")); |
676 | 0 | _spilling = true; |
677 | 0 | continue; |
678 | 0 | } |
679 | | |
680 | 2.79M | if (sink_reserve_size > 0 && |
681 | 2.79M | !_try_to_reserve_memory(sink_reserve_size, _sink.get())) { |
682 | 971 | continue; |
683 | 971 | } |
684 | 2.79M | } |
685 | | |
686 | 2.80M | DBUG_EXECUTE_IF("PipelineTask::execute.sink_eos_sleep", { |
687 | 2.80M | auto required_pipeline_id = |
688 | 2.80M | DebugPoints::instance()->get_debug_param_or_default<int32_t>( |
689 | 2.80M | "PipelineTask::execute.sink_eos_sleep", "pipeline_id", -1); |
690 | 2.80M | auto required_task_id = |
691 | 2.80M | DebugPoints::instance()->get_debug_param_or_default<int32_t>( |
692 | 2.80M | "PipelineTask::execute.sink_eos_sleep", "task_id", -1); |
693 | 2.80M | if (required_pipeline_id == pipeline_id() && required_task_id == task_id()) { |
694 | 2.80M | LOG(WARNING) << "PipelineTask::execute.sink_eos_sleep sleep 10s"; |
695 | 2.80M | sleep(10); |
696 | 2.80M | } |
697 | 2.80M | }); |
698 | | |
699 | 2.80M | DBUG_EXECUTE_IF("PipelineTask::execute.terminate", { |
700 | 2.80M | if (_eos) { |
701 | 2.80M | auto required_pipeline_id = |
702 | 2.80M | DebugPoints::instance()->get_debug_param_or_default<int32_t>( |
703 | 2.80M | "PipelineTask::execute.terminate", "pipeline_id", -1); |
704 | 2.80M | auto required_task_id = |
705 | 2.80M | DebugPoints::instance()->get_debug_param_or_default<int32_t>( |
706 | 2.80M | "PipelineTask::execute.terminate", "task_id", -1); |
707 | 2.80M | auto required_fragment_id = |
708 | 2.80M | DebugPoints::instance()->get_debug_param_or_default<int32_t>( |
709 | 2.80M | "PipelineTask::execute.terminate", "fragment_id", -1); |
710 | 2.80M | if (required_pipeline_id == pipeline_id() && required_task_id == task_id() && |
711 | 2.80M | fragment_context->get_fragment_id() == required_fragment_id) { |
712 | 2.80M | _wake_up_early = true; |
713 | 2.80M | unblock_all_dependencies(); |
714 | 2.80M | } else if (required_pipeline_id == pipeline_id() && |
715 | 2.80M | fragment_context->get_fragment_id() == required_fragment_id) { |
716 | 2.80M | LOG(WARNING) << "PipelineTask::execute.terminate sleep 5s"; |
717 | 2.80M | sleep(5); |
718 | 2.80M | } |
719 | 2.80M | } |
720 | 2.80M | }); |
721 | 2.80M | RETURN_IF_ERROR(block->check_type_and_column()); |
722 | 2.80M | status = _sink->sink(_state, block, _eos); |
723 | | |
724 | 2.80M | if (_eos) { |
725 | 2.02M | if (_sink->reset_to_rerun(_state, _root)) { |
726 | 1.97k | _eos = false; |
727 | 2.02M | } else { |
728 | 2.02M | RETURN_IF_ERROR(close(Status::OK(), false)); |
729 | 2.02M | } |
730 | 2.02M | } |
731 | | |
732 | 2.80M | if (status.is<ErrorCode::END_OF_FILE>()) { |
733 | 21 | set_wake_up_early(); |
734 | 21 | return Status::OK(); |
735 | 2.80M | } else if (!status) { |
736 | 13 | return status; |
737 | 13 | } |
738 | | |
739 | 2.80M | if (_eos) { // just return, the scheduler will do finish work |
740 | 2.02M | return Status::OK(); |
741 | 2.02M | } |
742 | 2.80M | } |
743 | 7.01M | } |
744 | | |
745 | 51.2k | RETURN_IF_ERROR(_state->get_query_ctx()->get_pipe_exec_scheduler()->submit(shared_from_this())); |
746 | 51.2k | return Status::OK(); |
747 | 51.2k | } |
748 | | |
749 | 7 | Status PipelineTask::do_revoke_memory(const std::shared_ptr<SpillContext>& spill_context) { |
750 | 7 | auto fragment_context = _fragment_context.lock(); |
751 | 7 | if (!fragment_context) { |
752 | 1 | return Status::InternalError("Fragment already finished! Query: {}", print_id(_query_id)); |
753 | 1 | } |
754 | | |
755 | 6 | SCOPED_ATTACH_TASK(_state); |
756 | 6 | ThreadCpuStopWatch cpu_time_stop_watch; |
757 | 6 | cpu_time_stop_watch.start(); |
758 | 6 | Defer running_defer {[&]() { |
759 | 6 | int64_t delta_cpu_time = cpu_time_stop_watch.elapsed_time(); |
760 | 6 | _task_cpu_timer->update(delta_cpu_time); |
761 | 6 | fragment_context->get_query_ctx()->resource_ctx()->cpu_context()->update_cpu_cost_ms( |
762 | 6 | delta_cpu_time); |
763 | | |
764 | | // If task is woke up early, unblock all dependencies and terminate all operators, |
765 | | // so this task could be closed immediately. |
766 | 6 | if (_wake_up_early) { |
767 | 1 | unblock_all_dependencies(); |
768 | 1 | THROW_IF_ERROR(_root->terminate(_state)); |
769 | 1 | THROW_IF_ERROR(_sink->terminate(_state)); |
770 | 1 | _eos = true; |
771 | 1 | } |
772 | | |
773 | | // SpillContext tracks pipeline task count, not operator count. |
774 | | // Notify completion once after all operators + sink have finished revoking. |
775 | 6 | if (spill_context) { |
776 | 3 | spill_context->on_task_finished(); |
777 | 3 | } |
778 | 6 | }}; |
779 | | |
780 | | // Revoke memory from every operator that has enough revocable memory, |
781 | | // then revoke from the sink. |
782 | 6 | for (auto& op : _operators) { |
783 | 6 | if (op->revocable_mem_size(_state) >= SpillFile::MIN_SPILL_WRITE_BATCH_MEM) { |
784 | 2 | RETURN_IF_ERROR(op->revoke_memory(_state)); |
785 | 2 | } |
786 | 6 | } |
787 | | |
788 | 6 | if (_sink->revocable_mem_size(_state) >= SpillFile::MIN_SPILL_WRITE_BATCH_MEM) { |
789 | 1 | RETURN_IF_ERROR(_sink->revoke_memory(_state)); |
790 | 1 | } |
791 | 6 | return Status::OK(); |
792 | 6 | } |
793 | | |
794 | 8.09M | bool PipelineTask::_try_to_reserve_memory(const size_t reserve_size, OperatorBase* op) { |
795 | 8.09M | auto st = thread_context()->thread_mem_tracker_mgr->try_reserve(reserve_size); |
796 | | // If reserve memory failed and the query is not enable spill, just disable reserve memory(this will enable |
797 | | // memory hard limit check, and will cancel the query if allocate memory failed) and let it run. |
798 | 8.09M | if (!st.ok() && !_state->enable_spill()) { |
799 | 2 | LOG(INFO) << print_id(_query_id) << " reserve memory failed due to " << st |
800 | 2 | << ", and it is not enable spill, disable reserve memory and let it run"; |
801 | 2 | _state->get_query_ctx()->resource_ctx()->task_controller()->disable_reserve_memory(); |
802 | 2 | return true; |
803 | 2 | } |
804 | 8.09M | COUNTER_UPDATE(_memory_reserve_times, 1); |
805 | | |
806 | | // Compute total revocable memory across all operators and the sink. |
807 | 8.09M | size_t total_revocable_mem_size = 0; |
808 | 8.09M | size_t operator_max_revocable_mem_size = 0; |
809 | | |
810 | 8.09M | if (!st.ok() || _state->enable_force_spill()) { |
811 | | // Compute total revocable memory across all operators and the sink. |
812 | 8.91k | total_revocable_mem_size = _sink->revocable_mem_size(_state); |
813 | 8.91k | operator_max_revocable_mem_size = total_revocable_mem_size; |
814 | 9.41k | for (auto& cur_op : _operators) { |
815 | 9.41k | total_revocable_mem_size += cur_op->revocable_mem_size(_state); |
816 | 9.41k | operator_max_revocable_mem_size = |
817 | 9.41k | std::max(cur_op->revocable_mem_size(_state), operator_max_revocable_mem_size); |
818 | 9.41k | } |
819 | 8.91k | } |
820 | | |
821 | | // During enable force spill, other operators like scan opeartor will also try to reserve memory and will failed |
822 | | // here, if not add this check, it will always paused and resumed again. |
823 | 8.09M | if (st.ok() && _state->enable_force_spill()) { |
824 | 6.82k | if (operator_max_revocable_mem_size >= _state->spill_min_revocable_mem()) { |
825 | 0 | st = Status::Error<ErrorCode::QUERY_MEMORY_EXCEEDED>( |
826 | 0 | "force spill and there is an operator has memory " |
827 | 0 | "size {} exceeds min mem size {}", |
828 | 0 | PrettyPrinter::print_bytes(operator_max_revocable_mem_size), |
829 | 0 | PrettyPrinter::print_bytes(_state->spill_min_revocable_mem())); |
830 | 0 | } |
831 | 6.82k | } |
832 | | |
833 | 8.09M | if (!st.ok()) { |
834 | 2.08k | COUNTER_UPDATE(_memory_reserve_failed_times, 1); |
835 | | // build per-operator revocable memory info string for debugging |
836 | 2.08k | std::string ops_revocable_info; |
837 | 2.08k | { |
838 | 2.08k | fmt::memory_buffer buf; |
839 | 2.08k | for (auto& cur_op : _operators) { |
840 | 2.08k | fmt::format_to(buf, "{}({})-> ", cur_op->get_name(), |
841 | 2.08k | PrettyPrinter::print_bytes(cur_op->revocable_mem_size(_state))); |
842 | 2.08k | } |
843 | 2.08k | if (_sink) { |
844 | 2.08k | fmt::format_to(buf, "{}({}) ", _sink->get_name(), |
845 | 2.08k | PrettyPrinter::print_bytes(_sink->revocable_mem_size(_state))); |
846 | 2.08k | } |
847 | 2.08k | ops_revocable_info = fmt::to_string(buf); |
848 | 2.08k | } |
849 | | |
850 | 2.08k | auto debug_msg = fmt::format( |
851 | 2.08k | "Query: {} , try to reserve: {}, total revocable mem size: {}, failed reason: {}", |
852 | 2.08k | print_id(_query_id), PrettyPrinter::print_bytes(reserve_size), |
853 | 2.08k | PrettyPrinter::print_bytes(total_revocable_mem_size), st.to_string()); |
854 | 2.08k | if (!ops_revocable_info.empty()) { |
855 | 2.08k | debug_msg += fmt::format(", ops_revocable=[{}]", ops_revocable_info); |
856 | 2.08k | } |
857 | | // PROCESS_MEMORY_EXCEEDED error msg already contains process_mem_log_str |
858 | 2.08k | if (!st.is<ErrorCode::PROCESS_MEMORY_EXCEEDED>()) { |
859 | 2.08k | debug_msg += |
860 | 2.08k | fmt::format(", debug info: {}", GlobalMemoryArbitrator::process_mem_log_str()); |
861 | 2.08k | } |
862 | 2.08k | LOG(INFO) << debug_msg; |
863 | 2.08k | ExecEnv::GetInstance()->workload_group_mgr()->add_paused_query( |
864 | 2.08k | _state->get_query_ctx()->resource_ctx()->shared_from_this(), reserve_size, st); |
865 | 2.08k | _spilling = true; |
866 | 2.08k | return false; |
867 | 2.08k | } |
868 | 8.09M | return true; |
869 | 8.09M | } |
870 | | |
871 | 879k | void PipelineTask::stop_if_finished() { |
872 | 879k | auto fragment = _fragment_context.lock(); |
873 | 879k | if (!fragment) { |
874 | 0 | return; |
875 | 0 | } |
876 | 879k | SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(fragment->get_query_ctx()->query_mem_tracker()); |
877 | 879k | if (auto sink = _sink) { |
878 | 718k | if (sink->is_finished(_state)) { |
879 | 1.29k | set_wake_up_early(); |
880 | 1.29k | unblock_all_dependencies(); |
881 | 1.29k | } |
882 | 718k | } |
883 | 879k | } |
884 | | |
885 | 2.02M | Status PipelineTask::finalize() { |
886 | 2.02M | auto fragment = _fragment_context.lock(); |
887 | 2.02M | if (!fragment) { |
888 | 0 | return Status::OK(); |
889 | 0 | } |
890 | 2.02M | SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(fragment->get_query_ctx()->query_mem_tracker()); |
891 | 2.02M | RETURN_IF_ERROR(_state_transition(State::FINALIZED)); |
892 | 2.02M | std::unique_lock<std::mutex> lc(_dependency_lock); |
893 | 2.02M | _sink_shared_state.reset(); |
894 | 2.02M | _op_shared_states.clear(); |
895 | 2.02M | _shared_state_map.clear(); |
896 | 2.02M | _block.reset(); |
897 | 2.02M | _operators.clear(); |
898 | 2.02M | _sink.reset(); |
899 | 2.02M | _pipeline.reset(); |
900 | 2.02M | return Status::OK(); |
901 | 2.02M | } |
902 | | |
903 | 4.04M | Status PipelineTask::close(Status exec_status, bool close_sink) { |
904 | 4.04M | int64_t close_ns = 0; |
905 | 4.04M | Status s; |
906 | 4.04M | { |
907 | 4.04M | SCOPED_RAW_TIMER(&close_ns); |
908 | 4.04M | if (close_sink) { |
909 | 2.03M | s = _sink->close(_state, exec_status); |
910 | 2.03M | } |
911 | 5.16M | for (auto& op : _operators) { |
912 | 5.16M | auto tem = op->close(_state); |
913 | 5.16M | if (!tem.ok() && s.ok()) { |
914 | 9 | s = std::move(tem); |
915 | 9 | } |
916 | 5.16M | } |
917 | 4.04M | } |
918 | 4.05M | if (_opened) { |
919 | 4.05M | COUNTER_UPDATE(_close_timer, close_ns); |
920 | 4.05M | COUNTER_UPDATE(_task_profile->total_time_counter(), close_ns); |
921 | 4.05M | } |
922 | | |
923 | 4.04M | if (close_sink && _opened) { |
924 | 2.03M | _task_profile->add_info_string("WakeUpEarly", std::to_string(_wake_up_early.load())); |
925 | 2.03M | _fresh_profile_counter(); |
926 | 2.03M | } |
927 | | |
928 | 4.04M | if (close_sink) { |
929 | 2.03M | RETURN_IF_ERROR(_state_transition(State::FINISHED)); |
930 | 2.03M | } |
931 | 4.04M | return s; |
932 | 4.04M | } |
933 | | |
934 | 17.4k | std::string PipelineTask::debug_string() { |
935 | 17.4k | fmt::memory_buffer debug_string_buffer; |
936 | | |
937 | 17.4k | fmt::format_to(debug_string_buffer, "QueryId: {}\n", print_id(_query_id)); |
938 | 17.4k | fmt::format_to(debug_string_buffer, "InstanceId: {}\n", |
939 | 17.4k | print_id(_state->fragment_instance_id())); |
940 | | |
941 | 17.4k | fmt::format_to(debug_string_buffer, |
942 | 17.4k | "PipelineTask[id = {}, open = {}, eos = {}, state = {}, dry run = " |
943 | 17.4k | "{}, _wake_up_early = {}, _wake_up_by = {}, time elapsed since last state " |
944 | 17.4k | "changing = {}s, spilling = {}, is running = {}]", |
945 | 17.4k | _index, _opened, _eos, _to_string(_exec_state), _dry_run, _wake_up_early.load(), |
946 | 17.4k | _wake_by, _state_change_watcher.elapsed_time() / NANOS_PER_SEC, _spilling, |
947 | 17.4k | is_running()); |
948 | 17.4k | std::unique_lock<std::mutex> lc(_dependency_lock); |
949 | 17.4k | auto* cur_blocked_dep = _blocked_dep; |
950 | 17.4k | auto fragment = _fragment_context.lock(); |
951 | 17.4k | if (is_finalized() || !fragment) { |
952 | 62 | fmt::format_to(debug_string_buffer, " pipeline name = {}", _pipeline_name); |
953 | 62 | return fmt::to_string(debug_string_buffer); |
954 | 62 | } |
955 | 17.4k | auto elapsed = fragment->elapsed_time() / NANOS_PER_SEC; |
956 | 17.4k | fmt::format_to(debug_string_buffer, " elapse time = {}s, block dependency = [{}]\n", elapsed, |
957 | 17.4k | cur_blocked_dep && !is_finalized() ? cur_blocked_dep->debug_string() : "NULL"); |
958 | | |
959 | 17.4k | if (_state && _state->local_runtime_filter_mgr()) { |
960 | 370 | fmt::format_to(debug_string_buffer, "local_runtime_filter_mgr: [{}]\n", |
961 | 370 | _state->local_runtime_filter_mgr()->debug_string()); |
962 | 370 | } |
963 | | |
964 | 17.4k | fmt::format_to(debug_string_buffer, "operators: "); |
965 | 34.8k | for (size_t i = 0; i < _operators.size(); i++) { |
966 | 17.4k | fmt::format_to(debug_string_buffer, "\n{}", |
967 | 17.4k | _opened && !is_finalized() |
968 | 17.4k | ? _operators[i]->debug_string(_state, cast_set<int>(i)) |
969 | 17.4k | : _operators[i]->debug_string(cast_set<int>(i))); |
970 | 17.4k | } |
971 | 17.4k | fmt::format_to(debug_string_buffer, "\n{}\n", |
972 | 17.4k | _opened && !is_finalized() |
973 | 17.4k | ? _sink->debug_string(_state, cast_set<int>(_operators.size())) |
974 | 17.4k | : _sink->debug_string(cast_set<int>(_operators.size()))); |
975 | | |
976 | 17.4k | fmt::format_to(debug_string_buffer, "\nRead Dependency Information: \n"); |
977 | | |
978 | 17.4k | size_t i = 0; |
979 | 34.8k | for (; i < _read_dependencies.size(); i++) { |
980 | 34.8k | for (size_t j = 0; j < _read_dependencies[i].size(); j++) { |
981 | 17.4k | fmt::format_to(debug_string_buffer, "{}. {}\n", i, |
982 | 17.4k | _read_dependencies[i][j]->debug_string(cast_set<int>(i) + 1)); |
983 | 17.4k | } |
984 | 17.4k | } |
985 | | |
986 | 17.4k | fmt::format_to(debug_string_buffer, "{}. {}\n", i, |
987 | 17.4k | _memory_sufficient_dependency->debug_string(cast_set<int>(i++))); |
988 | | |
989 | 17.4k | fmt::format_to(debug_string_buffer, "\nWrite Dependency Information: \n"); |
990 | 34.7k | for (size_t j = 0; j < _write_dependencies.size(); j++, i++) { |
991 | 17.3k | fmt::format_to(debug_string_buffer, "{}. {}\n", i, |
992 | 17.3k | _write_dependencies[j]->debug_string(cast_set<int>(j) + 1)); |
993 | 17.3k | } |
994 | | |
995 | 17.4k | fmt::format_to(debug_string_buffer, "\nExecution Dependency Information: \n"); |
996 | 51.9k | for (size_t j = 0; j < _execution_dependencies.size(); j++, i++) { |
997 | 34.5k | fmt::format_to(debug_string_buffer, "{}. {}\n", i, |
998 | 34.5k | _execution_dependencies[j]->debug_string(cast_set<int>(i) + 1)); |
999 | 34.5k | } |
1000 | | |
1001 | 17.4k | fmt::format_to(debug_string_buffer, "Finish Dependency Information: \n"); |
1002 | 51.6k | for (size_t j = 0; j < _finish_dependencies.size(); j++, i++) { |
1003 | 34.2k | fmt::format_to(debug_string_buffer, "{}. {}\n", i, |
1004 | 34.2k | _finish_dependencies[j]->debug_string(cast_set<int>(i) + 1)); |
1005 | 34.2k | } |
1006 | 17.4k | return fmt::to_string(debug_string_buffer); |
1007 | 17.4k | } |
1008 | | |
1009 | 2 | size_t PipelineTask::get_revocable_size() const { |
1010 | 2 | if (!_opened || is_finalized() || _running || (_eos && !_spilling)) { |
1011 | 0 | return 0; |
1012 | 0 | } |
1013 | | |
1014 | | // Sum revocable memory from every operator in the pipeline + the sink. |
1015 | | // Each operator reports only its own revocable memory (no child recursion). |
1016 | 2 | size_t total = _sink->revocable_mem_size(_state); |
1017 | 2 | for (const auto& op : _operators) { |
1018 | 2 | total += op->revocable_mem_size(_state); |
1019 | 2 | } |
1020 | 2 | return total; |
1021 | 2 | } |
1022 | | |
1023 | 3 | Status PipelineTask::revoke_memory(const std::shared_ptr<SpillContext>& spill_context) { |
1024 | 3 | DCHECK(spill_context); |
1025 | 3 | if (is_finalized()) { |
1026 | 1 | spill_context->on_task_finished(); |
1027 | 1 | VLOG_DEBUG << "Query: " << print_id(_state->query_id()) << ", task: " << ((void*)this) |
1028 | 0 | << " finalized"; |
1029 | 1 | return Status::OK(); |
1030 | 1 | } |
1031 | | |
1032 | 2 | const auto revocable_size = get_revocable_size(); |
1033 | 2 | if (revocable_size >= SpillFile::MIN_SPILL_WRITE_BATCH_MEM) { |
1034 | 1 | auto revokable_task = std::make_shared<RevokableTask>(shared_from_this(), spill_context); |
1035 | | // Submit a revocable task to run, the run method will call revoke memory. Currently the |
1036 | | // underline pipeline task is still blocked. |
1037 | 1 | RETURN_IF_ERROR(_state->get_query_ctx()->get_pipe_exec_scheduler()->submit(revokable_task)); |
1038 | 1 | } else { |
1039 | 1 | spill_context->on_task_finished(); |
1040 | 1 | VLOG_DEBUG << "Query: " << print_id(_state->query_id()) << ", task: " << ((void*)this) |
1041 | 0 | << " has not enough data to revoke: " << revocable_size; |
1042 | 1 | } |
1043 | 2 | return Status::OK(); |
1044 | 2 | } |
1045 | | |
1046 | 5.25M | void PipelineTask::wake_up(Dependency* dep, std::unique_lock<std::mutex>& /* dep_lock */) { |
1047 | 10.4M | auto cancel_if_error = [&](const Status& st) { |
1048 | 10.4M | if (!st.ok()) { |
1049 | 0 | if (auto frag = fragment_context().lock()) { |
1050 | 0 | frag->cancel(st); |
1051 | 0 | } |
1052 | 0 | } |
1053 | 10.4M | }; |
1054 | | // call by dependency |
1055 | 5.25M | DCHECK_EQ(_blocked_dep, dep) << "dep : " << dep->debug_string(0) << "task: " << debug_string(); |
1056 | 5.25M | _blocked_dep = nullptr; |
1057 | 5.25M | auto holder = std::dynamic_pointer_cast<PipelineTask>(shared_from_this()); |
1058 | 5.25M | cancel_if_error(_state_transition(PipelineTask::State::RUNNABLE)); |
1059 | | // Under _wake_up_early, FINISHED/FINALIZED → RUNNABLE is a legal no-op |
1060 | | // (_state_transition returns OK but state stays unchanged). We must not |
1061 | | // resubmit a terminated task: finalize() clears _sink/_operators, and |
1062 | | // submit() → is_blockable() would dereference them → SIGSEGV. |
1063 | 5.25M | if (_exec_state == State::FINISHED || _exec_state == State::FINALIZED) { |
1064 | 2 | return; |
1065 | 2 | } |
1066 | 5.25M | if (auto f = _fragment_context.lock(); f) { |
1067 | 5.25M | cancel_if_error(_state->get_query_ctx()->get_pipe_exec_scheduler()->submit(holder)); |
1068 | 5.25M | } |
1069 | 5.25M | } |
1070 | | |
1071 | 16.5M | Status PipelineTask::_state_transition(State new_state) { |
1072 | 16.5M | const auto& table = |
1073 | 16.5M | _wake_up_early ? WAKE_UP_EARLY_LEGAL_STATE_TRANSITION : LEGAL_STATE_TRANSITION; |
1074 | 16.5M | if (!table[(int)new_state].contains(_exec_state)) { |
1075 | 31 | return Status::InternalError( |
1076 | 31 | "Task state transition from {} to {} is not allowed! Task info: {}", |
1077 | 31 | _to_string(_exec_state), _to_string(new_state), debug_string()); |
1078 | 31 | } |
1079 | | // FINISHED/FINALIZED → RUNNABLE is legal under wake_up_early (delayed wake_up() arriving |
1080 | | // after the task already terminated), but we must not actually move the state backwards |
1081 | | // or update profile info (which would misleadingly show RUNNABLE for a terminated task). |
1082 | 16.5M | bool need_move = !((_exec_state == State::FINISHED || _exec_state == State::FINALIZED) && |
1083 | 16.5M | new_state == State::RUNNABLE); |
1084 | 16.6M | if (need_move) { |
1085 | 16.6M | if (_exec_state != new_state) { |
1086 | 16.6M | _state_change_watcher.reset(); |
1087 | 16.6M | _state_change_watcher.start(); |
1088 | 16.6M | } |
1089 | 16.6M | _task_profile->add_info_string("TaskState", _to_string(new_state)); |
1090 | 16.6M | _task_profile->add_info_string("BlockedByDependency", |
1091 | 16.6M | _blocked_dep ? _blocked_dep->name() : ""); |
1092 | 16.6M | _exec_state = new_state; |
1093 | 16.6M | } |
1094 | 16.5M | return Status::OK(); |
1095 | 16.5M | } |
1096 | | |
1097 | | } // namespace doris |