/root/doris/be/src/pipeline/pipeline_task.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "pipeline_task.h" |
19 | | |
20 | | #include <fmt/core.h> |
21 | | #include <fmt/format.h> |
22 | | #include <gen_cpp/Metrics_types.h> |
23 | | #include <glog/logging.h> |
24 | | |
25 | | #include <ostream> |
26 | | #include <vector> |
27 | | |
28 | | #include "common/logging.h" |
29 | | #include "common/status.h" |
30 | | #include "pipeline/dependency.h" |
31 | | #include "pipeline/exec/operator.h" |
32 | | #include "pipeline/exec/scan_operator.h" |
33 | | #include "pipeline/pipeline.h" |
34 | | #include "pipeline/pipeline_fragment_context.h" |
35 | | #include "pipeline/task_queue.h" |
36 | | #include "pipeline/task_scheduler.h" |
37 | | #include "runtime/descriptors.h" |
38 | | #include "runtime/exec_env.h" |
39 | | #include "runtime/query_context.h" |
40 | | #include "runtime/thread_context.h" |
41 | | #include "runtime/workload_group/workload_group_manager.h" |
42 | | #include "util/container_util.hpp" |
43 | | #include "util/defer_op.h" |
44 | | #include "util/mem_info.h" |
45 | | #include "util/runtime_profile.h" |
46 | | #include "util/uid_util.h" |
47 | | #include "vec/core/block.h" |
48 | | #include "vec/spill/spill_stream.h" |
49 | | |
50 | | namespace doris { |
51 | | class RuntimeState; |
52 | | } // namespace doris |
53 | | |
54 | | namespace doris::pipeline { |
55 | | #include "common/compile_check_begin.h" |
56 | | |
57 | | PipelineTask::PipelineTask(PipelinePtr& pipeline, uint32_t task_id, RuntimeState* state, |
58 | | std::shared_ptr<PipelineFragmentContext> fragment_context, |
59 | | RuntimeProfile* parent_profile, |
60 | | std::map<int, std::pair<std::shared_ptr<BasicSharedState>, |
61 | | std::vector<std::shared_ptr<Dependency>>>> |
62 | | shared_state_map, |
63 | | int task_idx) |
64 | | : |
65 | | #ifdef BE_TEST |
66 | 234k | _query_id(fragment_context ? fragment_context->get_query_id() : TUniqueId()), |
67 | | #else |
68 | | _query_id(fragment_context->get_query_id()), |
69 | | #endif |
70 | 234k | _index(task_id), |
71 | 234k | _pipeline(pipeline), |
72 | 234k | _opened(false), |
73 | 234k | _state(state), |
74 | 234k | _fragment_context(fragment_context), |
75 | 234k | _parent_profile(parent_profile), |
76 | 234k | _operators(pipeline->operators()), |
77 | 234k | _source(_operators.front().get()), |
78 | 234k | _root(_operators.back().get()), |
79 | 234k | _sink(pipeline->sink_shared_pointer()), |
80 | 234k | _shared_state_map(std::move(shared_state_map)), |
81 | 234k | _task_idx(task_idx), |
82 | 234k | _memory_sufficient_dependency(state->get_query_ctx()->get_memory_sufficient_dependency()), |
83 | 234k | _pipeline_name(_pipeline->name()) { |
84 | 234k | _execution_dependencies.push_back(state->get_query_ctx()->get_execution_dependency()); |
85 | 234k | if (!_shared_state_map.contains(_sink->dests_id().front())) { |
86 | 234k | auto shared_state = _sink->create_shared_state(); |
87 | 234k | if (shared_state) { |
88 | 87 | _sink_shared_state = shared_state; |
89 | 87 | } |
90 | 234k | } |
91 | 234k | } |
92 | | |
93 | | Status PipelineTask::prepare(const std::vector<TScanRangeParams>& scan_range, const int sender_id, |
94 | 45 | const TDataSink& tsink) { |
95 | 45 | DCHECK(_sink); |
96 | 45 | _init_profile(); |
97 | 45 | SCOPED_TIMER(_task_profile->total_time_counter()); |
98 | 45 | SCOPED_CPU_TIMER(_task_cpu_timer); |
99 | 45 | SCOPED_TIMER(_prepare_timer); |
100 | 45 | DBUG_EXECUTE_IF("fault_inject::PipelineXTask::prepare", { |
101 | 45 | Status status = Status::Error<INTERNAL_ERROR>("fault_inject pipeline_task prepare failed"); |
102 | 45 | return status; |
103 | 45 | }); |
104 | 45 | { |
105 | | // set sink local state |
106 | 45 | LocalSinkStateInfo info {_task_idx, _task_profile.get(), |
107 | 45 | sender_id, get_sink_shared_state().get(), |
108 | 45 | _shared_state_map, tsink}; |
109 | 45 | RETURN_IF_ERROR(_sink->setup_local_state(_state, info)); |
110 | 45 | } |
111 | | |
112 | 45 | _scan_ranges = scan_range; |
113 | 45 | auto* parent_profile = _state->get_sink_local_state()->operator_profile(); |
114 | | |
115 | 96 | for (int op_idx = cast_set<int>(_operators.size() - 1); op_idx >= 0; op_idx--) { |
116 | 51 | auto& op = _operators[op_idx]; |
117 | 51 | LocalStateInfo info {parent_profile, _scan_ranges, get_op_shared_state(op->operator_id()), |
118 | 51 | _shared_state_map, _task_idx}; |
119 | 51 | RETURN_IF_ERROR(op->setup_local_state(_state, info)); |
120 | 51 | parent_profile = _state->get_local_state(op->operator_id())->operator_profile(); |
121 | 51 | } |
122 | 45 | { |
123 | 45 | const auto& deps = |
124 | 45 | _state->get_local_state(_source->operator_id())->execution_dependencies(); |
125 | 45 | std::unique_lock<std::mutex> lc(_dependency_lock); |
126 | 45 | std::copy(deps.begin(), deps.end(), |
127 | 45 | std::inserter(_execution_dependencies, _execution_dependencies.end())); |
128 | 45 | } |
129 | 45 | if (auto fragment = _fragment_context.lock()) { |
130 | 42 | if (fragment->get_query_ctx()->is_cancelled()) { |
131 | 0 | terminate(); |
132 | 0 | return fragment->get_query_ctx()->exec_status(); |
133 | 0 | } |
134 | 42 | } else { |
135 | 3 | return Status::InternalError("Fragment already finished! Query: {}", print_id(_query_id)); |
136 | 3 | } |
137 | 42 | _block = doris::vectorized::Block::create_unique(); |
138 | 42 | return _state_transition(State::RUNNABLE); |
139 | 45 | } |
140 | | |
141 | 39 | Status PipelineTask::_extract_dependencies() { |
142 | 39 | std::vector<std::vector<Dependency*>> read_dependencies; |
143 | 39 | std::vector<Dependency*> write_dependencies; |
144 | 39 | std::vector<Dependency*> finish_dependencies; |
145 | 39 | std::vector<Dependency*> spill_dependencies; |
146 | 39 | read_dependencies.resize(_operators.size()); |
147 | 39 | size_t i = 0; |
148 | 45 | for (auto& op : _operators) { |
149 | 45 | auto result = _state->get_local_state_result(op->operator_id()); |
150 | 45 | if (!result) { |
151 | 3 | return result.error(); |
152 | 3 | } |
153 | 42 | auto* local_state = result.value(); |
154 | 42 | read_dependencies[i] = local_state->dependencies(); |
155 | 42 | auto* fin_dep = local_state->finishdependency(); |
156 | 42 | if (fin_dep) { |
157 | 21 | finish_dependencies.push_back(fin_dep); |
158 | 21 | } |
159 | 42 | if (auto* spill_dependency = local_state->spill_dependency()) { |
160 | 21 | spill_dependencies.push_back(spill_dependency); |
161 | 21 | } |
162 | 42 | i++; |
163 | 42 | } |
164 | 36 | DBUG_EXECUTE_IF("fault_inject::PipelineXTask::_extract_dependencies", { |
165 | 36 | Status status = Status::Error<INTERNAL_ERROR>( |
166 | 36 | "fault_inject pipeline_task _extract_dependencies failed"); |
167 | 36 | return status; |
168 | 36 | }); |
169 | 36 | { |
170 | 36 | auto* local_state = _state->get_sink_local_state(); |
171 | 36 | write_dependencies = local_state->dependencies(); |
172 | 36 | auto* fin_dep = local_state->finishdependency(); |
173 | 36 | if (fin_dep) { |
174 | 36 | finish_dependencies.push_back(fin_dep); |
175 | 36 | } |
176 | 36 | if (auto* spill_dependency = local_state->spill_dependency()) { |
177 | 21 | spill_dependencies.push_back(spill_dependency); |
178 | 21 | } |
179 | 36 | } |
180 | 36 | { |
181 | 36 | std::unique_lock<std::mutex> lc(_dependency_lock); |
182 | 36 | read_dependencies.swap(_read_dependencies); |
183 | 36 | write_dependencies.swap(_write_dependencies); |
184 | 36 | finish_dependencies.swap(_finish_dependencies); |
185 | 36 | spill_dependencies.swap(_spill_dependencies); |
186 | 36 | } |
187 | 36 | return Status::OK(); |
188 | 36 | } |
189 | | |
190 | 18 | bool PipelineTask::inject_shared_state(std::shared_ptr<BasicSharedState> shared_state) { |
191 | 18 | if (!shared_state) { |
192 | 3 | return false; |
193 | 3 | } |
194 | | // Shared state is created by upstream task's sink operator and shared by source operator of |
195 | | // this task. |
196 | 21 | for (auto& op : _operators) { |
197 | 21 | if (shared_state->related_op_ids.contains(op->operator_id())) { |
198 | 9 | _op_shared_states.insert({op->operator_id(), shared_state}); |
199 | 9 | return true; |
200 | 9 | } |
201 | 21 | } |
202 | | // Shared state is created by the first sink operator and shared by sink operator of this task. |
203 | | // For example, Set operations. |
204 | 6 | if (shared_state->related_op_ids.contains(_sink->dests_id().front())) { |
205 | 3 | DCHECK_EQ(_sink_shared_state, nullptr) |
206 | 0 | << " Sink: " << _sink->get_name() << " dest id: " << _sink->dests_id().front(); |
207 | 3 | _sink_shared_state = shared_state; |
208 | 3 | return true; |
209 | 3 | } |
210 | 3 | return false; |
211 | 6 | } |
212 | | |
213 | 45 | void PipelineTask::_init_profile() { |
214 | 45 | _task_profile = std::make_unique<RuntimeProfile>(fmt::format("PipelineTask(index={})", _index)); |
215 | 45 | _parent_profile->add_child(_task_profile.get(), true, nullptr); |
216 | 45 | _task_cpu_timer = ADD_TIMER(_task_profile, "TaskCpuTime"); |
217 | | |
218 | 45 | static const char* exec_time = "ExecuteTime"; |
219 | 45 | _exec_timer = ADD_TIMER(_task_profile, exec_time); |
220 | 45 | _prepare_timer = ADD_CHILD_TIMER(_task_profile, "PrepareTime", exec_time); |
221 | 45 | _open_timer = ADD_CHILD_TIMER(_task_profile, "OpenTime", exec_time); |
222 | 45 | _get_block_timer = ADD_CHILD_TIMER(_task_profile, "GetBlockTime", exec_time); |
223 | 45 | _get_block_counter = ADD_COUNTER(_task_profile, "GetBlockCounter", TUnit::UNIT); |
224 | 45 | _sink_timer = ADD_CHILD_TIMER(_task_profile, "SinkTime", exec_time); |
225 | 45 | _close_timer = ADD_CHILD_TIMER(_task_profile, "CloseTime", exec_time); |
226 | | |
227 | 45 | _wait_worker_timer = ADD_TIMER_WITH_LEVEL(_task_profile, "WaitWorkerTime", 1); |
228 | | |
229 | 45 | _schedule_counts = ADD_COUNTER(_task_profile, "NumScheduleTimes", TUnit::UNIT); |
230 | 45 | _yield_counts = ADD_COUNTER(_task_profile, "NumYieldTimes", TUnit::UNIT); |
231 | 45 | _core_change_times = ADD_COUNTER(_task_profile, "CoreChangeTimes", TUnit::UNIT); |
232 | 45 | _memory_reserve_times = ADD_COUNTER(_task_profile, "MemoryReserveTimes", TUnit::UNIT); |
233 | 45 | _memory_reserve_failed_times = |
234 | 45 | ADD_COUNTER(_task_profile, "MemoryReserveFailedTimes", TUnit::UNIT); |
235 | 45 | } |
236 | | |
237 | 18 | void PipelineTask::_fresh_profile_counter() { |
238 | 18 | COUNTER_SET(_schedule_counts, (int64_t)_schedule_time); |
239 | 18 | COUNTER_SET(_wait_worker_timer, (int64_t)_wait_worker_watcher.elapsed_time()); |
240 | 18 | } |
241 | | |
242 | 36 | Status PipelineTask::_open() { |
243 | 36 | SCOPED_TIMER(_task_profile->total_time_counter()); |
244 | 36 | SCOPED_CPU_TIMER(_task_cpu_timer); |
245 | 36 | SCOPED_TIMER(_open_timer); |
246 | 36 | _dry_run = _sink->should_dry_run(_state); |
247 | 42 | for (auto& o : _operators) { |
248 | 42 | RETURN_IF_ERROR(_state->get_local_state(o->operator_id())->open(_state)); |
249 | 42 | } |
250 | 36 | RETURN_IF_ERROR(_state->get_sink_local_state()->open(_state)); |
251 | 36 | RETURN_IF_ERROR(_extract_dependencies()); |
252 | 36 | DBUG_EXECUTE_IF("fault_inject::PipelineXTask::open", { |
253 | 36 | Status status = Status::Error<INTERNAL_ERROR>("fault_inject pipeline_task open failed"); |
254 | 36 | return status; |
255 | 36 | }); |
256 | 36 | _opened = true; |
257 | 36 | return Status::OK(); |
258 | 36 | } |
259 | | |
260 | 180 | Status PipelineTask::_prepare() { |
261 | 180 | SCOPED_TIMER(_task_profile->total_time_counter()); |
262 | 180 | SCOPED_CPU_TIMER(_task_cpu_timer); |
263 | 216 | for (auto& o : _operators) { |
264 | 216 | RETURN_IF_ERROR(_state->get_local_state(o->operator_id())->prepare(_state)); |
265 | 216 | } |
266 | 180 | RETURN_IF_ERROR(_state->get_sink_local_state()->prepare(_state)); |
267 | 180 | return Status::OK(); |
268 | 180 | } |
269 | | |
270 | 123 | bool PipelineTask::_wait_to_start() { |
271 | | // Before task starting, we should make sure |
272 | | // 1. Execution dependency is ready (which is controlled by FE 2-phase commit) |
273 | | // 2. Runtime filter dependencies are ready |
274 | | // 3. All tablets are loaded into local storage |
275 | 123 | return std::any_of( |
276 | 123 | _execution_dependencies.begin(), _execution_dependencies.end(), |
277 | 162 | [&](Dependency* dep) -> bool { return dep->is_blocked_by(shared_from_this()); }); |
278 | 123 | } |
279 | | |
280 | 48 | bool PipelineTask::_is_pending_finish() { |
281 | | // Spilling may be in progress if eos is true. |
282 | 48 | return std::any_of(_spill_dependencies.begin(), _spill_dependencies.end(), |
283 | 48 | [&](Dependency* dep) -> bool { |
284 | 33 | return dep->is_blocked_by(shared_from_this()); |
285 | 33 | }) || |
286 | 48 | std::any_of( |
287 | 45 | _finish_dependencies.begin(), _finish_dependencies.end(), |
288 | 57 | [&](Dependency* dep) -> bool { return dep->is_blocked_by(shared_from_this()); }); |
289 | 48 | } |
290 | | |
291 | 6 | bool PipelineTask::is_revoking() const { |
292 | | // Spilling may be in progress if eos is true. |
293 | 6 | return std::any_of(_spill_dependencies.begin(), _spill_dependencies.end(), |
294 | 9 | [&](Dependency* dep) -> bool { return dep->is_blocked_by(); }); |
295 | 6 | } |
296 | | |
297 | 839k | bool PipelineTask::_is_blocked() { |
298 | | // `_dry_run = true` means we do not need data from source operator. |
299 | 839k | if (!_dry_run) { |
300 | 1.67M | for (int i = cast_set<int>(_read_dependencies.size() - 1); i >= 0; i--) { |
301 | | // `_read_dependencies` is organized according to operators. For each operator, running condition is met iff all dependencies are ready. |
302 | 839k | for (auto* dep : _read_dependencies[i]) { |
303 | 839k | if (dep->is_blocked_by(shared_from_this())) { |
304 | 42 | return true; |
305 | 42 | } |
306 | 839k | } |
307 | | // If all dependencies are ready for this operator, we can execute this task if no datum is needed from upstream operators. |
308 | 839k | if (!_operators[i]->need_more_input_data(_state)) { |
309 | 6 | break; |
310 | 6 | } |
311 | 839k | } |
312 | 839k | } |
313 | 839k | return std::any_of(_spill_dependencies.begin(), _spill_dependencies.end(), |
314 | 1.67M | [&](Dependency* dep) -> bool { |
315 | 1.67M | return dep->is_blocked_by(shared_from_this()); |
316 | 1.67M | }) || |
317 | 839k | _memory_sufficient_dependency->is_blocked_by(shared_from_this()) || |
318 | 839k | std::any_of( |
319 | 839k | _write_dependencies.begin(), _write_dependencies.end(), |
320 | 839k | [&](Dependency* dep) -> bool { return dep->is_blocked_by(shared_from_this()); }); |
321 | 839k | } |
322 | | |
323 | 15 | void PipelineTask::terminate() { |
324 | | // We use a lock to assure all dependencies are not deconstructed here. |
325 | 15 | std::unique_lock<std::mutex> lc(_dependency_lock); |
326 | 15 | auto fragment = _fragment_context.lock(); |
327 | 15 | if (!is_finalized() && fragment) { |
328 | 15 | try { |
329 | 15 | DCHECK(_wake_up_early || fragment->is_canceled()); |
330 | 15 | std::for_each(_spill_dependencies.begin(), _spill_dependencies.end(), |
331 | 30 | [&](Dependency* dep) { dep->set_always_ready(); }); |
332 | 15 | std::for_each(_write_dependencies.begin(), _write_dependencies.end(), |
333 | 15 | [&](Dependency* dep) { dep->set_always_ready(); }); |
334 | 15 | std::for_each(_finish_dependencies.begin(), _finish_dependencies.end(), |
335 | 30 | [&](Dependency* dep) { dep->set_always_ready(); }); |
336 | 15 | std::for_each(_read_dependencies.begin(), _read_dependencies.end(), |
337 | 15 | [&](std::vector<Dependency*>& deps) { |
338 | 15 | std::for_each(deps.begin(), deps.end(), |
339 | 15 | [&](Dependency* dep) { dep->set_always_ready(); }); |
340 | 15 | }); |
341 | | // All `_execution_deps` will never be set blocking from ready. So we just set ready here. |
342 | 15 | std::for_each(_execution_dependencies.begin(), _execution_dependencies.end(), |
343 | 30 | [&](Dependency* dep) { dep->set_ready(); }); |
344 | 15 | _memory_sufficient_dependency->set_ready(); |
345 | 15 | } catch (const doris::Exception& e) { |
346 | 0 | LOG(WARNING) << "Terminate failed: " << e.code() << ", " << e.to_string(); |
347 | 0 | } |
348 | 15 | } |
349 | 15 | } |
350 | | |
351 | | /** |
352 | | * `_eos` indicates whether the execution phase is done. `done` indicates whether we could close |
353 | | * this task. |
354 | | * |
355 | | * For example, |
356 | | * 1. if `_eos` is false which means we should continue to get next block so we cannot close (e.g. |
357 | | * `done` is false) |
358 | | * 2. if `_eos` is true which means all blocks from source are exhausted but `_is_pending_finish()` |
359 | | * is true which means we should wait for a pending dependency ready (maybe a running rpc), so we |
360 | | * cannot close (e.g. `done` is false) |
361 | | * 3. if `_eos` is true which means all blocks from source are exhausted and `_is_pending_finish()` |
362 | | * is false which means we can close immediately (e.g. `done` is true) |
363 | | * @param done |
364 | | * @return |
365 | | */ |
366 | 99 | Status PipelineTask::execute(bool* done) { |
367 | 99 | if (_exec_state != State::RUNNABLE || _blocked_dep != nullptr) [[unlikely]] { |
368 | 3 | return Status::InternalError("Pipeline task is not runnable! Task info: {}", |
369 | 3 | debug_string()); |
370 | 3 | } |
371 | 96 | auto fragment_context = _fragment_context.lock(); |
372 | 96 | DCHECK(fragment_context); |
373 | 96 | int64_t time_spent = 0; |
374 | 96 | ThreadCpuStopWatch cpu_time_stop_watch; |
375 | 96 | cpu_time_stop_watch.start(); |
376 | 96 | SCOPED_ATTACH_TASK(_state); |
377 | 96 | Defer running_defer {[&]() { |
378 | 96 | if (_task_queue) { |
379 | 96 | _task_queue->update_statistics(this, time_spent); |
380 | 96 | } |
381 | 96 | int64_t delta_cpu_time = cpu_time_stop_watch.elapsed_time(); |
382 | 96 | _task_cpu_timer->update(delta_cpu_time); |
383 | 96 | fragment_context->get_query_ctx()->resource_ctx()->cpu_context()->update_cpu_cost_ms( |
384 | 96 | delta_cpu_time); |
385 | | |
386 | | // If task is woke up early, we should terminate all operators, and this task could be closed immediately. |
387 | 96 | if (_wake_up_early) { |
388 | 9 | terminate(); |
389 | 9 | THROW_IF_ERROR(_root->terminate(_state)); |
390 | 9 | THROW_IF_ERROR(_sink->terminate(_state)); |
391 | 9 | _eos = true; |
392 | 9 | *done = true; |
393 | 87 | } else if (_eos && !_spilling && |
394 | 87 | (fragment_context->is_canceled() || !_is_pending_finish())) { |
395 | 24 | *done = true; |
396 | 24 | } |
397 | 96 | }}; |
398 | 96 | const auto query_id = _state->query_id(); |
399 | | // If this task is already EOS and block is empty (which means we already output all blocks), |
400 | | // just return here. |
401 | 96 | if (_eos && !_spilling) { |
402 | 3 | return Status::OK(); |
403 | 3 | } |
404 | | // If this task is blocked by a spilling request and waken up immediately, the spilling |
405 | | // dependency will not block this task and we should just run here. |
406 | 93 | if (!_block->empty()) { |
407 | 0 | LOG(INFO) << "Query: " << print_id(query_id) << " has pending block, size: " |
408 | 0 | << PrettyPrinter::print_bytes(_block->allocated_bytes()); |
409 | 0 | DCHECK(_spilling); |
410 | 0 | } |
411 | | |
412 | 93 | SCOPED_TIMER(_task_profile->total_time_counter()); |
413 | 93 | SCOPED_TIMER(_exec_timer); |
414 | | |
415 | 93 | if (!_wake_up_early) { |
416 | 93 | RETURN_IF_ERROR(_prepare()); |
417 | 93 | } |
418 | 93 | DBUG_EXECUTE_IF("fault_inject::PipelineXTask::execute", { |
419 | 93 | Status status = Status::Error<INTERNAL_ERROR>("fault_inject pipeline_task execute failed"); |
420 | 93 | return status; |
421 | 93 | }); |
422 | | // `_wake_up_early` must be after `_wait_to_start()` |
423 | 93 | if (_wait_to_start() || _wake_up_early) { |
424 | 6 | return Status::OK(); |
425 | 6 | } |
426 | 93 | RETURN_IF_ERROR(_prepare()); |
427 | | |
428 | | // The status must be runnable |
429 | 87 | if (!_opened && !fragment_context->is_canceled()) { |
430 | 33 | DBUG_EXECUTE_IF("PipelineTask::execute.open_sleep", { |
431 | 33 | auto required_pipeline_id = |
432 | 33 | DebugPoints::instance()->get_debug_param_or_default<int32_t>( |
433 | 33 | "PipelineTask::execute.open_sleep", "pipeline_id", -1); |
434 | 33 | auto required_task_id = DebugPoints::instance()->get_debug_param_or_default<int32_t>( |
435 | 33 | "PipelineTask::execute.open_sleep", "task_id", -1); |
436 | 33 | if (required_pipeline_id == pipeline_id() && required_task_id == task_id()) { |
437 | 33 | LOG(WARNING) << "PipelineTask::execute.open_sleep sleep 5s"; |
438 | 33 | sleep(5); |
439 | 33 | } |
440 | 33 | }); |
441 | | |
442 | 33 | SCOPED_RAW_TIMER(&time_spent); |
443 | 33 | RETURN_IF_ERROR(_open()); |
444 | 33 | } |
445 | | |
446 | 839k | while (!fragment_context->is_canceled()) { |
447 | 839k | SCOPED_RAW_TIMER(&time_spent); |
448 | 839k | Defer defer {[&]() { |
449 | | // If this run is pended by a spilling request, the block will be output in next run. |
450 | 839k | if (!_spilling) { |
451 | 833k | _block->clear_column_data(_root->row_desc().num_materialized_slots()); |
452 | 833k | } |
453 | 839k | }}; |
454 | | // `_wake_up_early` must be after `_is_blocked()` |
455 | 839k | if (_is_blocked() || _wake_up_early) { |
456 | 48 | return Status::OK(); |
457 | 48 | } |
458 | | |
459 | | /// When a task is cancelled, |
460 | | /// its blocking state will be cleared and it will transition to a ready state (though it is not truly ready). |
461 | | /// Here, checking whether it is cancelled to prevent tasks in a blocking state from being re-executed. |
462 | 839k | if (fragment_context->is_canceled()) { |
463 | 0 | break; |
464 | 0 | } |
465 | | |
466 | 839k | if (time_spent > _exec_time_slice) { |
467 | 9 | COUNTER_UPDATE(_yield_counts, 1); |
468 | 9 | break; |
469 | 9 | } |
470 | 839k | auto* block = _block.get(); |
471 | | |
472 | 839k | DBUG_EXECUTE_IF("fault_inject::PipelineXTask::executing", { |
473 | 839k | Status status = |
474 | 839k | Status::Error<INTERNAL_ERROR>("fault_inject pipeline_task executing failed"); |
475 | 839k | return status; |
476 | 839k | }); |
477 | | |
478 | | // `_sink->is_finished(_state)` means sink operator should be finished |
479 | 839k | if (_sink->is_finished(_state)) { |
480 | 3 | set_wake_up_early(); |
481 | 3 | return Status::OK(); |
482 | 3 | } |
483 | | |
484 | | // `_dry_run` means sink operator need no more data |
485 | 839k | _eos = _dry_run || _eos; |
486 | 839k | _spilling = false; |
487 | 839k | auto workload_group = _state->workload_group(); |
488 | | // If last run is pended by a spilling request, `_block` is produced with some rows in last |
489 | | // run, so we will resume execution using the block. |
490 | 839k | if (!_eos && _block->empty()) { |
491 | 836k | SCOPED_TIMER(_get_block_timer); |
492 | 836k | if (_state->low_memory_mode()) { |
493 | 4.63k | _sink->set_low_memory_mode(_state); |
494 | 4.63k | _root->set_low_memory_mode(_state); |
495 | 4.63k | } |
496 | 836k | DEFER_RELEASE_RESERVED(); |
497 | 836k | _get_block_counter->update(1); |
498 | 836k | const auto reserve_size = _root->get_reserve_mem_size(_state); |
499 | 836k | _root->reset_reserve_mem_size(_state); |
500 | | |
501 | 836k | if (workload_group && |
502 | 836k | _state->get_query_ctx() |
503 | 7.82k | ->resource_ctx() |
504 | 7.82k | ->task_controller() |
505 | 7.82k | ->is_enable_reserve_memory() && |
506 | 836k | reserve_size > 0) { |
507 | 7.82k | if (!_try_to_reserve_memory(reserve_size, _root)) { |
508 | 3.18k | continue; |
509 | 3.18k | } |
510 | 7.82k | } |
511 | | |
512 | 833k | bool eos = false; |
513 | 833k | RETURN_IF_ERROR(_root->get_block_after_projects(_state, block, &eos)); |
514 | 833k | RETURN_IF_ERROR(block->check_type_and_column()); |
515 | 833k | _eos = eos; |
516 | 833k | } |
517 | | |
518 | 836k | if (!_block->empty() || _eos) { |
519 | 3.18k | SCOPED_TIMER(_sink_timer); |
520 | 3.18k | Status status = Status::OK(); |
521 | 3.18k | DEFER_RELEASE_RESERVED(); |
522 | 3.18k | COUNTER_UPDATE(_memory_reserve_times, 1); |
523 | 3.18k | if (_state->get_query_ctx() |
524 | 3.18k | ->resource_ctx() |
525 | 3.18k | ->task_controller() |
526 | 3.18k | ->is_enable_reserve_memory() && |
527 | 3.18k | workload_group && !(_wake_up_early || _dry_run)) { |
528 | 3.14k | const auto sink_reserve_size = _sink->get_reserve_mem_size(_state, _eos); |
529 | 3.14k | if (sink_reserve_size > 0 && |
530 | 3.14k | !_try_to_reserve_memory(sink_reserve_size, _sink.get())) { |
531 | 3.13k | continue; |
532 | 3.13k | } |
533 | 3.14k | } |
534 | | |
535 | 45 | if (_eos) { |
536 | 27 | RETURN_IF_ERROR(close(Status::OK(), false)); |
537 | 27 | } |
538 | | |
539 | 45 | DBUG_EXECUTE_IF("PipelineTask::execute.sink_eos_sleep", { |
540 | 45 | auto required_pipeline_id = |
541 | 45 | DebugPoints::instance()->get_debug_param_or_default<int32_t>( |
542 | 45 | "PipelineTask::execute.sink_eos_sleep", "pipeline_id", -1); |
543 | 45 | auto required_task_id = |
544 | 45 | DebugPoints::instance()->get_debug_param_or_default<int32_t>( |
545 | 45 | "PipelineTask::execute.sink_eos_sleep", "task_id", -1); |
546 | 45 | if (required_pipeline_id == pipeline_id() && required_task_id == task_id()) { |
547 | 45 | LOG(WARNING) << "PipelineTask::execute.sink_eos_sleep sleep 10s"; |
548 | 45 | sleep(10); |
549 | 45 | } |
550 | 45 | }); |
551 | | |
552 | 45 | DBUG_EXECUTE_IF("PipelineTask::execute.terminate", { |
553 | 45 | if (_eos) { |
554 | 45 | auto required_pipeline_id = |
555 | 45 | DebugPoints::instance()->get_debug_param_or_default<int32_t>( |
556 | 45 | "PipelineTask::execute.terminate", "pipeline_id", -1); |
557 | 45 | auto required_task_id = |
558 | 45 | DebugPoints::instance()->get_debug_param_or_default<int32_t>( |
559 | 45 | "PipelineTask::execute.terminate", "task_id", -1); |
560 | 45 | auto required_fragment_id = |
561 | 45 | DebugPoints::instance()->get_debug_param_or_default<int32_t>( |
562 | 45 | "PipelineTask::execute.terminate", "fragment_id", -1); |
563 | 45 | if (required_pipeline_id == pipeline_id() && required_task_id == task_id() && |
564 | 45 | fragment_context->get_fragment_id() == required_fragment_id) { |
565 | 45 | _wake_up_early = true; |
566 | 45 | terminate(); |
567 | 45 | } else if (required_pipeline_id == pipeline_id() && |
568 | 45 | fragment_context->get_fragment_id() == required_fragment_id) { |
569 | 45 | LOG(WARNING) << "PipelineTask::execute.terminate sleep 5s"; |
570 | 45 | sleep(5); |
571 | 45 | } |
572 | 45 | } |
573 | 45 | }); |
574 | 45 | RETURN_IF_ERROR(block->check_type_and_column()); |
575 | 45 | status = _sink->sink(_state, block, _eos); |
576 | | |
577 | 45 | if (status.is<ErrorCode::END_OF_FILE>()) { |
578 | 3 | set_wake_up_early(); |
579 | 3 | return Status::OK(); |
580 | 42 | } else if (!status) { |
581 | 0 | return status; |
582 | 0 | } |
583 | | |
584 | 42 | if (_eos) { // just return, the scheduler will do finish work |
585 | 24 | return Status::OK(); |
586 | 24 | } |
587 | 42 | } |
588 | 836k | } |
589 | | |
590 | 87 | RETURN_IF_ERROR(get_task_queue()->push_back(shared_from_this())); |
591 | 9 | return Status::OK(); |
592 | 9 | } |
593 | | |
594 | 10.9k | bool PipelineTask::_try_to_reserve_memory(const size_t reserve_size, OperatorBase* op) { |
595 | 10.9k | auto st = thread_context()->thread_mem_tracker_mgr->try_reserve(reserve_size); |
596 | 10.9k | COUNTER_UPDATE(_memory_reserve_times, 1); |
597 | 10.9k | auto sink_revocable_mem_size = _sink->revocable_mem_size(_state); |
598 | 10.9k | if (st.ok() && _state->enable_force_spill() && _sink->is_spillable() && |
599 | 10.9k | sink_revocable_mem_size >= vectorized::SpillStream::MIN_SPILL_WRITE_BATCH_MEM) { |
600 | 0 | st = Status(ErrorCode::QUERY_MEMORY_EXCEEDED, "Force Spill"); |
601 | 0 | } |
602 | 10.9k | if (!st.ok()) { |
603 | 10.9k | COUNTER_UPDATE(_memory_reserve_failed_times, 1); |
604 | 10.9k | auto debug_msg = fmt::format( |
605 | 10.9k | "Query: {} , try to reserve: {}, operator name: {}, operator " |
606 | 10.9k | "id: {}, task id: {}, root revocable mem size: {}, sink revocable mem" |
607 | 10.9k | "size: {}, failed: {}", |
608 | 10.9k | print_id(_query_id), PrettyPrinter::print_bytes(reserve_size), op->get_name(), |
609 | 10.9k | op->node_id(), _state->task_id(), |
610 | 10.9k | PrettyPrinter::print_bytes(op->revocable_mem_size(_state)), |
611 | 10.9k | PrettyPrinter::print_bytes(sink_revocable_mem_size), st.to_string()); |
612 | | // PROCESS_MEMORY_EXCEEDED error msg alread contains process_mem_log_str |
613 | 10.9k | if (!st.is<ErrorCode::PROCESS_MEMORY_EXCEEDED>()) { |
614 | 10.9k | debug_msg += |
615 | 10.9k | fmt::format(", debug info: {}", GlobalMemoryArbitrator::process_mem_log_str()); |
616 | 10.9k | } |
617 | 10.9k | LOG_EVERY_N(INFO, 100) << debug_msg; |
618 | | // If sink has enough revocable memory, trigger revoke memory |
619 | 10.9k | if (sink_revocable_mem_size >= vectorized::SpillStream::MIN_SPILL_WRITE_BATCH_MEM) { |
620 | 6.32k | LOG(INFO) << fmt::format( |
621 | 6.32k | "Query: {} sink: {}, node id: {}, task id: " |
622 | 6.32k | "{}, revocable mem size: {}", |
623 | 6.32k | print_id(_query_id), _sink->get_name(), _sink->node_id(), _state->task_id(), |
624 | 6.32k | PrettyPrinter::print_bytes(sink_revocable_mem_size)); |
625 | 6.32k | ExecEnv::GetInstance()->workload_group_mgr()->add_paused_query( |
626 | 6.32k | _state->get_query_ctx()->resource_ctx()->shared_from_this(), reserve_size, st); |
627 | 6.32k | _spilling = true; |
628 | 6.32k | return false; |
629 | 6.32k | } else { |
630 | | // If reserve failed, not add this query to paused list, because it is very small, will not |
631 | | // consume a lot of memory. But need set low memory mode to indicate that the system should |
632 | | // not use too much memory. |
633 | 4.64k | _state->get_query_ctx()->set_low_memory_mode(); |
634 | 4.64k | } |
635 | 10.9k | } |
636 | 4.64k | return true; |
637 | 10.9k | } |
638 | | |
639 | 459k | void PipelineTask::stop_if_finished() { |
640 | 459k | auto fragment = _fragment_context.lock(); |
641 | 459k | if (!fragment) { |
642 | 0 | return; |
643 | 0 | } |
644 | 459k | SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(fragment->get_query_ctx()->query_mem_tracker()); |
645 | 459k | if (auto sink = _sink) { |
646 | 459k | if (sink->is_finished(_state)) { |
647 | 3 | set_wake_up_early(); |
648 | 3 | terminate(); |
649 | 3 | } |
650 | 459k | } |
651 | 459k | } |
652 | | |
653 | 3 | Status PipelineTask::finalize() { |
654 | 3 | auto fragment = _fragment_context.lock(); |
655 | 3 | if (!fragment) { |
656 | 0 | return Status::OK(); |
657 | 0 | } |
658 | 3 | SCOPED_SWITCH_THREAD_MEM_TRACKER_LIMITER(fragment->get_query_ctx()->query_mem_tracker()); |
659 | 3 | std::unique_lock<std::mutex> lc(_dependency_lock); |
660 | 3 | RETURN_IF_ERROR(_state_transition(State::FINALIZED)); |
661 | 3 | _sink_shared_state.reset(); |
662 | 3 | _op_shared_states.clear(); |
663 | 3 | _shared_state_map.clear(); |
664 | 3 | _block.reset(); |
665 | 3 | _operators.clear(); |
666 | 3 | _sink.reset(); |
667 | 3 | _pipeline.reset(); |
668 | 3 | return Status::OK(); |
669 | 3 | } |
670 | | |
671 | 45 | Status PipelineTask::close(Status exec_status, bool close_sink) { |
672 | 45 | int64_t close_ns = 0; |
673 | 45 | Status s; |
674 | 45 | { |
675 | 45 | SCOPED_RAW_TIMER(&close_ns); |
676 | 45 | if (close_sink) { |
677 | 18 | s = _sink->close(_state, exec_status); |
678 | 18 | } |
679 | 57 | for (auto& op : _operators) { |
680 | 57 | auto tem = op->close(_state); |
681 | 57 | if (!tem.ok() && s.ok()) { |
682 | 0 | s = tem; |
683 | 0 | } |
684 | 57 | } |
685 | 45 | } |
686 | 45 | if (_opened) { |
687 | 45 | COUNTER_UPDATE(_close_timer, close_ns); |
688 | 45 | COUNTER_UPDATE(_task_profile->total_time_counter(), close_ns); |
689 | 45 | } |
690 | | |
691 | 45 | if (close_sink && _opened) { |
692 | 18 | _task_profile->add_info_string("WakeUpEarly", std::to_string(_wake_up_early.load())); |
693 | 18 | _fresh_profile_counter(); |
694 | 18 | } |
695 | | |
696 | 45 | if (_task_queue) { |
697 | 45 | _task_queue->update_statistics(this, close_ns); |
698 | 45 | } |
699 | 45 | if (close_sink) { |
700 | 18 | RETURN_IF_ERROR(_state_transition(State::FINISHED)); |
701 | 18 | } |
702 | 45 | return s; |
703 | 45 | } |
704 | | |
705 | 59.3k | std::string PipelineTask::debug_string() { |
706 | 59.3k | fmt::memory_buffer debug_string_buffer; |
707 | | |
708 | 59.3k | fmt::format_to(debug_string_buffer, "QueryId: {}\n", print_id(_query_id)); |
709 | 59.3k | fmt::format_to(debug_string_buffer, "InstanceId: {}\n", |
710 | 59.3k | print_id(_state->fragment_instance_id())); |
711 | | |
712 | 59.3k | fmt::format_to(debug_string_buffer, |
713 | 59.3k | "PipelineTask[id = {}, open = {}, eos = {}, state = {}, dry run = " |
714 | 59.3k | "{}, _wake_up_early = {}, time elapsed since last state changing = {}s, spilling" |
715 | 59.3k | " = {}, is running = {}]", |
716 | 59.3k | _index, _opened, _eos, _to_string(_exec_state), _dry_run, _wake_up_early.load(), |
717 | 59.3k | _state_change_watcher.elapsed_time() / NANOS_PER_SEC, _spilling, is_running()); |
718 | 59.3k | std::unique_lock<std::mutex> lc(_dependency_lock); |
719 | 59.3k | auto* cur_blocked_dep = _blocked_dep; |
720 | 59.3k | auto fragment = _fragment_context.lock(); |
721 | 59.3k | if (is_finalized() || !fragment) { |
722 | 15 | fmt::format_to(debug_string_buffer, " pipeline name = {}", _pipeline_name); |
723 | 15 | return fmt::to_string(debug_string_buffer); |
724 | 15 | } |
725 | 59.3k | auto elapsed = fragment->elapsed_time() / NANOS_PER_SEC; |
726 | 59.3k | fmt::format_to(debug_string_buffer, |
727 | 59.3k | " elapse time = {}s, block dependency = [{}]\noperators: ", elapsed, |
728 | 59.3k | cur_blocked_dep && !is_finalized() ? cur_blocked_dep->debug_string() : "NULL"); |
729 | | |
730 | 118k | for (size_t i = 0; i < _operators.size(); i++) { |
731 | 59.3k | fmt::format_to(debug_string_buffer, "\n{}", |
732 | 59.3k | _opened && !is_finalized() |
733 | 59.3k | ? _operators[i]->debug_string(_state, cast_set<int>(i)) |
734 | 59.3k | : _operators[i]->debug_string(cast_set<int>(i))); |
735 | 59.3k | } |
736 | 59.3k | fmt::format_to(debug_string_buffer, "\n{}\n", |
737 | 59.3k | _opened && !is_finalized() |
738 | 59.3k | ? _sink->debug_string(_state, cast_set<int>(_operators.size())) |
739 | 59.3k | : _sink->debug_string(cast_set<int>(_operators.size()))); |
740 | | |
741 | 59.3k | fmt::format_to(debug_string_buffer, "\nRead Dependency Information: \n"); |
742 | | |
743 | 59.3k | size_t i = 0; |
744 | 118k | for (; i < _read_dependencies.size(); i++) { |
745 | 118k | for (size_t j = 0; j < _read_dependencies[i].size(); j++) { |
746 | 59.3k | fmt::format_to(debug_string_buffer, "{}. {}\n", i, |
747 | 59.3k | _read_dependencies[i][j]->debug_string(cast_set<int>(i) + 1)); |
748 | 59.3k | } |
749 | 59.3k | } |
750 | | |
751 | 59.3k | fmt::format_to(debug_string_buffer, "{}. {}\n", i, |
752 | 59.3k | _memory_sufficient_dependency->debug_string(cast_set<int>(i++))); |
753 | | |
754 | 59.3k | fmt::format_to(debug_string_buffer, "\nWrite Dependency Information: \n"); |
755 | 118k | for (size_t j = 0; j < _write_dependencies.size(); j++, i++) { |
756 | 59.3k | fmt::format_to(debug_string_buffer, "{}. {}\n", i, |
757 | 59.3k | _write_dependencies[j]->debug_string(cast_set<int>(j) + 1)); |
758 | 59.3k | } |
759 | | |
760 | 59.3k | fmt::format_to(debug_string_buffer, "\nExecution Dependency Information: \n"); |
761 | 178k | for (size_t j = 0; j < _execution_dependencies.size(); j++, i++) { |
762 | 118k | fmt::format_to(debug_string_buffer, "{}. {}\n", i, |
763 | 118k | _execution_dependencies[j]->debug_string(cast_set<int>(i) + 1)); |
764 | 118k | } |
765 | | |
766 | 59.3k | fmt::format_to(debug_string_buffer, "\nSpill Dependency Information: \n"); |
767 | 178k | for (size_t j = 0; j < _spill_dependencies.size(); j++, i++) { |
768 | 118k | fmt::format_to(debug_string_buffer, "{}. {}\n", i, |
769 | 118k | _spill_dependencies[j]->debug_string(cast_set<int>(i) + 1)); |
770 | 118k | } |
771 | | |
772 | 59.3k | fmt::format_to(debug_string_buffer, "Finish Dependency Information: \n"); |
773 | 178k | for (size_t j = 0; j < _finish_dependencies.size(); j++, i++) { |
774 | 118k | fmt::format_to(debug_string_buffer, "{}. {}\n", i, |
775 | 118k | _finish_dependencies[j]->debug_string(cast_set<int>(i) + 1)); |
776 | 118k | } |
777 | 59.3k | return fmt::to_string(debug_string_buffer); |
778 | 59.3k | } |
779 | | |
780 | 0 | size_t PipelineTask::get_revocable_size() const { |
781 | 0 | if (is_finalized() || _running || (_eos && !_spilling)) { |
782 | 0 | return 0; |
783 | 0 | } |
784 | | |
785 | 0 | return _sink->revocable_mem_size(_state); |
786 | 0 | } |
787 | | |
788 | 0 | Status PipelineTask::revoke_memory(const std::shared_ptr<SpillContext>& spill_context) { |
789 | 0 | if (is_finalized()) { |
790 | 0 | if (spill_context) { |
791 | 0 | spill_context->on_task_finished(); |
792 | 0 | VLOG_DEBUG << "Query: " << print_id(_state->query_id()) << ", task: " << ((void*)this) |
793 | 0 | << " finalized"; |
794 | 0 | } |
795 | 0 | return Status::OK(); |
796 | 0 | } |
797 | | |
798 | 0 | const auto revocable_size = _sink->revocable_mem_size(_state); |
799 | 0 | if (revocable_size >= vectorized::SpillStream::MIN_SPILL_WRITE_BATCH_MEM) { |
800 | 0 | RETURN_IF_ERROR(_sink->revoke_memory(_state, spill_context)); |
801 | 0 | } else if (spill_context) { |
802 | 0 | spill_context->on_task_finished(); |
803 | 0 | LOG(INFO) << "Query: " << print_id(_state->query_id()) << ", task: " << ((void*)this) |
804 | 0 | << " has not enough data to revoke: " << revocable_size; |
805 | 0 | } |
806 | 0 | return Status::OK(); |
807 | 0 | } |
808 | | |
809 | 75 | Status PipelineTask::wake_up(Dependency* dep) { |
810 | | // call by dependency |
811 | 75 | DCHECK_EQ(_blocked_dep, dep) << "dep : " << dep->debug_string(0) << "task: " << debug_string(); |
812 | 75 | _blocked_dep = nullptr; |
813 | 75 | auto holder = std::dynamic_pointer_cast<PipelineTask>(shared_from_this()); |
814 | 75 | RETURN_IF_ERROR(_state_transition(PipelineTask::State::RUNNABLE)); |
815 | 75 | RETURN_IF_ERROR(get_task_queue()->push_back(holder)); |
816 | 75 | return Status::OK(); |
817 | 75 | } |
818 | | |
819 | 288 | Status PipelineTask::_state_transition(State new_state) { |
820 | 288 | if (_exec_state != new_state) { |
821 | 273 | _state_change_watcher.reset(); |
822 | 273 | _state_change_watcher.start(); |
823 | 273 | } |
824 | 288 | _task_profile->add_info_string("TaskState", _to_string(new_state)); |
825 | 288 | _task_profile->add_info_string("BlockedByDependency", _blocked_dep ? _blocked_dep->name() : ""); |
826 | 288 | if (!LEGAL_STATE_TRANSITION[(int)new_state].contains(_exec_state)) { |
827 | 51 | return Status::InternalError( |
828 | 51 | "Task state transition from {} to {} is not allowed! Task info: {}", |
829 | 51 | _to_string(_exec_state), _to_string(new_state), debug_string()); |
830 | 51 | } |
831 | 237 | _exec_state = new_state; |
832 | 237 | return Status::OK(); |
833 | 288 | } |
834 | | |
835 | | #include "common/compile_check_end.h" |
836 | | } // namespace doris::pipeline |