Coverage Report

Created: 2026-05-08 23:56

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/udf/python/python_server.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "udf/python/python_server.h"
19
20
#include <arrow/type_fwd.h>
21
#include <butil/fd_utility.h>
22
#include <dirent.h>
23
#include <fmt/core.h>
24
#include <sys/poll.h>
25
#include <sys/stat.h>
26
27
#include <boost/asio.hpp>
28
#include <boost/process.hpp>
29
#include <fstream>
30
#include <future>
31
32
#include "arrow/flight/client.h"
33
#include "common/config.h"
34
#include "udf/python/python_udaf_client.h"
35
#include "udf/python/python_udf_client.h"
36
#include "udf/python/python_udtf_client.h"
37
#include "util/cpu_info.h"
38
39
namespace doris {
40
41
std::shared_ptr<PythonServerManager::VersionedProcessPool>
42
27
PythonServerManager::_get_or_create_process_pool(const PythonVersion& version) {
43
27
    std::lock_guard<std::mutex> lock(_pools_mutex);
44
27
    auto& pool = _process_pools[version];
45
27
    if (!pool) {
46
17
        pool = std::make_shared<VersionedProcessPool>();
47
17
    }
48
27
    return pool;
49
27
}
50
51
std::vector<std::pair<PythonVersion, std::shared_ptr<PythonServerManager::VersionedProcessPool>>>
52
45
PythonServerManager::_snapshot_process_pools() {
53
45
    std::lock_guard<std::mutex> lock(_pools_mutex);
54
45
    std::vector<std::pair<PythonVersion, std::shared_ptr<VersionedProcessPool>>> snapshot;
55
45
    snapshot.reserve(_process_pools.size());
56
45
    for (const auto& [version, pool] : _process_pools) {
57
19
        snapshot.emplace_back(version, pool);
58
19
    }
59
45
    return snapshot;
60
45
}
61
62
#ifdef BE_TEST
63
void PythonServerManager::set_process_pool_for_test(const PythonVersion& version,
64
                                                    std::vector<ProcessPtr> processes,
65
4
                                                    bool initialized) {
66
4
    auto versioned_pool = _get_or_create_process_pool(version);
67
4
    std::lock_guard<std::mutex> lock(versioned_pool->mutex);
68
4
    versioned_pool->processes = std::move(processes);
69
4
    versioned_pool->initialized = initialized;
70
4
}
71
72
7
std::vector<ProcessPtr>& PythonServerManager::process_pool_for_test(const PythonVersion& version) {
73
7
    auto versioned_pool = _get_or_create_process_pool(version);
74
7
    return versioned_pool->processes;
75
7
}
76
#endif
77
78
template <typename ClientType>
79
Status PythonServerManager::get_client(const PythonUDFMeta& func_meta, const PythonVersion& version,
80
                                       std::shared_ptr<ClientType>* client,
81
1
                                       const std::shared_ptr<arrow::Schema>& data_schema) {
82
1
    std::shared_ptr<VersionedProcessPool> versioned_pool =
83
1
            DORIS_TRY(_ensure_pool_initialized(version));
84
85
0
    ProcessPtr process;
86
0
    RETURN_IF_ERROR(_get_process(version, versioned_pool, &process));
87
88
0
    if constexpr (std::is_same_v<ClientType, PythonUDAFClient>) {
89
0
        RETURN_IF_ERROR(ClientType::create(func_meta, std::move(process), data_schema, client));
90
0
    } else {
91
0
        RETURN_IF_ERROR(ClientType::create(func_meta, std::move(process), client));
92
0
    }
93
94
0
    return Status::OK();
95
0
}
_ZN5doris19PythonServerManager10get_clientINS_15PythonUDFClientEEENS_6StatusERKNS_13PythonUDFMetaERKNS_13PythonVersionEPSt10shared_ptrIT_ERKSA_IN5arrow6SchemaEE
Line
Count
Source
81
1
                                       const std::shared_ptr<arrow::Schema>& data_schema) {
82
1
    std::shared_ptr<VersionedProcessPool> versioned_pool =
83
1
            DORIS_TRY(_ensure_pool_initialized(version));
84
85
0
    ProcessPtr process;
86
0
    RETURN_IF_ERROR(_get_process(version, versioned_pool, &process));
87
88
    if constexpr (std::is_same_v<ClientType, PythonUDAFClient>) {
89
        RETURN_IF_ERROR(ClientType::create(func_meta, std::move(process), data_schema, client));
90
0
    } else {
91
0
        RETURN_IF_ERROR(ClientType::create(func_meta, std::move(process), client));
92
0
    }
93
94
0
    return Status::OK();
95
0
}
Unexecuted instantiation: _ZN5doris19PythonServerManager10get_clientINS_16PythonUDAFClientEEENS_6StatusERKNS_13PythonUDFMetaERKNS_13PythonVersionEPSt10shared_ptrIT_ERKSA_IN5arrow6SchemaEE
Unexecuted instantiation: _ZN5doris19PythonServerManager10get_clientINS_16PythonUDTFClientEEENS_6StatusERKNS_13PythonUDFMetaERKNS_13PythonVersionEPSt10shared_ptrIT_ERKSA_IN5arrow6SchemaEE
96
97
Result<std::shared_ptr<PythonServerManager::VersionedProcessPool>>
98
16
PythonServerManager::_ensure_pool_initialized(const PythonVersion& version) {
99
16
    auto versioned_pool = _get_or_create_process_pool(version);
100
16
    std::lock_guard<std::mutex> lock(versioned_pool->mutex);
101
102
    // Check if already initialized
103
16
    if (versioned_pool->initialized) return versioned_pool;
104
105
    // 0 means use CPU core count as default, otherwise use the specified value
106
13
    int max_pool_size = config::max_python_process_num > 0 ? config::max_python_process_num
107
13
                                                           : CpuInfo::num_cores();
108
109
13
    LOG(INFO) << "Initializing Python process pool for version " << version.to_string() << " with "
110
13
              << max_pool_size
111
13
              << " processes (config::max_python_process_num=" << config::max_python_process_num
112
13
              << ", CPU cores=" << CpuInfo::num_cores() << ")";
113
114
13
    std::vector<std::future<Status>> futures;
115
13
    std::vector<ProcessPtr> temp_processes(max_pool_size);
116
117
61
    for (int i = 0; i < max_pool_size; i++) {
118
48
        futures.push_back(std::async(std::launch::async, [this, &version, i, &temp_processes]() {
119
48
            ProcessPtr process;
120
48
            Status s = fork(version, &process);
121
48
            if (s.ok()) {
122
12
                temp_processes[i] = std::move(process);
123
12
            }
124
48
            return s;
125
48
        }));
126
48
    }
127
128
13
    int success_count = 0;
129
13
    int failure_count = 0;
130
61
    for (int i = 0; i < max_pool_size; i++) {
131
48
        Status s = futures[i].get();
132
48
        if (s.ok() && temp_processes[i]) {
133
12
            versioned_pool->processes.emplace_back(std::move(temp_processes[i]));
134
12
            success_count++;
135
36
        } else {
136
36
            failure_count++;
137
36
            LOG(WARNING) << "Failed to create Python process " << (i + 1) << "/" << max_pool_size
138
36
                         << ": " << s.to_string();
139
36
        }
140
48
    }
141
142
13
    if (versioned_pool->processes.empty()) {
143
3
        return ResultError(Status::Error<ErrorCode::SERVICE_UNAVAILABLE>(
144
3
                "Failed to initialize Python process pool: all {} process creation attempts failed",
145
3
                max_pool_size));
146
3
    }
147
148
13
    LOG(INFO) << "Python process pool initialized for version " << version.to_string()
149
10
              << ": created " << success_count << " processes"
150
10
              << (failure_count > 0 ? fmt::format(" ({} failed)", failure_count) : "");
151
152
10
    versioned_pool->initialized = true;
153
10
    _start_health_check_thread();
154
155
10
    return versioned_pool;
156
13
}
157
158
Status PythonServerManager::_get_process(
159
        const PythonVersion& version, const std::shared_ptr<VersionedProcessPool>& versioned_pool,
160
12
        ProcessPtr* process) {
161
12
    std::lock_guard<std::mutex> lock(versioned_pool->mutex);
162
12
    std::vector<ProcessPtr>& pool = versioned_pool->processes;
163
164
12
    if (UNLIKELY(pool.empty())) {
165
1
        return Status::InternalError("Python process pool is empty for version {}",
166
1
                                     version.to_string());
167
1
    }
168
169
    // Prefer an already-alive process and only use load balancing inside that alive subset.
170
    // keep dead entries stay in the pool for the background health checker
171
    // unless there is no alive process left for the current request.
172
11
    auto min_alive_iter = std::min_element(pool.begin(), pool.end(),
173
11
                                           [](const ProcessPtr& a, const ProcessPtr& b) {
174
6
                                               const bool a_alive = a && a->is_alive();
175
6
                                               const bool b_alive = b && b->is_alive();
176
6
                                               if (a_alive != b_alive) {
177
1
                                                   return a_alive > b_alive;
178
1
                                               }
179
5
                                               return a.use_count() < b.use_count();
180
6
                                           });
181
182
11
    if (min_alive_iter != pool.end() && *min_alive_iter && (*min_alive_iter)->is_alive()) {
183
10
        *process = *min_alive_iter;
184
10
        return Status::OK();
185
10
    }
186
187
    // Only reach here when the pool has no alive process at all. Try one foreground
188
    // recovery so the caller has a chance to proceed; leave batch repair to health check.
189
1
    auto& candidate = pool.front();
190
1
    ProcessPtr replacement;
191
1
    Status status = fork(version, &replacement);
192
1
    if (!status.ok()) {
193
0
        return Status::Error<ErrorCode::SERVICE_UNAVAILABLE>(
194
0
                "Python process pool has no available process for version {}, reason: {}",
195
0
                version.to_string(), status.to_string());
196
0
    }
197
198
1
    candidate = std::move(replacement);
199
1
    *process = candidate;
200
1
    return Status::OK();
201
1
}
202
203
63
Status PythonServerManager::fork(const PythonVersion& version, ProcessPtr* process) {
204
63
    std::string python_executable_path = version.get_executable_path();
205
63
    std::string fight_server_path = get_fight_server_path();
206
63
    std::string base_unix_socket_path = get_base_unix_socket_path();
207
63
    std::vector<std::string> args = {"-u", fight_server_path, base_unix_socket_path};
208
63
    boost::process::environment env = boost::this_process::environment();
209
63
    boost::process::ipstream child_output;
210
211
63
    try {
212
63
        boost::process::child c(
213
63
                python_executable_path, args, boost::process::std_out > child_output,
214
63
                boost::process::env = env,
215
63
                boost::process::on_exit([](int exit_code, const std::error_code& ec) {
216
0
                    if (ec) {
217
0
                        LOG(WARNING) << "Python UDF server exited with error: " << ec.message();
218
0
                    }
219
0
                }));
220
221
        // Wait for socket file to be created (indicates server is ready)
222
63
        std::string expected_socket_path = get_unix_socket_file_path(c.id());
223
63
        bool started_successfully = false;
224
63
        std::chrono::steady_clock::time_point start = std::chrono::steady_clock::now();
225
63
        const auto timeout = std::chrono::milliseconds(5000);
226
227
2.33k
        while (std::chrono::steady_clock::now() - start < timeout) {
228
2.29k
            struct stat buffer;
229
2.29k
            if (stat(expected_socket_path.c_str(), &buffer) == 0) {
230
21
                started_successfully = true;
231
21
                break;
232
21
            }
233
234
2.27k
            if (!c.running()) {
235
2
                break;
236
2
            }
237
2.27k
            std::this_thread::sleep_for(std::chrono::milliseconds(1));
238
2.27k
        }
239
240
63
        if (!started_successfully) {
241
2
            if (c.running()) {
242
0
                c.terminate();
243
0
                c.wait();
244
0
            }
245
2
            return Status::InternalError("Python server start failed: socket file not found at {}",
246
2
                                         expected_socket_path);
247
2
        }
248
249
61
        *process = std::make_shared<PythonUDFProcess>(std::move(c), std::move(child_output));
250
251
61
    } catch (const std::exception& e) {
252
40
        return Status::InternalError("Failed to start Python UDF server: {}", e.what());
253
40
    }
254
255
21
    return Status::OK();
256
63
}
257
258
10
void PythonServerManager::_start_health_check_thread() {
259
10
    std::lock_guard<std::mutex> lock(_health_check_mutex);
260
10
    if (_health_check_thread) return;
261
262
10
    LOG(INFO) << "Starting Python process health check thread (interval: 30 seconds)";
263
264
8
    _health_check_thread = std::make_unique<std::thread>([this]() {
265
        // Health check loop
266
8
        while (!_shutdown_flag.load(std::memory_order_acquire)) {
267
            // Wait for interval or shutdown signal
268
4
            {
269
4
                std::unique_lock<std::mutex> lock(_health_check_mutex);
270
8
                _health_check_cv.wait_for(lock, std::chrono::seconds(30), [this]() {
271
8
                    return _shutdown_flag.load(std::memory_order_acquire);
272
8
                });
273
4
            }
274
275
4
            if (_shutdown_flag.load(std::memory_order_acquire)) break;
276
277
0
            _check_and_recreate_processes();
278
0
            _refresh_memory_stats();
279
0
        }
280
281
8
        LOG(INFO) << "Python process health check thread exiting";
282
8
    });
283
8
}
284
285
2
void PythonServerManager::_check_and_recreate_processes() {
286
2
    int total_checked = 0;
287
2
    int total_dead = 0;
288
2
    int total_recreated = 0;
289
290
2
    for (auto& [version, versioned_pool] : _snapshot_process_pools()) {
291
2
        std::lock_guard<std::mutex> lock(versioned_pool->mutex);
292
2
        auto& pool = versioned_pool->processes;
293
7
        for (size_t i = 0; i < pool.size(); ++i) {
294
5
            auto& process = pool[i];
295
5
            if (!process) continue;
296
297
4
            total_checked++;
298
4
            if (!process->is_alive()) {
299
3
                total_dead++;
300
3
                LOG(WARNING) << "Detected dead Python process (pid=" << process->get_child_pid()
301
3
                             << ", version=" << version.to_string() << "), recreating...";
302
303
3
                ProcessPtr new_process;
304
3
                Status s = fork(version, &new_process);
305
3
                if (s.ok()) {
306
1
                    pool[i] = std::move(new_process);
307
1
                    total_recreated++;
308
1
                    LOG(INFO) << "Successfully recreated Python process for version "
309
1
                              << version.to_string();
310
2
                } else {
311
2
                    LOG(ERROR) << "Failed to recreate Python process for version "
312
2
                               << version.to_string() << ": " << s.to_string();
313
2
                    pool.erase(pool.begin() + i);
314
2
                    --i;
315
2
                }
316
3
            }
317
4
        }
318
2
    }
319
320
2
    if (total_dead > 0) {
321
2
        LOG(INFO) << "Health check completed: checked=" << total_checked << ", dead=" << total_dead
322
2
                  << ", recreated=" << total_recreated;
323
2
    }
324
2
}
325
326
43
void PythonServerManager::shutdown() {
327
    // Signal health check thread to stop
328
43
    _shutdown_flag.store(true, std::memory_order_release);
329
43
    _health_check_cv.notify_one();
330
331
43
    if (_health_check_thread && _health_check_thread->joinable()) {
332
8
        _health_check_thread->join();
333
8
        _health_check_thread.reset();
334
8
    }
335
336
    // Shutdown all processes
337
43
    for (auto& [version, versioned_pool] : _snapshot_process_pools()) {
338
17
        std::lock_guard<std::mutex> lock(versioned_pool->mutex);
339
17
        auto& pool = versioned_pool->processes;
340
17
        for (auto& process : pool) {
341
17
            if (process) {
342
16
                process->shutdown();
343
16
            }
344
17
        }
345
17
        pool.clear();
346
17
        versioned_pool->initialized = false;
347
17
    }
348
349
43
    {
350
43
        std::lock_guard<std::mutex> lock(_pools_mutex);
351
43
        _process_pools.clear();
352
43
    }
353
43
}
354
355
0
Status PythonServerManager::_read_process_memory(pid_t pid, size_t* rss_bytes) {
356
    // Read from /proc/{pid}/statm
357
    // Format: size resident shared text lib data dt
358
0
    std::string statm_path = fmt::format("/proc/{}/statm", pid);
359
0
    std::ifstream statm_file(statm_path);
360
361
0
    if (!statm_file.is_open()) {
362
0
        return Status::InternalError("Cannot open {}", statm_path);
363
0
    }
364
365
0
    size_t size_pages = 0, rss_pages = 0;
366
    // we only care about RSS, read and ignore the total size field
367
0
    statm_file >> size_pages >> rss_pages;
368
369
0
    if (statm_file.fail()) {
370
0
        return Status::InternalError("Failed to read {}", statm_path);
371
0
    }
372
373
    // Convert pages to bytes
374
0
    long page_size = sysconf(_SC_PAGESIZE);
375
0
    *rss_bytes = rss_pages * page_size;
376
377
0
    return Status::OK();
378
0
}
379
380
0
void PythonServerManager::_refresh_memory_stats() {
381
0
    int64_t total_rss = 0;
382
383
0
    for (const auto& [version, versioned_pool] : _snapshot_process_pools()) {
384
0
        std::lock_guard<std::mutex> lock(versioned_pool->mutex);
385
0
        const auto& pool = versioned_pool->processes;
386
0
        for (const auto& process : pool) {
387
0
            if (!process || !process->is_alive()) continue;
388
389
0
            size_t rss_bytes = 0;
390
0
            Status s = _read_process_memory(process->get_child_pid(), &rss_bytes);
391
392
0
            if (s.ok()) {
393
0
                total_rss += rss_bytes;
394
0
            } else [[unlikely]] {
395
0
                LOG(WARNING) << "Failed to read memory info for Python process (pid="
396
0
                             << process->get_child_pid() << "): " << s.to_string();
397
0
            }
398
0
        }
399
0
    }
400
0
    _mem_tracker.set_consumption(total_rss);
401
0
    LOG(INFO) << _mem_tracker.log_usage();
402
403
0
    if (config::python_udf_processes_memory_limit_bytes > 0 &&
404
0
        total_rss > config::python_udf_processes_memory_limit_bytes) {
405
0
        LOG(WARNING) << "Python UDF process memory usage exceeds limit: rss_bytes=" << total_rss
406
0
                     << ", limit_bytes=" << config::python_udf_processes_memory_limit_bytes;
407
0
    }
408
0
}
409
410
0
Status PythonServerManager::clear_module_cache(const std::string& location) {
411
0
    if (location.empty()) {
412
0
        return Status::InvalidArgument("Empty location for clear_module_cache");
413
0
    }
414
415
0
    std::string body = fmt::format(R"({{"location": "{}"}})", location);
416
417
0
    int success_count = 0;
418
0
    int fail_count = 0;
419
0
    bool has_active_process = false;
420
421
0
    for (auto& [version, versioned_pool] : _snapshot_process_pools()) {
422
0
        std::lock_guard<std::mutex> lock(versioned_pool->mutex);
423
0
        auto& pool = versioned_pool->processes;
424
0
        for (auto& process : pool) {
425
0
            if (!process || !process->is_alive()) {
426
0
                continue;
427
0
            }
428
0
            has_active_process = true;
429
0
            try {
430
0
                auto loc_result = arrow::flight::Location::Parse(process->get_uri());
431
0
                if (!loc_result.ok()) [[unlikely]] {
432
0
                    fail_count++;
433
0
                    continue;
434
0
                }
435
436
0
                auto client_result = arrow::flight::FlightClient::Connect(*loc_result);
437
0
                if (!client_result.ok()) [[unlikely]] {
438
0
                    fail_count++;
439
0
                    continue;
440
0
                }
441
0
                auto client = std::move(*client_result);
442
443
0
                arrow::flight::Action action;
444
0
                action.type = "clear_module_cache";
445
0
                action.body = arrow::Buffer::FromString(body);
446
447
0
                auto result_stream = client->DoAction(action);
448
0
                if (!result_stream.ok()) {
449
0
                    fail_count++;
450
0
                    continue;
451
0
                }
452
453
0
                auto result = (*result_stream)->Next();
454
0
                if (result.ok() && *result) {
455
0
                    success_count++;
456
0
                } else {
457
0
                    fail_count++;
458
0
                }
459
460
0
            } catch (...) {
461
0
                fail_count++;
462
0
            }
463
0
        }
464
0
    }
465
466
0
    if (!has_active_process) {
467
0
        return Status::OK();
468
0
    }
469
470
0
    LOG(INFO) << "clear_module_cache completed for location=" << location
471
0
              << ", success=" << success_count << ", failed=" << fail_count;
472
473
0
    if (fail_count > 0) {
474
0
        return Status::InternalError(
475
0
                "clear_module_cache failed for location={}, success={}, failed={}", location,
476
0
                success_count, fail_count);
477
0
    }
478
479
0
    return Status::OK();
480
0
}
481
482
// Explicit template instantiation for UDF, UDAF and UDTF clients
483
template Status PythonServerManager::get_client<PythonUDFClient>(
484
        const PythonUDFMeta& func_meta, const PythonVersion& version,
485
        std::shared_ptr<PythonUDFClient>* client,
486
        const std::shared_ptr<arrow::Schema>& data_schema);
487
488
template Status PythonServerManager::get_client<PythonUDAFClient>(
489
        const PythonUDFMeta& func_meta, const PythonVersion& version,
490
        std::shared_ptr<PythonUDAFClient>* client,
491
        const std::shared_ptr<arrow::Schema>& data_schema);
492
493
template Status PythonServerManager::get_client<PythonUDTFClient>(
494
        const PythonUDFMeta& func_meta, const PythonVersion& version,
495
        std::shared_ptr<PythonUDTFClient>* client,
496
        const std::shared_ptr<arrow::Schema>& data_schema);
497
498
} // namespace doris