be/src/udf/python/python_env.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "udf/python/python_env.h" |
19 | | |
20 | | #include <fmt/core.h> |
21 | | #include <gen_cpp/BackendService_types.h> |
22 | | #include <rapidjson/document.h> |
23 | | |
24 | | #include <filesystem> |
25 | | #include <memory> |
26 | | #include <regex> |
27 | | #include <vector> |
28 | | |
29 | | #include "common/status.h" |
30 | | #include "udf/python/python_server.h" |
31 | | #include "util/string_util.h" |
32 | | |
33 | | namespace doris { |
34 | | |
35 | | namespace fs = std::filesystem; |
36 | | |
37 | 2 | static std::string _python_env_type_to_string(PythonEnvType env_type) { |
38 | 2 | switch (env_type) { |
39 | 2 | case PythonEnvType::CONDA: |
40 | 2 | return "conda"; |
41 | 0 | case PythonEnvType::VENV: |
42 | 0 | return "venv"; |
43 | 2 | } |
44 | 0 | return "unknown"; |
45 | 2 | } |
46 | | |
47 | | // extract python version by executing `python --version` and extract "3.9.16" from "Python 3.9.16" |
48 | | // @param python_path: path to python executable, e.g. "/opt/miniconda3/envs/myenv/bin/python" |
49 | | // @param version: extracted python version, e.g. "3.9.16" |
50 | 36 | static Status extract_python_version(const std::string& python_path, std::string* version) { |
51 | 36 | static std::regex python_version_re(R"(^Python (\d+\.\d+\.\d+))"); |
52 | | |
53 | 36 | if (!fs::exists(python_path)) { |
54 | 1 | return Status::NotFound("Python executable not found: {}", python_path); |
55 | 1 | } |
56 | | |
57 | 35 | std::string cmd = fmt::format("\"{}\" --version", python_path); |
58 | 35 | FILE* pipe = popen(cmd.c_str(), "r"); |
59 | 35 | if (!pipe) { |
60 | 0 | return Status::InternalError("Failed to run: {}", cmd); |
61 | 0 | } |
62 | | |
63 | 35 | std::string result; |
64 | 35 | char buf[128]; |
65 | 70 | while (fgets(buf, sizeof(buf), pipe)) { |
66 | 35 | result += buf; |
67 | 35 | } |
68 | 35 | pclose(pipe); |
69 | | |
70 | 35 | std::smatch match; |
71 | 35 | if (std::regex_search(result, match, python_version_re)) { |
72 | 34 | *version = match[1].str(); |
73 | 34 | return Status::OK(); |
74 | 34 | } |
75 | | |
76 | 1 | return Status::InternalError("Failed to extract Python version from path: {}, result: {}", |
77 | 1 | python_path, result); |
78 | 35 | } |
79 | | |
80 | | PythonEnvironment::PythonEnvironment(const std::string& name, const PythonVersion& python_version) |
81 | 23 | : env_name(name), python_version(python_version) {} |
82 | | |
83 | 8 | std::string PythonEnvironment::to_string() const { |
84 | 8 | return fmt::format( |
85 | 8 | "[env_name: {}, env_base_path: {}, python_base_path: {}, python_full_version: {}]", |
86 | 8 | env_name, python_version.base_path, python_version.executable_path, |
87 | 8 | python_version.full_version); |
88 | 8 | } |
89 | | |
90 | 21 | bool PythonEnvironment::is_valid() const { |
91 | 21 | if (!python_version.is_valid()) return false; |
92 | | |
93 | 20 | auto perms = fs::status(python_version.executable_path).permissions(); |
94 | 20 | if ((perms & fs::perms::owner_exec) == fs::perms::none) { |
95 | 1 | return false; |
96 | 1 | } |
97 | | |
98 | 19 | std::string version; |
99 | 19 | if (!extract_python_version(python_version.executable_path, &version).ok()) { |
100 | 1 | LOG(WARNING) << "Failed to extract python version from path: " |
101 | 1 | << python_version.executable_path; |
102 | 1 | return false; |
103 | 1 | } |
104 | | |
105 | 18 | return python_version.full_version == version; |
106 | 19 | } |
107 | | |
108 | | // Scan for environments under the /{conda_root_path}/envs directory from the conda root. |
109 | | Status PythonEnvironment::scan_from_conda_root_path(const fs::path& conda_root_path, |
110 | 13 | std::vector<PythonEnvironment>* environments) { |
111 | 13 | DCHECK(!conda_root_path.empty() && environments != nullptr); |
112 | | |
113 | 13 | fs::path envs_dir = conda_root_path / "envs"; |
114 | 13 | if (!fs::exists(envs_dir) || !fs::is_directory(envs_dir)) { |
115 | 3 | return Status::NotFound("Conda envs directory not found: {}", envs_dir.string()); |
116 | 3 | } |
117 | | |
118 | 13 | for (const auto& entry : fs::directory_iterator(envs_dir)) { |
119 | 13 | if (!entry.is_directory()) continue; |
120 | | |
121 | 11 | std::string env_name = entry.path().filename(); // e.g. "myenv" |
122 | 11 | std::string env_base_path = entry.path(); // e.g. "/opt/miniconda3/envs/myenv" |
123 | 11 | std::string python_path = |
124 | 11 | env_base_path + "/bin/python"; // e.g. "/{env_base_path}/bin/python" |
125 | 11 | std::string python_full_version; // e.g. "3.9.16" |
126 | 11 | RETURN_IF_ERROR(extract_python_version(python_path, &python_full_version)); |
127 | 10 | size_t pos = python_full_version.find_last_of('.'); |
128 | | |
129 | 10 | if (UNLIKELY(pos == std::string::npos)) { |
130 | 0 | return Status::InvalidArgument("Invalid python version: {}", python_full_version); |
131 | 0 | } |
132 | | |
133 | 10 | PythonVersion python_version(python_full_version, env_base_path, python_path); |
134 | 10 | PythonEnvironment conda_env(env_name, python_version); |
135 | | |
136 | 10 | if (UNLIKELY(!conda_env.is_valid())) { |
137 | 0 | LOG(WARNING) << "Invalid conda environment: " << conda_env.to_string(); |
138 | 0 | continue; |
139 | 0 | } |
140 | | |
141 | 10 | environments->push_back(std::move(conda_env)); |
142 | 10 | } |
143 | | |
144 | 9 | if (environments->empty()) { |
145 | 1 | return Status::NotFound("No conda python environments found"); |
146 | 1 | } |
147 | | |
148 | 8 | return Status::OK(); |
149 | 9 | } |
150 | | |
151 | | Status PythonEnvironment::scan_from_venv_root_path( |
152 | | const fs::path& venv_root_path, const std::vector<std::string>& interpreter_paths, |
153 | 9 | std::vector<PythonEnvironment>* environments) { |
154 | 9 | DCHECK(!venv_root_path.empty() && environments != nullptr); |
155 | | |
156 | 9 | for (const auto& interpreter_path : interpreter_paths) { |
157 | 9 | if (!fs::exists(interpreter_path) || !fs::is_regular_file(interpreter_path)) { |
158 | 3 | return Status::NotFound("Interpreter path not found: {}", interpreter_path); |
159 | 3 | } |
160 | 6 | std::string python_full_version; |
161 | 6 | RETURN_IF_ERROR(extract_python_version(interpreter_path, &python_full_version)); |
162 | 6 | size_t pos = python_full_version.find_last_of('.'); |
163 | 6 | if (UNLIKELY(pos == std::string::npos)) { |
164 | 0 | return Status::InvalidArgument("Invalid python version: {}", python_full_version); |
165 | 0 | } |
166 | | // Extract major.minor version (e.g., "3.12" from "3.12.0") |
167 | 6 | std::string python_major_minor_version = python_full_version.substr(0, pos); |
168 | | |
169 | 6 | std::string env_name = fmt::format("python{}", python_full_version); // e.g. "python3.9.16" |
170 | 6 | std::string env_base_path = fmt::format("{}/{}", venv_root_path.string(), |
171 | 6 | env_name); // e.g. "/opt/venv/python3.9.16" |
172 | 6 | std::string python_path = |
173 | 6 | fmt::format("{}/bin/python", env_base_path); // e.g. "/{venv_base_path}/bin/python" |
174 | | |
175 | 6 | if (!fs::exists(env_base_path) || !fs::exists(python_path)) { |
176 | 0 | fs::create_directories(env_base_path); |
177 | | // Use --system-site-packages to inherit packages from system Python |
178 | | // This ensures pandas and pyarrow are available if installed in system |
179 | 0 | std::string create_venv_cmd = fmt::format("{} -m venv --system-site-packages {}", |
180 | 0 | interpreter_path, env_base_path); |
181 | |
|
182 | 0 | if (system(create_venv_cmd.c_str()) != 0 || !fs::exists(python_path)) { |
183 | 0 | return Status::RuntimeError("Failed to create python virtual environment, cmd: {}", |
184 | 0 | create_venv_cmd); |
185 | 0 | } |
186 | 0 | } |
187 | | |
188 | | // Use major.minor version for site-packages path (e.g., "python3.12") |
189 | 6 | std::string python_dependency_path = fmt::format("{}/lib/python{}/site-packages", |
190 | 6 | env_base_path, python_major_minor_version); |
191 | | |
192 | 6 | if (!fs::exists(python_dependency_path)) { |
193 | 0 | return Status::NotFound("Python dependency path not found: {}", python_dependency_path); |
194 | 0 | } |
195 | | |
196 | 6 | PythonVersion python_version(python_full_version, env_base_path, python_path); |
197 | 6 | PythonEnvironment venv_env(env_name, python_version); |
198 | | |
199 | 6 | if (UNLIKELY(!venv_env.is_valid())) { |
200 | 0 | LOG(WARNING) << "Invalid venv environment: " << venv_env.to_string(); |
201 | 0 | continue; |
202 | 0 | } |
203 | | |
204 | 6 | environments->push_back(std::move(venv_env)); |
205 | 6 | } |
206 | | |
207 | 6 | if (environments->empty()) { |
208 | 0 | return Status::NotFound("No venv python environments found"); |
209 | 0 | } |
210 | | |
211 | 6 | return Status::OK(); |
212 | 6 | } |
213 | | |
214 | 10 | Status PythonEnvScanner::get_versions(std::vector<PythonVersion>* versions) const { |
215 | 10 | DCHECK(versions != nullptr); |
216 | 10 | if (_envs.empty()) { |
217 | 1 | return Status::InternalError("not found available version"); |
218 | 1 | } |
219 | 10 | for (const auto& env : _envs) { |
220 | 10 | versions->push_back(env.python_version); |
221 | 10 | } |
222 | 9 | return Status::OK(); |
223 | 10 | } |
224 | | |
225 | | Status PythonEnvScanner::get_version(const std::string& runtime_version, |
226 | 3.47k | PythonVersion* version) const { |
227 | 3.47k | if (_envs.empty()) { |
228 | 1 | return Status::InternalError("not found available version"); |
229 | 1 | } |
230 | 3.46k | std::string_view runtime_version_view(runtime_version); |
231 | 3.46k | runtime_version_view = trim(runtime_version_view); |
232 | 3.47k | for (const auto& env : _envs) { |
233 | 3.47k | if (env.python_version.full_version == runtime_version_view) { |
234 | 3.47k | *version = env.python_version; |
235 | 3.47k | return Status::OK(); |
236 | 3.47k | } |
237 | 3.47k | } |
238 | 18.4E | return Status::NotFound("not found runtime version: {}", runtime_version); |
239 | 3.46k | } |
240 | | |
241 | 6 | Status CondaEnvScanner::scan() { |
242 | 6 | RETURN_IF_ERROR(PythonEnvironment::scan_from_conda_root_path(_env_root_path, &_envs)); |
243 | 5 | return Status::OK(); |
244 | 6 | } |
245 | | |
246 | 2 | std::string CondaEnvScanner::to_string() const { |
247 | 2 | std::stringstream ss; |
248 | 2 | ss << "Conda environments: "; |
249 | 2 | for (const auto& conda_env : _envs) { |
250 | 1 | ss << conda_env.to_string() << ", "; |
251 | 1 | } |
252 | 2 | return ss.str(); |
253 | 2 | } |
254 | | |
255 | 8 | Status VenvEnvScanner::scan() { |
256 | 8 | RETURN_IF_ERROR(PythonEnvironment::scan_from_venv_root_path(_env_root_path, _interpreter_paths, |
257 | 8 | &_envs)); |
258 | 6 | return Status::OK(); |
259 | 8 | } |
260 | | |
261 | 7 | std::string VenvEnvScanner::to_string() const { |
262 | 7 | std::stringstream ss; |
263 | 7 | ss << "Venv environments: "; |
264 | 7 | for (const auto& venv_env : _envs) { |
265 | 6 | ss << venv_env.to_string() << ", "; |
266 | 6 | } |
267 | 7 | return ss.str(); |
268 | 7 | } |
269 | | |
270 | | Status PythonEnvScannerHolder::init(PythonEnvType env_type, const fs::path& python_root_path, |
271 | 12 | const std::string& python_venv_interpreter_paths) { |
272 | 12 | switch (env_type) { |
273 | 3 | case PythonEnvType::CONDA: { |
274 | 3 | if (!fs::exists(python_root_path) || !fs::is_directory(python_root_path)) { |
275 | 1 | return Status::InvalidArgument("Invalid conda root path: {}", |
276 | 1 | python_root_path.string()); |
277 | 1 | } |
278 | 2 | _env_scanner = std::make_unique<CondaEnvScanner>(python_root_path); |
279 | 2 | break; |
280 | 3 | } |
281 | 8 | case PythonEnvType::VENV: { |
282 | 8 | if (!fs::exists(python_root_path) || !fs::is_directory(python_root_path)) { |
283 | 1 | return Status::InvalidArgument("Invalid venv root path: {}", python_root_path.string()); |
284 | 1 | } |
285 | 7 | std::vector<std::string> interpreter_paths = split(python_venv_interpreter_paths, ":"); |
286 | 7 | if (interpreter_paths.empty()) { |
287 | 0 | return Status::InvalidArgument("Invalid python interpreter paths: {}", |
288 | 0 | python_venv_interpreter_paths); |
289 | 0 | } |
290 | 7 | _env_scanner = std::make_unique<VenvEnvScanner>(python_root_path, interpreter_paths); |
291 | 7 | break; |
292 | 7 | } |
293 | 1 | default: |
294 | 1 | return Status::NotSupported("Unsupported python runtime type: {}", |
295 | 1 | static_cast<int>(env_type)); |
296 | 12 | } |
297 | 9 | std::vector<PythonVersion> versions; |
298 | 9 | RETURN_IF_ERROR(_env_scanner->scan()); |
299 | 8 | RETURN_IF_ERROR(_env_scanner->get_versions(&versions)); |
300 | 8 | return Status::OK(); |
301 | 8 | } |
302 | | |
303 | 2 | std::vector<TPythonEnvInfo> PythonVersionManager::env_infos_to_thrift() const { |
304 | 2 | std::vector<TPythonEnvInfo> infos; |
305 | 2 | const auto& envs = this->get_envs(); |
306 | 2 | infos.reserve(envs.size()); |
307 | | |
308 | 2 | const auto env_type_str = _python_env_type_to_string(this->env_type()); |
309 | 2 | for (const auto& env : envs) { |
310 | 2 | TPythonEnvInfo info; |
311 | 2 | info.__set_env_name(env.env_name); |
312 | 2 | info.__set_full_version(env.python_version.full_version); |
313 | 2 | info.__set_env_type(env_type_str); |
314 | 2 | info.__set_base_path(env.python_version.base_path); |
315 | 2 | info.__set_executable_path(env.python_version.executable_path); |
316 | 2 | infos.emplace_back(std::move(info)); |
317 | 2 | } |
318 | | |
319 | 2 | return infos; |
320 | 2 | } |
321 | | |
322 | | std::vector<TPythonPackageInfo> PythonVersionManager::package_infos_to_thrift( |
323 | 1 | const std::vector<std::pair<std::string, std::string>>& packages) const { |
324 | 1 | std::vector<TPythonPackageInfo> infos; |
325 | 1 | infos.reserve(packages.size()); |
326 | 11 | for (const auto& [name, ver] : packages) { |
327 | 11 | TPythonPackageInfo info; |
328 | 11 | info.__set_package_name(name); |
329 | 11 | info.__set_version(ver); |
330 | 11 | infos.emplace_back(std::move(info)); |
331 | 11 | } |
332 | 1 | return infos; |
333 | 1 | } |
334 | | |
335 | | Status list_installed_packages(const PythonVersion& version, |
336 | 7 | std::vector<std::pair<std::string, std::string>>* packages) { |
337 | 7 | DCHECK(packages != nullptr); |
338 | 7 | if (!version.is_valid()) { |
339 | 1 | return Status::InvalidArgument("Invalid python version: {}", version.to_string()); |
340 | 1 | } |
341 | | |
342 | | // Run pip list --format=json to get installed packages |
343 | 6 | std::string cmd = |
344 | 6 | fmt::format("\"{}\" -m pip list --format=json 2>/dev/null", version.executable_path); |
345 | 6 | FILE* pipe = popen(cmd.c_str(), "r"); |
346 | 6 | if (!pipe) { |
347 | 0 | return Status::InternalError("Failed to run pip list for python version: {}", |
348 | 0 | version.full_version); |
349 | 0 | } |
350 | | |
351 | 6 | std::string result; |
352 | 6 | char buf[4096]; |
353 | 12 | while (fgets(buf, sizeof(buf), pipe)) { |
354 | 6 | result += buf; |
355 | 6 | } |
356 | 6 | int ret = pclose(pipe); |
357 | 6 | if (ret != 0) { |
358 | 1 | return Status::InternalError( |
359 | 1 | "pip list failed for python version: {}, exit code: {}, output: {}", |
360 | 1 | version.full_version, ret, result); |
361 | 1 | } |
362 | | |
363 | | // Parse JSON output: [{"name": "pkg", "version": "1.0"}, ...] |
364 | | // Simple JSON parsing without external library |
365 | | // Each entry looks like: {"name": "package_name", "version": "1.2.3"} |
366 | 5 | rapidjson::Document doc; |
367 | 5 | if (doc.Parse(result.data(), result.size()).HasParseError() || !doc.IsArray()) [[unlikely]] { |
368 | 2 | return Status::InternalError("Failed to parse pip list json output for python version: {}", |
369 | 2 | version.full_version); |
370 | 2 | } |
371 | | |
372 | 3 | packages->reserve(packages->size() + doc.Size()); |
373 | 14 | for (const auto& item : doc.GetArray()) { |
374 | 14 | auto name_it = item.FindMember("name"); |
375 | 14 | auto version_it = item.FindMember("version"); |
376 | 14 | if (name_it == item.MemberEnd() || version_it == item.MemberEnd() || |
377 | 14 | !name_it->value.IsString() || !version_it->value.IsString()) [[unlikely]] { |
378 | 1 | return Status::InternalError("Invalid pip list json format for python version: {}", |
379 | 1 | version.full_version); |
380 | 1 | } |
381 | 13 | packages->emplace_back(name_it->value.GetString(), version_it->value.GetString()); |
382 | 13 | } |
383 | | |
384 | 2 | return Status::OK(); |
385 | 3 | } |
386 | | |
387 | | } // namespace doris |