be/src/io/file_factory.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "io/file_factory.h" |
19 | | |
20 | | #include <gen_cpp/PaloInternalService_types.h> |
21 | | #include <gen_cpp/PlanNodes_types.h> |
22 | | #include <gen_cpp/Types_types.h> |
23 | | |
24 | | #include <mutex> |
25 | | #include <utility> |
26 | | |
27 | | #include "common/cast_set.h" |
28 | | #include "common/config.h" |
29 | | #include "common/status.h" |
30 | | #include "io/fs/broker_file_system.h" |
31 | | #include "io/fs/broker_file_writer.h" |
32 | | #include "io/fs/file_reader.h" |
33 | | #include "io/fs/file_system.h" |
34 | | #include "io/fs/hdfs/hdfs_mgr.h" |
35 | | #include "io/fs/hdfs_file_reader.h" |
36 | | #include "io/fs/hdfs_file_system.h" |
37 | | #include "io/fs/hdfs_file_writer.h" |
38 | | #include "io/fs/http_file_reader.h" |
39 | | #include "io/fs/http_file_system.h" |
40 | | #include "io/fs/local_file_system.h" |
41 | | #include "io/fs/multi_table_pipe.h" |
42 | | #include "io/fs/s3_file_reader.h" |
43 | | #include "io/fs/s3_file_system.h" |
44 | | #include "io/fs/s3_file_writer.h" |
45 | | #include "io/fs/stream_load_pipe.h" |
46 | | #include "io/hdfs_builder.h" |
47 | | #include "io/hdfs_util.h" |
48 | | #include "load/stream_load/new_load_stream_mgr.h" |
49 | | #include "load/stream_load/stream_load_context.h" |
50 | | #include "runtime/exec_env.h" |
51 | | #include "runtime/runtime_state.h" |
52 | | #include "util/s3_uri.h" |
53 | | #include "util/s3_util.h" |
54 | | #include "util/uid_util.h" |
55 | | |
56 | | namespace doris { |
57 | | #include "common/compile_check_begin.h" |
58 | | |
59 | | constexpr std::string_view RANDOM_CACHE_BASE_PATH = "random"; |
60 | | |
61 | | io::FileReaderOptions FileFactory::get_reader_options(RuntimeState* state, |
62 | 1.08k | const io::FileDescription& fd) { |
63 | 1.08k | io::FileReaderOptions opts { |
64 | 1.08k | .cache_base_path {}, |
65 | 1.08k | .file_size = fd.file_size, |
66 | 1.08k | .mtime = fd.mtime, |
67 | 1.08k | }; |
68 | 1.08k | if (config::enable_file_cache && state != nullptr && |
69 | 1.08k | state->query_options().__isset.enable_file_cache && |
70 | 1.08k | state->query_options().enable_file_cache) { |
71 | 0 | opts.cache_type = io::FileCachePolicy::FILE_BLOCK_CACHE; |
72 | 0 | } |
73 | 1.08k | if (state != nullptr && state->query_options().__isset.file_cache_base_path && |
74 | 1.08k | state->query_options().file_cache_base_path != RANDOM_CACHE_BASE_PATH) { |
75 | 0 | opts.cache_base_path = state->query_options().file_cache_base_path; |
76 | 0 | } |
77 | 1.08k | return opts; |
78 | 1.08k | } |
79 | | |
80 | 0 | int32_t get_broker_index(const std::vector<TNetworkAddress>& brokers, const std::string& path) { |
81 | 0 | if (brokers.empty()) { |
82 | 0 | return -1; |
83 | 0 | } |
84 | | |
85 | | // firstly find local broker |
86 | 0 | const auto local_host = BackendOptions::get_localhost(); |
87 | 0 | for (int32_t i = 0; i < brokers.size(); ++i) { |
88 | 0 | if (brokers[i].hostname == local_host) { |
89 | 0 | return i; |
90 | 0 | } |
91 | 0 | } |
92 | | |
93 | | // secondly select broker by hash of file path |
94 | 0 | auto key = HashUtil::hash(path.data(), cast_set<uint32_t>(path.size()), 0); |
95 | |
|
96 | 0 | return key % brokers.size(); |
97 | 0 | } |
98 | | |
99 | | Result<io::FileSystemSPtr> FileFactory::create_fs(const io::FSPropertiesRef& fs_properties, |
100 | 0 | const io::FileDescription& file_description) { |
101 | 0 | switch (fs_properties.type) { |
102 | 0 | case TFileType::FILE_LOCAL: |
103 | 0 | return io::global_local_filesystem(); |
104 | 0 | case TFileType::FILE_BROKER: { |
105 | 0 | auto index = get_broker_index(*fs_properties.broker_addresses, file_description.path); |
106 | 0 | if (index < 0) { |
107 | 0 | return ResultError(Status::InternalError("empty broker_addresses")); |
108 | 0 | } |
109 | 0 | LOG_INFO("select broker: {} for file {}", (*fs_properties.broker_addresses)[index].hostname, |
110 | 0 | file_description.path); |
111 | 0 | return io::BrokerFileSystem::create((*fs_properties.broker_addresses)[index], |
112 | 0 | *fs_properties.properties, io::FileSystem::TMP_FS_ID); |
113 | 0 | } |
114 | 0 | case TFileType::FILE_S3: { |
115 | 0 | S3URI s3_uri(file_description.path); |
116 | 0 | RETURN_IF_ERROR_RESULT(s3_uri.parse()); |
117 | 0 | S3Conf s3_conf; |
118 | 0 | RETURN_IF_ERROR_RESULT(S3ClientFactory::convert_properties_to_s3_conf( |
119 | 0 | *fs_properties.properties, s3_uri, &s3_conf)); |
120 | 0 | return io::S3FileSystem::create(std::move(s3_conf), io::FileSystem::TMP_FS_ID); |
121 | 0 | } |
122 | 0 | case TFileType::FILE_HDFS: { |
123 | 0 | std::string fs_name = _get_fs_name(file_description); |
124 | 0 | return io::HdfsFileSystem::create(*fs_properties.properties, fs_name, |
125 | 0 | io::FileSystem::TMP_FS_ID, nullptr); |
126 | 0 | } |
127 | 0 | case TFileType::FILE_HTTP: { |
128 | 0 | const auto& kv = *fs_properties.properties; |
129 | 0 | auto it = kv.find("uri"); |
130 | 0 | if (it == kv.end() || it->second.empty()) { |
131 | 0 | return ResultError(Status::InternalError("http fs must set uri property")); |
132 | 0 | } |
133 | 0 | return io::HttpFileSystem::create(it->second, io::FileSystem::TMP_FS_ID, kv); |
134 | 0 | } |
135 | 0 | default: |
136 | 0 | return ResultError(Status::InternalError("unsupported fs type: {}", |
137 | 0 | std::to_string(fs_properties.type))); |
138 | 0 | } |
139 | 0 | } |
140 | | |
141 | 0 | std::string FileFactory::_get_fs_name(const io::FileDescription& file_description) { |
142 | | // If the destination path contains a schema, use the schema directly. |
143 | | // If not, use origin file_description.fs_name |
144 | | // Because the default fsname in file_description.fs_name maybe different from |
145 | | // file's. |
146 | | // example: |
147 | | // hdfs://host:port/path1/path2 --> hdfs://host:port |
148 | | // hdfs://nameservice/path1/path2 --> hdfs://nameservice |
149 | 0 | std::string fs_name = file_description.fs_name; |
150 | 0 | std::string::size_type idx = file_description.path.find("://"); |
151 | 0 | if (idx != std::string::npos) { |
152 | 0 | idx = file_description.path.find('/', idx + 3); |
153 | 0 | if (idx != std::string::npos) { |
154 | 0 | fs_name = file_description.path.substr(0, idx); |
155 | 0 | } |
156 | 0 | } |
157 | 0 | return fs_name; |
158 | 0 | } |
159 | | |
160 | | Result<io::FileWriterPtr> FileFactory::create_file_writer( |
161 | | TFileType::type type, ExecEnv* env, const std::vector<TNetworkAddress>& broker_addresses, |
162 | | const std::map<std::string, std::string>& properties, const std::string& path, |
163 | 347 | const io::FileWriterOptions& options) { |
164 | 347 | io::FileWriterPtr file_writer; |
165 | 347 | switch (type) { |
166 | 81 | case TFileType::FILE_LOCAL: { |
167 | 81 | RETURN_IF_ERROR_RESULT(io::global_local_filesystem()->create_file(path, &file_writer)); |
168 | 81 | return file_writer; |
169 | 81 | } |
170 | 0 | case TFileType::FILE_BROKER: { |
171 | 0 | auto index = get_broker_index(broker_addresses, path); |
172 | 0 | if (index < 0) { |
173 | 0 | return ResultError(Status::InternalError("empty broker_addresses")); |
174 | 0 | } |
175 | 0 | LOG_INFO("select broker: {} for file {}", broker_addresses[index].hostname, path); |
176 | 0 | return io::BrokerFileWriter::create(env, broker_addresses[index], properties, path); |
177 | 0 | } |
178 | 266 | case TFileType::FILE_S3: { |
179 | 266 | S3URI s3_uri(path); |
180 | 266 | RETURN_IF_ERROR_RESULT(s3_uri.parse()); |
181 | 266 | S3Conf s3_conf; |
182 | 266 | RETURN_IF_ERROR_RESULT( |
183 | 266 | S3ClientFactory::convert_properties_to_s3_conf(properties, s3_uri, &s3_conf)); |
184 | 266 | auto client = std::make_shared<io::ObjClientHolder>(std::move(s3_conf.client_conf)); |
185 | 266 | RETURN_IF_ERROR_RESULT(client->init()); |
186 | 266 | return std::make_unique<io::S3FileWriter>(std::move(client), std::move(s3_conf.bucket), |
187 | 266 | s3_uri.get_key(), &options); |
188 | 266 | } |
189 | 0 | case TFileType::FILE_HDFS: { |
190 | 0 | THdfsParams hdfs_params = parse_properties(properties); |
191 | 0 | std::shared_ptr<io::HdfsHandler> handler; |
192 | 0 | RETURN_IF_ERROR_RESULT(ExecEnv::GetInstance()->hdfs_mgr()->get_or_create_fs( |
193 | 0 | hdfs_params, hdfs_params.fs_name, &handler)); |
194 | 0 | return io::HdfsFileWriter::create(path, handler, hdfs_params.fs_name, &options); |
195 | 0 | } |
196 | 0 | default: |
197 | 0 | return ResultError( |
198 | 0 | Status::InternalError("unsupported file writer type: {}", std::to_string(type))); |
199 | 347 | } |
200 | 347 | } |
201 | | |
202 | | Result<io::FileReaderSPtr> FileFactory::create_file_reader( |
203 | | const io::FileSystemProperties& system_properties, |
204 | | const io::FileDescription& file_description, const io::FileReaderOptions& reader_options, |
205 | 1.08k | RuntimeProfile* profile) { |
206 | 1.08k | auto reader_res = _create_file_reader_internal(system_properties, file_description, |
207 | 1.08k | reader_options, profile); |
208 | 1.08k | if (!reader_res.has_value()) { |
209 | 0 | return unexpected(std::move(reader_res).error()); |
210 | 0 | } |
211 | 1.08k | return std::move(reader_res).value(); |
212 | 1.08k | } |
213 | | |
214 | | Result<io::FileReaderSPtr> FileFactory::_create_file_reader_internal( |
215 | | const io::FileSystemProperties& system_properties, |
216 | | const io::FileDescription& file_description, const io::FileReaderOptions& reader_options, |
217 | 1.08k | RuntimeProfile* profile) { |
218 | 1.08k | TFileType::type type = system_properties.system_type; |
219 | 1.08k | switch (type) { |
220 | 138 | case TFileType::FILE_LOCAL: { |
221 | 138 | io::FileReaderSPtr file_reader; |
222 | 138 | RETURN_IF_ERROR_RESULT(io::global_local_filesystem()->open_file( |
223 | 138 | file_description.path, &file_reader, &reader_options)); |
224 | 138 | return file_reader; |
225 | 138 | } |
226 | 944 | case TFileType::FILE_S3: { |
227 | 944 | S3URI s3_uri(file_description.path); |
228 | 944 | RETURN_IF_ERROR_RESULT(s3_uri.parse()); |
229 | 944 | S3Conf s3_conf; |
230 | 944 | RETURN_IF_ERROR_RESULT(S3ClientFactory::convert_properties_to_s3_conf( |
231 | 944 | system_properties.properties, s3_uri, &s3_conf)); |
232 | 944 | auto client_holder = std::make_shared<io::ObjClientHolder>(s3_conf.client_conf); |
233 | 944 | RETURN_IF_ERROR_RESULT(client_holder->init()); |
234 | 944 | return io::S3FileReader::create(std::move(client_holder), s3_conf.bucket, s3_uri.get_key(), |
235 | 944 | file_description.file_size, profile) |
236 | 945 | .and_then([&](auto&& reader) { |
237 | 945 | return io::create_cached_file_reader(std::move(reader), reader_options); |
238 | 945 | }); |
239 | 944 | } |
240 | 0 | case TFileType::FILE_HDFS: { |
241 | 0 | std::shared_ptr<io::HdfsHandler> handler; |
242 | | // FIXME(plat1ko): Explain the difference between `system_properties.hdfs_params.fs_name` |
243 | | // and `file_description.fs_name`, it's so confused. |
244 | 0 | const auto* fs_name = &file_description.fs_name; |
245 | 0 | if (fs_name->empty()) { |
246 | 0 | fs_name = &system_properties.hdfs_params.fs_name; |
247 | 0 | } |
248 | 0 | RETURN_IF_ERROR_RESULT(ExecEnv::GetInstance()->hdfs_mgr()->get_or_create_fs( |
249 | 0 | system_properties.hdfs_params, *fs_name, &handler)); |
250 | 0 | return io::HdfsFileReader::create(file_description.path, handler->hdfs_fs, *fs_name, |
251 | 0 | reader_options, profile) |
252 | 0 | .and_then([&](auto&& reader) { |
253 | 0 | return io::create_cached_file_reader(std::move(reader), reader_options); |
254 | 0 | }); |
255 | 0 | } |
256 | 0 | case TFileType::FILE_BROKER: { |
257 | 0 | auto index = get_broker_index(system_properties.broker_addresses, file_description.path); |
258 | 0 | if (index < 0) { |
259 | 0 | return ResultError(Status::InternalError("empty broker_addresses")); |
260 | 0 | } |
261 | 0 | LOG_INFO("select broker: {} for file {}", |
262 | 0 | system_properties.broker_addresses[index].hostname, file_description.path); |
263 | | // TODO(plat1ko): Create `FileReader` without FS |
264 | 0 | return io::BrokerFileSystem::create(system_properties.broker_addresses[index], |
265 | 0 | system_properties.properties, io::FileSystem::TMP_FS_ID) |
266 | 0 | .and_then([&](auto&& fs) -> Result<io::FileReaderSPtr> { |
267 | 0 | io::FileReaderSPtr file_reader; |
268 | 0 | RETURN_IF_ERROR_RESULT( |
269 | 0 | fs->open_file(file_description.path, &file_reader, &reader_options)); |
270 | 0 | return file_reader; |
271 | 0 | }); |
272 | 0 | } |
273 | 0 | case TFileType::FILE_HTTP: { |
274 | 0 | return io::HttpFileReader::create(file_description.path, system_properties.properties, |
275 | 0 | reader_options, profile) |
276 | 0 | .and_then([&](auto&& reader) { |
277 | 0 | return io::create_cached_file_reader(std::move(reader), reader_options); |
278 | 0 | }); |
279 | 0 | } |
280 | 0 | default: |
281 | 0 | return ResultError( |
282 | 0 | Status::InternalError("unsupported file reader type: {}", std::to_string(type))); |
283 | 1.08k | } |
284 | 1.08k | } |
285 | | |
286 | | // file scan node/stream load pipe |
287 | | Status FileFactory::create_pipe_reader(const TUniqueId& load_id, io::FileReaderSPtr* file_reader, |
288 | 2.26k | RuntimeState* runtime_state, bool need_schema) { |
289 | 2.26k | auto stream_load_ctx = ExecEnv::GetInstance()->new_load_stream_mgr()->get(load_id); |
290 | 2.26k | if (!stream_load_ctx) { |
291 | 0 | return Status::InternalError("unknown stream load id: {}", UniqueId(load_id).to_string()); |
292 | 0 | } |
293 | 2.26k | if (need_schema) { |
294 | 125 | RETURN_IF_ERROR(stream_load_ctx->allocate_schema_buffer()); |
295 | | // Here, a portion of the data is processed to parse column information |
296 | 125 | auto pipe = std::make_shared<io::StreamLoadPipe>( |
297 | 125 | io::kMaxPipeBufferedBytes /* max_buffered_bytes */, 64 * 1024 /* min_chunk_size */, |
298 | 125 | stream_load_ctx->schema_buffer()->pos /* total_length */); |
299 | 125 | stream_load_ctx->schema_buffer()->flip(); |
300 | 125 | RETURN_IF_ERROR(pipe->append(stream_load_ctx->schema_buffer())); |
301 | 125 | RETURN_IF_ERROR(pipe->finish()); |
302 | 125 | *file_reader = std::move(pipe); |
303 | 2.14k | } else { |
304 | 2.14k | *file_reader = stream_load_ctx->pipe; |
305 | 2.14k | } |
306 | | |
307 | 2.26k | return Status::OK(); |
308 | 2.26k | } |
309 | | #include "common/compile_check_end.h" |
310 | | |
311 | | } // namespace doris |