Coverage Report

Created: 2025-04-15 00:19

/root/doris/be/src/util/s3_util.cpp
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "util/s3_util.h"
19
20
#include <aws/core/auth/AWSAuthSigner.h>
21
#include <aws/core/auth/AWSCredentials.h>
22
#include <aws/core/auth/AWSCredentialsProviderChain.h>
23
#include <aws/core/client/DefaultRetryStrategy.h>
24
#include <aws/core/utils/logging/LogLevel.h>
25
#include <aws/core/utils/logging/LogSystemInterface.h>
26
#include <aws/core/utils/memory/stl/AWSStringStream.h>
27
#include <aws/s3/S3Client.h>
28
#include <bvar/reducer.h>
29
#include <util/string_util.h>
30
31
#include <atomic>
32
#ifdef USE_AZURE
33
#include <azure/storage/blobs/blob_container_client.hpp>
34
#endif
35
#include <cstdlib>
36
#include <filesystem>
37
#include <functional>
38
#include <memory>
39
#include <ostream>
40
#include <utility>
41
42
#include "common/config.h"
43
#include "common/logging.h"
44
#include "common/status.h"
45
#include "cpp/aws_logger.h"
46
#include "cpp/obj_retry_strategy.h"
47
#include "cpp/sync_point.h"
48
#ifdef USE_AZURE
49
#include "io/fs/azure_obj_storage_client.h"
50
#endif
51
#include "io/fs/obj_storage_client.h"
52
#include "io/fs/s3_obj_storage_client.h"
53
#include "runtime/exec_env.h"
54
#include "s3_uri.h"
55
#include "vec/exec/scan/scanner_scheduler.h"
56
57
namespace doris {
58
namespace s3_bvar {
59
bvar::LatencyRecorder s3_get_latency("s3_get");
60
bvar::LatencyRecorder s3_put_latency("s3_put");
61
bvar::LatencyRecorder s3_delete_object_latency("s3_delete_object");
62
bvar::LatencyRecorder s3_delete_objects_latency("s3_delete_objects");
63
bvar::LatencyRecorder s3_head_latency("s3_head");
64
bvar::LatencyRecorder s3_multi_part_upload_latency("s3_multi_part_upload");
65
bvar::LatencyRecorder s3_list_latency("s3_list");
66
bvar::LatencyRecorder s3_list_object_versions_latency("s3_list_object_versions");
67
bvar::LatencyRecorder s3_get_bucket_version_latency("s3_get_bucket_version");
68
bvar::LatencyRecorder s3_copy_object_latency("s3_copy_object");
69
}; // namespace s3_bvar
70
71
namespace {
72
73
2
doris::Status is_s3_conf_valid(const S3ClientConf& conf) {
74
2
    if (conf.endpoint.empty()) {
75
0
        return Status::InvalidArgument<false>("Invalid s3 conf, empty endpoint");
76
0
    }
77
2
    if (conf.region.empty()) {
78
0
        return Status::InvalidArgument<false>("Invalid s3 conf, empty region");
79
0
    }
80
2
    if (conf.ak.empty()) {
81
0
        return Status::InvalidArgument<false>("Invalid s3 conf, empty ak");
82
0
    }
83
2
    if (conf.sk.empty()) {
84
0
        return Status::InvalidArgument<false>("Invalid s3 conf, empty sk");
85
0
    }
86
2
    return Status::OK();
87
2
}
88
89
// Return true is convert `str` to int successfully
90
0
bool to_int(std::string_view str, int& res) {
91
0
    auto [_, ec] = std::from_chars(str.data(), str.data() + str.size(), res);
92
0
    return ec == std::errc {};
93
0
}
94
95
constexpr char USE_PATH_STYLE[] = "use_path_style";
96
97
constexpr char AZURE_PROVIDER_STRING[] = "AZURE";
98
constexpr char S3_PROVIDER[] = "provider";
99
constexpr char S3_AK[] = "AWS_ACCESS_KEY";
100
constexpr char S3_SK[] = "AWS_SECRET_KEY";
101
constexpr char S3_ENDPOINT[] = "AWS_ENDPOINT";
102
constexpr char S3_REGION[] = "AWS_REGION";
103
constexpr char S3_TOKEN[] = "AWS_TOKEN";
104
constexpr char S3_MAX_CONN_SIZE[] = "AWS_MAX_CONN_SIZE";
105
constexpr char S3_REQUEST_TIMEOUT_MS[] = "AWS_REQUEST_TIMEOUT_MS";
106
constexpr char S3_CONN_TIMEOUT_MS[] = "AWS_CONNECTION_TIMEOUT_MS";
107
constexpr char S3_NEED_OVERRIDE_ENDPOINT[] = "AWS_NEED_OVERRIDE_ENDPOINT";
108
109
} // namespace
110
111
bvar::Adder<int64_t> get_rate_limit_ns("get_rate_limit_ns");
112
bvar::Adder<int64_t> get_rate_limit_exceed_req_num("get_rate_limit_exceed_req_num");
113
bvar::Adder<int64_t> put_rate_limit_ns("put_rate_limit_ns");
114
bvar::Adder<int64_t> put_rate_limit_exceed_req_num("put_rate_limit_exceed_req_num");
115
116
0
S3RateLimiterHolder* S3ClientFactory::rate_limiter(S3RateLimitType type) {
117
0
    CHECK(type == S3RateLimitType::GET || type == S3RateLimitType::PUT) << to_string(type);
118
0
    return _rate_limiters[static_cast<size_t>(type)].get();
119
0
}
120
121
0
int reset_s3_rate_limiter(S3RateLimitType type, size_t max_speed, size_t max_burst, size_t limit) {
122
0
    if (type == S3RateLimitType::UNKNOWN) {
123
0
        return -1;
124
0
    }
125
0
    return S3ClientFactory::instance().rate_limiter(type)->reset(max_speed, max_burst, limit);
126
0
}
127
128
1
S3ClientFactory::S3ClientFactory() {
129
1
    _aws_options = Aws::SDKOptions {};
130
1
    auto logLevel = static_cast<Aws::Utils::Logging::LogLevel>(config::aws_log_level);
131
1
    _aws_options.loggingOptions.logLevel = logLevel;
132
1
    _aws_options.loggingOptions.logger_create_fn = [logLevel] {
133
1
        return std::make_shared<DorisAWSLogger>(logLevel);
134
1
    };
135
1
    Aws::InitAPI(_aws_options);
136
1
    _ca_cert_file_path = get_valid_ca_cert_path();
137
1
    _rate_limiters = {
138
1
            std::make_unique<S3RateLimiterHolder>(
139
1
                    config::s3_get_token_per_second, config::s3_get_bucket_tokens,
140
1
                    config::s3_get_token_limit,
141
1
                    metric_func_factory(get_rate_limit_ns, get_rate_limit_exceed_req_num)),
142
1
            std::make_unique<S3RateLimiterHolder>(
143
1
                    config::s3_put_token_per_second, config::s3_put_bucket_tokens,
144
1
                    config::s3_put_token_limit,
145
1
                    metric_func_factory(put_rate_limit_ns, put_rate_limit_exceed_req_num))};
146
1
}
147
148
2
std::string S3ClientFactory::get_valid_ca_cert_path() {
149
2
    auto vec_ca_file_path = doris::split(config::ca_cert_file_paths, ";");
150
2
    auto it = vec_ca_file_path.begin();
151
2
    for (; it != vec_ca_file_path.end(); ++it) {
152
2
        if (std::filesystem::exists(*it)) {
153
2
            return *it;
154
2
        }
155
2
    }
156
0
    return "";
157
2
}
158
159
1
S3ClientFactory::~S3ClientFactory() {
160
1
    Aws::ShutdownAPI(_aws_options);
161
1
}
162
163
2
S3ClientFactory& S3ClientFactory::instance() {
164
2
    static S3ClientFactory ret;
165
2
    return ret;
166
2
}
167
168
2
std::shared_ptr<io::ObjStorageClient> S3ClientFactory::create(const S3ClientConf& s3_conf) {
169
2
    if (!is_s3_conf_valid(s3_conf).ok()) {
170
0
        return nullptr;
171
0
    }
172
173
2
    uint64_t hash = s3_conf.get_hash();
174
2
    {
175
2
        std::lock_guard l(_lock);
176
2
        auto it = _cache.find(hash);
177
2
        if (it != _cache.end()) {
178
0
            return it->second;
179
0
        }
180
2
    }
181
182
2
    auto obj_client = (s3_conf.provider == io::ObjStorageType::AZURE)
183
2
                              ? _create_azure_client(s3_conf)
184
2
                              : _create_s3_client(s3_conf);
185
186
2
    {
187
2
        uint64_t hash = s3_conf.get_hash();
188
2
        std::lock_guard l(_lock);
189
2
        _cache[hash] = obj_client;
190
2
    }
191
2
    return obj_client;
192
2
}
193
194
std::shared_ptr<io::ObjStorageClient> S3ClientFactory::_create_azure_client(
195
0
        const S3ClientConf& s3_conf) {
196
0
#ifdef USE_AZURE
197
0
    auto cred =
198
0
            std::make_shared<Azure::Storage::StorageSharedKeyCredential>(s3_conf.ak, s3_conf.sk);
199
200
0
    const std::string container_name = s3_conf.bucket;
201
0
    std::string uri;
202
0
    if (config::force_azure_blob_global_endpoint) {
203
0
        uri = fmt::format("https://{}.blob.core.windows.net/{}", s3_conf.ak, container_name);
204
0
    } else {
205
0
        uri = fmt::format("{}/{}", s3_conf.endpoint, container_name);
206
0
        if (s3_conf.endpoint.find("://") == std::string::npos) {
207
0
            uri = "https://" + uri;
208
0
        }
209
0
    }
210
211
0
    auto containerClient = std::make_shared<Azure::Storage::Blobs::BlobContainerClient>(uri, cred);
212
0
    LOG_INFO("create one azure client with {}", s3_conf.to_string());
213
0
    return std::make_shared<io::AzureObjStorageClient>(std::move(containerClient));
214
#else
215
    LOG_FATAL("BE is not compiled with azure support, export BUILD_AZURE=ON before building");
216
    return nullptr;
217
#endif
218
0
}
219
220
std::shared_ptr<io::ObjStorageClient> S3ClientFactory::_create_s3_client(
221
2
        const S3ClientConf& s3_conf) {
222
2
    TEST_SYNC_POINT_RETURN_WITH_VALUE(
223
1
            "s3_client_factory::create",
224
1
            std::make_shared<io::S3ObjStorageClient>(std::make_shared<Aws::S3::S3Client>()));
225
1
    Aws::Client::ClientConfiguration aws_config = S3ClientFactory::getClientConfiguration();
226
1
    if (s3_conf.need_override_endpoint) {
227
1
        aws_config.endpointOverride = s3_conf.endpoint;
228
1
    }
229
1
    aws_config.region = s3_conf.region;
230
1
    std::string ca_cert = get_valid_ca_cert_path();
231
1
    if ("" != _ca_cert_file_path) {
232
1
        aws_config.caFile = _ca_cert_file_path;
233
1
    } else {
234
        // config::ca_cert_file_paths is valmutable,get newest value if file path invaild
235
0
        _ca_cert_file_path = get_valid_ca_cert_path();
236
0
        if ("" != _ca_cert_file_path) {
237
0
            aws_config.caFile = _ca_cert_file_path;
238
0
        }
239
0
    }
240
1
    if (s3_conf.max_connections > 0) {
241
0
        aws_config.maxConnections = s3_conf.max_connections;
242
1
    } else {
243
1
#ifdef BE_TEST
244
        // the S3Client may shared by many threads.
245
        // So need to set the number of connections large enough.
246
1
        aws_config.maxConnections = config::doris_scanner_thread_pool_thread_num;
247
#else
248
        aws_config.maxConnections =
249
                ExecEnv::GetInstance()->scanner_scheduler()->remote_thread_pool_max_thread_num();
250
#endif
251
1
    }
252
253
1
    aws_config.requestTimeoutMs = 30000;
254
1
    if (s3_conf.request_timeout_ms > 0) {
255
0
        aws_config.requestTimeoutMs = s3_conf.request_timeout_ms;
256
0
    }
257
258
1
    if (s3_conf.connect_timeout_ms > 0) {
259
0
        aws_config.connectTimeoutMs = s3_conf.connect_timeout_ms;
260
0
    }
261
262
1
    if (config::s3_client_http_scheme == "http") {
263
1
        aws_config.scheme = Aws::Http::Scheme::HTTP;
264
1
    }
265
266
1
    aws_config.retryStrategy = std::make_shared<S3CustomRetryStrategy>(
267
1
            config::max_s3_client_retry /*scaleFactor = 25*/);
268
1
    std::shared_ptr<Aws::S3::S3Client> new_client;
269
1
    if (!s3_conf.ak.empty() && !s3_conf.sk.empty()) {
270
1
        Aws::Auth::AWSCredentials aws_cred(s3_conf.ak, s3_conf.sk);
271
1
        DCHECK(!aws_cred.IsExpiredOrEmpty());
272
1
        if (!s3_conf.token.empty()) {
273
0
            aws_cred.SetSessionToken(s3_conf.token);
274
0
        }
275
1
        new_client = std::make_shared<Aws::S3::S3Client>(
276
1
                std::move(aws_cred), std::move(aws_config),
277
1
                Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never,
278
1
                s3_conf.use_virtual_addressing);
279
1
    } else {
280
0
        std::shared_ptr<Aws::Auth::AWSCredentialsProvider> aws_provider_chain =
281
0
                std::make_shared<Aws::Auth::DefaultAWSCredentialsProviderChain>();
282
0
        new_client = std::make_shared<Aws::S3::S3Client>(
283
0
                std::move(aws_provider_chain), std::move(aws_config),
284
0
                Aws::Client::AWSAuthV4Signer::PayloadSigningPolicy::Never,
285
0
                s3_conf.use_virtual_addressing);
286
0
    }
287
288
1
    auto obj_client = std::make_shared<io::S3ObjStorageClient>(std::move(new_client));
289
1
    LOG_INFO("create one s3 client with {}", s3_conf.to_string());
290
1
    return obj_client;
291
2
}
292
293
Status S3ClientFactory::convert_properties_to_s3_conf(
294
0
        const std::map<std::string, std::string>& prop, const S3URI& s3_uri, S3Conf* s3_conf) {
295
0
    StringCaseMap<std::string> properties(prop.begin(), prop.end());
296
0
    if (auto it = properties.find(S3_AK); it != properties.end()) {
297
0
        s3_conf->client_conf.ak = it->second;
298
0
    }
299
0
    if (auto it = properties.find(S3_SK); it != properties.end()) {
300
0
        s3_conf->client_conf.sk = it->second;
301
0
    }
302
0
    if (auto it = properties.find(S3_TOKEN); it != properties.end()) {
303
0
        s3_conf->client_conf.token = it->second;
304
0
    }
305
0
    if (auto it = properties.find(S3_ENDPOINT); it != properties.end()) {
306
0
        s3_conf->client_conf.endpoint = it->second;
307
0
    }
308
0
    if (auto it = properties.find(S3_NEED_OVERRIDE_ENDPOINT); it != properties.end()) {
309
0
        s3_conf->client_conf.need_override_endpoint = (it->second == "true");
310
0
    }
311
0
    if (auto it = properties.find(S3_REGION); it != properties.end()) {
312
0
        s3_conf->client_conf.region = it->second;
313
0
    }
314
0
    if (auto it = properties.find(S3_MAX_CONN_SIZE); it != properties.end()) {
315
0
        if (!to_int(it->second, s3_conf->client_conf.max_connections)) {
316
0
            return Status::InvalidArgument("invalid {} value \"{}\"", S3_MAX_CONN_SIZE, it->second);
317
0
        }
318
0
    }
319
0
    if (auto it = properties.find(S3_REQUEST_TIMEOUT_MS); it != properties.end()) {
320
0
        if (!to_int(it->second, s3_conf->client_conf.request_timeout_ms)) {
321
0
            return Status::InvalidArgument("invalid {} value \"{}\"", S3_REQUEST_TIMEOUT_MS,
322
0
                                           it->second);
323
0
        }
324
0
    }
325
0
    if (auto it = properties.find(S3_CONN_TIMEOUT_MS); it != properties.end()) {
326
0
        if (!to_int(it->second, s3_conf->client_conf.connect_timeout_ms)) {
327
0
            return Status::InvalidArgument("invalid {} value \"{}\"", S3_CONN_TIMEOUT_MS,
328
0
                                           it->second);
329
0
        }
330
0
    }
331
0
    if (auto it = properties.find(S3_PROVIDER); it != properties.end()) {
332
        // S3 Provider properties should be case insensitive.
333
0
        if (0 == strcasecmp(it->second.c_str(), AZURE_PROVIDER_STRING)) {
334
0
            s3_conf->client_conf.provider = io::ObjStorageType::AZURE;
335
0
        }
336
0
    }
337
338
0
    if (s3_uri.get_bucket().empty()) {
339
0
        return Status::InvalidArgument("Invalid S3 URI {}, bucket is not specified",
340
0
                                       s3_uri.to_string());
341
0
    }
342
0
    s3_conf->bucket = s3_uri.get_bucket();
343
    // For azure's compatibility
344
0
    s3_conf->client_conf.bucket = s3_uri.get_bucket();
345
0
    s3_conf->prefix = "";
346
347
    // See https://sdk.amazonaws.com/cpp/api/LATEST/class_aws_1_1_s3_1_1_s3_client.html
348
0
    s3_conf->client_conf.use_virtual_addressing = true;
349
0
    if (auto it = properties.find(USE_PATH_STYLE); it != properties.end()) {
350
0
        s3_conf->client_conf.use_virtual_addressing = it->second != "true";
351
0
    }
352
353
0
    if (auto st = is_s3_conf_valid(s3_conf->client_conf); !st.ok()) {
354
0
        return st;
355
0
    }
356
0
    return Status::OK();
357
0
}
358
359
0
S3Conf S3Conf::get_s3_conf(const cloud::ObjectStoreInfoPB& info) {
360
0
    S3Conf ret {
361
0
            .bucket = info.bucket(),
362
0
            .prefix = info.prefix(),
363
0
            .client_conf {.endpoint = info.endpoint(),
364
0
                          .region = info.region(),
365
0
                          .ak = info.ak(),
366
0
                          .sk = info.sk(),
367
0
                          .token {},
368
0
                          .bucket = info.bucket(),
369
0
                          .provider = io::ObjStorageType::AWS,
370
0
                          .use_virtual_addressing =
371
0
                                  info.has_use_path_style() ? !info.use_path_style() : true},
372
0
            .sse_enabled = info.sse_enabled(),
373
0
    };
374
375
0
    io::ObjStorageType type = io::ObjStorageType::AWS;
376
0
    switch (info.provider()) {
377
0
    case cloud::ObjectStoreInfoPB_Provider_OSS:
378
0
        type = io::ObjStorageType::OSS;
379
0
        break;
380
0
    case cloud::ObjectStoreInfoPB_Provider_S3:
381
0
        type = io::ObjStorageType::AWS;
382
0
        break;
383
0
    case cloud::ObjectStoreInfoPB_Provider_COS:
384
0
        type = io::ObjStorageType::COS;
385
0
        break;
386
0
    case cloud::ObjectStoreInfoPB_Provider_OBS:
387
0
        type = io::ObjStorageType::OBS;
388
0
        break;
389
0
    case cloud::ObjectStoreInfoPB_Provider_BOS:
390
0
        type = io::ObjStorageType::BOS;
391
0
        break;
392
0
    case cloud::ObjectStoreInfoPB_Provider_GCP:
393
0
        type = io::ObjStorageType::GCP;
394
0
        break;
395
0
    case cloud::ObjectStoreInfoPB_Provider_AZURE:
396
0
        type = io::ObjStorageType::AZURE;
397
0
        break;
398
0
    default:
399
0
        LOG_FATAL("unknown provider type {}, info {}", info.provider(), ret.to_string());
400
0
        __builtin_unreachable();
401
0
    }
402
0
    ret.client_conf.provider = type;
403
0
    return ret;
404
0
}
405
406
0
S3Conf S3Conf::get_s3_conf(const TS3StorageParam& param) {
407
0
    S3Conf ret {
408
0
            .bucket = param.bucket,
409
0
            .prefix = param.root_path,
410
0
            .client_conf = {
411
0
                    .endpoint = param.endpoint,
412
0
                    .region = param.region,
413
0
                    .ak = param.ak,
414
0
                    .sk = param.sk,
415
0
                    .token = param.token,
416
0
                    .bucket = param.bucket,
417
0
                    .provider = io::ObjStorageType::AWS,
418
0
                    .max_connections = param.max_conn,
419
0
                    .request_timeout_ms = param.request_timeout_ms,
420
0
                    .connect_timeout_ms = param.conn_timeout_ms,
421
                    // When using cold heat separation in minio, user might use ip address directly,
422
                    // which needs enable use_virtual_addressing to true
423
0
                    .use_virtual_addressing = !param.use_path_style,
424
0
            }};
425
0
    io::ObjStorageType type = io::ObjStorageType::AWS;
426
0
    switch (param.provider) {
427
0
    case TObjStorageType::UNKNOWN:
428
0
        LOG_INFO("Receive one legal storage resource, set provider type to aws, param detail {}",
429
0
                 ret.to_string());
430
0
        type = io::ObjStorageType::AWS;
431
0
        break;
432
0
    case TObjStorageType::AWS:
433
0
        type = io::ObjStorageType::AWS;
434
0
        break;
435
0
    case TObjStorageType::AZURE:
436
0
        type = io::ObjStorageType::AZURE;
437
0
        break;
438
0
    case TObjStorageType::BOS:
439
0
        type = io::ObjStorageType::BOS;
440
0
        break;
441
0
    case TObjStorageType::COS:
442
0
        type = io::ObjStorageType::COS;
443
0
        break;
444
0
    case TObjStorageType::OBS:
445
0
        type = io::ObjStorageType::OBS;
446
0
        break;
447
0
    case TObjStorageType::OSS:
448
0
        type = io::ObjStorageType::OSS;
449
0
        break;
450
0
    case TObjStorageType::GCP:
451
0
        type = io::ObjStorageType::GCP;
452
0
        break;
453
0
    default:
454
0
        LOG_FATAL("unknown provider type {}, info {}", param.provider, ret.to_string());
455
0
        __builtin_unreachable();
456
0
    }
457
0
    ret.client_conf.provider = type;
458
0
    return ret;
459
0
}
460
461
} // end namespace doris