Coverage Report

Created: 2025-03-13 18:54

/root/doris/be/src/agent/utils.cpp
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "agent/utils.h"
19
20
// IWYU pragma: no_include <bthread/errno.h>
21
#include <errno.h> // IWYU pragma: keep
22
#include <fmt/format.h>
23
#include <gen_cpp/FrontendService.h>
24
#include <gen_cpp/HeartbeatService_types.h>
25
#include <gen_cpp/Types_types.h>
26
#include <glog/logging.h>
27
#include <rapidjson/document.h>
28
#include <rapidjson/encodings.h>
29
#include <rapidjson/rapidjson.h>
30
#include <rapidjson/stringbuffer.h>
31
#include <rapidjson/writer.h>
32
#include <stdint.h>
33
#include <stdlib.h>
34
#include <string.h>
35
#include <thrift/transport/TTransportException.h>
36
37
#include <cstdio>
38
#include <exception>
39
#include <fstream>
40
#include <memory>
41
#include <utility>
42
43
#include "common/config.h"
44
#include "common/status.h"
45
#include "runtime/client_cache.h"
46
#include "runtime/cluster_info.h"
47
48
namespace doris {
49
class TConfirmUnusedRemoteFilesRequest;
50
class TConfirmUnusedRemoteFilesResult;
51
class TFinishTaskRequest;
52
class TMasterResult;
53
class TReportRequest;
54
} // namespace doris
55
56
using std::map;
57
using std::string;
58
using apache::thrift::transport::TTransportException;
59
60
namespace doris {
61
62
static std::unique_ptr<MasterServerClient> s_client;
63
64
0
MasterServerClient* MasterServerClient::create(const ClusterInfo* cluster_info) {
65
0
    s_client.reset(new MasterServerClient(cluster_info));
66
0
    return s_client.get();
67
0
}
68
69
0
MasterServerClient* MasterServerClient::instance() {
70
0
    return s_client.get();
71
0
}
72
73
MasterServerClient::MasterServerClient(const ClusterInfo* cluster_info)
74
        : _cluster_info(cluster_info),
75
          _client_cache(std::make_unique<FrontendServiceClientCache>(
76
0
                  config::max_master_fe_client_cache_size)) {
77
0
    _client_cache->init_metrics("master_fe");
78
0
}
79
80
0
Status MasterServerClient::finish_task(const TFinishTaskRequest& request, TMasterResult* result) {
81
0
    Status client_status;
82
0
    FrontendServiceConnection client(_client_cache.get(), _cluster_info->master_fe_addr,
83
0
                                     config::thrift_rpc_timeout_ms, &client_status);
84
85
0
    if (!client_status.ok()) {
86
0
        LOG(WARNING) << "fail to get master client from cache. "
87
0
                     << "host=" << _cluster_info->master_fe_addr.hostname
88
0
                     << ", port=" << _cluster_info->master_fe_addr.port
89
0
                     << ", code=" << client_status.code();
90
0
        return Status::InternalError("Failed to get master client");
91
0
    }
92
93
0
    try {
94
0
        try {
95
0
            client->finishTask(*result, request);
96
0
        } catch ([[maybe_unused]] TTransportException& e) {
97
#ifndef ADDRESS_SANITIZER
98
            LOG(WARNING) << "master client, retry finishTask: " << e.what();
99
#endif
100
0
            client_status = client.reopen(config::thrift_rpc_timeout_ms);
101
0
            if (!client_status.ok()) {
102
#ifndef ADDRESS_SANITIZER
103
                LOG(WARNING) << "fail to get master client from cache. "
104
                             << "host=" << _cluster_info->master_fe_addr.hostname
105
                             << ", port=" << _cluster_info->master_fe_addr.port
106
                             << ", code=" << client_status.code();
107
#endif
108
0
                return Status::RpcError("Master client finish task failed");
109
0
            }
110
0
            client->finishTask(*result, request);
111
0
        }
112
0
    } catch (std::exception& e) {
113
0
        RETURN_IF_ERROR(client.reopen(config::thrift_rpc_timeout_ms));
114
0
        LOG(WARNING) << "fail to finish_task. "
115
0
                     << "host=" << _cluster_info->master_fe_addr.hostname
116
0
                     << ", port=" << _cluster_info->master_fe_addr.port << ", error=" << e.what();
117
0
        return Status::InternalError("Fail to finish task");
118
0
    }
119
120
0
    return Status::OK();
121
0
}
122
123
0
Status MasterServerClient::report(const TReportRequest& request, TMasterResult* result) {
124
0
    Status client_status;
125
0
    FrontendServiceConnection client(_client_cache.get(), _cluster_info->master_fe_addr,
126
0
                                     config::thrift_rpc_timeout_ms, &client_status);
127
128
0
    if (!client_status.ok()) {
129
0
        LOG(WARNING) << "fail to get master client from cache. "
130
0
                     << "host=" << _cluster_info->master_fe_addr.hostname
131
0
                     << ", port=" << _cluster_info->master_fe_addr.port
132
0
                     << ", code=" << client_status;
133
0
        return Status::InternalError("Fail to get master client from cache");
134
0
    }
135
136
0
    try {
137
0
        try {
138
0
            client->report(*result, request);
139
0
        } catch (TTransportException& e) {
140
0
            TTransportException::TTransportExceptionType type = e.getType();
141
0
            if (type != TTransportException::TTransportExceptionType::TIMED_OUT) {
142
#ifndef ADDRESS_SANITIZER
143
                // if not TIMED_OUT, retry
144
                LOG(WARNING) << "master client, retry finishTask: " << e.what();
145
#endif
146
147
0
                client_status = client.reopen(config::thrift_rpc_timeout_ms);
148
0
                if (!client_status.ok()) {
149
#ifndef ADDRESS_SANITIZER
150
                    LOG(WARNING) << "fail to get master client from cache. "
151
                                 << "host=" << _cluster_info->master_fe_addr.hostname
152
                                 << ", port=" << _cluster_info->master_fe_addr.port
153
                                 << ", code=" << client_status.code();
154
#endif
155
0
                    return Status::InternalError("Fail to get master client from cache");
156
0
                }
157
158
0
                client->report(*result, request);
159
0
            } else {
160
                // TIMED_OUT exception. do not retry
161
                // actually we don't care what FE returns.
162
#ifndef ADDRESS_SANITIZER
163
                LOG(WARNING) << "fail to report to master: " << e.what();
164
#endif
165
0
                return Status::InternalError("Fail to report to master");
166
0
            }
167
0
        }
168
0
    } catch (std::exception& e) {
169
0
        RETURN_IF_ERROR(client.reopen(config::thrift_rpc_timeout_ms));
170
0
        LOG(WARNING) << "fail to report to master. "
171
0
                     << "host=" << _cluster_info->master_fe_addr.hostname
172
0
                     << ", port=" << _cluster_info->master_fe_addr.port
173
0
                     << ", code=" << client_status.code() << ", reason=" << e.what();
174
0
        return Status::InternalError("Fail to report to master");
175
0
    }
176
177
0
    return Status::OK();
178
0
}
179
180
Status MasterServerClient::confirm_unused_remote_files(
181
0
        const TConfirmUnusedRemoteFilesRequest& request, TConfirmUnusedRemoteFilesResult* result) {
182
0
    Status client_status;
183
0
    FrontendServiceConnection client(_client_cache.get(), _cluster_info->master_fe_addr,
184
0
                                     config::thrift_rpc_timeout_ms, &client_status);
185
186
0
    if (!client_status.ok()) {
187
0
        return Status::InternalError(
188
0
                "fail to get master client from cache. host={}, port={}, code={}",
189
0
                _cluster_info->master_fe_addr.hostname, _cluster_info->master_fe_addr.port,
190
0
                client_status.code());
191
0
    }
192
0
    try {
193
0
        try {
194
0
            client->confirmUnusedRemoteFiles(*result, request);
195
0
        } catch (TTransportException& e) {
196
0
            TTransportException::TTransportExceptionType type = e.getType();
197
0
            if (type != TTransportException::TTransportExceptionType::TIMED_OUT) {
198
#ifndef ADDRESS_SANITIZER
199
                // if not TIMED_OUT, retry
200
                LOG(WARNING) << "master client, retry finishTask: " << e.what();
201
#endif
202
203
0
                client_status = client.reopen(config::thrift_rpc_timeout_ms);
204
0
                if (!client_status.ok()) {
205
0
                    return Status::InternalError(
206
0
                            "fail to get master client from cache. host={}, port={}, code={}",
207
0
                            _cluster_info->master_fe_addr.hostname,
208
0
                            _cluster_info->master_fe_addr.port, client_status.code());
209
0
                }
210
211
0
                client->confirmUnusedRemoteFiles(*result, request);
212
0
            } else {
213
                // TIMED_OUT exception. do not retry
214
                // actually we don't care what FE returns.
215
0
                return Status::InternalError(
216
0
                        "fail to confirm unused remote files. host={}, port={}, code={}, reason={}",
217
0
                        _cluster_info->master_fe_addr.hostname, _cluster_info->master_fe_addr.port,
218
0
                        client_status.code(), e.what());
219
0
            }
220
0
        }
221
0
    } catch (std::exception& e) {
222
0
        RETURN_IF_ERROR(client.reopen(config::thrift_rpc_timeout_ms));
223
0
        return Status::InternalError(
224
0
                "fail to confirm unused remote files. host={}, port={}, code={}, reason={}",
225
0
                _cluster_info->master_fe_addr.hostname, _cluster_info->master_fe_addr.port,
226
0
                client_status.code(), e.what());
227
0
    }
228
229
0
    return Status::OK();
230
0
}
231
232
0
bool AgentUtils::exec_cmd(const string& command, string* errmsg, bool redirect_stderr) {
233
    // The exit status of the command.
234
0
    uint32_t rc = 0;
235
236
    // Redirect stderr to stdout to get error message.
237
0
    string cmd = command;
238
0
    if (redirect_stderr) {
239
0
        cmd += " 2>&1";
240
0
    }
241
242
    // Execute command.
243
0
    FILE* fp = popen(cmd.c_str(), "r");
244
0
    if (fp == nullptr) {
245
0
        *errmsg = fmt::format("popen failed. {}, with errno: {}.\n", strerror(errno), errno);
246
0
        return false;
247
0
    }
248
249
    // Get command output.
250
0
    char result[1024] = {'\0'};
251
0
    while (fgets(result, sizeof(result), fp) != nullptr) {
252
0
        *errmsg += result;
253
0
    }
254
255
    // Waits for the associated process to terminate and returns.
256
0
    rc = pclose(fp);
257
0
    if (rc == -1) {
258
0
        if (errno == ECHILD) {
259
0
            *errmsg += "pclose cannot obtain the child status.\n";
260
0
        } else {
261
0
            *errmsg += fmt::format("Close popen failed. {}, with errno: {}.\n", strerror(errno),
262
0
                                   errno);
263
0
        }
264
0
        return false;
265
0
    }
266
267
    // Get return code of command.
268
0
    int32_t status_child = WEXITSTATUS(rc);
269
0
    if (status_child == 0) {
270
0
        return true;
271
0
    } else {
272
0
        return false;
273
0
    }
274
0
}
275
276
0
bool AgentUtils::write_json_to_file(const map<string, string>& info, const string& path) {
277
0
    rapidjson::Document json_info(rapidjson::kObjectType);
278
0
    for (auto& it : info) {
279
0
        json_info.AddMember(rapidjson::Value(it.first.c_str(), json_info.GetAllocator()).Move(),
280
0
                            rapidjson::Value(it.second.c_str(), json_info.GetAllocator()).Move(),
281
0
                            json_info.GetAllocator());
282
0
    }
283
0
    rapidjson::StringBuffer json_info_str;
284
0
    rapidjson::Writer<rapidjson::StringBuffer> writer(json_info_str);
285
0
    json_info.Accept(writer);
286
0
    std::ofstream fp(path);
287
0
    if (!fp) {
288
0
        return false;
289
0
    }
290
0
    fp << json_info_str.GetString() << std::endl;
291
0
    fp.close();
292
293
0
    return true;
294
0
}
295
296
} // namespace doris