/root/doris/cloud/src/recycler/checker.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #if defined(USE_LIBCPP) && _LIBCPP_ABI_VERSION <= 1 |
21 | | #define _LIBCPP_ABI_INCOMPLETE_TYPES_IN_DEQUE |
22 | | #endif |
23 | | #include <atomic> |
24 | | #include <condition_variable> |
25 | | #include <deque> |
26 | | #include <functional> |
27 | | #include <thread> |
28 | | #include <unordered_map> |
29 | | #include <unordered_set> |
30 | | |
31 | | #include "recycler/storage_vault_accessor.h" |
32 | | #include "recycler/white_black_list.h" |
33 | | #include "snapshot/snapshot_manager.h" |
34 | | |
35 | | namespace doris { |
36 | | class RowsetMetaCloudPB; |
37 | | } // namespace doris |
38 | | |
39 | | namespace doris::cloud { |
40 | | class StorageVaultAccessor; |
41 | | class InstanceChecker; |
42 | | class TxnKv; |
43 | | class InstanceInfoPB; |
44 | | |
45 | | class Checker { |
46 | | public: |
47 | | explicit Checker(std::shared_ptr<TxnKv> txn_kv); |
48 | | ~Checker(); |
49 | | |
50 | | int start(); |
51 | | |
52 | | void stop(); |
53 | 317 | bool stopped() const { return stopped_.load(std::memory_order_acquire); } |
54 | | |
55 | | private: |
56 | | void lease_check_jobs(); |
57 | | void inspect_instance_check_interval(); |
58 | | void do_inspect(const InstanceInfoPB& instance); |
59 | | |
60 | | private: |
61 | | friend class RecyclerServiceImpl; |
62 | | |
63 | | std::shared_ptr<TxnKv> txn_kv_; |
64 | | std::atomic_bool stopped_ {false}; |
65 | | std::string ip_port_; |
66 | | std::vector<std::thread> workers_; |
67 | | |
68 | | std::mutex mtx_; |
69 | | // notify check workers |
70 | | std::condition_variable pending_instance_cond_; |
71 | | std::deque<InstanceInfoPB> pending_instance_queue_; |
72 | | // instance_id -> enqueue_timestamp |
73 | | std::unordered_map<std::string, long> pending_instance_map_; |
74 | | std::unordered_map<std::string, std::shared_ptr<InstanceChecker>> working_instance_map_; |
75 | | // notify instance scanner and lease thread |
76 | | std::condition_variable notifier_; |
77 | | |
78 | | WhiteBlackList instance_filter_; |
79 | | }; |
80 | | |
81 | | class InstanceChecker { |
82 | | public: |
83 | | explicit InstanceChecker(std::shared_ptr<TxnKv> txn_kv, const std::string& instance_id); |
84 | | // Return 0 if success, otherwise error |
85 | | int init(const InstanceInfoPB& instance); |
86 | | // Return 0 if success. |
87 | | // Return 1 if data leak is identified. |
88 | | // Return negative if a temporary error occurred during the check process. |
89 | | int do_inverted_check(); |
90 | | |
91 | | // Return 0 if success. |
92 | | // Return 1 if data loss is identified. |
93 | | // Return negative if a temporary error occurred during the check process. |
94 | | int do_check(); |
95 | | |
96 | | // Return 0 if success. |
97 | | // Return 1 if delete bitmap leak is identified. |
98 | | // Return negative if a temporary error occurred during the check process. |
99 | | int do_delete_bitmap_inverted_check(); |
100 | | |
101 | | // version = 1 : https://github.com/apache/doris/pull/40204 |
102 | | // checks if https://github.com/apache/doris/pull/40204 works as expected |
103 | | // the stale delete bitmap will be cleared in MS when BE delete expired stale rowsets |
104 | | // NOTE: stale rowsets will be lost after BE restarts, so there may be some stale delete bitmaps |
105 | | // which will not be cleared. |
106 | | // version = 2 : https://github.com/apache/doris/pull/49822 |
107 | | int do_delete_bitmap_storage_optimize_check(int version = 2); |
108 | | |
109 | | int do_mow_job_key_check(); |
110 | | |
111 | | int do_tablet_stats_key_check(); |
112 | | |
113 | | int do_restore_job_check(); |
114 | | |
115 | | int do_txn_key_check(); |
116 | | |
117 | | // check table and partition version key |
118 | | // table version should be greater than the versions of all its partitions |
119 | | // Return 0 if success, otherwise error |
120 | | int do_version_key_check(); |
121 | | |
122 | | // Return 0 if success. |
123 | | // Return 1 if meta rowset key leak or loss is identified. |
124 | | // Return negative if a temporary error occurred during the check process. |
125 | | int do_meta_rowset_key_check(); |
126 | | |
127 | | int do_snapshots_check(); |
128 | | |
129 | | // If there are multiple buckets, return the minimum lifecycle; if there are no buckets (i.e. |
130 | | // all accessors are HdfsAccessor), return INT64_MAX. |
131 | | // Return 0 if success, otherwise error |
132 | | int get_bucket_lifecycle(int64_t* lifecycle_days); |
133 | 0 | void stop() { stopped_.store(true, std::memory_order_release); } |
134 | 5.81k | bool stopped() const { return stopped_.load(std::memory_order_acquire); } |
135 | 0 | std::string_view instance_id() const { return instance_id_; } |
136 | | |
137 | | private: |
138 | | struct RowsetIndexesFormatV1 { |
139 | | std::string rowset_id; |
140 | | std::unordered_set<int64_t> segment_ids; |
141 | | std::unordered_set<std::string> index_ids; |
142 | | }; |
143 | | |
144 | | struct RowsetIndexesFormatV2 { |
145 | | std::string rowset_id; |
146 | | std::unordered_set<int64_t> segment_ids; |
147 | | }; |
148 | | |
149 | | private: |
150 | | // returns 0 for success otherwise error |
151 | | int init_obj_store_accessors(const InstanceInfoPB& instance); |
152 | | |
153 | | // returns 0 for success otherwise error |
154 | | int init_storage_vault_accessors(const InstanceInfoPB& instance); |
155 | | |
156 | | int traverse_mow_tablet(const std::function<int(int64_t, bool)>& check_func); |
157 | | int traverse_rowset_delete_bitmaps( |
158 | | int64_t tablet_id, std::string rowset_id, |
159 | | const std::function<int(int64_t, std::string_view, int64_t, int64_t)>& callback); |
160 | | int collect_tablet_rowsets( |
161 | | int64_t tablet_id, |
162 | | const std::function<void(const doris::RowsetMetaCloudPB&)>& collect_cb); |
163 | | int get_pending_delete_bitmap_keys(int64_t tablet_id, |
164 | | std::unordered_set<std::string>& pending_delete_bitmaps); |
165 | | int check_delete_bitmap_storage_optimize_v2(int64_t tablet_id, bool has_sequence_col, |
166 | | int64_t& abnormal_rowsets_num); |
167 | | |
168 | | int check_inverted_index_file_storage_format_v1(int64_t tablet_id, const std::string& file_path, |
169 | | const std::string& rowset_info, |
170 | | RowsetIndexesFormatV1& rowset_index_cache_v1); |
171 | | |
172 | | int check_inverted_index_file_storage_format_v2(int64_t tablet_id, const std::string& file_path, |
173 | | const std::string& rowset_info, |
174 | | RowsetIndexesFormatV2& rowset_index_cache_v2); |
175 | | |
176 | | // Return 0 if success. |
177 | | // Return 1 if key loss is abnormal. |
178 | | // Return negative if a temporary error occurred during the check process. |
179 | | int check_stats_tablet_key(std::string_view key, std::string_view value); |
180 | | |
181 | | // Return 0 if success. |
182 | | // Return 1 if key loss is identified. |
183 | | // Return negative if a temporary error occurred during the check process. |
184 | | int check_stats_tablet_key_exists(std::string_view key, std::string_view value); |
185 | | |
186 | | // Return 0 if success. |
187 | | // Return 1 if key leak is identified. |
188 | | // Return negative if a temporary error occurred during the check process. |
189 | | int check_stats_tablet_key_leaked(std::string_view key, std::string_view value); |
190 | | int check_txn_info_key(std::string_view key, std::string_view value); |
191 | | |
192 | | int check_txn_label_key(std::string_view key, std::string_view value); |
193 | | |
194 | | int check_txn_index_key(std::string_view key, std::string_view value); |
195 | | |
196 | | int check_txn_running_key(std::string_view key, std::string_view value); |
197 | | |
198 | | // Only check whether the meta rowset key is leak |
199 | | // in do_inverted_check() function, check whether the key is lost by comparing data file with key |
200 | | // Return 0 if success. |
201 | | // Return 1 if meta rowset key leak is identified. |
202 | | // Return negative if a temporary error occurred during the check process. |
203 | | int check_meta_rowset_key(std::string_view key, std::string_view value); |
204 | | |
205 | | // if TxnInfoKey's finish time > current time, it should not find tmp rowset |
206 | | // Return 0 if success. |
207 | | // Return 1 if meta tmp rowset key is abnormal. |
208 | | // Return negative if a temporary error occurred during the check process. |
209 | | int check_meta_tmp_rowset_key(std::string_view key, std::string_view value); |
210 | | |
211 | | /** |
212 | | * It is used to scan the key in the range from start_key to end_key |
213 | | * and then perform handle operations on each group of kv |
214 | | * |
215 | | * @param start_key Range begining. Note that this function will modify the `start_key` |
216 | | * @param end_key Range ending |
217 | | * @param handle_kv Operations on kv |
218 | | * @return code int 0 for success to scan and hanle, 1 for success to scan but handle abnormally, -1 for failed to handle |
219 | | */ |
220 | | int scan_and_handle_kv(std::string& start_key, const std::string& end_key, |
221 | | std::function<int(std::string_view, std::string_view)> handle_kv); |
222 | | |
223 | | StorageVaultAccessor* get_accessor(const std::string& id); |
224 | | |
225 | | void get_all_accessor(std::vector<StorageVaultAccessor*>* accessors); |
226 | | |
227 | | std::atomic_bool stopped_ {false}; |
228 | | std::shared_ptr<TxnKv> txn_kv_; |
229 | | std::string instance_id_; |
230 | | // id -> accessor |
231 | | std::unordered_map<std::string, std::shared_ptr<StorageVaultAccessor>> accessor_map_; |
232 | | std::shared_ptr<SnapshotManager> snapshot_manager_; |
233 | | }; |
234 | | |
235 | | } // namespace doris::cloud |