be/src/storage/segment/row_ranges.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <roaring/roaring.hh> |
21 | | #include <string> |
22 | | #include <vector> |
23 | | |
24 | | #include "absl/strings/substitute.h" |
25 | | #include "common/cast_set.h" |
26 | | #include "common/logging.h" |
27 | | #include "storage/segment/common.h" |
28 | | |
29 | | namespace doris { |
30 | | namespace segment_v2 { |
31 | | |
32 | | // RowRange stands for range[From, To), From is inclusive, |
33 | | // To is exclusive. It is used for row id range calculation. |
34 | | class RowRange { |
35 | | public: |
36 | | // Returns true if two ranges are overlapped or false. |
37 | | // The union range will be returned through range. |
38 | 105k | static bool range_union(const RowRange& left, const RowRange& right, RowRange* range) { |
39 | 105k | if (left._from <= right._from) { |
40 | 105k | if (left._to >= right._from) { |
41 | 104k | range->_from = left._from; |
42 | 104k | range->_to = std::max(left._to, right._to); |
43 | 104k | return true; |
44 | 104k | } |
45 | 18.4E | } else if (right._to >= left._from) { |
46 | 0 | range->_from = right._from; |
47 | 0 | range->_to = std::max(left._to, right._to); |
48 | 0 | return true; |
49 | 0 | } |
50 | | // return a invalid range |
51 | 1.09k | range->_from = 0; |
52 | 1.09k | range->_to = 0; |
53 | 1.09k | return false; |
54 | 105k | } |
55 | | |
56 | | // Returns true if the two ranges are intersected or false. |
57 | | // The intersection of the two ranges is returned through range. |
58 | 373k | static bool range_intersection(const RowRange& left, const RowRange& right, RowRange* range) { |
59 | 373k | if (left._from <= right._from) { |
60 | 373k | if (left._to > right._from) { |
61 | 373k | range->_from = right._from; |
62 | 373k | range->_to = std::min(left._to, right._to); |
63 | 373k | return true; |
64 | 373k | } |
65 | 18.4E | } else if (right._to > left._from) { |
66 | 62 | range->_from = left._from; |
67 | 62 | range->_to = std::min(left._to, right._to); |
68 | 62 | return true; |
69 | 62 | } |
70 | | // return a invalid range |
71 | 18.4E | range->_from = 0; |
72 | 18.4E | range->_to = 0; |
73 | 18.4E | return false; |
74 | 373k | } |
75 | | |
76 | 478k | RowRange() : _from(0), _to(0) {} |
77 | | |
78 | | // Creates a range of [from, to) (from inclusive and to exclusive; empty ranges are invalid) |
79 | 2.87M | RowRange(int64_t from, int64_t to) : _from(from), _to(to) {} |
80 | | |
81 | | bool is_valid() const { return _from < _to; } |
82 | | |
83 | 6.49M | size_t count() const { return _to - _from; } |
84 | | |
85 | 373k | bool is_before(const RowRange& other) const { return _to <= other._from; } |
86 | | |
87 | 583k | bool is_after(const RowRange& other) const { return _from >= other._to; } |
88 | | |
89 | 942k | int64_t from() const { return _from; } |
90 | | |
91 | 1.64M | int64_t to() const { return _to; } |
92 | | |
93 | 0 | std::string to_string() const { return absl::Substitute("[$0-$1)", _from, _to); } |
94 | | |
95 | 743k | uint64_t get_digest(uint64_t seed) const { |
96 | 743k | uint64_t hash = seed; |
97 | 743k | hash = hash * 31 + _from; |
98 | 743k | hash = hash * 31 + _to; |
99 | 743k | return hash; |
100 | 743k | } |
101 | | |
102 | | private: |
103 | | int64_t _from; |
104 | | int64_t _to; |
105 | | }; |
106 | | |
107 | | class RowRanges { |
108 | | public: |
109 | 16.5M | RowRanges() : _count(0) {} |
110 | | |
111 | 872k | void clear() { |
112 | 872k | _ranges.clear(); |
113 | 872k | _count = 0; |
114 | 872k | } |
115 | | |
116 | | // Creates a new RowRanges object with the single range [0, row_count). |
117 | 441k | static RowRanges create_single(uint64_t row_count) { |
118 | 441k | RowRanges ranges; |
119 | 441k | ranges.add(RowRange(0, row_count)); |
120 | 441k | return ranges; |
121 | 441k | } |
122 | | |
123 | | // Creates a new RowRanges object with the single range [from, to). |
124 | 1.67M | static RowRanges create_single(int64_t from, int64_t to) { |
125 | 1.67M | DCHECK(from <= to); |
126 | 1.67M | RowRanges ranges; |
127 | 1.67M | ranges.add(RowRange(from, to)); |
128 | 1.67M | return ranges; |
129 | 1.67M | } |
130 | | |
131 | | // Calculates the union of the two specified RowRanges object. The union of two range is calculated if there are |
132 | | // elements between them. Otherwise, the two disjunct ranges are stored separately. |
133 | | // For example: |
134 | | // [113, 241) ∪ [221, 340) = [113, 340) |
135 | | // [113, 230) ∪ [230, 340) = [113, 340] |
136 | | // while |
137 | | // [113, 230) ∪ [231, 340) = [113, 230), [231, 340) |
138 | 1.67M | static void ranges_union(const RowRanges& left, const RowRanges& right, RowRanges* result) { |
139 | 1.67M | RowRanges tmp_range; |
140 | 1.67M | auto it1 = left._ranges.begin(); |
141 | 1.67M | auto it2 = right._ranges.begin(); |
142 | | // merge and add |
143 | 1.78M | while (it1 != left._ranges.end() && it2 != right._ranges.end()) { |
144 | 104k | if (it1->is_after(*it2)) { |
145 | 0 | tmp_range.add(*it2); |
146 | 0 | ++it2; |
147 | 104k | } else { |
148 | 104k | tmp_range.add(*it1); |
149 | 104k | ++it1; |
150 | 104k | } |
151 | 104k | } |
152 | 1.83M | while (it1 != left._ranges.end()) { |
153 | 162k | tmp_range.add(*it1); |
154 | 162k | ++it1; |
155 | 162k | } |
156 | 2.02M | while (it2 != right._ranges.end()) { |
157 | 348k | tmp_range.add(*it2); |
158 | 348k | ++it2; |
159 | 348k | } |
160 | 1.67M | *result = std::move(tmp_range); |
161 | 1.67M | } |
162 | | |
163 | | // Calculates the intersection of the two specified RowRanges object. Two ranges intersect if they have common |
164 | | // elements otherwise the result is empty. |
165 | | // For example: |
166 | | // [113, 241) ∩ [221, 340) = [221, 241) |
167 | | // while |
168 | | // [113, 230) ∩ [230, 340) = <EMPTY> |
169 | | // |
170 | | // The result RowRanges object will contain all the row indexes there were contained in both of the specified objects |
171 | | static void ranges_intersection(const RowRanges& left, const RowRanges& right, |
172 | 373k | RowRanges* result) { |
173 | 373k | RowRanges tmp_range; |
174 | 373k | int right_index = 0; |
175 | 747k | for (auto it1 = left._ranges.begin(); it1 != left._ranges.end(); ++it1) { |
176 | 373k | const RowRange& range1 = *it1; |
177 | 747k | for (int i = right_index; i < right._ranges.size(); ++i) { |
178 | 373k | const RowRange& range2 = right._ranges[i]; |
179 | 373k | if (range1.is_before(range2)) { |
180 | 72 | break; |
181 | 373k | } else if (range1.is_after(range2)) { |
182 | 159 | right_index = i + 1; |
183 | 159 | continue; |
184 | 159 | } |
185 | 373k | RowRange merge_range; |
186 | 373k | bool ret = RowRange::range_intersection(range1, range2, &merge_range); |
187 | 373k | DCHECK(ret); |
188 | 373k | tmp_range.add(merge_range); |
189 | 373k | } |
190 | 373k | } |
191 | 373k | *result = std::move(tmp_range); |
192 | 373k | } |
193 | | |
194 | | // Calculates the exception (set difference) of the two specified RowRanges objects: left \ right. |
195 | | // The result contains all row indexes that are in the left ranges but NOT in the right ranges. |
196 | | // For example: |
197 | | // [100, 300) \ [150, 200) = [100, 150), [200, 300) |
198 | | // [100, 300) \ [0, 150) = [150, 300) |
199 | | // [100, 300) \ [250, 400) = [100, 250) |
200 | | // [100, 200) \ [200, 300) = [100, 200) |
201 | | // [100, 300) \ [0, 400) = <EMPTY> |
202 | | // [100, 200), [300, 400) \ [150, 350) = [100, 150), [350, 400) |
203 | 34 | static void ranges_exception(const RowRanges& left, const RowRanges& right, RowRanges* result) { |
204 | 34 | RowRanges tmp_range; |
205 | 34 | int right_index = 0; |
206 | 73 | for (auto it1 = left._ranges.begin(); it1 != left._ranges.end(); ++it1) { |
207 | 39 | int64_t current_from = it1->from(); |
208 | 39 | int64_t current_to = it1->to(); |
209 | 70 | for (int i = right_index; i < right._ranges.size(); ++i) { |
210 | 36 | const RowRange& range2 = right._ranges[i]; |
211 | 36 | if (current_from >= current_to) { |
212 | | // Current range fully consumed |
213 | 1 | break; |
214 | 1 | } |
215 | 35 | if (current_to <= range2.from()) { |
216 | | // Current remaining range is entirely before range2, no more subtraction needed |
217 | 4 | break; |
218 | 4 | } |
219 | 31 | if (current_from >= range2.to()) { |
220 | | // range2 is entirely before the current remaining range, advance right_index |
221 | 6 | right_index = i + 1; |
222 | 6 | continue; |
223 | 6 | } |
224 | | // There is overlap between [current_from, current_to) and range2 |
225 | 25 | if (current_from < range2.from()) { |
226 | | // Left portion before the overlap: [current_from, range2.from()) |
227 | 14 | tmp_range.add(RowRange(current_from, range2.from())); |
228 | 14 | } |
229 | | // Advance current_from past the overlap |
230 | 25 | current_from = range2.to(); |
231 | 25 | } |
232 | | // Add whatever remains of the current left range |
233 | 39 | if (current_from < current_to) { |
234 | 26 | tmp_range.add(RowRange(current_from, current_to)); |
235 | 26 | } |
236 | 39 | } |
237 | 34 | *result = std::move(tmp_range); |
238 | 34 | } |
239 | | |
240 | 2.00M | static roaring::Roaring ranges_to_roaring(const RowRanges& ranges) { |
241 | 2.00M | roaring::Roaring result; |
242 | 2.94M | for (auto it = ranges._ranges.begin(); it != ranges._ranges.end(); ++it) { |
243 | 940k | result.addRange(it->from(), it->to()); |
244 | 940k | } |
245 | 2.00M | return result; |
246 | 2.00M | } |
247 | | |
248 | 419k | size_t count() { return _count; } |
249 | | |
250 | 2.99M | bool is_empty() { return _count == 0; } |
251 | | |
252 | 0 | bool contain(rowid_t from, rowid_t to) { |
253 | 0 | // binary search |
254 | 0 | RowRange tmp_range = RowRange(from, to); |
255 | 0 | size_t start = 0; |
256 | 0 | size_t end = _ranges.size(); |
257 | 0 | while (start <= end) { |
258 | 0 | size_t mid = (start + end) / 2; |
259 | 0 | if (_ranges[mid].is_before(tmp_range)) { |
260 | 0 | start = mid; |
261 | 0 | } else if (_ranges[mid].is_after(tmp_range)) { |
262 | 0 | end = mid - 1; |
263 | 0 | } else { |
264 | 0 | return true; |
265 | 0 | } |
266 | 0 | } |
267 | 0 | return false; |
268 | 0 | } |
269 | | |
270 | | int64_t from() { |
271 | | DCHECK(!is_empty()); |
272 | | return _ranges[0].from(); |
273 | | } |
274 | | |
275 | 698k | int64_t to() { |
276 | 698k | DCHECK(!is_empty()); |
277 | 698k | return _ranges[_ranges.size() - 1].to(); |
278 | 698k | } |
279 | | |
280 | 2.95k | size_t range_size() const { return _ranges.size(); } |
281 | | |
282 | 911 | RowRange get_range(size_t index) const { return _ranges[index]; } |
283 | | |
284 | 572 | int64_t get_range_from(size_t range_index) const { return _ranges[range_index].from(); } |
285 | | |
286 | 1.01k | int64_t get_range_to(size_t range_index) const { return _ranges[range_index].to(); } |
287 | | |
288 | | size_t get_range_count(size_t range_index) const { return _ranges[range_index].count(); } |
289 | | |
290 | 0 | std::string to_string() { |
291 | 0 | std::string result; |
292 | 0 | for (auto range : _ranges) { |
293 | 0 | result += range.to_string() + " "; |
294 | 0 | } |
295 | 0 | return result; |
296 | 0 | } |
297 | | |
298 | | // Adds a range to the end of the list of ranges. It maintains the disjunct ascending order(*) of the ranges by |
299 | | // trying to union the specified range to the last ranges in the list. The specified range shall be larger(*) than |
300 | | // the last one or might be overlapped with some of the last ones. |
301 | 3.85M | void add(const RowRange& range) { |
302 | 3.85M | if (range.count() == 0) { |
303 | 1.32M | return; |
304 | 1.32M | } |
305 | 2.53M | RowRange range_to_add = range; |
306 | 2.63M | for (int i = cast_set<int>(_ranges.size()) - 1; i >= 0; --i) { |
307 | 105k | const RowRange last = _ranges[i]; |
308 | 105k | DCHECK(!last.is_after(range)); |
309 | 105k | RowRange u; |
310 | 105k | bool ret = RowRange::range_union(last, range_to_add, &u); |
311 | 105k | if (!ret) { |
312 | | // range do not intersect with the last |
313 | 1.08k | break; |
314 | 1.08k | } |
315 | 104k | range_to_add = u; |
316 | 104k | _ranges.erase(_ranges.begin() + i); |
317 | 104k | _count -= last.count(); |
318 | 104k | } |
319 | 2.53M | _ranges.emplace_back(range_to_add); |
320 | 2.53M | _count += range_to_add.count(); |
321 | 2.53M | } |
322 | | |
323 | | // Returns the row index (within the original row space) of the pos-th element |
324 | | // across all ranges. For example, if ranges are [0,3000) and [8000,11000), |
325 | | // pos=0 returns 0, pos=2999 returns 2999, pos=3000 returns 8000. |
326 | 98 | int64_t get_row_index_by_pos(int64_t pos) const { |
327 | 98 | DORIS_CHECK(pos < _count); |
328 | 98 | size_t remaining = pos; |
329 | 98 | for (const auto& range : _ranges) { |
330 | 98 | size_t range_len = range.count(); |
331 | 98 | if (remaining < range_len) { |
332 | 98 | return range.from() + remaining; |
333 | 98 | } |
334 | 0 | remaining -= range_len; |
335 | 0 | } |
336 | | // pos is out of bounds; return -1 to indicate invalid |
337 | 98 | DCHECK(false) << "pos " << pos << " is out of bounds for RowRanges with count " << _count; |
338 | 0 | return -1; |
339 | 98 | } |
340 | | |
341 | 743k | uint64_t get_digest(uint64_t seed) const { |
342 | 743k | for (auto range : _ranges) { |
343 | 743k | seed = range.get_digest(seed); |
344 | 743k | } |
345 | 743k | return seed; |
346 | 743k | } |
347 | | |
348 | | private: |
349 | | std::vector<RowRange> _ranges; |
350 | | size_t _count; |
351 | | }; |
352 | | |
353 | | } // namespace segment_v2 |
354 | | } // namespace doris |