be/src/storage/segment/binary_prefix_page.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <glog/logging.h> |
21 | | #include <stddef.h> |
22 | | #include <stdint.h> |
23 | | |
24 | | #include <vector> |
25 | | |
26 | | #include "common/status.h" |
27 | | #include "core/column/column.h" |
28 | | #include "storage/segment/options.h" |
29 | | #include "storage/segment/page_builder.h" |
30 | | #include "storage/segment/page_decoder.h" |
31 | | #include "util/coding.h" |
32 | | #include "util/faststring.h" |
33 | | #include "util/slice.h" |
34 | | |
35 | | namespace doris { |
36 | | namespace segment_v2 { |
37 | | |
38 | | // prefix encoding for string dictionary |
39 | | // |
40 | | // BinaryPrefixPage := Entry^EntryNum, Trailer |
41 | | // Entry := SharedPrefixLength(vint), UnsharedLength(vint), Byte^UnsharedLength |
42 | | // Trailer := NumEntry(uint32_t), RESTART_POINT_INTERVAL(uint8_t) |
43 | | // RestartPointStartOffset(uint32_t)^NumRestartPoints,NumRestartPoints(uint32_t) |
44 | | class BinaryPrefixPageBuilder : public PageBuilderHelper<BinaryPrefixPageBuilder> { |
45 | | public: |
46 | | using Self = BinaryPrefixPageBuilder; |
47 | | friend class PageBuilderHelper<Self>; |
48 | | |
49 | 210 | Status init() override { return reset(); } |
50 | | |
51 | 269k | bool is_page_full() override { return size() >= _options.data_page_size; } |
52 | | |
53 | | Status add(const uint8_t* vals, size_t* add_count) override; |
54 | | |
55 | | Status finish(OwnedSlice* slice) override; |
56 | | |
57 | 453 | Status reset() override { |
58 | 453 | _restart_points_offset.clear(); |
59 | 453 | _last_entry.clear(); |
60 | 453 | _count = 0; |
61 | 453 | _buffer.clear(); |
62 | 453 | _finished = false; |
63 | 453 | _raw_data_size = 0; |
64 | 453 | return Status::OK(); |
65 | 453 | } |
66 | | |
67 | 269k | uint64_t size() const override { |
68 | 269k | if (_finished) { |
69 | 0 | return _buffer.size(); |
70 | 269k | } else { |
71 | 269k | return _buffer.size() + (_restart_points_offset.size() + 2) * sizeof(uint32_t); |
72 | 269k | } |
73 | 269k | } |
74 | | |
75 | 134k | size_t count() const override { return _count; } |
76 | | |
77 | 0 | uint64_t get_raw_data_size() const override { return _raw_data_size; } |
78 | | |
79 | | private: |
80 | 210 | BinaryPrefixPageBuilder(const PageBuilderOptions& options) : _options(options) {} |
81 | | |
82 | | PageBuilderOptions _options; |
83 | | std::vector<uint32_t> _restart_points_offset; |
84 | | // Holds the most recently added entry; used by add() to compute the |
85 | | // shared-prefix length against the next entry. |
86 | | faststring _last_entry; |
87 | | size_t _count = 0; |
88 | | bool _finished = false; |
89 | | faststring _buffer; |
90 | | uint64_t _raw_data_size = 0; |
91 | | // This is a empirical value, Kudu and LevelDB use this default value |
92 | | static const uint8_t RESTART_POINT_INTERVAL = 16; |
93 | | }; |
94 | | |
95 | | class BinaryPrefixPageDecoder : public PageDecoder { |
96 | | public: |
97 | | BinaryPrefixPageDecoder(Slice data, const PageDecoderOptions& options) |
98 | 203 | : _data(data), _parsed(false) {} |
99 | | |
100 | | Status init() override; |
101 | | |
102 | | Status seek_to_position_in_page(size_t pos) override; |
103 | | |
104 | | Status seek_at_or_after_value(const void* value, bool* exact_match) override; |
105 | | |
106 | | Status next_batch(size_t* n, MutableColumnPtr& dst) override; |
107 | | |
108 | 0 | size_t count() const override { |
109 | 0 | DCHECK(_parsed); |
110 | 0 | return _num_values; |
111 | 0 | } |
112 | | |
113 | 5.45k | size_t current_index() const override { |
114 | 5.45k | DCHECK(_parsed); |
115 | 5.45k | return _cur_pos; |
116 | 5.45k | } |
117 | | |
118 | | private: |
119 | | // decode shared and non-shared entry length from `ptr`. |
120 | | // return ptr past the parsed value when success. |
121 | | // return nullptr on failure |
122 | | const uint8_t* _decode_value_lengths(const uint8_t* ptr, uint32_t* shared, |
123 | | uint32_t* non_shared); |
124 | | |
125 | | // return start pointer of the restart point at index `restart_point_index` |
126 | 46.0k | const uint8_t* _get_restart_point(size_t restart_point_index) const { |
127 | 46.0k | return reinterpret_cast<const uint8_t*>(_data.get_data()) + |
128 | 46.0k | decode_fixed32_le(_restarts_ptr + restart_point_index * sizeof(uint32_t)); |
129 | 46.0k | } |
130 | | |
131 | | // read next value at `_cur_pos` and `_next_ptr` into `_current_value`. |
132 | | // return OK and advance `_next_ptr` on success. `_cur_pos` is not modified. |
133 | | // return EndOfFile when no more entry can be read. |
134 | | // return other error status otherwise. |
135 | | Status _read_next_value(); |
136 | | |
137 | | // seek to the first value at the given restart point |
138 | | Status _seek_to_restart_point(size_t restart_point_index); |
139 | | |
140 | | Slice _data; |
141 | | bool _parsed = false; |
142 | | size_t _num_values = 0; |
143 | | uint8_t _restart_point_internal = 0; |
144 | | uint32_t _num_restarts = 0; |
145 | | // pointer to _footer start |
146 | | const uint8_t* _footer_start = nullptr; |
147 | | // pointer to restart offsets array |
148 | | const uint8_t* _restarts_ptr = nullptr; |
149 | | // ordinal of the first value to return in next_batch() |
150 | | uint32_t _cur_pos = 0; |
151 | | // first value to return in next_batch() |
152 | | faststring _current_value; |
153 | | // pointer to the start of next value to read, advanced by `_read_next_value` |
154 | | const uint8_t* _next_ptr = nullptr; |
155 | | }; |
156 | | |
157 | | } // namespace segment_v2 |
158 | | } // namespace doris |