be/src/util/bit_stream_utils.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // This file is copied from |
18 | | // https://github.com/apache/impala/blob/branch-2.9.0/be/src/util/bit-stream-utils.h |
19 | | // and modified by Doris |
20 | | |
21 | | #pragma once |
22 | | |
23 | | #include "util/bit_packing.h" |
24 | | #include "util/bit_util.h" |
25 | | #include "util/faststring.h" |
26 | | |
27 | | using doris::BitUtil; |
28 | | #include "common/compile_check_begin.h" |
29 | | namespace doris { |
30 | | |
31 | | // Utility class to write bit/byte streams. This class can write data to either be |
32 | | // bit packed or byte aligned (and a single stream that has a mix of both). |
33 | | class BitWriter { |
34 | | public: |
35 | | // buffer: buffer to write bits to. |
36 | 11.6M | explicit BitWriter(faststring* buffer) : buffer_(buffer) { Clear(); } |
37 | | |
38 | 23.8M | void Clear() { |
39 | 23.8M | buffered_values_ = 0; |
40 | 23.8M | byte_offset_ = 0; |
41 | 23.8M | bit_offset_ = 0; |
42 | 23.8M | buffer_->clear(); |
43 | 23.8M | } |
44 | | |
45 | | // Returns a pointer to the underlying buffer |
46 | 1.47M | faststring* buffer() const { return buffer_; } |
47 | | |
48 | | // The number of current bytes written, including the current byte (i.e. may include a |
49 | | // fraction of a byte). Includes buffered values. |
50 | 11.4M | int bytes_written() const { return byte_offset_ + BitUtil::Ceil(bit_offset_, 8); } |
51 | | |
52 | | // Writes a value to buffered_values_, flushing to buffer_ if necessary. This is bit |
53 | | // packed. |
54 | | void PutValue(uint64_t v, int num_bits); |
55 | | |
56 | | // Writes v to the next aligned byte using num_bits. If T is larger than num_bits, the |
57 | | // extra high-order bits will be ignored. |
58 | | template <typename T> |
59 | | void PutAligned(T v, int num_bits); |
60 | | |
61 | | // Write a Vlq encoded int to the buffer. The value is written byte aligned. |
62 | | // For more details on vlq: en.wikipedia.org/wiki/Variable-length_quantity |
63 | | void PutVlqInt(int32_t v); |
64 | | |
65 | | // Get the index to the next aligned byte and advance the underlying buffer by num_bytes. |
66 | 1.47M | size_t GetByteIndexAndAdvance(int num_bytes) { |
67 | 1.47M | uint8_t* ptr = GetNextBytePtr(num_bytes); |
68 | 1.47M | return ptr - buffer_->data(); |
69 | 1.47M | } |
70 | | |
71 | | // Get a pointer to the next aligned byte and advance the underlying buffer by num_bytes. |
72 | | uint8_t* GetNextBytePtr(int num_bytes); |
73 | | |
74 | | // Flushes all buffered values to the buffer. Call this when done writing to the buffer. |
75 | | // If 'align' is true, buffered_values_ is reset and any future writes will be written |
76 | | // to the next byte boundary. |
77 | | void Flush(bool align = false); |
78 | | |
79 | | private: |
80 | | // Bit-packed values are initially written to this variable before being memcpy'd to |
81 | | // buffer_. This is faster than writing values byte by byte directly to buffer_. |
82 | | uint64_t buffered_values_; |
83 | | |
84 | | faststring* buffer_ = nullptr; |
85 | | int byte_offset_; // Offset in buffer_ |
86 | | int bit_offset_; // Offset in buffered_values_ |
87 | | }; |
88 | | |
89 | | // Utility class to read bit/byte stream. This class can read bits or bytes |
90 | | // that are either byte aligned or not. It also has utilities to read multiple |
91 | | // bytes in one read (e.g. encoded int). |
92 | | class BitReader { |
93 | | public: |
94 | | // 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'. |
95 | | BitReader(const uint8_t* buffer, int buffer_len); |
96 | | |
97 | 29.7M | BitReader() : buffer_(nullptr), max_bytes_(0) {} |
98 | | |
99 | | // Gets the next value from the buffer. Returns true if 'v' could be read or false if |
100 | | // there are not enough bytes left. num_bits must be <= 32. |
101 | | template <typename T> |
102 | | bool GetValue(int num_bits, T* v); |
103 | | |
104 | | // Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T needs to be a |
105 | | // little-endian native type and big enough to store 'num_bytes'. The value is assumed |
106 | | // to be byte-aligned so the stream will be advanced to the start of the next byte |
107 | | // before 'v' is read. Returns false if there are not enough bytes left. |
108 | | template <typename T> |
109 | | bool GetAligned(int num_bytes, T* v); |
110 | | |
111 | | // Reads a vlq encoded int from the stream. The encoded int must start at the |
112 | | // beginning of a byte. Return false if there were not enough bytes in the buffer. |
113 | | bool GetVlqInt(uint32_t* v); |
114 | | // Reads a zigzag encoded int `into` v. |
115 | | bool GetZigZagVlqInt(int32_t* v); |
116 | | |
117 | | // Reads a vlq encoded int from the stream. The encoded int must start at the |
118 | | // beginning of a byte. Return false if there were not enough bytes in the buffer. |
119 | | bool GetVlqInt(uint64_t* v); |
120 | | // Reads a zigzag encoded int `into` v. |
121 | | bool GetZigZagVlqInt(int64_t* v); |
122 | | |
123 | | // Returns the number of bytes left in the stream, not including the current byte (i.e., |
124 | | // there may be an additional fraction of a byte). |
125 | | int bytes_left() { return max_bytes_ - (byte_offset_ + BitUtil::Ceil(bit_offset_, 8)); } |
126 | | |
127 | | // Current position in the stream, by bit. |
128 | | int position() const { return byte_offset_ * 8 + bit_offset_; } |
129 | | |
130 | | // Rewind the stream by 'num_bits' bits |
131 | | void Rewind(int num_bits); |
132 | | |
133 | | // Advance the stream by 'num_bits' bits |
134 | | bool Advance(int64_t num_bits); |
135 | | |
136 | | // Seek to a specific bit in the buffer |
137 | | void SeekToBit(unsigned int stream_position); |
138 | | |
139 | | // Maximum byte length of a vlq encoded int |
140 | | static const int MAX_VLQ_BYTE_LEN = 5; |
141 | | |
142 | | // Maximum byte length of a vlq encoded int64 |
143 | | static const int MAX_VLQ_BYTE_LEN_FOR_INT64 = 10; |
144 | | |
145 | 43.5M | bool is_initialized() const { return buffer_ != nullptr; } |
146 | | |
147 | 0 | const uint8_t* buffer() const { return buffer_; } |
148 | | |
149 | 0 | int max_bytes() const { return max_bytes_; } |
150 | | |
151 | | private: |
152 | | // Used by SeekToBit() and GetValue() to fetch the |
153 | | // the next word into buffer_. |
154 | | void BufferValues(); |
155 | | |
156 | | const uint8_t* buffer_ = nullptr; |
157 | | int max_bytes_; |
158 | | |
159 | | // Bytes are memcpy'd from buffer_ and values are read from this variable. This is |
160 | | // faster than reading values byte by byte directly from buffer_. |
161 | | uint64_t buffered_values_; |
162 | | |
163 | | int byte_offset_; // Offset in buffer_ |
164 | | int bit_offset_; // Offset in buffered_values_ |
165 | | }; |
166 | | |
167 | | /// Utility class to read bit/byte stream. This class can read bits or bytes that are |
168 | | /// either byte aligned or not. It also has utilities to read multiple bytes in one |
169 | | /// read (e.g. encoded int). Exposes a batch-oriented interface to allow efficient |
170 | | /// processing of multiple values at a time. |
171 | | class BatchedBitReader { |
172 | | public: |
173 | | /// 'buffer' is the buffer to read from. The buffer's length is 'buffer_len'. |
174 | | /// Does not take ownership of the buffer. |
175 | 0 | BatchedBitReader(const uint8_t* buffer, int64_t buffer_len) { Reset(buffer, buffer_len); } |
176 | | |
177 | 1.03k | BatchedBitReader() {} |
178 | | |
179 | | // The implicit copy constructor is left defined. If a BatchedBitReader is copied, the |
180 | | // two copies do not share any state. Invoking functions on either copy continues |
181 | | // reading from the current read position without modifying the state of the other |
182 | | // copy. |
183 | | |
184 | | /// Resets the read to start reading from the start of 'buffer'. The buffer's |
185 | | /// length is 'buffer_len'. Does not take ownership of the buffer. |
186 | 1.03k | void Reset(const uint8_t* buffer, int64_t buffer_len) { |
187 | 1.03k | DCHECK(buffer != nullptr); |
188 | 1.03k | DCHECK_GE(buffer_len, 0); |
189 | 1.03k | buffer_pos_ = buffer; |
190 | 1.03k | buffer_end_ = buffer + buffer_len; |
191 | 1.03k | } |
192 | | |
193 | | /// Gets up to 'num_values' bit-packed values, starting from the current byte in the |
194 | | /// buffer and advance the read position. 'bit_width' must be <= 64. |
195 | | /// If 'bit_width' * 'num_values' is not a multiple of 8, the trailing bytes are |
196 | | /// skipped and the next UnpackBatch() call will start reading from the next byte. |
197 | | /// |
198 | | /// If the caller does not want to drop trailing bits, 'num_values' must be exactly the |
199 | | /// total number of values the caller wants to read from a run of bit-packed values, or |
200 | | /// 'bit_width' * 'num_values' must be a multiple of 8. This condition is always |
201 | | /// satisfied if 'num_values' is a multiple of 32. |
202 | | /// |
203 | | /// The output type 'T' must be an unsigned integer. |
204 | | /// |
205 | | /// Returns the number of values read. |
206 | | template <typename T> |
207 | | int UnpackBatch(int bit_width, int num_values, T* v); |
208 | | |
209 | | /// Skip 'num_values_to_skip' bit-packed values. |
210 | | /// 'num_values_to_skip * bit_width' is either divisible by 8, or |
211 | | /// 'num_values_to_skip' equals to the count of the remaining bit-packed values. |
212 | | bool SkipBatch(int bit_width, int num_values_to_skip); |
213 | | |
214 | | /// Unpack bit-packed values in the same way as UnpackBatch() and decode them using the |
215 | | /// dictionary 'dict' with 'dict_len' entries. Return -1 if a decoding error is |
216 | | /// encountered, i.e. if the bit-packed values are not valid indices in 'dict'. |
217 | | /// Otherwise returns the number of values decoded. The values are written to 'v' with |
218 | | /// a stride of 'stride' bytes. |
219 | | template <typename T> |
220 | | int UnpackAndDecodeBatch(int bit_width, T* dict, int64_t dict_len, int num_values, T* v, |
221 | | int64_t stride); |
222 | | |
223 | | /// Reads an unpacked 'num_bytes'-sized value from the buffer and stores it in 'v'. T |
224 | | /// needs to be a little-endian native type and big enough to store 'num_bytes'. |
225 | | /// Returns false if there are not enough bytes left. |
226 | | template <typename T> |
227 | | bool GetBytes(int num_bytes, T* v); |
228 | | |
229 | | /// Read an unsigned ULEB-128 encoded int from the stream. The encoded int must start |
230 | | /// at the beginning of a byte. Return false if there were not enough bytes in the |
231 | | /// buffer or the int is invalid. For more details on ULEB-128: |
232 | | /// https://en.wikipedia.org/wiki/LEB128 |
233 | | /// UINT_T must be an unsigned integer type. |
234 | | template <typename UINT_T> |
235 | | bool GetUleb128(UINT_T* v); |
236 | | |
237 | | /// Returns the number of bytes left in the stream. |
238 | 22.8k | int bytes_left() { return static_cast<int>(buffer_end_ - buffer_pos_); } |
239 | | |
240 | | /// Maximum byte length of a vlq encoded integer of type T. |
241 | | template <typename T> |
242 | 12.0k | static constexpr int max_vlq_byte_len() { |
243 | 12.0k | return BitUtil::Ceil(sizeof(T) * 8, 7); |
244 | 12.0k | } |
245 | | |
246 | | /// Maximum supported bitwidth for reader. |
247 | | static const int MAX_BITWIDTH = BitPacking::MAX_BITWIDTH; |
248 | | |
249 | | private: |
250 | | /// Current read position in the buffer. |
251 | | const uint8_t* buffer_pos_ = nullptr; |
252 | | |
253 | | /// Pointer to the byte after the end of the buffer. |
254 | | const uint8_t* buffer_end_ = nullptr; |
255 | | }; |
256 | | #include "common/compile_check_end.h" |
257 | | } // namespace doris |