Coverage Report

Created: 2026-04-14 13:01

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/bit_stream_utils.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
// This file is copied from
18
// https://github.com/apache/impala/blob/branch-2.9.0/be/src/util/bit-stream-utils.h
19
// and modified by Doris
20
21
#pragma once
22
23
#include "util/bit_packing.h"
24
#include "util/bit_util.h"
25
#include "util/faststring.h"
26
27
using doris::BitUtil;
28
namespace doris {
29
30
// Utility class to write bit/byte streams.  This class can write data to either be
31
// bit packed or byte aligned (and a single stream that has a mix of both).
32
class BitWriter {
33
public:
34
    // buffer: buffer to write bits to.
35
11.5M
    explicit BitWriter(faststring* buffer) : buffer_(buffer) { Clear(); }
36
37
23.6M
    void Clear() {
38
23.6M
        buffered_values_ = 0;
39
23.6M
        byte_offset_ = 0;
40
23.6M
        bit_offset_ = 0;
41
23.6M
        buffer_->clear();
42
23.6M
    }
43
44
    // Returns a pointer to the underlying buffer
45
1.39M
    faststring* buffer() const { return buffer_; }
46
47
    // The number of current bytes written, including the current byte (i.e. may include a
48
    // fraction of a byte). Includes buffered values.
49
11.3M
    int bytes_written() const { return byte_offset_ + BitUtil::Ceil(bit_offset_, 8); }
50
51
    // Writes a value to buffered_values_, flushing to buffer_ if necessary.  This is bit
52
    // packed.
53
    void PutValue(uint64_t v, int num_bits);
54
55
    // Writes v to the next aligned byte using num_bits. If T is larger than num_bits, the
56
    // extra high-order bits will be ignored.
57
    template <typename T>
58
    void PutAligned(T v, int num_bits);
59
60
    // Write a Vlq encoded int to the buffer. The value is written byte aligned.
61
    // For more details on vlq: en.wikipedia.org/wiki/Variable-length_quantity
62
    void PutVlqInt(int32_t v);
63
64
    // Get the index to the next aligned byte and advance the underlying buffer by num_bytes.
65
1.39M
    size_t GetByteIndexAndAdvance(int num_bytes) {
66
1.39M
        uint8_t* ptr = GetNextBytePtr(num_bytes);
67
1.39M
        return ptr - buffer_->data();
68
1.39M
    }
69
70
    // Get a pointer to the next aligned byte and advance the underlying buffer by num_bytes.
71
    uint8_t* GetNextBytePtr(int num_bytes);
72
73
    // Flushes all buffered values to the buffer. Call this when done writing to the buffer.
74
    // If 'align' is true, buffered_values_ is reset and any future writes will be written
75
    // to the next byte boundary.
76
    void Flush(bool align = false);
77
78
private:
79
    // Bit-packed values are initially written to this variable before being memcpy'd to
80
    // buffer_. This is faster than writing values byte by byte directly to buffer_.
81
    uint64_t buffered_values_;
82
83
    faststring* buffer_ = nullptr;
84
    int byte_offset_; // Offset in buffer_
85
    int bit_offset_;  // Offset in buffered_values_
86
};
87
88
// Utility class to read bit/byte stream.  This class can read bits or bytes
89
// that are either byte aligned or not.  It also has utilities to read multiple
90
// bytes in one read (e.g. encoded int).
91
class BitReader {
92
public:
93
    // 'buffer' is the buffer to read from.  The buffer's length is 'buffer_len'.
94
    BitReader(const uint8_t* buffer, int buffer_len);
95
96
38.0M
    BitReader() : buffer_(nullptr), max_bytes_(0) {}
97
98
    // Gets the next value from the buffer.  Returns true if 'v' could be read or false if
99
    // there are not enough bytes left. num_bits must be <= 32.
100
    template <typename T>
101
    bool GetValue(int num_bits, T* v);
102
103
    // Reads a 'num_bytes'-sized value from the buffer and stores it in 'v'. T needs to be a
104
    // little-endian native type and big enough to store 'num_bytes'. The value is assumed
105
    // to be byte-aligned so the stream will be advanced to the start of the next byte
106
    // before 'v' is read. Returns false if there are not enough bytes left.
107
    template <typename T>
108
    bool GetAligned(int num_bytes, T* v);
109
110
    // Reads a vlq encoded int from the stream.  The encoded int must start at the
111
    // beginning of a byte. Return false if there were not enough bytes in the buffer.
112
    bool GetVlqInt(uint32_t* v);
113
    // Reads a zigzag encoded int `into` v.
114
    bool GetZigZagVlqInt(int32_t* v);
115
116
    // Reads a vlq encoded int from the stream.  The encoded int must start at the
117
    // beginning of a byte. Return false if there were not enough bytes in the buffer.
118
    bool GetVlqInt(uint64_t* v);
119
    // Reads a zigzag encoded int `into` v.
120
    bool GetZigZagVlqInt(int64_t* v);
121
122
    // Returns the number of bytes left in the stream, not including the current byte (i.e.,
123
    // there may be an additional fraction of a byte).
124
    int bytes_left() { return max_bytes_ - (byte_offset_ + BitUtil::Ceil(bit_offset_, 8)); }
125
126
    // Current position in the stream, by bit.
127
    int position() const { return byte_offset_ * 8 + bit_offset_; }
128
129
    // Rewind the stream by 'num_bits' bits
130
    void Rewind(int num_bits);
131
132
    // Advance the stream by 'num_bits' bits
133
    bool Advance(int64_t num_bits);
134
135
    // Seek to a specific bit in the buffer
136
    void SeekToBit(unsigned int stream_position);
137
138
    // Maximum byte length of a vlq encoded int
139
    static const int MAX_VLQ_BYTE_LEN = 5;
140
141
    // Maximum byte length of a vlq encoded int64
142
    static const int MAX_VLQ_BYTE_LEN_FOR_INT64 = 10;
143
144
270M
    bool is_initialized() const { return buffer_ != nullptr; }
145
146
0
    const uint8_t* buffer() const { return buffer_; }
147
148
0
    int max_bytes() const { return max_bytes_; }
149
150
private:
151
    // Used by SeekToBit() and GetValue() to fetch the
152
    // the next word into buffer_.
153
    void BufferValues();
154
155
    const uint8_t* buffer_ = nullptr;
156
    int max_bytes_;
157
158
    // Bytes are memcpy'd from buffer_ and values are read from this variable. This is
159
    // faster than reading values byte by byte directly from buffer_.
160
    uint64_t buffered_values_;
161
162
    int byte_offset_; // Offset in buffer_
163
    int bit_offset_;  // Offset in buffered_values_
164
};
165
166
/// Utility class to read bit/byte stream. This class can read bits or bytes that are
167
/// either byte aligned or not. It also has utilities to read multiple bytes in one
168
/// read (e.g. encoded int). Exposes a batch-oriented interface to allow efficient
169
/// processing of multiple values at a time.
170
class BatchedBitReader {
171
public:
172
    /// 'buffer' is the buffer to read from.  The buffer's length is 'buffer_len'.
173
    /// Does not take ownership of the buffer.
174
0
    BatchedBitReader(const uint8_t* buffer, int64_t buffer_len) { Reset(buffer, buffer_len); }
175
176
341k
    BatchedBitReader() {}
177
178
    // The implicit copy constructor is left defined. If a BatchedBitReader is copied, the
179
    // two copies do not share any state. Invoking functions on either copy continues
180
    // reading from the current read position without modifying the state of the other
181
    // copy.
182
183
    /// Resets the read to start reading from the start of 'buffer'. The buffer's
184
    /// length is 'buffer_len'. Does not take ownership of the buffer.
185
396k
    void Reset(const uint8_t* buffer, int64_t buffer_len) {
186
396k
        DCHECK(buffer != nullptr);
187
396k
        DCHECK_GE(buffer_len, 0);
188
396k
        buffer_pos_ = buffer;
189
396k
        buffer_end_ = buffer + buffer_len;
190
396k
    }
191
192
    /// Gets up to 'num_values' bit-packed values, starting from the current byte in the
193
    /// buffer and advance the read position. 'bit_width' must be <= 64.
194
    /// If 'bit_width' * 'num_values' is not a multiple of 8, the trailing bytes are
195
    /// skipped and the next UnpackBatch() call will start reading from the next byte.
196
    ///
197
    /// If the caller does not want to drop trailing bits, 'num_values' must be exactly the
198
    /// total number of values the caller wants to read from a run of bit-packed values, or
199
    /// 'bit_width' * 'num_values' must be a multiple of 8. This condition is always
200
    /// satisfied if 'num_values' is a multiple of 32.
201
    ///
202
    /// The output type 'T' must be an unsigned integer.
203
    ///
204
    /// Returns the number of values read.
205
    template <typename T>
206
    int UnpackBatch(int bit_width, int num_values, T* v);
207
208
    /// Skip 'num_values_to_skip' bit-packed values.
209
    /// 'num_values_to_skip * bit_width' is either divisible by 8, or
210
    /// 'num_values_to_skip' equals to the count of the remaining bit-packed values.
211
    bool SkipBatch(int bit_width, int num_values_to_skip);
212
213
    /// Unpack bit-packed values in the same way as UnpackBatch() and decode them using the
214
    /// dictionary 'dict' with 'dict_len' entries. Return -1 if a decoding error is
215
    /// encountered, i.e. if the bit-packed values are not valid indices in 'dict'.
216
    /// Otherwise returns the number of values decoded. The values are written to 'v' with
217
    /// a stride of 'stride' bytes.
218
    template <typename T>
219
    int UnpackAndDecodeBatch(int bit_width, T* dict, int64_t dict_len, int num_values, T* v,
220
                             int64_t stride);
221
222
    /// Reads an unpacked 'num_bytes'-sized value from the buffer and stores it in 'v'. T
223
    /// needs to be a little-endian native type and big enough to store 'num_bytes'.
224
    /// Returns false if there are not enough bytes left.
225
    template <typename T>
226
    bool GetBytes(int num_bytes, T* v);
227
228
    /// Read an unsigned ULEB-128 encoded int from the stream. The encoded int must start
229
    /// at the beginning of a byte. Return false if there were not enough bytes in the
230
    /// buffer or the int is invalid. For more details on ULEB-128:
231
    /// https://en.wikipedia.org/wiki/LEB128
232
    /// UINT_T must be an unsigned integer type.
233
    template <typename UINT_T>
234
    bool GetUleb128(UINT_T* v);
235
236
    /// Returns the number of bytes left in the stream.
237
6.22M
    int bytes_left() { return static_cast<int>(buffer_end_ - buffer_pos_); }
238
239
    /// Maximum byte length of a vlq encoded integer of type T.
240
    template <typename T>
241
7.36M
    static constexpr int max_vlq_byte_len() {
242
7.36M
        return BitUtil::Ceil(sizeof(T) * 8, 7);
243
7.36M
    }
244
245
    /// Maximum supported bitwidth for reader.
246
    static const int MAX_BITWIDTH = BitPacking::MAX_BITWIDTH;
247
248
private:
249
    /// Current read position in the buffer.
250
    const uint8_t* buffer_pos_ = nullptr;
251
252
    /// Pointer to the byte after the end of the buffer.
253
    const uint8_t* buffer_end_ = nullptr;
254
};
255
} // namespace doris