Coverage Report

Created: 2026-03-16 19:58

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/storage/segment/binary_dict_page.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <parallel_hashmap/phmap.h>
21
#include <stddef.h>
22
#include <stdint.h>
23
24
#include <memory>
25
#include <vector>
26
27
#include "common/status.h"
28
#include "core/arena.h"
29
#include "core/data_type/data_type.h"
30
#include "core/string_ref.h"
31
#include "storage/olap_common.h"
32
#include "storage/segment/binary_plain_page.h"
33
#include "storage/segment/common.h"
34
#include "storage/segment/options.h"
35
#include "storage/segment/page_builder.h"
36
#include "storage/segment/page_decoder.h"
37
#include "util/faststring.h"
38
#include "util/slice.h"
39
40
namespace doris {
41
struct StringRef;
42
43
namespace segment_v2 {
44
enum EncodingTypePB : int;
45
template <FieldType Type>
46
class BitShufflePageDecoder;
47
48
enum { BINARY_DICT_PAGE_HEADER_SIZE = 4 };
49
50
// This type of page use dictionary encoding for strings.
51
// There is only one dictionary page for all the data pages within a column.
52
//
53
// Layout for dictionary encoded data page:
54
// The data page starts with a 4-byte header (EncodingTypePB) followed by the encoded data.
55
// There are three possible encoding formats:
56
//
57
// 1. header(4 bytes) + bitshuffle encoded codeword page, when mode_ = DICT_ENCODING.
58
//    The codeword page contains integer codes referencing the dictionary, compressed with bitshuffle+lz4.
59
//
60
// 2. header(4 bytes) + BinaryPlainPageV2, when mode_ = PLAIN_ENCODING_V2.
61
//    Used as fallback when dictionary is full. Stores raw strings with varint-encoded lengths.
62
//
63
// 3. header(4 bytes) + BinaryPlainPage, when mode_ = PLAIN_ENCODING.
64
//    Used as fallback when dictionary is full. Stores raw strings with offset array.
65
//
66
// Data pages start with mode_ = DICT_ENCODING. When the size of dictionary page
67
// goes beyond options.dict_page_size, subsequent data pages will switch to plain
68
// encoding (either PLAIN_ENCODING_V2 or PLAIN_ENCODING based on config) automatically.
69
//
70
// The dictionary page itself is encoded as either BinaryPlainPage (PLAIN_ENCODING) or
71
// BinaryPlainPageV2 (PLAIN_ENCODING_V2), determined by config::binary_plain_encoding_default_impl.
72
class BinaryDictPageBuilder : public PageBuilderHelper<BinaryDictPageBuilder> {
73
public:
74
    using Self = BinaryDictPageBuilder;
75
    friend class PageBuilderHelper<Self>;
76
77
    Status init() override;
78
79
    bool is_page_full() override;
80
81
    Status add(const uint8_t* vals, size_t* count) override;
82
83
    Status finish(OwnedSlice* slice) override;
84
85
    Status reset() override;
86
87
    size_t count() const override;
88
89
    uint64_t size() const override;
90
91
    Status get_dictionary_page(OwnedSlice* dictionary_page) override;
92
93
    Status get_dictionary_page_encoding(EncodingTypePB* encoding) const override;
94
95
    Status get_first_value(void* value) const override;
96
97
    Status get_last_value(void* value) const override;
98
99
    uint64_t get_raw_data_size() const override;
100
101
private:
102
    BinaryDictPageBuilder(const PageBuilderOptions& options);
103
104
    PageBuilderOptions _options;
105
    bool _finished;
106
107
    std::unique_ptr<PageBuilder> _data_page_builder;
108
109
    std::unique_ptr<PageBuilder> _dict_builder = nullptr;
110
111
    EncodingTypePB _encoding_type;
112
113
    EncodingTypePB
114
            _dict_word_page_encoding_type; // currently only support PLAIN_ENCODING and PLAIN_ENCODING_V2
115
    EncodingTypePB
116
            _fallback_binary_encoding_type; // currently only support PLAIN_ENCODING and PLAIN_ENCODING_V2
117
118
    struct HashOfSlice {
119
1.15M
        size_t operator()(const Slice& slice) const { return crc32_hash(slice.data, slice.size); }
120
    };
121
    // query for dict item -> dict id
122
    phmap::flat_hash_map<Slice, uint32_t, HashOfSlice> _dictionary;
123
    // TODO(zc): rethink about this arena
124
    Arena _arena;
125
    faststring _buffer;
126
    faststring _first_value;
127
    uint64_t _raw_data_size = 0;
128
129
    bool _has_empty = false;
130
    uint32_t _empty_code = 0;
131
};
132
133
class BinaryDictPageDecoder : public PageDecoder {
134
public:
135
    BinaryDictPageDecoder(Slice data, const PageDecoderOptions& options);
136
137
    Status init() override;
138
139
    Status seek_to_position_in_page(size_t pos) override;
140
141
    Status next_batch(size_t* n, MutableColumnPtr& dst) override;
142
143
    Status read_by_rowids(const rowid_t* rowids, ordinal_t page_first_ordinal, size_t* n,
144
                          MutableColumnPtr& dst) override;
145
146
3
    size_t count() const override { return _data_page_decoder->count(); }
147
148
199
    size_t current_index() const override { return _data_page_decoder->current_index(); }
149
150
    bool is_dict_encoding() const;
151
152
    void set_dict_decoder(uint32_t num_dict_items, StringRef* dict_word_info);
153
154
    ~BinaryDictPageDecoder() override;
155
156
private:
157
    Slice _data;
158
    PageDecoderOptions _options;
159
    std::unique_ptr<PageDecoder> _data_page_decoder;
160
    BitShufflePageDecoder<FieldType::OLAP_FIELD_TYPE_INT>* _bit_shuffle_ptr = nullptr;
161
    bool _parsed;
162
    EncodingTypePB _encoding_type;
163
164
    StringRef* _dict_word_info = nullptr;
165
    uint32_t _num_dict_items = 0;
166
167
    std::vector<int32_t> _buffer;
168
};
169
170
} // namespace segment_v2
171
} // namespace doris