Coverage Report

Created: 2026-04-27 15:08

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/storage/segment/binary_dict_page.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <parallel_hashmap/phmap.h>
21
#include <stddef.h>
22
#include <stdint.h>
23
24
#include <memory>
25
#include <vector>
26
27
#include "common/status.h"
28
#include "core/arena.h"
29
#include "core/data_type/data_type.h"
30
#include "core/string_ref.h"
31
#include "storage/olap_common.h"
32
#include "storage/segment/binary_plain_page.h"
33
#include "storage/segment/common.h"
34
#include "storage/segment/options.h"
35
#include "storage/segment/page_builder.h"
36
#include "storage/segment/page_decoder.h"
37
#include "util/faststring.h"
38
#include "util/slice.h"
39
40
namespace doris {
41
struct StringRef;
42
43
namespace segment_v2 {
44
enum EncodingTypePB : int;
45
template <FieldType Type>
46
class BitShufflePageDecoder;
47
48
enum { BINARY_DICT_PAGE_HEADER_SIZE = 4 };
49
50
// This type of page use dictionary encoding for strings.
51
// There is only one dictionary page for all the data pages within a column.
52
//
53
// Layout for dictionary encoded data page:
54
// The data page starts with a 4-byte header (EncodingTypePB) followed by the encoded data.
55
// There are three possible encoding formats:
56
//
57
// 1. header(4 bytes) + bitshuffle encoded codeword page, when mode_ = DICT_ENCODING.
58
//    The codeword page contains integer codes referencing the dictionary, compressed with bitshuffle+lz4.
59
//
60
// 2. header(4 bytes) + BinaryPlainPageV2, when mode_ = PLAIN_ENCODING_V2.
61
//    Used as fallback when dictionary is full. Stores raw strings with varint-encoded lengths.
62
//
63
// 3. header(4 bytes) + BinaryPlainPage, when mode_ = PLAIN_ENCODING.
64
//    Used as fallback when dictionary is full. Stores raw strings with offset array.
65
//
66
// Data pages start with mode_ = DICT_ENCODING. When the size of dictionary page
67
// goes beyond options.dict_page_size, subsequent data pages will switch to plain
68
// encoding (either PLAIN_ENCODING_V2 or PLAIN_ENCODING based on config) automatically.
69
//
70
// The dictionary page itself is encoded as either BinaryPlainPage (PLAIN_ENCODING) or
71
// BinaryPlainPageV2 (PLAIN_ENCODING_V2), determined by config::binary_plain_encoding_default_impl.
72
class BinaryDictPageBuilder : public PageBuilderHelper<BinaryDictPageBuilder> {
73
public:
74
    using Self = BinaryDictPageBuilder;
75
    friend class PageBuilderHelper<Self>;
76
77
    Status init() override;
78
79
    bool is_page_full() override;
80
81
    Status add(const uint8_t* vals, size_t* count) override;
82
83
    Status finish(OwnedSlice* slice) override;
84
85
    Status reset() override;
86
87
    size_t count() const override;
88
89
    uint64_t size() const override;
90
91
    Status get_dictionary_page(OwnedSlice* dictionary_page) override;
92
93
    Status get_dictionary_page_encoding(EncodingTypePB* encoding) const override;
94
95
    uint64_t get_raw_data_size() const override;
96
97
private:
98
    BinaryDictPageBuilder(const PageBuilderOptions& options);
99
100
    PageBuilderOptions _options;
101
    bool _finished;
102
103
    std::unique_ptr<PageBuilder> _data_page_builder;
104
105
    std::unique_ptr<PageBuilder> _dict_builder = nullptr;
106
107
    EncodingTypePB _encoding_type;
108
109
    EncodingTypePB
110
            _dict_word_page_encoding_type; // currently only support PLAIN_ENCODING and PLAIN_ENCODING_V2
111
    EncodingTypePB
112
            _fallback_binary_encoding_type; // currently only support PLAIN_ENCODING and PLAIN_ENCODING_V2
113
114
    struct HashOfSlice {
115
1.15M
        size_t operator()(const Slice& slice) const { return crc32_hash(slice.data, slice.size); }
116
    };
117
    // query for dict item -> dict id
118
    phmap::flat_hash_map<Slice, uint32_t, HashOfSlice> _dictionary;
119
    // TODO(zc): rethink about this arena
120
    Arena _arena;
121
    faststring _buffer;
122
    uint64_t _raw_data_size = 0;
123
124
    bool _has_empty = false;
125
    uint32_t _empty_code = 0;
126
};
127
128
class BinaryDictPageDecoder : public PageDecoder {
129
public:
130
    BinaryDictPageDecoder(Slice data, const PageDecoderOptions& options);
131
132
    Status init() override;
133
134
    Status seek_to_position_in_page(size_t pos) override;
135
136
    Status next_batch(size_t* n, MutableColumnPtr& dst) override;
137
138
    Status read_by_rowids(const rowid_t* rowids, ordinal_t page_first_ordinal, size_t* n,
139
                          MutableColumnPtr& dst) override;
140
141
3
    size_t count() const override { return _data_page_decoder->count(); }
142
143
382
    size_t current_index() const override { return _data_page_decoder->current_index(); }
144
145
    bool is_dict_encoding() const;
146
147
    void set_dict_decoder(uint32_t num_dict_items, StringRef* dict_word_info);
148
149
    ~BinaryDictPageDecoder() override;
150
151
private:
152
    Slice _data;
153
    PageDecoderOptions _options;
154
    std::unique_ptr<PageDecoder> _data_page_decoder;
155
    BitShufflePageDecoder<FieldType::OLAP_FIELD_TYPE_INT>* _bit_shuffle_ptr = nullptr;
156
    bool _parsed;
157
    EncodingTypePB _encoding_type;
158
159
    StringRef* _dict_word_info = nullptr;
160
    uint32_t _num_dict_items = 0;
161
162
    std::vector<int32_t> _buffer;
163
};
164
165
} // namespace segment_v2
166
} // namespace doris