be/src/storage/index/indexed_column_writer.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <butil/macros.h> |
21 | | #include <gen_cpp/segment_v2.pb.h> |
22 | | #include <stdint.h> |
23 | | |
24 | | #include <cstddef> |
25 | | #include <cstdint> |
26 | | #include <memory> |
27 | | |
28 | | #include "common/status.h" |
29 | | #include "core/arena.h" |
30 | | #include "storage/segment/common.h" |
31 | | #include "storage/segment/page_pointer.h" |
32 | | #include "util/faststring.h" |
33 | | |
34 | | namespace doris { |
35 | | |
36 | | class BlockCompressionCodec; |
37 | | class KeyCoder; |
38 | | class TypeInfo; |
39 | | |
40 | | namespace io { |
41 | | class FileWriter; |
42 | | } |
43 | | |
44 | | namespace segment_v2 { |
45 | | |
46 | | class IndexPageBuilder; |
47 | | class PageBuilder; |
48 | | |
49 | | struct IndexedColumnWriterOptions { |
50 | | size_t index_page_size = 64 * 1024; |
51 | | size_t data_page_size = 1024 * 1024; |
52 | | bool write_ordinal_index = false; |
53 | | bool write_value_index = false; |
54 | | EncodingTypePB encoding = DEFAULT_ENCODING; |
55 | | CompressionTypePB compression = NO_COMPRESSION; |
56 | | double compression_min_space_saving = 0.1; |
57 | | }; |
58 | | |
59 | | // IndexedColumn is a column with an optional "ordinal index" and an optional "value index". |
60 | | // - "ordinal index" enables us to seek to a particular rowid within the column |
61 | | // - "value index" enables us to seek to a particular value but requires IndexedColumn to store ordered values |
62 | | // |
63 | | // IndexedColumn can be used as the building blocks for implementing other data structures. For example, |
64 | | // - a bitmap index can be represented by two indexed columns, one for the term dictionary, one for the posting lists. |
65 | | // the "dictionary" IndexedColumn contains ordered terms and a value index. |
66 | | // the "posting" IndexedColumn contains bitmap for each term and an ordinal index. |
67 | | // - a bloom filter index can be represented by one indexed column containing bloom filters with an ordinal index |
68 | | // |
69 | | // Currently IndexedColumn has the following restrictions but can be extended to solve in the future |
70 | | // 1. value can't be null |
71 | | // 2. duplicated values are not supported/tested when storing ordered values |
72 | | // TODO test with empty input |
73 | | class IndexedColumnWriter { |
74 | | public: |
75 | | explicit IndexedColumnWriter(const IndexedColumnWriterOptions& options, |
76 | | const TypeInfo* type_info, io::FileWriter* file_writer); |
77 | | |
78 | | ~IndexedColumnWriter(); |
79 | | |
80 | | Status init(); |
81 | | |
82 | | // add a single not-null value |
83 | | Status add(const void* value); |
84 | | |
85 | | Status finish(IndexedColumnMetaPB* meta); |
86 | | |
87 | 75 | uint64_t disk_size() const { return _disk_size; } |
88 | | |
89 | 2 | uint32_t data_page_num() const { return _num_data_pages + 1; } |
90 | | |
91 | | private: |
92 | | Status _finish_current_data_page(size_t& num_val); |
93 | | |
94 | | Status _flush_index(IndexPageBuilder* index_builder, BTreeMetaPB* meta); |
95 | | |
96 | | IndexedColumnWriterOptions _options; |
97 | | const TypeInfo* _type_info = nullptr; |
98 | | io::FileWriter* _file_writer = nullptr; |
99 | | // only used for `_first_value` |
100 | | Arena _arena; |
101 | | |
102 | | ordinal_t _num_values; |
103 | | uint32_t _num_data_pages; |
104 | | uint64_t _disk_size; |
105 | | // remember the first value in current page |
106 | | faststring _first_value; |
107 | | PagePointer _last_data_page; |
108 | | |
109 | | // the following members are initialized in init() |
110 | | // ----- |
111 | | // builder for data pages |
112 | | std::unique_ptr<PageBuilder> _data_page_builder; |
113 | | // builder for index pages of ordinal index, null if write_ordinal_index == false |
114 | | std::unique_ptr<IndexPageBuilder> _ordinal_index_builder; |
115 | | // builder for index pages of value index, null if write_value_index == false |
116 | | std::unique_ptr<IndexPageBuilder> _value_index_builder; |
117 | | // encoder for value index's key |
118 | | const KeyCoder* _value_key_coder = nullptr; |
119 | | BlockCompressionCodec* _compress_codec = nullptr; |
120 | | |
121 | | DISALLOW_COPY_AND_ASSIGN(IndexedColumnWriter); |
122 | | }; |
123 | | |
124 | | } // namespace segment_v2 |
125 | | } // namespace doris |