be/src/format/table/equality_delete.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "core/block/block.h" |
19 | | #include "exprs/hybrid_set.h" |
20 | | #include "runtime/runtime_profile.h" |
21 | | |
22 | | namespace doris { |
23 | | #include "common/compile_check_begin.h" |
24 | | |
25 | | /** |
26 | | * Support Iceberg equality delete. |
27 | | * If there's only one delete column in delete file, use `SimpleEqualityDelete`, |
28 | | * which uses optimized `HybridSetBase` to build the hash set. |
29 | | * If there are more delete columns in delete file, use `MultiEqualityDelete`, |
30 | | * which generates a hash column from all delete columns, and only compare the values |
31 | | * when the hash values are the same. |
32 | | */ |
33 | | class EqualityDeleteBase { |
34 | | protected: |
35 | | RuntimeProfile::Counter* num_delete_rows; |
36 | | RuntimeProfile::Counter* build_set_time; |
37 | | RuntimeProfile::Counter* equality_delete_time; |
38 | | |
39 | | const Block* _delete_block; |
40 | | std::vector<int> _delete_col_ids; |
41 | | |
42 | | virtual Status _build_set() = 0; |
43 | | |
44 | | public: |
45 | | EqualityDeleteBase(const Block* delete_block, const std::vector<int> delete_col_ids) |
46 | 0 | : _delete_block(delete_block), _delete_col_ids(delete_col_ids) {} |
47 | 0 | virtual ~EqualityDeleteBase() = default; |
48 | | |
49 | 0 | Status init(RuntimeProfile* profile) { |
50 | 0 | static const char* delete_profile = "EqualityDelete"; |
51 | 0 | ADD_TIMER_WITH_LEVEL(profile, delete_profile, 1); |
52 | 0 | num_delete_rows = ADD_CHILD_COUNTER_WITH_LEVEL(profile, "NumRowsInDeleteFile", TUnit::UNIT, |
53 | 0 | delete_profile, 1); |
54 | 0 | build_set_time = ADD_CHILD_TIMER_WITH_LEVEL(profile, "BuildHashSetTime", delete_profile, 1); |
55 | 0 | equality_delete_time = |
56 | 0 | ADD_CHILD_TIMER_WITH_LEVEL(profile, "EqualityDeleteFilterTime", delete_profile, 1); |
57 | 0 | SCOPED_TIMER(build_set_time); |
58 | 0 | return _build_set(); |
59 | 0 | } |
60 | | |
61 | | virtual Status filter_data_block( |
62 | | Block* data_block, |
63 | | const std::unordered_map<std::string, uint32_t>* col_name_to_block_idx, |
64 | | const std::unordered_map<int, std::string>& id_to_block_column_name, |
65 | | IColumn::Filter& filter) = 0; |
66 | | |
67 | | static std::unique_ptr<EqualityDeleteBase> get_delete_impl( |
68 | | const Block* delete_block, const std::vector<int>& delete_col_ids); |
69 | | }; |
70 | | |
71 | | class SimpleEqualityDelete : public EqualityDeleteBase { |
72 | | protected: |
73 | | std::shared_ptr<HybridSetBase> _hybrid_set; |
74 | | std::unique_ptr<IColumn::Filter> _single_filter; |
75 | | |
76 | | Status _build_set() override; |
77 | | |
78 | | public: |
79 | | SimpleEqualityDelete(const Block* delete_block, const std::vector<int>& delete_col_ids) |
80 | 0 | : EqualityDeleteBase(delete_block, delete_col_ids) {} |
81 | | |
82 | | Status filter_data_block(Block* data_block, |
83 | | const std::unordered_map<std::string, uint32_t>* col_name_to_block_idx, |
84 | | const std::unordered_map<int, std::string>& id_to_block_column_name, |
85 | | IColumn::Filter& filter) override; |
86 | | }; |
87 | | |
88 | | /** |
89 | | * `MultiEqualityDelete` will generate the hash column for delete block and data block. |
90 | | */ |
91 | | class MultiEqualityDelete : public EqualityDeleteBase { |
92 | | protected: |
93 | | // hash column for delete block |
94 | | std::vector<uint64_t> _delete_hashes; |
95 | | // hash column for data block |
96 | | std::vector<uint64_t> _data_hashes; |
97 | | // hash code => row index |
98 | | // if hash values are equal, then compare the real values |
99 | | // the row index records the row number of the delete row in delete block |
100 | | std::multimap<uint64_t, size_t> _delete_hash_map; |
101 | | // the delete column indexes in data block |
102 | | std::vector<size_t> _data_column_index; |
103 | | |
104 | | Status _build_set() override; |
105 | | |
106 | | bool _equal(Block* data_block, size_t data_row_index, size_t delete_row_index); |
107 | | |
108 | | public: |
109 | | MultiEqualityDelete(const Block* delete_block, const std::vector<int>& delete_col_ids) |
110 | 0 | : EqualityDeleteBase(delete_block, delete_col_ids) {} |
111 | | |
112 | | Status filter_data_block(Block* data_block, |
113 | | const std::unordered_map<std::string, uint32_t>* col_name_to_block_idx, |
114 | | const std::unordered_map<int, std::string>& id_to_block_column_name, |
115 | | IColumn::Filter& filter) override; |
116 | | }; |
117 | | |
118 | | #include "common/compile_check_end.h" |
119 | | } // namespace doris |