be/src/format/table/equality_delete.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "core/block/block.h" |
19 | | #include "exprs/hybrid_set.h" |
20 | | #include "runtime/runtime_profile.h" |
21 | | |
22 | | namespace doris { |
23 | | |
24 | | /** |
25 | | * Support Iceberg equality delete. |
26 | | * If there's only one delete column in delete file, use `SimpleEqualityDelete`, |
27 | | * which uses optimized `HybridSetBase` to build the hash set. |
28 | | * If there are more delete columns in delete file, use `MultiEqualityDelete`, |
29 | | * which generates a hash column from all delete columns, and only compare the values |
30 | | * when the hash values are the same. |
31 | | */ |
32 | | class EqualityDeleteBase { |
33 | | protected: |
34 | | RuntimeProfile::Counter* num_delete_rows; |
35 | | RuntimeProfile::Counter* build_set_time; |
36 | | RuntimeProfile::Counter* equality_delete_time; |
37 | | |
38 | | const Block* _delete_block; |
39 | | std::vector<int> _delete_col_ids; |
40 | | |
41 | | virtual Status _build_set() = 0; |
42 | | |
43 | | public: |
44 | | EqualityDeleteBase(const Block* delete_block, const std::vector<int> delete_col_ids) |
45 | 1.70k | : _delete_block(delete_block), _delete_col_ids(delete_col_ids) {} |
46 | 1.70k | virtual ~EqualityDeleteBase() = default; |
47 | | |
48 | 1.70k | Status init(RuntimeProfile* profile) { |
49 | 1.70k | static const char* delete_profile = "EqualityDelete"; |
50 | 1.70k | ADD_TIMER_WITH_LEVEL(profile, delete_profile, 1); |
51 | 1.70k | num_delete_rows = ADD_CHILD_COUNTER_WITH_LEVEL(profile, "NumRowsInDeleteFile", TUnit::UNIT, |
52 | 1.70k | delete_profile, 1); |
53 | 1.70k | build_set_time = ADD_CHILD_TIMER_WITH_LEVEL(profile, "BuildHashSetTime", delete_profile, 1); |
54 | 1.70k | equality_delete_time = |
55 | 1.70k | ADD_CHILD_TIMER_WITH_LEVEL(profile, "EqualityDeleteFilterTime", delete_profile, 1); |
56 | 1.70k | SCOPED_TIMER(build_set_time); |
57 | 1.70k | return _build_set(); |
58 | 1.70k | } |
59 | | |
60 | | virtual Status filter_data_block( |
61 | | Block* data_block, |
62 | | const std::unordered_map<std::string, uint32_t>* col_name_to_block_idx, |
63 | | const std::unordered_map<int, std::string>& id_to_block_column_name, |
64 | | IColumn::Filter& filter) = 0; |
65 | | |
66 | | static std::unique_ptr<EqualityDeleteBase> get_delete_impl( |
67 | | const Block* delete_block, const std::vector<int>& delete_col_ids); |
68 | | }; |
69 | | |
70 | | class SimpleEqualityDelete : public EqualityDeleteBase { |
71 | | protected: |
72 | | std::shared_ptr<HybridSetBase> _hybrid_set; |
73 | | std::unique_ptr<IColumn::Filter> _single_filter; |
74 | | |
75 | | Status _build_set() override; |
76 | | |
77 | | public: |
78 | | SimpleEqualityDelete(const Block* delete_block, const std::vector<int>& delete_col_ids) |
79 | 776 | : EqualityDeleteBase(delete_block, delete_col_ids) {} |
80 | | |
81 | | Status filter_data_block(Block* data_block, |
82 | | const std::unordered_map<std::string, uint32_t>* col_name_to_block_idx, |
83 | | const std::unordered_map<int, std::string>& id_to_block_column_name, |
84 | | IColumn::Filter& filter) override; |
85 | | }; |
86 | | |
87 | | /** |
88 | | * `MultiEqualityDelete` will generate the hash column for delete block and data block. |
89 | | */ |
90 | | class MultiEqualityDelete : public EqualityDeleteBase { |
91 | | protected: |
92 | | // hash column for delete block |
93 | | std::vector<uint64_t> _delete_hashes; |
94 | | // hash column for data block |
95 | | std::vector<uint64_t> _data_hashes; |
96 | | // hash code => row index |
97 | | // if hash values are equal, then compare the real values |
98 | | // the row index records the row number of the delete row in delete block |
99 | | std::multimap<uint64_t, size_t> _delete_hash_map; |
100 | | // the delete column indexes in data block |
101 | | std::vector<size_t> _data_column_index; |
102 | | |
103 | | Status _build_set() override; |
104 | | |
105 | | bool _equal(Block* data_block, size_t data_row_index, size_t delete_row_index); |
106 | | |
107 | | public: |
108 | | MultiEqualityDelete(const Block* delete_block, const std::vector<int>& delete_col_ids) |
109 | 928 | : EqualityDeleteBase(delete_block, delete_col_ids) {} |
110 | | |
111 | | Status filter_data_block(Block* data_block, |
112 | | const std::unordered_map<std::string, uint32_t>* col_name_to_block_idx, |
113 | | const std::unordered_map<int, std::string>& id_to_block_column_name, |
114 | | IColumn::Filter& filter) override; |
115 | | }; |
116 | | |
117 | | } // namespace doris |