be/src/format/table/hive_reader.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "format/table/hive_reader.h" |
19 | | |
20 | | #include <vector> |
21 | | |
22 | | #include "common/status.h" |
23 | | #include "format/table/hive/hive_orc_nested_column_utils.h" |
24 | | #include "format/table/hive/hive_parquet_nested_column_utils.h" |
25 | | #include "format/table/nested_column_access_helper.h" |
26 | | #include "runtime/runtime_state.h" |
27 | | |
28 | | namespace doris { |
29 | | #include "common/compile_check_begin.h" |
30 | | |
31 | 127k | Status HiveReader::get_next_block_inner(Block* block, size_t* read_rows, bool* eof) { |
32 | 127k | RETURN_IF_ERROR(_file_format_reader->get_next_block(block, read_rows, eof)); |
33 | 127k | return Status::OK(); |
34 | 127k | }; |
35 | | |
36 | | Status HiveOrcReader::init_reader( |
37 | | const std::vector<std::string>& read_table_col_names, |
38 | | std::unordered_map<std::string, uint32_t>* col_name_to_block_idx, |
39 | | const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor, |
40 | | const RowDescriptor* row_descriptor, |
41 | | const VExprContextSPtrs* not_single_slot_filter_conjuncts, |
42 | 23.6k | const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts) { |
43 | 23.6k | auto* orc_reader = static_cast<OrcReader*>(_file_format_reader.get()); |
44 | | |
45 | 23.6k | const orc::Type* orc_type_ptr = nullptr; |
46 | 23.6k | RETURN_IF_ERROR(orc_reader->get_file_type(&orc_type_ptr)); |
47 | 23.6k | bool is_hive_col_name = OrcReader::is_hive1_col_name(orc_type_ptr); |
48 | | |
49 | 23.6k | if (_state->query_options().hive_orc_use_column_names && !is_hive_col_name) { |
50 | | // Directly use the table column name to match the file column name, but pay attention to the case issue. |
51 | 23.3k | RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(tuple_descriptor, orc_type_ptr, |
52 | 23.3k | table_info_node_ptr, _is_file_slot)); |
53 | 23.3k | } else { |
54 | | // hive1 / use index |
55 | 328 | std::map<std::string, const SlotDescriptor*> slot_map; // table_name to slot |
56 | 1.09k | for (const auto& slot : tuple_descriptor->slots()) { |
57 | 1.09k | slot_map.emplace(slot->col_name_lower_case(), slot); |
58 | 1.09k | } |
59 | | |
60 | | // For top-level columns, use indexes to match, and for sub-columns, still use name to match columns. |
61 | 1.23k | for (size_t idx = 0; idx < _params.column_idxs.size(); idx++) { |
62 | 908 | auto table_column_name = read_table_col_names[idx]; |
63 | 908 | auto file_index = _params.column_idxs[idx]; |
64 | | |
65 | 908 | if (file_index >= orc_type_ptr->getSubtypeCount()) { |
66 | 112 | table_info_node_ptr->add_not_exist_children(table_column_name); |
67 | 796 | } else { |
68 | 796 | auto field_node = std::make_shared<Node>(); |
69 | | // For sub-columns, still use name to match columns. |
70 | 796 | RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name( |
71 | 796 | slot_map[table_column_name]->type(), orc_type_ptr->getSubtype(file_index), |
72 | 796 | field_node)); |
73 | 796 | table_info_node_ptr->add_children( |
74 | 796 | table_column_name, orc_type_ptr->getFieldName(file_index), field_node); |
75 | 796 | } |
76 | 908 | slot_map.erase(table_column_name); |
77 | 908 | } |
78 | 328 | for (const auto& [partition_col_name, _] : slot_map) { |
79 | 176 | table_info_node_ptr->add_not_exist_children(partition_col_name); |
80 | 176 | } |
81 | 328 | } |
82 | | |
83 | 23.6k | auto column_id_result = ColumnIdResult(); |
84 | 23.6k | if (_state->query_options().hive_orc_use_column_names && !is_hive_col_name) { |
85 | 23.2k | column_id_result = _create_column_ids(orc_type_ptr, tuple_descriptor); |
86 | 23.2k | } else { |
87 | 416 | column_id_result = |
88 | 416 | _create_column_ids_by_top_level_col_index(orc_type_ptr, tuple_descriptor); |
89 | 416 | } |
90 | | |
91 | 23.6k | const auto& column_ids = column_id_result.column_ids; |
92 | 23.6k | const auto& filter_column_ids = column_id_result.filter_column_ids; |
93 | | |
94 | 23.6k | return orc_reader->init_reader(&read_table_col_names, col_name_to_block_idx, conjuncts, false, |
95 | 23.6k | tuple_descriptor, row_descriptor, |
96 | 23.6k | not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts, |
97 | 23.6k | table_info_node_ptr, column_ids, filter_column_ids); |
98 | 23.6k | } |
99 | | |
100 | | ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type, |
101 | 23.1k | const TupleDescriptor* tuple_descriptor) { |
102 | | // map top-level table column name (lower-cased) -> orc::Type* |
103 | 23.1k | std::unordered_map<std::string, const orc::Type*> table_col_name_to_orc_type_map; |
104 | 2.48M | for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) { |
105 | 2.46M | auto orc_sub_type = orc_type->getSubtype(i); |
106 | 2.46M | if (!orc_sub_type) continue; |
107 | | |
108 | 2.46M | std::string table_col_name = to_lower(orc_type->getFieldName(i)); |
109 | 2.46M | table_col_name_to_orc_type_map[table_col_name] = orc_sub_type; |
110 | 2.46M | } |
111 | | |
112 | 23.1k | std::set<uint64_t> column_ids; |
113 | 23.1k | std::set<uint64_t> filter_column_ids; |
114 | | |
115 | | // helper to process access paths for a given top-level orc field |
116 | 23.1k | auto process_access_paths = [](const orc::Type* orc_field, |
117 | 23.1k | const std::vector<TColumnAccessPath>& access_paths, |
118 | 23.1k | std::set<uint64_t>& out_ids) { |
119 | 13.5k | process_nested_access_paths( |
120 | 13.5k | orc_field, access_paths, out_ids, |
121 | 13.6k | [](const orc::Type* type) { return type->getColumnId(); }, |
122 | 13.5k | [](const orc::Type* type) { return type->getMaximumColumnId(); }, |
123 | 13.5k | HiveOrcNestedColumnUtils::extract_nested_column_ids); |
124 | 13.5k | }; |
125 | | |
126 | 98.2k | for (const auto* slot : tuple_descriptor->slots()) { |
127 | 98.2k | auto it = table_col_name_to_orc_type_map.find(slot->col_name_lower_case()); |
128 | 98.2k | if (it == table_col_name_to_orc_type_map.end()) { |
129 | | // Column not found in file |
130 | 17.1k | continue; |
131 | 17.1k | } |
132 | 81.0k | const orc::Type* orc_field = it->second; |
133 | | |
134 | | // primitive (non-nested) types |
135 | 81.0k | if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY && |
136 | 81.0k | slot->col_type() != TYPE_MAP)) { |
137 | 67.4k | column_ids.insert(orc_field->getColumnId()); |
138 | 67.4k | if (slot->is_predicate()) { |
139 | 16.6k | filter_column_ids.insert(orc_field->getColumnId()); |
140 | 16.6k | } |
141 | 67.4k | continue; |
142 | 67.4k | } |
143 | | |
144 | | // complex types |
145 | 13.5k | const auto& all_access_paths = slot->all_access_paths(); |
146 | 13.5k | process_access_paths(orc_field, all_access_paths, column_ids); |
147 | | |
148 | 13.5k | const auto& predicate_access_paths = slot->predicate_access_paths(); |
149 | 13.5k | if (!predicate_access_paths.empty()) { |
150 | 214 | process_access_paths(orc_field, predicate_access_paths, filter_column_ids); |
151 | 214 | } |
152 | 13.5k | } |
153 | | |
154 | 23.1k | return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids)); |
155 | 23.1k | } |
156 | | |
157 | | ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index( |
158 | 342 | const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) { |
159 | | // map top-level table column position -> orc::Type* |
160 | 342 | std::unordered_map<uint64_t, const orc::Type*> table_col_pos_to_orc_type_map; |
161 | 1.59k | for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) { |
162 | 1.25k | auto orc_sub_type = orc_type->getSubtype(i); |
163 | 1.25k | if (!orc_sub_type) continue; |
164 | | |
165 | 1.25k | table_col_pos_to_orc_type_map[i] = orc_sub_type; |
166 | 1.25k | } |
167 | | |
168 | 342 | std::set<uint64_t> column_ids; |
169 | 342 | std::set<uint64_t> filter_column_ids; |
170 | | |
171 | | // helper to process access paths for a given top-level orc field |
172 | 342 | auto process_access_paths = [](const orc::Type* orc_field, |
173 | 342 | const std::vector<TColumnAccessPath>& access_paths, |
174 | 342 | std::set<uint64_t>& out_ids) { |
175 | 13 | process_nested_access_paths( |
176 | 13 | orc_field, access_paths, out_ids, |
177 | 13 | [](const orc::Type* type) { return type->getColumnId(); }, |
178 | 13 | [](const orc::Type* type) { return type->getMaximumColumnId(); }, |
179 | 13 | HiveOrcNestedColumnUtils::extract_nested_column_ids); |
180 | 13 | }; |
181 | | |
182 | 1.10k | for (const auto* slot : tuple_descriptor->slots()) { |
183 | 1.10k | auto it = table_col_pos_to_orc_type_map.find(slot->col_pos()); |
184 | 1.10k | if (it == table_col_pos_to_orc_type_map.end()) { |
185 | | // Column not found in file |
186 | 1.09k | continue; |
187 | 1.09k | } |
188 | 11 | const orc::Type* orc_field = it->second; |
189 | | |
190 | | // primitive (non-nested) types |
191 | 11 | if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY && |
192 | 11 | slot->col_type() != TYPE_MAP)) { |
193 | 6 | column_ids.insert(orc_field->getColumnId()); |
194 | 6 | if (slot->is_predicate()) { |
195 | 0 | filter_column_ids.insert(orc_field->getColumnId()); |
196 | 0 | } |
197 | 6 | continue; |
198 | 6 | } |
199 | | |
200 | 5 | const auto& all_access_paths = slot->all_access_paths(); |
201 | | // complex types |
202 | 5 | process_access_paths(orc_field, all_access_paths, column_ids); |
203 | | |
204 | 5 | const auto& predicate_access_paths = slot->predicate_access_paths(); |
205 | 6 | if (!predicate_access_paths.empty()) { |
206 | 6 | process_access_paths(orc_field, predicate_access_paths, filter_column_ids); |
207 | 6 | } |
208 | 5 | } |
209 | | |
210 | 342 | return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids)); |
211 | 342 | } |
212 | | |
213 | | Status HiveParquetReader::init_reader( |
214 | | const std::vector<std::string>& read_table_col_names, |
215 | | std::unordered_map<std::string, uint32_t>* col_name_to_block_idx, |
216 | | const VExprContextSPtrs& conjuncts, |
217 | | phmap::flat_hash_map<int, std::vector<std::shared_ptr<ColumnPredicate>>>& |
218 | | slot_id_to_predicates, |
219 | | const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor, |
220 | | const std::unordered_map<std::string, int>* colname_to_slot_id, |
221 | | const VExprContextSPtrs* not_single_slot_filter_conjuncts, |
222 | 12.3k | const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts) { |
223 | 12.3k | auto* parquet_reader = static_cast<ParquetReader*>(_file_format_reader.get()); |
224 | 12.3k | const FieldDescriptor* field_desc = nullptr; |
225 | 12.3k | RETURN_IF_ERROR(parquet_reader->get_file_metadata_schema(&field_desc)); |
226 | 12.3k | DCHECK(field_desc != nullptr); |
227 | | |
228 | 12.3k | if (_state->query_options().hive_parquet_use_column_names) { |
229 | | // Directly use the table column name to match the file column name, but pay attention to the case issue. |
230 | 12.1k | RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(tuple_descriptor, *field_desc, |
231 | 12.1k | table_info_node_ptr, _is_file_slot)); |
232 | 12.1k | } else { // use idx |
233 | 282 | std::map<std::string, const SlotDescriptor*> slot_map; //table_name to slot |
234 | 996 | for (const auto& slot : tuple_descriptor->slots()) { |
235 | 996 | slot_map.emplace(slot->col_name_lower_case(), slot); |
236 | 996 | } |
237 | | |
238 | | // For top-level columns, use indexes to match, and for sub-columns, still use name to match columns. |
239 | 282 | auto parquet_fields_schema = field_desc->get_fields_schema(); |
240 | 1.10k | for (size_t idx = 0; idx < _params.column_idxs.size(); idx++) { |
241 | 824 | auto table_column_name = read_table_col_names[idx]; |
242 | 824 | auto file_index = _params.column_idxs[idx]; |
243 | | |
244 | 824 | if (file_index >= parquet_fields_schema.size()) { |
245 | | // Non-partitioning columns, which may be columns added later. |
246 | 112 | table_info_node_ptr->add_not_exist_children(table_column_name); |
247 | 712 | } else { |
248 | | // Non-partitioning columns, columns that exist in both the table and the file. |
249 | 712 | auto field_node = std::make_shared<Node>(); |
250 | | // for sub-columns, still use name to match columns. |
251 | 712 | RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name( |
252 | 712 | slot_map[table_column_name]->type(), parquet_fields_schema[file_index], |
253 | 712 | field_node)); |
254 | 712 | table_info_node_ptr->add_children( |
255 | 712 | table_column_name, parquet_fields_schema[file_index].name, field_node); |
256 | 712 | } |
257 | | |
258 | 824 | slot_map.erase(table_column_name); |
259 | 824 | } |
260 | | /* |
261 | | * `_params.column_idxs` only have `isIsFileSlot()`, so we need add `partition slot`. |
262 | | * eg: |
263 | | * Table : A, B, C, D (D: partition column) |
264 | | * Parquet file : A, B |
265 | | * Column C is obtained by add column. |
266 | | * |
267 | | * sql : select * from table; |
268 | | * slot : A, B, C, D |
269 | | * _params.column_idxs: 0, 1, 2 (There is no 3, because column D is the partition column) |
270 | | * |
271 | | */ |
272 | 282 | for (const auto& [partition_col_name, _] : slot_map) { |
273 | 176 | table_info_node_ptr->add_not_exist_children(partition_col_name); |
274 | 176 | } |
275 | 282 | } |
276 | | |
277 | 12.3k | auto column_id_result = ColumnIdResult(); |
278 | 12.3k | if (_state->query_options().hive_parquet_use_column_names) { |
279 | 12.0k | column_id_result = _create_column_ids(field_desc, tuple_descriptor); |
280 | 12.0k | } else { |
281 | 302 | column_id_result = _create_column_ids_by_top_level_col_index(field_desc, tuple_descriptor); |
282 | 302 | } |
283 | | |
284 | 12.3k | const auto& column_ids = column_id_result.column_ids; |
285 | 12.3k | const auto& filter_column_ids = column_id_result.filter_column_ids; |
286 | | |
287 | 12.3k | RETURN_IF_ERROR(init_row_filters()); |
288 | | |
289 | 12.3k | return parquet_reader->init_reader( |
290 | 12.3k | read_table_col_names, col_name_to_block_idx, conjuncts, slot_id_to_predicates, |
291 | 12.3k | tuple_descriptor, row_descriptor, colname_to_slot_id, not_single_slot_filter_conjuncts, |
292 | 12.3k | slot_id_to_filter_conjuncts, table_info_node_ptr, true, column_ids, filter_column_ids); |
293 | 12.3k | } |
294 | | |
295 | | ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* field_desc, |
296 | 12.0k | const TupleDescriptor* tuple_descriptor) { |
297 | | // First, assign column IDs to the field descriptor |
298 | 12.0k | auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc); |
299 | 12.0k | mutable_field_desc->assign_ids(); |
300 | | |
301 | | // map top-level table column name (lower-cased) -> FieldSchema* |
302 | 12.0k | std::unordered_map<std::string, const FieldSchema*> table_col_name_to_field_schema_map; |
303 | 109k | for (int i = 0; i < field_desc->size(); ++i) { |
304 | 97.2k | auto field_schema = field_desc->get_column(i); |
305 | 97.2k | if (!field_schema) continue; |
306 | | |
307 | 97.2k | table_col_name_to_field_schema_map[field_schema->lower_case_name] = field_schema; |
308 | 97.2k | } |
309 | | |
310 | 12.0k | std::set<uint64_t> column_ids; |
311 | 12.0k | std::set<uint64_t> filter_column_ids; |
312 | | |
313 | | // helper to process access paths for a given top-level parquet field |
314 | 12.0k | auto process_access_paths = [](const FieldSchema* parquet_field, |
315 | 12.0k | const std::vector<TColumnAccessPath>& access_paths, |
316 | 16.7k | std::set<uint64_t>& out_ids) { |
317 | 16.7k | process_nested_access_paths( |
318 | 16.7k | parquet_field, access_paths, out_ids, |
319 | 16.7k | [](const FieldSchema* field) { return field->get_column_id(); }, |
320 | 16.7k | [](const FieldSchema* field) { return field->get_max_column_id(); }, |
321 | 16.7k | HiveParquetNestedColumnUtils::extract_nested_column_ids); |
322 | 16.7k | }; |
323 | | |
324 | 62.7k | for (const auto* slot : tuple_descriptor->slots()) { |
325 | 62.7k | auto it = table_col_name_to_field_schema_map.find(slot->col_name_lower_case()); |
326 | 62.7k | if (it == table_col_name_to_field_schema_map.end()) { |
327 | | // Column not found in file |
328 | 7.39k | continue; |
329 | 7.39k | } |
330 | 55.3k | auto field_schema = it->second; |
331 | | |
332 | | // primitive (non-nested) types |
333 | 55.3k | if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY && |
334 | 55.3k | slot->col_type() != TYPE_MAP)) { |
335 | 38.8k | column_ids.insert(field_schema->column_id); |
336 | | |
337 | 38.8k | if (slot->is_predicate()) { |
338 | 5.89k | filter_column_ids.insert(field_schema->column_id); |
339 | 5.89k | } |
340 | 38.8k | continue; |
341 | 38.8k | } |
342 | | |
343 | | // complex types |
344 | 16.5k | const auto& all_access_paths = slot->all_access_paths(); |
345 | 16.5k | process_access_paths(field_schema, all_access_paths, column_ids); |
346 | | |
347 | 16.5k | const auto& predicate_access_paths = slot->predicate_access_paths(); |
348 | 16.5k | if (!predicate_access_paths.empty()) { |
349 | 290 | process_access_paths(field_schema, predicate_access_paths, filter_column_ids); |
350 | 290 | } |
351 | 16.5k | } |
352 | | |
353 | 12.0k | return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids)); |
354 | 12.0k | } |
355 | | |
356 | | ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index( |
357 | 298 | const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor) { |
358 | | // First, assign column IDs to the field descriptor |
359 | 298 | auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc); |
360 | 298 | mutable_field_desc->assign_ids(); |
361 | | |
362 | | // map top-level table column position -> FieldSchema* |
363 | 298 | std::unordered_map<uint64_t, const FieldSchema*> table_col_pos_to_field_schema_map; |
364 | 1.42k | for (int i = 0; i < field_desc->size(); ++i) { |
365 | 1.12k | auto field_schema = field_desc->get_column(i); |
366 | 1.12k | if (!field_schema) continue; |
367 | | |
368 | 1.12k | table_col_pos_to_field_schema_map[i] = field_schema; |
369 | 1.12k | } |
370 | | |
371 | 298 | std::set<uint64_t> column_ids; |
372 | 298 | std::set<uint64_t> filter_column_ids; |
373 | | |
374 | | // helper to process access paths for a given top-level parquet field |
375 | 298 | auto process_access_paths = [](const FieldSchema* parquet_field, |
376 | 298 | const std::vector<TColumnAccessPath>& access_paths, |
377 | 298 | std::set<uint64_t>& out_ids) { |
378 | 13 | process_nested_access_paths( |
379 | 13 | parquet_field, access_paths, out_ids, |
380 | 13 | [](const FieldSchema* field) { return field->get_column_id(); }, |
381 | 13 | [](const FieldSchema* field) { return field->get_max_column_id(); }, |
382 | 13 | HiveParquetNestedColumnUtils::extract_nested_column_ids); |
383 | 13 | }; |
384 | | |
385 | 1.01k | for (const auto* slot : tuple_descriptor->slots()) { |
386 | 1.01k | auto it = table_col_pos_to_field_schema_map.find(slot->col_pos()); |
387 | 1.01k | if (it == table_col_pos_to_field_schema_map.end()) { |
388 | | // Column not found in file |
389 | 1.00k | continue; |
390 | 1.00k | } |
391 | 13 | auto field_schema = it->second; |
392 | | |
393 | | // primitive (non-nested) types |
394 | 13 | if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY && |
395 | 13 | slot->col_type() != TYPE_MAP)) { |
396 | 6 | column_ids.insert(field_schema->column_id); |
397 | | |
398 | 6 | if (slot->is_predicate()) { |
399 | 0 | filter_column_ids.insert(field_schema->column_id); |
400 | 0 | } |
401 | 6 | continue; |
402 | 6 | } |
403 | | |
404 | | // complex types |
405 | 7 | const auto& all_access_paths = slot->all_access_paths(); |
406 | 7 | process_access_paths(field_schema, all_access_paths, column_ids); |
407 | | |
408 | 7 | const auto& predicate_access_paths = slot->predicate_access_paths(); |
409 | 7 | if (!predicate_access_paths.empty()) { |
410 | 6 | process_access_paths(field_schema, predicate_access_paths, filter_column_ids); |
411 | 6 | } |
412 | 7 | } |
413 | | |
414 | 298 | return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids)); |
415 | 298 | } |
416 | | |
417 | | #include "common/compile_check_end.h" |
418 | | } // namespace doris |