be/src/format/table/hive_reader.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "format/table/hive_reader.h" |
19 | | |
20 | | #include <vector> |
21 | | |
22 | | #include "common/status.h" |
23 | | #include "format/table/hive/hive_orc_nested_column_utils.h" |
24 | | #include "format/table/hive/hive_parquet_nested_column_utils.h" |
25 | | #include "format/table/nested_column_access_helper.h" |
26 | | #include "runtime/runtime_state.h" |
27 | | |
28 | | namespace doris { |
29 | | |
30 | 15.3k | Status HiveOrcReader::on_before_init_reader(ReaderInitContext* ctx) { |
31 | 15.3k | _column_descs = ctx->column_descs; |
32 | 15.3k | _fill_col_name_to_block_idx = ctx->col_name_to_block_idx; |
33 | 15.3k | RETURN_IF_ERROR(_extract_partition_values(*ctx->range, ctx->tuple_descriptor, |
34 | 15.3k | _fill_partition_values, |
35 | 15.3k | &_fill_partition_value_is_null)); |
36 | 63.5k | for (const auto& desc : *ctx->column_descs) { |
37 | 63.5k | if (desc.category == ColumnCategory::REGULAR || |
38 | 63.5k | desc.category == ColumnCategory::GENERATED) { |
39 | 53.7k | ctx->column_names.push_back(desc.name); |
40 | 53.7k | } else if (desc.category == ColumnCategory::SYNTHESIZED && |
41 | 9.82k | desc.name.starts_with(BeConsts::GLOBAL_ROWID_COL)) { |
42 | 4.98k | auto topn_row_id_column_iter = _create_topn_row_id_column_iterator(); |
43 | 4.98k | this->register_synthesized_column_handler( |
44 | 4.98k | desc.name, |
45 | 4.98k | [iter = std::move(topn_row_id_column_iter), this, &desc]( |
46 | 27.2k | Block* block, size_t rows) -> Status { |
47 | 27.2k | return fill_topn_row_id(iter, desc.name, block, rows); |
48 | 27.2k | }); |
49 | 4.98k | continue; |
50 | 4.98k | } |
51 | 63.5k | } |
52 | | |
53 | | // Get file type (available because _create_file_reader() runs before this hook) |
54 | 15.3k | const orc::Type* orc_type_ptr = nullptr; |
55 | 15.3k | RETURN_IF_ERROR(get_file_type(&orc_type_ptr)); |
56 | 15.3k | bool is_hive_col_name = OrcReader::is_hive1_col_name(orc_type_ptr); |
57 | | |
58 | | // Build table_info_node based on config |
59 | 15.3k | if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) { |
60 | 15.0k | RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(ctx->tuple_descriptor, orc_type_ptr, |
61 | 15.0k | ctx->table_info_node, _is_file_slot)); |
62 | 15.0k | } else { |
63 | 348 | ctx->table_info_node = std::make_shared<StructNode>(); |
64 | 348 | std::map<std::string, const SlotDescriptor*> slot_map; |
65 | 974 | for (const auto& slot : ctx->tuple_descriptor->slots()) { |
66 | 974 | slot_map.emplace(slot->col_name_lower_case(), slot); |
67 | 974 | } |
68 | | |
69 | 1.20k | for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) { |
70 | 852 | auto table_column_name = ctx->column_names[idx]; |
71 | 852 | auto file_index = get_scan_params().column_idxs[idx]; |
72 | | |
73 | 852 | if (file_index >= orc_type_ptr->getSubtypeCount()) { |
74 | 112 | ctx->table_info_node->add_not_exist_children(table_column_name); |
75 | 740 | } else { |
76 | 740 | auto field_node = std::make_shared<Node>(); |
77 | 740 | RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name( |
78 | 740 | slot_map[table_column_name]->type(), orc_type_ptr->getSubtype(file_index), |
79 | 740 | field_node)); |
80 | 740 | ctx->table_info_node->add_children( |
81 | 740 | table_column_name, orc_type_ptr->getFieldName(file_index), field_node); |
82 | 740 | } |
83 | 852 | slot_map.erase(table_column_name); |
84 | 852 | } |
85 | 348 | for (const auto& [partition_col_name, _] : slot_map) { |
86 | 116 | ctx->table_info_node->add_not_exist_children(partition_col_name); |
87 | 116 | } |
88 | 348 | } |
89 | | |
90 | | // Compute column_ids |
91 | 15.3k | auto column_id_result = ColumnIdResult(); |
92 | 15.3k | if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) { |
93 | 14.9k | column_id_result = _create_column_ids(orc_type_ptr, ctx->tuple_descriptor); |
94 | 14.9k | } else { |
95 | 390 | column_id_result = |
96 | 390 | _create_column_ids_by_top_level_col_index(orc_type_ptr, ctx->tuple_descriptor); |
97 | 390 | } |
98 | 15.3k | ctx->column_ids = std::move(column_id_result.column_ids); |
99 | 15.3k | ctx->filter_column_ids = std::move(column_id_result.filter_column_ids); |
100 | | |
101 | | // _is_acid is false by default, no need to set explicitly |
102 | 15.3k | return Status::OK(); |
103 | 15.3k | } |
104 | | |
105 | | ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type, |
106 | 14.9k | const TupleDescriptor* tuple_descriptor) { |
107 | | // map top-level table column name (lower-cased) -> orc::Type* |
108 | 14.9k | std::unordered_map<std::string, const orc::Type*> table_col_name_to_orc_type_map; |
109 | 1.26M | for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) { |
110 | 1.25M | auto orc_sub_type = orc_type->getSubtype(i); |
111 | 1.25M | if (!orc_sub_type) continue; |
112 | | |
113 | 1.25M | std::string table_col_name = to_lower(orc_type->getFieldName(i)); |
114 | 1.25M | table_col_name_to_orc_type_map[table_col_name] = orc_sub_type; |
115 | 1.25M | } |
116 | | |
117 | 14.9k | std::set<uint64_t> column_ids; |
118 | 14.9k | std::set<uint64_t> filter_column_ids; |
119 | | |
120 | | // helper to process access paths for a given top-level orc field |
121 | 14.9k | auto process_access_paths = [](const orc::Type* orc_field, |
122 | 14.9k | const std::vector<TColumnAccessPath>& access_paths, |
123 | 14.9k | std::set<uint64_t>& out_ids) { |
124 | 10.1k | process_nested_access_paths( |
125 | 10.1k | orc_field, access_paths, out_ids, |
126 | 10.1k | [](const orc::Type* type) { return type->getColumnId(); }, |
127 | 10.1k | [](const orc::Type* type) { return type->getMaximumColumnId(); }, |
128 | 10.1k | HiveOrcNestedColumnUtils::extract_nested_column_ids); |
129 | 10.1k | }; |
130 | | |
131 | 62.5k | for (const auto* slot : tuple_descriptor->slots()) { |
132 | 62.5k | auto it = table_col_name_to_orc_type_map.find(slot->col_name_lower_case()); |
133 | 62.5k | if (it == table_col_name_to_orc_type_map.end()) { |
134 | | // Column not found in file |
135 | 10.4k | continue; |
136 | 10.4k | } |
137 | 52.1k | const orc::Type* orc_field = it->second; |
138 | | |
139 | | // primitive (non-nested) types |
140 | 52.1k | if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY && |
141 | 52.1k | slot->col_type() != TYPE_MAP)) { |
142 | 41.9k | column_ids.insert(orc_field->getColumnId()); |
143 | 41.9k | if (slot->is_predicate()) { |
144 | 10.7k | filter_column_ids.insert(orc_field->getColumnId()); |
145 | 10.7k | } |
146 | 41.9k | continue; |
147 | 41.9k | } |
148 | | |
149 | | // complex types |
150 | 10.1k | const auto& all_access_paths = slot->all_access_paths(); |
151 | 10.1k | process_access_paths(orc_field, all_access_paths, column_ids); |
152 | | |
153 | 10.1k | const auto& predicate_access_paths = slot->predicate_access_paths(); |
154 | 10.1k | if (!predicate_access_paths.empty()) { |
155 | 94 | process_access_paths(orc_field, predicate_access_paths, filter_column_ids); |
156 | 94 | } |
157 | 10.1k | } |
158 | | |
159 | 14.9k | return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids)); |
160 | 14.9k | } |
161 | | |
162 | | ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index( |
163 | 312 | const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) { |
164 | | // map top-level table column position -> orc::Type* |
165 | 312 | std::unordered_map<uint64_t, const orc::Type*> table_col_pos_to_orc_type_map; |
166 | 1.49k | for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) { |
167 | 1.18k | auto orc_sub_type = orc_type->getSubtype(i); |
168 | 1.18k | if (!orc_sub_type) continue; |
169 | | |
170 | 1.18k | table_col_pos_to_orc_type_map[i] = orc_sub_type; |
171 | 1.18k | } |
172 | | |
173 | 312 | std::set<uint64_t> column_ids; |
174 | 312 | std::set<uint64_t> filter_column_ids; |
175 | | |
176 | | // helper to process access paths for a given top-level orc field |
177 | 312 | auto process_access_paths = [](const orc::Type* orc_field, |
178 | 312 | const std::vector<TColumnAccessPath>& access_paths, |
179 | 312 | std::set<uint64_t>& out_ids) { |
180 | 13 | process_nested_access_paths( |
181 | 13 | orc_field, access_paths, out_ids, |
182 | 13 | [](const orc::Type* type) { return type->getColumnId(); }, |
183 | 13 | [](const orc::Type* type) { return type->getMaximumColumnId(); }, |
184 | 13 | HiveOrcNestedColumnUtils::extract_nested_column_ids); |
185 | 13 | }; |
186 | | |
187 | 989 | for (const auto* slot : tuple_descriptor->slots()) { |
188 | 989 | auto it = table_col_pos_to_orc_type_map.find(slot->col_pos()); |
189 | 989 | if (it == table_col_pos_to_orc_type_map.end()) { |
190 | | // Column not found in file |
191 | 972 | continue; |
192 | 972 | } |
193 | 17 | const orc::Type* orc_field = it->second; |
194 | | |
195 | | // primitive (non-nested) types |
196 | 17 | if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY && |
197 | 17 | slot->col_type() != TYPE_MAP)) { |
198 | 6 | column_ids.insert(orc_field->getColumnId()); |
199 | 6 | if (slot->is_predicate()) { |
200 | 0 | filter_column_ids.insert(orc_field->getColumnId()); |
201 | 0 | } |
202 | 6 | continue; |
203 | 6 | } |
204 | | |
205 | 11 | const auto& all_access_paths = slot->all_access_paths(); |
206 | | // complex types |
207 | 11 | process_access_paths(orc_field, all_access_paths, column_ids); |
208 | | |
209 | 11 | const auto& predicate_access_paths = slot->predicate_access_paths(); |
210 | 11 | if (!predicate_access_paths.empty()) { |
211 | 6 | process_access_paths(orc_field, predicate_access_paths, filter_column_ids); |
212 | 6 | } |
213 | 11 | } |
214 | | |
215 | 312 | return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids)); |
216 | 312 | } |
217 | | |
218 | 10.4k | Status HiveParquetReader::on_before_init_reader(ReaderInitContext* ctx) { |
219 | 10.4k | _column_descs = ctx->column_descs; |
220 | 10.4k | _fill_col_name_to_block_idx = ctx->col_name_to_block_idx; |
221 | 10.4k | RETURN_IF_ERROR(_extract_partition_values(*ctx->range, ctx->tuple_descriptor, |
222 | 10.4k | _fill_partition_values, |
223 | 10.4k | &_fill_partition_value_is_null)); |
224 | 56.1k | for (const auto& desc : *ctx->column_descs) { |
225 | 56.1k | if (desc.category == ColumnCategory::REGULAR || |
226 | 56.1k | desc.category == ColumnCategory::GENERATED) { |
227 | 49.9k | ctx->column_names.push_back(desc.name); |
228 | 49.9k | } else if (desc.category == ColumnCategory::SYNTHESIZED && |
229 | 6.17k | desc.name.starts_with(BeConsts::GLOBAL_ROWID_COL)) { |
230 | 1.83k | auto topn_row_id_column_iter = _create_topn_row_id_column_iterator(); |
231 | 1.83k | this->register_synthesized_column_handler( |
232 | 1.83k | desc.name, |
233 | 1.83k | [iter = std::move(topn_row_id_column_iter), this, &desc]( |
234 | 7.71k | Block* block, size_t rows) -> Status { |
235 | 7.71k | return fill_topn_row_id(iter, desc.name, block, rows); |
236 | 7.71k | }); |
237 | 1.83k | continue; |
238 | 1.83k | } |
239 | 56.1k | } |
240 | | |
241 | | // Get file metadata schema (available because _open_file() runs before this hook) |
242 | 10.4k | const FieldDescriptor* field_desc = nullptr; |
243 | 10.4k | RETURN_IF_ERROR(get_file_metadata_schema(&field_desc)); |
244 | 10.4k | DCHECK(field_desc != nullptr); |
245 | | |
246 | | // Build table_info_node based on config |
247 | 10.4k | if (get_state()->query_options().hive_parquet_use_column_names) { |
248 | 10.2k | RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(ctx->tuple_descriptor, *field_desc, |
249 | 10.2k | ctx->table_info_node, _is_file_slot)); |
250 | 10.2k | } else { |
251 | 264 | ctx->table_info_node = std::make_shared<StructNode>(); |
252 | 264 | std::map<std::string, const SlotDescriptor*> slot_map; |
253 | 880 | for (const auto& slot : ctx->tuple_descriptor->slots()) { |
254 | 880 | slot_map.emplace(slot->col_name_lower_case(), slot); |
255 | 880 | } |
256 | | |
257 | 264 | auto parquet_fields_schema = field_desc->get_fields_schema(); |
258 | 1.02k | for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) { |
259 | 762 | auto table_column_name = ctx->column_names[idx]; |
260 | 762 | auto file_index = get_scan_params().column_idxs[idx]; |
261 | | |
262 | 762 | if (file_index >= parquet_fields_schema.size()) { |
263 | 112 | ctx->table_info_node->add_not_exist_children(table_column_name); |
264 | 650 | } else { |
265 | 650 | auto field_node = std::make_shared<Node>(); |
266 | 650 | RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name( |
267 | 650 | slot_map[table_column_name]->type(), parquet_fields_schema[file_index], |
268 | 650 | field_node)); |
269 | 650 | ctx->table_info_node->add_children( |
270 | 650 | table_column_name, parquet_fields_schema[file_index].name, field_node); |
271 | 650 | } |
272 | 762 | slot_map.erase(table_column_name); |
273 | 762 | } |
274 | 264 | for (const auto& [partition_col_name, _] : slot_map) { |
275 | 116 | ctx->table_info_node->add_not_exist_children(partition_col_name); |
276 | 116 | } |
277 | 264 | } |
278 | | |
279 | | // Compute column_ids for lazy materialization |
280 | 10.4k | auto column_id_result = ColumnIdResult(); |
281 | 10.4k | if (get_state()->query_options().hive_parquet_use_column_names) { |
282 | 10.2k | column_id_result = _create_column_ids(field_desc, ctx->tuple_descriptor); |
283 | 10.2k | } else { |
284 | 274 | column_id_result = |
285 | 274 | _create_column_ids_by_top_level_col_index(field_desc, ctx->tuple_descriptor); |
286 | 274 | } |
287 | 10.4k | ctx->column_ids = std::move(column_id_result.column_ids); |
288 | 10.4k | ctx->filter_column_ids = std::move(column_id_result.filter_column_ids); |
289 | | |
290 | 10.4k | _filter_groups = true; |
291 | 10.4k | return Status::OK(); |
292 | 10.4k | } |
293 | | |
294 | | ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* field_desc, |
295 | 10.2k | const TupleDescriptor* tuple_descriptor) { |
296 | | // First, assign column IDs to the field descriptor |
297 | 10.2k | auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc); |
298 | 10.2k | mutable_field_desc->assign_ids(); |
299 | | |
300 | | // map top-level table column name (lower-cased) -> FieldSchema* |
301 | 10.2k | std::unordered_map<std::string, const FieldSchema*> table_col_name_to_field_schema_map; |
302 | 87.7k | for (int i = 0; i < field_desc->size(); ++i) { |
303 | 77.5k | auto field_schema = field_desc->get_column(i); |
304 | 77.5k | if (!field_schema) continue; |
305 | | |
306 | 77.5k | table_col_name_to_field_schema_map[field_schema->lower_case_name] = field_schema; |
307 | 77.5k | } |
308 | | |
309 | 10.2k | std::set<uint64_t> column_ids; |
310 | 10.2k | std::set<uint64_t> filter_column_ids; |
311 | | |
312 | | // helper to process access paths for a given top-level parquet field |
313 | 10.2k | auto process_access_paths = [](const FieldSchema* parquet_field, |
314 | 10.2k | const std::vector<TColumnAccessPath>& access_paths, |
315 | 15.7k | std::set<uint64_t>& out_ids) { |
316 | 15.7k | process_nested_access_paths( |
317 | 15.7k | parquet_field, access_paths, out_ids, |
318 | 15.8k | [](const FieldSchema* field) { return field->get_column_id(); }, |
319 | 15.7k | [](const FieldSchema* field) { return field->get_max_column_id(); }, |
320 | 15.7k | HiveParquetNestedColumnUtils::extract_nested_column_ids); |
321 | 15.7k | }; |
322 | | |
323 | 55.1k | for (const auto* slot : tuple_descriptor->slots()) { |
324 | 55.1k | auto it = table_col_name_to_field_schema_map.find(slot->col_name_lower_case()); |
325 | 55.1k | if (it == table_col_name_to_field_schema_map.end()) { |
326 | | // Column not found in file |
327 | 6.67k | continue; |
328 | 6.67k | } |
329 | 48.4k | auto field_schema = it->second; |
330 | | |
331 | | // primitive (non-nested) types |
332 | 48.4k | if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY && |
333 | 48.4k | slot->col_type() != TYPE_MAP)) { |
334 | 32.7k | column_ids.insert(field_schema->column_id); |
335 | | |
336 | 32.7k | if (slot->is_predicate()) { |
337 | 5.14k | filter_column_ids.insert(field_schema->column_id); |
338 | 5.14k | } |
339 | 32.7k | continue; |
340 | 32.7k | } |
341 | | |
342 | | // complex types |
343 | 15.7k | const auto& all_access_paths = slot->all_access_paths(); |
344 | 15.7k | process_access_paths(field_schema, all_access_paths, column_ids); |
345 | | |
346 | 15.7k | const auto& predicate_access_paths = slot->predicate_access_paths(); |
347 | 15.7k | if (!predicate_access_paths.empty()) { |
348 | 98 | process_access_paths(field_schema, predicate_access_paths, filter_column_ids); |
349 | 98 | } |
350 | 15.7k | } |
351 | | |
352 | 10.2k | return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids)); |
353 | 10.2k | } |
354 | | |
355 | | ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index( |
356 | 268 | const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor) { |
357 | | // First, assign column IDs to the field descriptor |
358 | 268 | auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc); |
359 | 268 | mutable_field_desc->assign_ids(); |
360 | | |
361 | | // map top-level table column position -> FieldSchema* |
362 | 268 | std::unordered_map<uint64_t, const FieldSchema*> table_col_pos_to_field_schema_map; |
363 | 1.31k | for (int i = 0; i < field_desc->size(); ++i) { |
364 | 1.05k | auto field_schema = field_desc->get_column(i); |
365 | 1.05k | if (!field_schema) continue; |
366 | | |
367 | 1.05k | table_col_pos_to_field_schema_map[i] = field_schema; |
368 | 1.05k | } |
369 | | |
370 | 268 | std::set<uint64_t> column_ids; |
371 | 268 | std::set<uint64_t> filter_column_ids; |
372 | | |
373 | | // helper to process access paths for a given top-level parquet field |
374 | 268 | auto process_access_paths = [](const FieldSchema* parquet_field, |
375 | 268 | const std::vector<TColumnAccessPath>& access_paths, |
376 | 268 | std::set<uint64_t>& out_ids) { |
377 | 13 | process_nested_access_paths( |
378 | 13 | parquet_field, access_paths, out_ids, |
379 | 13 | [](const FieldSchema* field) { return field->get_column_id(); }, |
380 | 13 | [](const FieldSchema* field) { return field->get_max_column_id(); }, |
381 | 13 | HiveParquetNestedColumnUtils::extract_nested_column_ids); |
382 | 13 | }; |
383 | | |
384 | 885 | for (const auto* slot : tuple_descriptor->slots()) { |
385 | 885 | auto it = table_col_pos_to_field_schema_map.find(slot->col_pos()); |
386 | 885 | if (it == table_col_pos_to_field_schema_map.end()) { |
387 | | // Column not found in file |
388 | 872 | continue; |
389 | 872 | } |
390 | 13 | auto field_schema = it->second; |
391 | | |
392 | | // primitive (non-nested) types |
393 | 13 | if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY && |
394 | 13 | slot->col_type() != TYPE_MAP)) { |
395 | 6 | column_ids.insert(field_schema->column_id); |
396 | | |
397 | 6 | if (slot->is_predicate()) { |
398 | 0 | filter_column_ids.insert(field_schema->column_id); |
399 | 0 | } |
400 | 6 | continue; |
401 | 6 | } |
402 | | |
403 | | // complex types |
404 | 7 | const auto& all_access_paths = slot->all_access_paths(); |
405 | 7 | process_access_paths(field_schema, all_access_paths, column_ids); |
406 | | |
407 | 7 | const auto& predicate_access_paths = slot->predicate_access_paths(); |
408 | 7 | if (!predicate_access_paths.empty()) { |
409 | 6 | process_access_paths(field_schema, predicate_access_paths, filter_column_ids); |
410 | 6 | } |
411 | 7 | } |
412 | | |
413 | 268 | return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids)); |
414 | 268 | } |
415 | | |
416 | | } // namespace doris |