Coverage Report

Created: 2026-04-10 12:12

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/format/table/hive_reader.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "format/table/hive_reader.h"
19
20
#include <vector>
21
22
#include "common/status.h"
23
#include "format/table/hive/hive_orc_nested_column_utils.h"
24
#include "format/table/hive/hive_parquet_nested_column_utils.h"
25
#include "format/table/nested_column_access_helper.h"
26
#include "runtime/runtime_state.h"
27
28
namespace doris {
29
30
81.4k
Status HiveReader::get_next_block_inner(Block* block, size_t* read_rows, bool* eof) {
31
81.4k
    RETURN_IF_ERROR(_file_format_reader->get_next_block(block, read_rows, eof));
32
81.4k
    return Status::OK();
33
81.4k
};
34
35
Status HiveOrcReader::init_reader(
36
        const std::vector<std::string>& read_table_col_names,
37
        std::unordered_map<std::string, uint32_t>* col_name_to_block_idx,
38
        const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor,
39
        const RowDescriptor* row_descriptor,
40
        const VExprContextSPtrs* not_single_slot_filter_conjuncts,
41
15.3k
        const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts) {
42
15.3k
    auto* orc_reader = static_cast<OrcReader*>(_file_format_reader.get());
43
44
15.3k
    const orc::Type* orc_type_ptr = nullptr;
45
15.3k
    RETURN_IF_ERROR(orc_reader->get_file_type(&orc_type_ptr));
46
15.3k
    bool is_hive_col_name = OrcReader::is_hive1_col_name(orc_type_ptr);
47
48
15.3k
    if (_state->query_options().hive_orc_use_column_names && !is_hive_col_name) {
49
        // Directly use the table column name to match the file column name, but pay attention to the case issue.
50
15.0k
        RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(tuple_descriptor, orc_type_ptr,
51
15.0k
                                                        table_info_node_ptr, _is_file_slot));
52
15.0k
    } else {
53
        // hive1 / use index
54
278
        std::map<std::string, const SlotDescriptor*> slot_map; // table_name to slot
55
972
        for (const auto& slot : tuple_descriptor->slots()) {
56
972
            slot_map.emplace(slot->col_name_lower_case(), slot);
57
972
        }
58
59
        // For top-level columns, use indexes to match, and for sub-columns, still use name to match columns.
60
1.13k
        for (size_t idx = 0; idx < _params.column_idxs.size(); idx++) {
61
852
            auto table_column_name = read_table_col_names[idx];
62
852
            auto file_index = _params.column_idxs[idx];
63
64
852
            if (file_index >= orc_type_ptr->getSubtypeCount()) {
65
112
                table_info_node_ptr->add_not_exist_children(table_column_name);
66
740
            } else {
67
740
                auto field_node = std::make_shared<Node>();
68
                // For sub-columns, still use name to match columns.
69
740
                RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(
70
740
                        slot_map[table_column_name]->type(), orc_type_ptr->getSubtype(file_index),
71
740
                        field_node));
72
740
                table_info_node_ptr->add_children(
73
740
                        table_column_name, orc_type_ptr->getFieldName(file_index), field_node);
74
740
            }
75
852
            slot_map.erase(table_column_name);
76
852
        }
77
278
        for (const auto& [partition_col_name, _] : slot_map) {
78
114
            table_info_node_ptr->add_not_exist_children(partition_col_name);
79
114
        }
80
278
    }
81
82
15.3k
    auto column_id_result = ColumnIdResult();
83
15.3k
    if (_state->query_options().hive_orc_use_column_names && !is_hive_col_name) {
84
14.9k
        column_id_result = _create_column_ids(orc_type_ptr, tuple_descriptor);
85
14.9k
    } else {
86
386
        column_id_result =
87
386
                _create_column_ids_by_top_level_col_index(orc_type_ptr, tuple_descriptor);
88
386
    }
89
90
15.3k
    const auto& column_ids = column_id_result.column_ids;
91
15.3k
    const auto& filter_column_ids = column_id_result.filter_column_ids;
92
93
15.3k
    return orc_reader->init_reader(&read_table_col_names, col_name_to_block_idx, conjuncts, false,
94
15.3k
                                   tuple_descriptor, row_descriptor,
95
15.3k
                                   not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts,
96
15.3k
                                   table_info_node_ptr, column_ids, filter_column_ids);
97
15.3k
}
98
99
ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type,
100
14.9k
                                                 const TupleDescriptor* tuple_descriptor) {
101
    // map top-level table column name (lower-cased) -> orc::Type*
102
14.9k
    std::unordered_map<std::string, const orc::Type*> table_col_name_to_orc_type_map;
103
1.27M
    for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
104
1.25M
        auto orc_sub_type = orc_type->getSubtype(i);
105
1.25M
        if (!orc_sub_type) continue;
106
107
1.25M
        std::string table_col_name = to_lower(orc_type->getFieldName(i));
108
1.25M
        table_col_name_to_orc_type_map[table_col_name] = orc_sub_type;
109
1.25M
    }
110
111
14.9k
    std::set<uint64_t> column_ids;
112
14.9k
    std::set<uint64_t> filter_column_ids;
113
114
    // helper to process access paths for a given top-level orc field
115
14.9k
    auto process_access_paths = [](const orc::Type* orc_field,
116
14.9k
                                   const std::vector<TColumnAccessPath>& access_paths,
117
14.9k
                                   std::set<uint64_t>& out_ids) {
118
10.1k
        process_nested_access_paths(
119
10.1k
                orc_field, access_paths, out_ids,
120
10.1k
                [](const orc::Type* type) { return type->getColumnId(); },
121
10.1k
                [](const orc::Type* type) { return type->getMaximumColumnId(); },
122
10.1k
                HiveOrcNestedColumnUtils::extract_nested_column_ids);
123
10.1k
    };
124
125
62.0k
    for (const auto* slot : tuple_descriptor->slots()) {
126
62.0k
        auto it = table_col_name_to_orc_type_map.find(slot->col_name_lower_case());
127
62.0k
        if (it == table_col_name_to_orc_type_map.end()) {
128
            // Column not found in file
129
10.4k
            continue;
130
10.4k
        }
131
51.6k
        const orc::Type* orc_field = it->second;
132
133
        // primitive (non-nested) types
134
51.6k
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
135
51.6k
             slot->col_type() != TYPE_MAP)) {
136
41.5k
            column_ids.insert(orc_field->getColumnId());
137
41.5k
            if (slot->is_predicate()) {
138
10.3k
                filter_column_ids.insert(orc_field->getColumnId());
139
10.3k
            }
140
41.5k
            continue;
141
41.5k
        }
142
143
        // complex types
144
10.0k
        const auto& all_access_paths = slot->all_access_paths();
145
10.0k
        process_access_paths(orc_field, all_access_paths, column_ids);
146
147
10.0k
        const auto& predicate_access_paths = slot->predicate_access_paths();
148
10.0k
        if (!predicate_access_paths.empty()) {
149
150
            process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
150
150
        }
151
10.0k
    }
152
153
14.9k
    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
154
14.9k
}
155
156
ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index(
157
312
        const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) {
158
    // map top-level table column position -> orc::Type*
159
312
    std::unordered_map<uint64_t, const orc::Type*> table_col_pos_to_orc_type_map;
160
1.49k
    for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
161
1.18k
        auto orc_sub_type = orc_type->getSubtype(i);
162
1.18k
        if (!orc_sub_type) continue;
163
164
1.18k
        table_col_pos_to_orc_type_map[i] = orc_sub_type;
165
1.18k
    }
166
167
312
    std::set<uint64_t> column_ids;
168
312
    std::set<uint64_t> filter_column_ids;
169
170
    // helper to process access paths for a given top-level orc field
171
312
    auto process_access_paths = [](const orc::Type* orc_field,
172
312
                                   const std::vector<TColumnAccessPath>& access_paths,
173
312
                                   std::set<uint64_t>& out_ids) {
174
13
        process_nested_access_paths(
175
13
                orc_field, access_paths, out_ids,
176
13
                [](const orc::Type* type) { return type->getColumnId(); },
177
13
                [](const orc::Type* type) { return type->getMaximumColumnId(); },
178
13
                HiveOrcNestedColumnUtils::extract_nested_column_ids);
179
13
    };
180
181
967
    for (const auto* slot : tuple_descriptor->slots()) {
182
967
        auto it = table_col_pos_to_orc_type_map.find(slot->col_pos());
183
967
        if (it == table_col_pos_to_orc_type_map.end()) {
184
            // Column not found in file
185
952
            continue;
186
952
        }
187
15
        const orc::Type* orc_field = it->second;
188
189
        // primitive (non-nested) types
190
15
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
191
15
             slot->col_type() != TYPE_MAP)) {
192
6
            column_ids.insert(orc_field->getColumnId());
193
6
            if (slot->is_predicate()) {
194
0
                filter_column_ids.insert(orc_field->getColumnId());
195
0
            }
196
6
            continue;
197
6
        }
198
199
9
        const auto& all_access_paths = slot->all_access_paths();
200
        // complex types
201
9
        process_access_paths(orc_field, all_access_paths, column_ids);
202
203
9
        const auto& predicate_access_paths = slot->predicate_access_paths();
204
9
        if (!predicate_access_paths.empty()) {
205
6
            process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
206
6
        }
207
9
    }
208
209
312
    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
210
312
}
211
212
Status HiveParquetReader::init_reader(
213
        const std::vector<std::string>& read_table_col_names,
214
        std::unordered_map<std::string, uint32_t>* col_name_to_block_idx,
215
        const VExprContextSPtrs& conjuncts,
216
        phmap::flat_hash_map<int, std::vector<std::shared_ptr<ColumnPredicate>>>&
217
                slot_id_to_predicates,
218
        const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor,
219
        const std::unordered_map<std::string, int>* colname_to_slot_id,
220
        const VExprContextSPtrs* not_single_slot_filter_conjuncts,
221
10.4k
        const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts) {
222
10.4k
    auto* parquet_reader = static_cast<ParquetReader*>(_file_format_reader.get());
223
10.4k
    const FieldDescriptor* field_desc = nullptr;
224
10.4k
    RETURN_IF_ERROR(parquet_reader->get_file_metadata_schema(&field_desc));
225
10.4k
    DCHECK(field_desc != nullptr);
226
227
10.4k
    if (_state->query_options().hive_parquet_use_column_names) {
228
        // Directly use the table column name to match the file column name, but pay attention to the case issue.
229
10.2k
        RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(tuple_descriptor, *field_desc,
230
10.2k
                                                            table_info_node_ptr, _is_file_slot));
231
10.2k
    } else {                                                   // use idx
232
274
        std::map<std::string, const SlotDescriptor*> slot_map; //table_name to slot
233
880
        for (const auto& slot : tuple_descriptor->slots()) {
234
880
            slot_map.emplace(slot->col_name_lower_case(), slot);
235
880
        }
236
237
        // For top-level columns, use indexes to match, and for sub-columns, still use name to match columns.
238
274
        auto parquet_fields_schema = field_desc->get_fields_schema();
239
1.03k
        for (size_t idx = 0; idx < _params.column_idxs.size(); idx++) {
240
760
            auto table_column_name = read_table_col_names[idx];
241
760
            auto file_index = _params.column_idxs[idx];
242
243
760
            if (file_index >= parquet_fields_schema.size()) {
244
                // Non-partitioning columns, which may be columns added later.
245
112
                table_info_node_ptr->add_not_exist_children(table_column_name);
246
648
            } else {
247
                // Non-partitioning columns, columns that exist in both the table and the file.
248
648
                auto field_node = std::make_shared<Node>();
249
                // for sub-columns, still use name to match columns.
250
648
                RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(
251
648
                        slot_map[table_column_name]->type(), parquet_fields_schema[file_index],
252
648
                        field_node));
253
648
                table_info_node_ptr->add_children(
254
648
                        table_column_name, parquet_fields_schema[file_index].name, field_node);
255
648
            }
256
257
760
            slot_map.erase(table_column_name);
258
760
        }
259
        /*
260
         * `_params.column_idxs` only have `isIsFileSlot()`, so we need add `partition slot`.
261
         * eg:
262
         * Table : A, B, C, D     (D: partition column)
263
         * Parquet file : A, B
264
         * Column C is obtained by add column.
265
         *
266
         * sql : select * from table;
267
         * slot : A, B, C, D
268
         * _params.column_idxs: 0, 1, 2 (There is no 3, because column D is the partition column)
269
         *
270
         */
271
274
        for (const auto& [partition_col_name, _] : slot_map) {
272
116
            table_info_node_ptr->add_not_exist_children(partition_col_name);
273
116
        }
274
274
    }
275
276
10.4k
    auto column_id_result = ColumnIdResult();
277
10.4k
    if (_state->query_options().hive_parquet_use_column_names) {
278
10.1k
        column_id_result = _create_column_ids(field_desc, tuple_descriptor);
279
10.1k
    } else {
280
302
        column_id_result = _create_column_ids_by_top_level_col_index(field_desc, tuple_descriptor);
281
302
    }
282
283
10.4k
    const auto& column_ids = column_id_result.column_ids;
284
10.4k
    const auto& filter_column_ids = column_id_result.filter_column_ids;
285
286
10.4k
    RETURN_IF_ERROR(init_row_filters());
287
288
10.4k
    return parquet_reader->init_reader(
289
10.4k
            read_table_col_names, col_name_to_block_idx, conjuncts, slot_id_to_predicates,
290
10.4k
            tuple_descriptor, row_descriptor, colname_to_slot_id, not_single_slot_filter_conjuncts,
291
10.4k
            slot_id_to_filter_conjuncts, table_info_node_ptr, true, column_ids, filter_column_ids);
292
10.4k
}
293
294
ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* field_desc,
295
10.2k
                                                     const TupleDescriptor* tuple_descriptor) {
296
    // First, assign column IDs to the field descriptor
297
10.2k
    auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
298
10.2k
    mutable_field_desc->assign_ids();
299
300
    // map top-level table column name (lower-cased) -> FieldSchema*
301
10.2k
    std::unordered_map<std::string, const FieldSchema*> table_col_name_to_field_schema_map;
302
87.6k
    for (int i = 0; i < field_desc->size(); ++i) {
303
77.4k
        auto field_schema = field_desc->get_column(i);
304
77.4k
        if (!field_schema) continue;
305
306
77.4k
        table_col_name_to_field_schema_map[field_schema->lower_case_name] = field_schema;
307
77.4k
    }
308
309
10.2k
    std::set<uint64_t> column_ids;
310
10.2k
    std::set<uint64_t> filter_column_ids;
311
312
    // helper to process access paths for a given top-level parquet field
313
10.2k
    auto process_access_paths = [](const FieldSchema* parquet_field,
314
10.2k
                                   const std::vector<TColumnAccessPath>& access_paths,
315
15.8k
                                   std::set<uint64_t>& out_ids) {
316
15.8k
        process_nested_access_paths(
317
15.8k
                parquet_field, access_paths, out_ids,
318
15.8k
                [](const FieldSchema* field) { return field->get_column_id(); },
319
15.8k
                [](const FieldSchema* field) { return field->get_max_column_id(); },
320
15.8k
                HiveParquetNestedColumnUtils::extract_nested_column_ids);
321
15.8k
    };
322
323
55.2k
    for (const auto* slot : tuple_descriptor->slots()) {
324
55.2k
        auto it = table_col_name_to_field_schema_map.find(slot->col_name_lower_case());
325
55.2k
        if (it == table_col_name_to_field_schema_map.end()) {
326
            // Column not found in file
327
6.65k
            continue;
328
6.65k
        }
329
48.5k
        auto field_schema = it->second;
330
331
        // primitive (non-nested) types
332
48.5k
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
333
48.5k
             slot->col_type() != TYPE_MAP)) {
334
32.8k
            column_ids.insert(field_schema->column_id);
335
336
32.8k
            if (slot->is_predicate()) {
337
4.97k
                filter_column_ids.insert(field_schema->column_id);
338
4.97k
            }
339
32.8k
            continue;
340
32.8k
        }
341
342
        // complex types
343
15.7k
        const auto& all_access_paths = slot->all_access_paths();
344
15.7k
        process_access_paths(field_schema, all_access_paths, column_ids);
345
346
15.7k
        const auto& predicate_access_paths = slot->predicate_access_paths();
347
15.7k
        if (!predicate_access_paths.empty()) {
348
188
            process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
349
188
        }
350
15.7k
    }
351
352
10.2k
    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
353
10.2k
}
354
355
ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index(
356
270
        const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor) {
357
    // First, assign column IDs to the field descriptor
358
270
    auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
359
270
    mutable_field_desc->assign_ids();
360
361
    // map top-level table column position -> FieldSchema*
362
270
    std::unordered_map<uint64_t, const FieldSchema*> table_col_pos_to_field_schema_map;
363
1.32k
    for (int i = 0; i < field_desc->size(); ++i) {
364
1.05k
        auto field_schema = field_desc->get_column(i);
365
1.05k
        if (!field_schema) continue;
366
367
1.05k
        table_col_pos_to_field_schema_map[i] = field_schema;
368
1.05k
    }
369
370
270
    std::set<uint64_t> column_ids;
371
270
    std::set<uint64_t> filter_column_ids;
372
373
    // helper to process access paths for a given top-level parquet field
374
270
    auto process_access_paths = [](const FieldSchema* parquet_field,
375
270
                                   const std::vector<TColumnAccessPath>& access_paths,
376
270
                                   std::set<uint64_t>& out_ids) {
377
13
        process_nested_access_paths(
378
13
                parquet_field, access_paths, out_ids,
379
13
                [](const FieldSchema* field) { return field->get_column_id(); },
380
13
                [](const FieldSchema* field) { return field->get_max_column_id(); },
381
13
                HiveParquetNestedColumnUtils::extract_nested_column_ids);
382
13
    };
383
384
893
    for (const auto* slot : tuple_descriptor->slots()) {
385
893
        auto it = table_col_pos_to_field_schema_map.find(slot->col_pos());
386
893
        if (it == table_col_pos_to_field_schema_map.end()) {
387
            // Column not found in file
388
880
            continue;
389
880
        }
390
13
        auto field_schema = it->second;
391
392
        // primitive (non-nested) types
393
13
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
394
13
             slot->col_type() != TYPE_MAP)) {
395
6
            column_ids.insert(field_schema->column_id);
396
397
6
            if (slot->is_predicate()) {
398
0
                filter_column_ids.insert(field_schema->column_id);
399
0
            }
400
6
            continue;
401
6
        }
402
403
        // complex types
404
7
        const auto& all_access_paths = slot->all_access_paths();
405
7
        process_access_paths(field_schema, all_access_paths, column_ids);
406
407
7
        const auto& predicate_access_paths = slot->predicate_access_paths();
408
7
        if (!predicate_access_paths.empty()) {
409
6
            process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
410
6
        }
411
7
    }
412
413
270
    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
414
270
}
415
416
} // namespace doris