Coverage Report

Created: 2026-03-12 17:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/format/table/hive_reader.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "format/table/hive_reader.h"
19
20
#include <vector>
21
22
#include "common/status.h"
23
#include "format/table/hive/hive_orc_nested_column_utils.h"
24
#include "format/table/hive/hive_parquet_nested_column_utils.h"
25
#include "format/table/nested_column_access_helper.h"
26
#include "runtime/runtime_state.h"
27
28
namespace doris {
29
#include "common/compile_check_begin.h"
30
31
127k
Status HiveReader::get_next_block_inner(Block* block, size_t* read_rows, bool* eof) {
32
127k
    RETURN_IF_ERROR(_file_format_reader->get_next_block(block, read_rows, eof));
33
127k
    return Status::OK();
34
127k
};
35
36
Status HiveOrcReader::init_reader(
37
        const std::vector<std::string>& read_table_col_names,
38
        std::unordered_map<std::string, uint32_t>* col_name_to_block_idx,
39
        const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor,
40
        const RowDescriptor* row_descriptor,
41
        const VExprContextSPtrs* not_single_slot_filter_conjuncts,
42
23.6k
        const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts) {
43
23.6k
    auto* orc_reader = static_cast<OrcReader*>(_file_format_reader.get());
44
45
23.6k
    const orc::Type* orc_type_ptr = nullptr;
46
23.6k
    RETURN_IF_ERROR(orc_reader->get_file_type(&orc_type_ptr));
47
23.6k
    bool is_hive_col_name = OrcReader::is_hive1_col_name(orc_type_ptr);
48
49
23.6k
    if (_state->query_options().hive_orc_use_column_names && !is_hive_col_name) {
50
        // Directly use the table column name to match the file column name, but pay attention to the case issue.
51
23.3k
        RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(tuple_descriptor, orc_type_ptr,
52
23.3k
                                                        table_info_node_ptr, _is_file_slot));
53
23.3k
    } else {
54
        // hive1 / use index
55
328
        std::map<std::string, const SlotDescriptor*> slot_map; // table_name to slot
56
1.09k
        for (const auto& slot : tuple_descriptor->slots()) {
57
1.09k
            slot_map.emplace(slot->col_name_lower_case(), slot);
58
1.09k
        }
59
60
        // For top-level columns, use indexes to match, and for sub-columns, still use name to match columns.
61
1.23k
        for (size_t idx = 0; idx < _params.column_idxs.size(); idx++) {
62
908
            auto table_column_name = read_table_col_names[idx];
63
908
            auto file_index = _params.column_idxs[idx];
64
65
908
            if (file_index >= orc_type_ptr->getSubtypeCount()) {
66
112
                table_info_node_ptr->add_not_exist_children(table_column_name);
67
796
            } else {
68
796
                auto field_node = std::make_shared<Node>();
69
                // For sub-columns, still use name to match columns.
70
796
                RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(
71
796
                        slot_map[table_column_name]->type(), orc_type_ptr->getSubtype(file_index),
72
796
                        field_node));
73
796
                table_info_node_ptr->add_children(
74
796
                        table_column_name, orc_type_ptr->getFieldName(file_index), field_node);
75
796
            }
76
908
            slot_map.erase(table_column_name);
77
908
        }
78
328
        for (const auto& [partition_col_name, _] : slot_map) {
79
176
            table_info_node_ptr->add_not_exist_children(partition_col_name);
80
176
        }
81
328
    }
82
83
23.6k
    auto column_id_result = ColumnIdResult();
84
23.6k
    if (_state->query_options().hive_orc_use_column_names && !is_hive_col_name) {
85
23.2k
        column_id_result = _create_column_ids(orc_type_ptr, tuple_descriptor);
86
23.2k
    } else {
87
416
        column_id_result =
88
416
                _create_column_ids_by_top_level_col_index(orc_type_ptr, tuple_descriptor);
89
416
    }
90
91
23.6k
    const auto& column_ids = column_id_result.column_ids;
92
23.6k
    const auto& filter_column_ids = column_id_result.filter_column_ids;
93
94
23.6k
    return orc_reader->init_reader(&read_table_col_names, col_name_to_block_idx, conjuncts, false,
95
23.6k
                                   tuple_descriptor, row_descriptor,
96
23.6k
                                   not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts,
97
23.6k
                                   table_info_node_ptr, column_ids, filter_column_ids);
98
23.6k
}
99
100
ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type,
101
23.1k
                                                 const TupleDescriptor* tuple_descriptor) {
102
    // map top-level table column name (lower-cased) -> orc::Type*
103
23.1k
    std::unordered_map<std::string, const orc::Type*> table_col_name_to_orc_type_map;
104
2.48M
    for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
105
2.46M
        auto orc_sub_type = orc_type->getSubtype(i);
106
2.46M
        if (!orc_sub_type) continue;
107
108
2.46M
        std::string table_col_name = to_lower(orc_type->getFieldName(i));
109
2.46M
        table_col_name_to_orc_type_map[table_col_name] = orc_sub_type;
110
2.46M
    }
111
112
23.1k
    std::set<uint64_t> column_ids;
113
23.1k
    std::set<uint64_t> filter_column_ids;
114
115
    // helper to process access paths for a given top-level orc field
116
23.1k
    auto process_access_paths = [](const orc::Type* orc_field,
117
23.1k
                                   const std::vector<TColumnAccessPath>& access_paths,
118
23.1k
                                   std::set<uint64_t>& out_ids) {
119
13.5k
        process_nested_access_paths(
120
13.5k
                orc_field, access_paths, out_ids,
121
13.6k
                [](const orc::Type* type) { return type->getColumnId(); },
122
13.5k
                [](const orc::Type* type) { return type->getMaximumColumnId(); },
123
13.5k
                HiveOrcNestedColumnUtils::extract_nested_column_ids);
124
13.5k
    };
125
126
98.2k
    for (const auto* slot : tuple_descriptor->slots()) {
127
98.2k
        auto it = table_col_name_to_orc_type_map.find(slot->col_name_lower_case());
128
98.2k
        if (it == table_col_name_to_orc_type_map.end()) {
129
            // Column not found in file
130
17.1k
            continue;
131
17.1k
        }
132
81.0k
        const orc::Type* orc_field = it->second;
133
134
        // primitive (non-nested) types
135
81.0k
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
136
81.0k
             slot->col_type() != TYPE_MAP)) {
137
67.4k
            column_ids.insert(orc_field->getColumnId());
138
67.4k
            if (slot->is_predicate()) {
139
16.6k
                filter_column_ids.insert(orc_field->getColumnId());
140
16.6k
            }
141
67.4k
            continue;
142
67.4k
        }
143
144
        // complex types
145
13.5k
        const auto& all_access_paths = slot->all_access_paths();
146
13.5k
        process_access_paths(orc_field, all_access_paths, column_ids);
147
148
13.5k
        const auto& predicate_access_paths = slot->predicate_access_paths();
149
13.5k
        if (!predicate_access_paths.empty()) {
150
214
            process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
151
214
        }
152
13.5k
    }
153
154
23.1k
    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
155
23.1k
}
156
157
ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index(
158
342
        const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) {
159
    // map top-level table column position -> orc::Type*
160
342
    std::unordered_map<uint64_t, const orc::Type*> table_col_pos_to_orc_type_map;
161
1.59k
    for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
162
1.25k
        auto orc_sub_type = orc_type->getSubtype(i);
163
1.25k
        if (!orc_sub_type) continue;
164
165
1.25k
        table_col_pos_to_orc_type_map[i] = orc_sub_type;
166
1.25k
    }
167
168
342
    std::set<uint64_t> column_ids;
169
342
    std::set<uint64_t> filter_column_ids;
170
171
    // helper to process access paths for a given top-level orc field
172
342
    auto process_access_paths = [](const orc::Type* orc_field,
173
342
                                   const std::vector<TColumnAccessPath>& access_paths,
174
342
                                   std::set<uint64_t>& out_ids) {
175
13
        process_nested_access_paths(
176
13
                orc_field, access_paths, out_ids,
177
13
                [](const orc::Type* type) { return type->getColumnId(); },
178
13
                [](const orc::Type* type) { return type->getMaximumColumnId(); },
179
13
                HiveOrcNestedColumnUtils::extract_nested_column_ids);
180
13
    };
181
182
1.10k
    for (const auto* slot : tuple_descriptor->slots()) {
183
1.10k
        auto it = table_col_pos_to_orc_type_map.find(slot->col_pos());
184
1.10k
        if (it == table_col_pos_to_orc_type_map.end()) {
185
            // Column not found in file
186
1.09k
            continue;
187
1.09k
        }
188
11
        const orc::Type* orc_field = it->second;
189
190
        // primitive (non-nested) types
191
11
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
192
11
             slot->col_type() != TYPE_MAP)) {
193
6
            column_ids.insert(orc_field->getColumnId());
194
6
            if (slot->is_predicate()) {
195
0
                filter_column_ids.insert(orc_field->getColumnId());
196
0
            }
197
6
            continue;
198
6
        }
199
200
5
        const auto& all_access_paths = slot->all_access_paths();
201
        // complex types
202
5
        process_access_paths(orc_field, all_access_paths, column_ids);
203
204
5
        const auto& predicate_access_paths = slot->predicate_access_paths();
205
6
        if (!predicate_access_paths.empty()) {
206
6
            process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
207
6
        }
208
5
    }
209
210
342
    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
211
342
}
212
213
Status HiveParquetReader::init_reader(
214
        const std::vector<std::string>& read_table_col_names,
215
        std::unordered_map<std::string, uint32_t>* col_name_to_block_idx,
216
        const VExprContextSPtrs& conjuncts,
217
        phmap::flat_hash_map<int, std::vector<std::shared_ptr<ColumnPredicate>>>&
218
                slot_id_to_predicates,
219
        const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor,
220
        const std::unordered_map<std::string, int>* colname_to_slot_id,
221
        const VExprContextSPtrs* not_single_slot_filter_conjuncts,
222
12.3k
        const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts) {
223
12.3k
    auto* parquet_reader = static_cast<ParquetReader*>(_file_format_reader.get());
224
12.3k
    const FieldDescriptor* field_desc = nullptr;
225
12.3k
    RETURN_IF_ERROR(parquet_reader->get_file_metadata_schema(&field_desc));
226
12.3k
    DCHECK(field_desc != nullptr);
227
228
12.3k
    if (_state->query_options().hive_parquet_use_column_names) {
229
        // Directly use the table column name to match the file column name, but pay attention to the case issue.
230
12.1k
        RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(tuple_descriptor, *field_desc,
231
12.1k
                                                            table_info_node_ptr, _is_file_slot));
232
12.1k
    } else {                                                   // use idx
233
282
        std::map<std::string, const SlotDescriptor*> slot_map; //table_name to slot
234
996
        for (const auto& slot : tuple_descriptor->slots()) {
235
996
            slot_map.emplace(slot->col_name_lower_case(), slot);
236
996
        }
237
238
        // For top-level columns, use indexes to match, and for sub-columns, still use name to match columns.
239
282
        auto parquet_fields_schema = field_desc->get_fields_schema();
240
1.10k
        for (size_t idx = 0; idx < _params.column_idxs.size(); idx++) {
241
824
            auto table_column_name = read_table_col_names[idx];
242
824
            auto file_index = _params.column_idxs[idx];
243
244
824
            if (file_index >= parquet_fields_schema.size()) {
245
                // Non-partitioning columns, which may be columns added later.
246
112
                table_info_node_ptr->add_not_exist_children(table_column_name);
247
712
            } else {
248
                // Non-partitioning columns, columns that exist in both the table and the file.
249
712
                auto field_node = std::make_shared<Node>();
250
                // for sub-columns, still use name to match columns.
251
712
                RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(
252
712
                        slot_map[table_column_name]->type(), parquet_fields_schema[file_index],
253
712
                        field_node));
254
712
                table_info_node_ptr->add_children(
255
712
                        table_column_name, parquet_fields_schema[file_index].name, field_node);
256
712
            }
257
258
824
            slot_map.erase(table_column_name);
259
824
        }
260
        /*
261
         * `_params.column_idxs` only have `isIsFileSlot()`, so we need add `partition slot`.
262
         * eg:
263
         * Table : A, B, C, D     (D: partition column)
264
         * Parquet file : A, B
265
         * Column C is obtained by add column.
266
         *
267
         * sql : select * from table;
268
         * slot : A, B, C, D
269
         * _params.column_idxs: 0, 1, 2 (There is no 3, because column D is the partition column)
270
         *
271
         */
272
282
        for (const auto& [partition_col_name, _] : slot_map) {
273
176
            table_info_node_ptr->add_not_exist_children(partition_col_name);
274
176
        }
275
282
    }
276
277
12.3k
    auto column_id_result = ColumnIdResult();
278
12.3k
    if (_state->query_options().hive_parquet_use_column_names) {
279
12.0k
        column_id_result = _create_column_ids(field_desc, tuple_descriptor);
280
12.0k
    } else {
281
302
        column_id_result = _create_column_ids_by_top_level_col_index(field_desc, tuple_descriptor);
282
302
    }
283
284
12.3k
    const auto& column_ids = column_id_result.column_ids;
285
12.3k
    const auto& filter_column_ids = column_id_result.filter_column_ids;
286
287
12.3k
    RETURN_IF_ERROR(init_row_filters());
288
289
12.3k
    return parquet_reader->init_reader(
290
12.3k
            read_table_col_names, col_name_to_block_idx, conjuncts, slot_id_to_predicates,
291
12.3k
            tuple_descriptor, row_descriptor, colname_to_slot_id, not_single_slot_filter_conjuncts,
292
12.3k
            slot_id_to_filter_conjuncts, table_info_node_ptr, true, column_ids, filter_column_ids);
293
12.3k
}
294
295
ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* field_desc,
296
12.0k
                                                     const TupleDescriptor* tuple_descriptor) {
297
    // First, assign column IDs to the field descriptor
298
12.0k
    auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
299
12.0k
    mutable_field_desc->assign_ids();
300
301
    // map top-level table column name (lower-cased) -> FieldSchema*
302
12.0k
    std::unordered_map<std::string, const FieldSchema*> table_col_name_to_field_schema_map;
303
109k
    for (int i = 0; i < field_desc->size(); ++i) {
304
97.2k
        auto field_schema = field_desc->get_column(i);
305
97.2k
        if (!field_schema) continue;
306
307
97.2k
        table_col_name_to_field_schema_map[field_schema->lower_case_name] = field_schema;
308
97.2k
    }
309
310
12.0k
    std::set<uint64_t> column_ids;
311
12.0k
    std::set<uint64_t> filter_column_ids;
312
313
    // helper to process access paths for a given top-level parquet field
314
12.0k
    auto process_access_paths = [](const FieldSchema* parquet_field,
315
12.0k
                                   const std::vector<TColumnAccessPath>& access_paths,
316
16.7k
                                   std::set<uint64_t>& out_ids) {
317
16.7k
        process_nested_access_paths(
318
16.7k
                parquet_field, access_paths, out_ids,
319
16.7k
                [](const FieldSchema* field) { return field->get_column_id(); },
320
16.7k
                [](const FieldSchema* field) { return field->get_max_column_id(); },
321
16.7k
                HiveParquetNestedColumnUtils::extract_nested_column_ids);
322
16.7k
    };
323
324
62.7k
    for (const auto* slot : tuple_descriptor->slots()) {
325
62.7k
        auto it = table_col_name_to_field_schema_map.find(slot->col_name_lower_case());
326
62.7k
        if (it == table_col_name_to_field_schema_map.end()) {
327
            // Column not found in file
328
7.39k
            continue;
329
7.39k
        }
330
55.3k
        auto field_schema = it->second;
331
332
        // primitive (non-nested) types
333
55.3k
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
334
55.3k
             slot->col_type() != TYPE_MAP)) {
335
38.8k
            column_ids.insert(field_schema->column_id);
336
337
38.8k
            if (slot->is_predicate()) {
338
5.89k
                filter_column_ids.insert(field_schema->column_id);
339
5.89k
            }
340
38.8k
            continue;
341
38.8k
        }
342
343
        // complex types
344
16.5k
        const auto& all_access_paths = slot->all_access_paths();
345
16.5k
        process_access_paths(field_schema, all_access_paths, column_ids);
346
347
16.5k
        const auto& predicate_access_paths = slot->predicate_access_paths();
348
16.5k
        if (!predicate_access_paths.empty()) {
349
290
            process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
350
290
        }
351
16.5k
    }
352
353
12.0k
    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
354
12.0k
}
355
356
ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index(
357
298
        const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor) {
358
    // First, assign column IDs to the field descriptor
359
298
    auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
360
298
    mutable_field_desc->assign_ids();
361
362
    // map top-level table column position -> FieldSchema*
363
298
    std::unordered_map<uint64_t, const FieldSchema*> table_col_pos_to_field_schema_map;
364
1.42k
    for (int i = 0; i < field_desc->size(); ++i) {
365
1.12k
        auto field_schema = field_desc->get_column(i);
366
1.12k
        if (!field_schema) continue;
367
368
1.12k
        table_col_pos_to_field_schema_map[i] = field_schema;
369
1.12k
    }
370
371
298
    std::set<uint64_t> column_ids;
372
298
    std::set<uint64_t> filter_column_ids;
373
374
    // helper to process access paths for a given top-level parquet field
375
298
    auto process_access_paths = [](const FieldSchema* parquet_field,
376
298
                                   const std::vector<TColumnAccessPath>& access_paths,
377
298
                                   std::set<uint64_t>& out_ids) {
378
13
        process_nested_access_paths(
379
13
                parquet_field, access_paths, out_ids,
380
13
                [](const FieldSchema* field) { return field->get_column_id(); },
381
13
                [](const FieldSchema* field) { return field->get_max_column_id(); },
382
13
                HiveParquetNestedColumnUtils::extract_nested_column_ids);
383
13
    };
384
385
1.01k
    for (const auto* slot : tuple_descriptor->slots()) {
386
1.01k
        auto it = table_col_pos_to_field_schema_map.find(slot->col_pos());
387
1.01k
        if (it == table_col_pos_to_field_schema_map.end()) {
388
            // Column not found in file
389
1.00k
            continue;
390
1.00k
        }
391
13
        auto field_schema = it->second;
392
393
        // primitive (non-nested) types
394
13
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
395
13
             slot->col_type() != TYPE_MAP)) {
396
6
            column_ids.insert(field_schema->column_id);
397
398
6
            if (slot->is_predicate()) {
399
0
                filter_column_ids.insert(field_schema->column_id);
400
0
            }
401
6
            continue;
402
6
        }
403
404
        // complex types
405
7
        const auto& all_access_paths = slot->all_access_paths();
406
7
        process_access_paths(field_schema, all_access_paths, column_ids);
407
408
7
        const auto& predicate_access_paths = slot->predicate_access_paths();
409
7
        if (!predicate_access_paths.empty()) {
410
6
            process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
411
6
        }
412
7
    }
413
414
298
    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
415
298
}
416
417
#include "common/compile_check_end.h"
418
} // namespace doris