Coverage Report

Created: 2026-04-09 21:24

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/format/table/hive_reader.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "format/table/hive_reader.h"
19
20
#include <vector>
21
22
#include "common/status.h"
23
#include "format/table/hive/hive_orc_nested_column_utils.h"
24
#include "format/table/hive/hive_parquet_nested_column_utils.h"
25
#include "format/table/nested_column_access_helper.h"
26
#include "runtime/runtime_state.h"
27
28
namespace doris {
29
#include "common/compile_check_begin.h"
30
31
11
Status HiveOrcReader::on_before_init_reader(ReaderInitContext* ctx) {
32
11
    _column_descs = ctx->column_descs;
33
11
    _fill_col_name_to_block_idx = ctx->col_name_to_block_idx;
34
11
    RETURN_IF_ERROR(
35
11
            _extract_partition_values(*ctx->range, ctx->tuple_descriptor, _fill_partition_values));
36
99
    for (auto& desc : *ctx->column_descs) {
37
99
        if (desc.category == ColumnCategory::REGULAR ||
38
99
            desc.category == ColumnCategory::GENERATED) {
39
99
            ctx->column_names.push_back(desc.name);
40
99
        }
41
99
    }
42
43
    // Get file type (available because _create_file_reader() runs before this hook)
44
11
    const orc::Type* orc_type_ptr = nullptr;
45
11
    RETURN_IF_ERROR(get_file_type(&orc_type_ptr));
46
11
    bool is_hive_col_name = OrcReader::is_hive1_col_name(orc_type_ptr);
47
48
    // Build table_info_node based on config
49
11
    if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) {
50
11
        RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(ctx->tuple_descriptor, orc_type_ptr,
51
11
                                                        ctx->table_info_node, _is_file_slot));
52
11
    } else {
53
0
        ctx->table_info_node = std::make_shared<StructNode>();
54
0
        std::map<std::string, const SlotDescriptor*> slot_map;
55
0
        for (const auto& slot : ctx->tuple_descriptor->slots()) {
56
0
            slot_map.emplace(slot->col_name_lower_case(), slot);
57
0
        }
58
59
0
        for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) {
60
0
            auto table_column_name = ctx->column_names[idx];
61
0
            auto file_index = get_scan_params().column_idxs[idx];
62
63
0
            if (file_index >= orc_type_ptr->getSubtypeCount()) {
64
0
                ctx->table_info_node->add_not_exist_children(table_column_name);
65
0
            } else {
66
0
                auto field_node = std::make_shared<Node>();
67
0
                RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(
68
0
                        slot_map[table_column_name]->type(), orc_type_ptr->getSubtype(file_index),
69
0
                        field_node));
70
0
                ctx->table_info_node->add_children(
71
0
                        table_column_name, orc_type_ptr->getFieldName(file_index), field_node);
72
0
            }
73
0
            slot_map.erase(table_column_name);
74
0
        }
75
0
        for (const auto& [partition_col_name, _] : slot_map) {
76
0
            ctx->table_info_node->add_not_exist_children(partition_col_name);
77
0
        }
78
0
    }
79
80
    // Compute column_ids
81
11
    auto column_id_result = ColumnIdResult();
82
11
    if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) {
83
11
        column_id_result = _create_column_ids(orc_type_ptr, ctx->tuple_descriptor);
84
11
    } else {
85
0
        column_id_result =
86
0
                _create_column_ids_by_top_level_col_index(orc_type_ptr, ctx->tuple_descriptor);
87
0
    }
88
11
    ctx->column_ids = std::move(column_id_result.column_ids);
89
11
    ctx->filter_column_ids = std::move(column_id_result.filter_column_ids);
90
91
    // _is_acid is false by default, no need to set explicitly
92
11
    return Status::OK();
93
11
}
94
95
ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type,
96
17
                                                 const TupleDescriptor* tuple_descriptor) {
97
    // map top-level table column name (lower-cased) -> orc::Type*
98
17
    std::unordered_map<std::string, const orc::Type*> table_col_name_to_orc_type_map;
99
164
    for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
100
147
        auto orc_sub_type = orc_type->getSubtype(i);
101
147
        if (!orc_sub_type) continue;
102
103
147
        std::string table_col_name = to_lower(orc_type->getFieldName(i));
104
147
        table_col_name_to_orc_type_map[table_col_name] = orc_sub_type;
105
147
    }
106
107
17
    std::set<uint64_t> column_ids;
108
17
    std::set<uint64_t> filter_column_ids;
109
110
    // helper to process access paths for a given top-level orc field
111
17
    auto process_access_paths = [](const orc::Type* orc_field,
112
17
                                   const std::vector<TColumnAccessPath>& access_paths,
113
17
                                   std::set<uint64_t>& out_ids) {
114
13
        process_nested_access_paths(
115
13
                orc_field, access_paths, out_ids,
116
13
                [](const orc::Type* type) { return type->getColumnId(); },
117
13
                [](const orc::Type* type) { return type->getMaximumColumnId(); },
118
13
                HiveOrcNestedColumnUtils::extract_nested_column_ids);
119
13
    };
120
121
112
    for (const auto* slot : tuple_descriptor->slots()) {
122
112
        auto it = table_col_name_to_orc_type_map.find(slot->col_name_lower_case());
123
112
        if (it == table_col_name_to_orc_type_map.end()) {
124
            // Column not found in file
125
0
            continue;
126
0
        }
127
112
        const orc::Type* orc_field = it->second;
128
129
        // primitive (non-nested) types
130
112
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
131
112
             slot->col_type() != TYPE_MAP)) {
132
105
            column_ids.insert(orc_field->getColumnId());
133
105
            if (slot->is_predicate()) {
134
0
                filter_column_ids.insert(orc_field->getColumnId());
135
0
            }
136
105
            continue;
137
105
        }
138
139
        // complex types
140
7
        const auto& all_access_paths = slot->all_access_paths();
141
7
        process_access_paths(orc_field, all_access_paths, column_ids);
142
143
7
        const auto& predicate_access_paths = slot->predicate_access_paths();
144
7
        if (!predicate_access_paths.empty()) {
145
6
            process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
146
6
        }
147
7
    }
148
149
17
    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
150
17
}
151
152
ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index(
153
6
        const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) {
154
    // map top-level table column position -> orc::Type*
155
6
    std::unordered_map<uint64_t, const orc::Type*> table_col_pos_to_orc_type_map;
156
54
    for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
157
48
        auto orc_sub_type = orc_type->getSubtype(i);
158
48
        if (!orc_sub_type) continue;
159
160
48
        table_col_pos_to_orc_type_map[i] = orc_sub_type;
161
48
    }
162
163
6
    std::set<uint64_t> column_ids;
164
6
    std::set<uint64_t> filter_column_ids;
165
166
    // helper to process access paths for a given top-level orc field
167
6
    auto process_access_paths = [](const orc::Type* orc_field,
168
6
                                   const std::vector<TColumnAccessPath>& access_paths,
169
13
                                   std::set<uint64_t>& out_ids) {
170
13
        process_nested_access_paths(
171
13
                orc_field, access_paths, out_ids,
172
13
                [](const orc::Type* type) { return type->getColumnId(); },
173
13
                [](const orc::Type* type) { return type->getMaximumColumnId(); },
174
13
                HiveOrcNestedColumnUtils::extract_nested_column_ids);
175
13
    };
176
177
13
    for (const auto* slot : tuple_descriptor->slots()) {
178
13
        auto it = table_col_pos_to_orc_type_map.find(slot->col_pos());
179
13
        if (it == table_col_pos_to_orc_type_map.end()) {
180
            // Column not found in file
181
0
            continue;
182
0
        }
183
13
        const orc::Type* orc_field = it->second;
184
185
        // primitive (non-nested) types
186
13
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
187
13
             slot->col_type() != TYPE_MAP)) {
188
6
            column_ids.insert(orc_field->getColumnId());
189
6
            if (slot->is_predicate()) {
190
0
                filter_column_ids.insert(orc_field->getColumnId());
191
0
            }
192
6
            continue;
193
6
        }
194
195
7
        const auto& all_access_paths = slot->all_access_paths();
196
        // complex types
197
7
        process_access_paths(orc_field, all_access_paths, column_ids);
198
199
7
        const auto& predicate_access_paths = slot->predicate_access_paths();
200
7
        if (!predicate_access_paths.empty()) {
201
6
            process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
202
6
        }
203
7
    }
204
205
6
    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
206
6
}
207
208
14
Status HiveParquetReader::on_before_init_reader(ReaderInitContext* ctx) {
209
14
    _column_descs = ctx->column_descs;
210
14
    _fill_col_name_to_block_idx = ctx->col_name_to_block_idx;
211
14
    RETURN_IF_ERROR(
212
14
            _extract_partition_values(*ctx->range, ctx->tuple_descriptor, _fill_partition_values));
213
51
    for (auto& desc : *ctx->column_descs) {
214
51
        if (desc.category == ColumnCategory::REGULAR ||
215
51
            desc.category == ColumnCategory::GENERATED) {
216
51
            ctx->column_names.push_back(desc.name);
217
51
        }
218
51
    }
219
220
    // Get file metadata schema (available because _open_file() runs before this hook)
221
14
    const FieldDescriptor* field_desc = nullptr;
222
14
    RETURN_IF_ERROR(get_file_metadata_schema(&field_desc));
223
14
    DCHECK(field_desc != nullptr);
224
225
    // Build table_info_node based on config
226
14
    if (get_state()->query_options().hive_parquet_use_column_names) {
227
14
        RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(ctx->tuple_descriptor, *field_desc,
228
14
                                                            ctx->table_info_node, _is_file_slot));
229
14
    } else {
230
0
        ctx->table_info_node = std::make_shared<StructNode>();
231
0
        std::map<std::string, const SlotDescriptor*> slot_map;
232
0
        for (const auto& slot : ctx->tuple_descriptor->slots()) {
233
0
            slot_map.emplace(slot->col_name_lower_case(), slot);
234
0
        }
235
236
0
        auto parquet_fields_schema = field_desc->get_fields_schema();
237
0
        for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) {
238
0
            auto table_column_name = ctx->column_names[idx];
239
0
            auto file_index = get_scan_params().column_idxs[idx];
240
241
0
            if (file_index >= parquet_fields_schema.size()) {
242
0
                ctx->table_info_node->add_not_exist_children(table_column_name);
243
0
            } else {
244
0
                auto field_node = std::make_shared<Node>();
245
0
                RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(
246
0
                        slot_map[table_column_name]->type(), parquet_fields_schema[file_index],
247
0
                        field_node));
248
0
                ctx->table_info_node->add_children(
249
0
                        table_column_name, parquet_fields_schema[file_index].name, field_node);
250
0
            }
251
0
            slot_map.erase(table_column_name);
252
0
        }
253
0
        for (const auto& [partition_col_name, _] : slot_map) {
254
0
            ctx->table_info_node->add_not_exist_children(partition_col_name);
255
0
        }
256
0
    }
257
258
    // Compute column_ids for lazy materialization
259
14
    auto column_id_result = ColumnIdResult();
260
14
    if (get_state()->query_options().hive_parquet_use_column_names) {
261
14
        column_id_result = _create_column_ids(field_desc, ctx->tuple_descriptor);
262
14
    } else {
263
0
        column_id_result =
264
0
                _create_column_ids_by_top_level_col_index(field_desc, ctx->tuple_descriptor);
265
0
    }
266
14
    ctx->column_ids = std::move(column_id_result.column_ids);
267
14
    ctx->filter_column_ids = std::move(column_id_result.filter_column_ids);
268
269
14
    _filter_groups = true;
270
14
    return Status::OK();
271
14
}
272
273
ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* field_desc,
274
20
                                                     const TupleDescriptor* tuple_descriptor) {
275
    // First, assign column IDs to the field descriptor
276
20
    auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
277
20
    mutable_field_desc->assign_ids();
278
279
    // map top-level table column name (lower-cased) -> FieldSchema*
280
20
    std::unordered_map<std::string, const FieldSchema*> table_col_name_to_field_schema_map;
281
278
    for (int i = 0; i < field_desc->size(); ++i) {
282
258
        auto field_schema = field_desc->get_column(i);
283
258
        if (!field_schema) continue;
284
285
258
        table_col_name_to_field_schema_map[field_schema->lower_case_name] = field_schema;
286
258
    }
287
288
20
    std::set<uint64_t> column_ids;
289
20
    std::set<uint64_t> filter_column_ids;
290
291
    // helper to process access paths for a given top-level parquet field
292
20
    auto process_access_paths = [](const FieldSchema* parquet_field,
293
20
                                   const std::vector<TColumnAccessPath>& access_paths,
294
20
                                   std::set<uint64_t>& out_ids) {
295
13
        process_nested_access_paths(
296
13
                parquet_field, access_paths, out_ids,
297
13
                [](const FieldSchema* field) { return field->get_column_id(); },
298
13
                [](const FieldSchema* field) { return field->get_max_column_id(); },
299
13
                HiveParquetNestedColumnUtils::extract_nested_column_ids);
300
13
    };
301
302
64
    for (const auto* slot : tuple_descriptor->slots()) {
303
64
        auto it = table_col_name_to_field_schema_map.find(slot->col_name_lower_case());
304
64
        if (it == table_col_name_to_field_schema_map.end()) {
305
            // Column not found in file
306
0
            continue;
307
0
        }
308
64
        auto field_schema = it->second;
309
310
        // primitive (non-nested) types
311
64
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
312
64
             slot->col_type() != TYPE_MAP)) {
313
57
            column_ids.insert(field_schema->column_id);
314
315
57
            if (slot->is_predicate()) {
316
0
                filter_column_ids.insert(field_schema->column_id);
317
0
            }
318
57
            continue;
319
57
        }
320
321
        // complex types
322
7
        const auto& all_access_paths = slot->all_access_paths();
323
7
        process_access_paths(field_schema, all_access_paths, column_ids);
324
325
7
        const auto& predicate_access_paths = slot->predicate_access_paths();
326
7
        if (!predicate_access_paths.empty()) {
327
6
            process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
328
6
        }
329
7
    }
330
331
20
    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
332
20
}
333
334
ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index(
335
6
        const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor) {
336
    // First, assign column IDs to the field descriptor
337
6
    auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
338
6
    mutable_field_desc->assign_ids();
339
340
    // map top-level table column position -> FieldSchema*
341
6
    std::unordered_map<uint64_t, const FieldSchema*> table_col_pos_to_field_schema_map;
342
54
    for (int i = 0; i < field_desc->size(); ++i) {
343
48
        auto field_schema = field_desc->get_column(i);
344
48
        if (!field_schema) continue;
345
346
48
        table_col_pos_to_field_schema_map[i] = field_schema;
347
48
    }
348
349
6
    std::set<uint64_t> column_ids;
350
6
    std::set<uint64_t> filter_column_ids;
351
352
    // helper to process access paths for a given top-level parquet field
353
6
    auto process_access_paths = [](const FieldSchema* parquet_field,
354
6
                                   const std::vector<TColumnAccessPath>& access_paths,
355
13
                                   std::set<uint64_t>& out_ids) {
356
13
        process_nested_access_paths(
357
13
                parquet_field, access_paths, out_ids,
358
13
                [](const FieldSchema* field) { return field->get_column_id(); },
359
13
                [](const FieldSchema* field) { return field->get_max_column_id(); },
360
13
                HiveParquetNestedColumnUtils::extract_nested_column_ids);
361
13
    };
362
363
13
    for (const auto* slot : tuple_descriptor->slots()) {
364
13
        auto it = table_col_pos_to_field_schema_map.find(slot->col_pos());
365
13
        if (it == table_col_pos_to_field_schema_map.end()) {
366
            // Column not found in file
367
0
            continue;
368
0
        }
369
13
        auto field_schema = it->second;
370
371
        // primitive (non-nested) types
372
13
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
373
13
             slot->col_type() != TYPE_MAP)) {
374
6
            column_ids.insert(field_schema->column_id);
375
376
6
            if (slot->is_predicate()) {
377
0
                filter_column_ids.insert(field_schema->column_id);
378
0
            }
379
6
            continue;
380
6
        }
381
382
        // complex types
383
7
        const auto& all_access_paths = slot->all_access_paths();
384
7
        process_access_paths(field_schema, all_access_paths, column_ids);
385
386
7
        const auto& predicate_access_paths = slot->predicate_access_paths();
387
7
        if (!predicate_access_paths.empty()) {
388
6
            process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
389
6
        }
390
7
    }
391
392
6
    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
393
6
}
394
395
#include "common/compile_check_end.h"
396
} // namespace doris