Coverage Report

Created: 2026-04-20 07:27

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/format/table/hive_reader.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "format/table/hive_reader.h"
19
20
#include <vector>
21
22
#include "common/status.h"
23
#include "format/table/hive/hive_orc_nested_column_utils.h"
24
#include "format/table/hive/hive_parquet_nested_column_utils.h"
25
#include "format/table/nested_column_access_helper.h"
26
#include "runtime/runtime_state.h"
27
28
namespace doris {
29
30
15.3k
Status HiveOrcReader::on_before_init_reader(ReaderInitContext* ctx) {
31
15.3k
    _column_descs = ctx->column_descs;
32
15.3k
    _fill_col_name_to_block_idx = ctx->col_name_to_block_idx;
33
15.3k
    RETURN_IF_ERROR(
34
15.3k
            _extract_partition_values(*ctx->range, ctx->tuple_descriptor, _fill_partition_values));
35
63.2k
    for (auto& desc : *ctx->column_descs) {
36
63.2k
        if (desc.category == ColumnCategory::REGULAR ||
37
63.2k
            desc.category == ColumnCategory::GENERATED) {
38
53.3k
            ctx->column_names.push_back(desc.name);
39
53.3k
        } else if (desc.category == ColumnCategory::SYNTHESIZED &&
40
9.85k
                   desc.name.starts_with(BeConsts::GLOBAL_ROWID_COL)) {
41
4.99k
            auto topn_row_id_column_iter = _create_topn_row_id_column_iterator();
42
4.99k
            this->register_synthesized_column_handler(
43
4.99k
                    desc.name,
44
4.99k
                    [iter = std::move(topn_row_id_column_iter), this, &desc](
45
4.99k
                            Block* block, size_t rows) -> Status {
46
4.23k
                        return fill_topn_row_id(iter, desc.name, block, rows);
47
4.23k
                    });
48
4.99k
            continue;
49
4.99k
        }
50
63.2k
    }
51
52
    // Get file type (available because _create_file_reader() runs before this hook)
53
15.3k
    const orc::Type* orc_type_ptr = nullptr;
54
15.3k
    RETURN_IF_ERROR(get_file_type(&orc_type_ptr));
55
15.3k
    bool is_hive_col_name = OrcReader::is_hive1_col_name(orc_type_ptr);
56
57
    // Build table_info_node based on config
58
15.3k
    if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) {
59
15.0k
        RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(ctx->tuple_descriptor, orc_type_ptr,
60
15.0k
                                                        ctx->table_info_node, _is_file_slot));
61
15.0k
    } else {
62
320
        ctx->table_info_node = std::make_shared<StructNode>();
63
320
        std::map<std::string, const SlotDescriptor*> slot_map;
64
966
        for (const auto& slot : ctx->tuple_descriptor->slots()) {
65
966
            slot_map.emplace(slot->col_name_lower_case(), slot);
66
966
        }
67
68
1.17k
        for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) {
69
850
            auto table_column_name = ctx->column_names[idx];
70
850
            auto file_index = get_scan_params().column_idxs[idx];
71
72
850
            if (file_index >= orc_type_ptr->getSubtypeCount()) {
73
112
                ctx->table_info_node->add_not_exist_children(table_column_name);
74
738
            } else {
75
738
                auto field_node = std::make_shared<Node>();
76
738
                RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(
77
738
                        slot_map[table_column_name]->type(), orc_type_ptr->getSubtype(file_index),
78
738
                        field_node));
79
738
                ctx->table_info_node->add_children(
80
738
                        table_column_name, orc_type_ptr->getFieldName(file_index), field_node);
81
738
            }
82
850
            slot_map.erase(table_column_name);
83
850
        }
84
320
        for (const auto& [partition_col_name, _] : slot_map) {
85
112
            ctx->table_info_node->add_not_exist_children(partition_col_name);
86
112
        }
87
320
    }
88
89
    // Compute column_ids
90
15.3k
    auto column_id_result = ColumnIdResult();
91
15.3k
    if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) {
92
15.0k
        column_id_result = _create_column_ids(orc_type_ptr, ctx->tuple_descriptor);
93
15.0k
    } else {
94
362
        column_id_result =
95
362
                _create_column_ids_by_top_level_col_index(orc_type_ptr, ctx->tuple_descriptor);
96
362
    }
97
15.3k
    ctx->column_ids = std::move(column_id_result.column_ids);
98
15.3k
    ctx->filter_column_ids = std::move(column_id_result.filter_column_ids);
99
100
    // _is_acid is false by default, no need to set explicitly
101
15.3k
    return Status::OK();
102
15.3k
}
103
104
ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type,
105
14.9k
                                                 const TupleDescriptor* tuple_descriptor) {
106
    // map top-level table column name (lower-cased) -> orc::Type*
107
14.9k
    std::unordered_map<std::string, const orc::Type*> table_col_name_to_orc_type_map;
108
1.27M
    for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
109
1.25M
        auto orc_sub_type = orc_type->getSubtype(i);
110
1.25M
        if (!orc_sub_type) continue;
111
112
1.25M
        std::string table_col_name = to_lower(orc_type->getFieldName(i));
113
1.25M
        table_col_name_to_orc_type_map[table_col_name] = orc_sub_type;
114
1.25M
    }
115
116
14.9k
    std::set<uint64_t> column_ids;
117
14.9k
    std::set<uint64_t> filter_column_ids;
118
119
    // helper to process access paths for a given top-level orc field
120
14.9k
    auto process_access_paths = [](const orc::Type* orc_field,
121
14.9k
                                   const std::vector<TColumnAccessPath>& access_paths,
122
14.9k
                                   std::set<uint64_t>& out_ids) {
123
10.1k
        process_nested_access_paths(
124
10.1k
                orc_field, access_paths, out_ids,
125
10.1k
                [](const orc::Type* type) { return type->getColumnId(); },
126
10.1k
                [](const orc::Type* type) { return type->getMaximumColumnId(); },
127
10.1k
                HiveOrcNestedColumnUtils::extract_nested_column_ids);
128
10.1k
    };
129
130
62.2k
    for (const auto* slot : tuple_descriptor->slots()) {
131
62.2k
        auto it = table_col_name_to_orc_type_map.find(slot->col_name_lower_case());
132
62.2k
        if (it == table_col_name_to_orc_type_map.end()) {
133
            // Column not found in file
134
10.4k
            continue;
135
10.4k
        }
136
51.8k
        const orc::Type* orc_field = it->second;
137
138
        // primitive (non-nested) types
139
51.8k
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
140
51.8k
             slot->col_type() != TYPE_MAP)) {
141
41.8k
            column_ids.insert(orc_field->getColumnId());
142
41.8k
            if (slot->is_predicate()) {
143
10.3k
                filter_column_ids.insert(orc_field->getColumnId());
144
10.3k
            }
145
41.8k
            continue;
146
41.8k
        }
147
148
        // complex types
149
10.0k
        const auto& all_access_paths = slot->all_access_paths();
150
10.0k
        process_access_paths(orc_field, all_access_paths, column_ids);
151
152
10.0k
        const auto& predicate_access_paths = slot->predicate_access_paths();
153
10.0k
        if (!predicate_access_paths.empty()) {
154
150
            process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
155
150
        }
156
10.0k
    }
157
158
14.9k
    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
159
14.9k
}
160
161
ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index(
162
314
        const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) {
163
    // map top-level table column position -> orc::Type*
164
314
    std::unordered_map<uint64_t, const orc::Type*> table_col_pos_to_orc_type_map;
165
1.50k
    for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
166
1.18k
        auto orc_sub_type = orc_type->getSubtype(i);
167
1.18k
        if (!orc_sub_type) continue;
168
169
1.18k
        table_col_pos_to_orc_type_map[i] = orc_sub_type;
170
1.18k
    }
171
172
314
    std::set<uint64_t> column_ids;
173
314
    std::set<uint64_t> filter_column_ids;
174
175
    // helper to process access paths for a given top-level orc field
176
314
    auto process_access_paths = [](const orc::Type* orc_field,
177
314
                                   const std::vector<TColumnAccessPath>& access_paths,
178
314
                                   std::set<uint64_t>& out_ids) {
179
13
        process_nested_access_paths(
180
13
                orc_field, access_paths, out_ids,
181
13
                [](const orc::Type* type) { return type->getColumnId(); },
182
13
                [](const orc::Type* type) { return type->getMaximumColumnId(); },
183
13
                HiveOrcNestedColumnUtils::extract_nested_column_ids);
184
13
    };
185
186
973
    for (const auto* slot : tuple_descriptor->slots()) {
187
973
        auto it = table_col_pos_to_orc_type_map.find(slot->col_pos());
188
973
        if (it == table_col_pos_to_orc_type_map.end()) {
189
            // Column not found in file
190
960
            continue;
191
960
        }
192
13
        const orc::Type* orc_field = it->second;
193
194
        // primitive (non-nested) types
195
13
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
196
13
             slot->col_type() != TYPE_MAP)) {
197
6
            column_ids.insert(orc_field->getColumnId());
198
6
            if (slot->is_predicate()) {
199
0
                filter_column_ids.insert(orc_field->getColumnId());
200
0
            }
201
6
            continue;
202
6
        }
203
204
7
        const auto& all_access_paths = slot->all_access_paths();
205
        // complex types
206
7
        process_access_paths(orc_field, all_access_paths, column_ids);
207
208
7
        const auto& predicate_access_paths = slot->predicate_access_paths();
209
7
        if (!predicate_access_paths.empty()) {
210
6
            process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
211
6
        }
212
7
    }
213
214
314
    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
215
314
}
216
217
10.4k
Status HiveParquetReader::on_before_init_reader(ReaderInitContext* ctx) {
218
10.4k
    _column_descs = ctx->column_descs;
219
10.4k
    _fill_col_name_to_block_idx = ctx->col_name_to_block_idx;
220
10.4k
    RETURN_IF_ERROR(
221
10.4k
            _extract_partition_values(*ctx->range, ctx->tuple_descriptor, _fill_partition_values));
222
56.0k
    for (auto& desc : *ctx->column_descs) {
223
56.0k
        if (desc.category == ColumnCategory::REGULAR ||
224
56.0k
            desc.category == ColumnCategory::GENERATED) {
225
49.9k
            ctx->column_names.push_back(desc.name);
226
49.9k
        } else if (desc.category == ColumnCategory::SYNTHESIZED &&
227
6.14k
                   desc.name.starts_with(BeConsts::GLOBAL_ROWID_COL)) {
228
1.84k
            auto topn_row_id_column_iter = _create_topn_row_id_column_iterator();
229
1.84k
            this->register_synthesized_column_handler(
230
1.84k
                    desc.name,
231
1.84k
                    [iter = std::move(topn_row_id_column_iter), this, &desc](
232
2.33k
                            Block* block, size_t rows) -> Status {
233
2.33k
                        return fill_topn_row_id(iter, desc.name, block, rows);
234
2.33k
                    });
235
1.84k
            continue;
236
1.84k
        }
237
56.0k
    }
238
239
    // Get file metadata schema (available because _open_file() runs before this hook)
240
10.4k
    const FieldDescriptor* field_desc = nullptr;
241
10.4k
    RETURN_IF_ERROR(get_file_metadata_schema(&field_desc));
242
10.4k
    DCHECK(field_desc != nullptr);
243
244
    // Build table_info_node based on config
245
10.4k
    if (get_state()->query_options().hive_parquet_use_column_names) {
246
10.2k
        RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(ctx->tuple_descriptor, *field_desc,
247
10.2k
                                                            ctx->table_info_node, _is_file_slot));
248
10.2k
    } else {
249
266
        ctx->table_info_node = std::make_shared<StructNode>();
250
266
        std::map<std::string, const SlotDescriptor*> slot_map;
251
880
        for (const auto& slot : ctx->tuple_descriptor->slots()) {
252
880
            slot_map.emplace(slot->col_name_lower_case(), slot);
253
880
        }
254
255
266
        auto parquet_fields_schema = field_desc->get_fields_schema();
256
1.03k
        for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) {
257
764
            auto table_column_name = ctx->column_names[idx];
258
764
            auto file_index = get_scan_params().column_idxs[idx];
259
260
764
            if (file_index >= parquet_fields_schema.size()) {
261
112
                ctx->table_info_node->add_not_exist_children(table_column_name);
262
652
            } else {
263
652
                auto field_node = std::make_shared<Node>();
264
652
                RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(
265
652
                        slot_map[table_column_name]->type(), parquet_fields_schema[file_index],
266
652
                        field_node));
267
652
                ctx->table_info_node->add_children(
268
652
                        table_column_name, parquet_fields_schema[file_index].name, field_node);
269
652
            }
270
764
            slot_map.erase(table_column_name);
271
764
        }
272
266
        for (const auto& [partition_col_name, _] : slot_map) {
273
116
            ctx->table_info_node->add_not_exist_children(partition_col_name);
274
116
        }
275
266
    }
276
277
    // Compute column_ids for lazy materialization
278
10.4k
    auto column_id_result = ColumnIdResult();
279
10.4k
    if (get_state()->query_options().hive_parquet_use_column_names) {
280
10.1k
        column_id_result = _create_column_ids(field_desc, ctx->tuple_descriptor);
281
10.1k
    } else {
282
304
        column_id_result =
283
304
                _create_column_ids_by_top_level_col_index(field_desc, ctx->tuple_descriptor);
284
304
    }
285
10.4k
    ctx->column_ids = std::move(column_id_result.column_ids);
286
10.4k
    ctx->filter_column_ids = std::move(column_id_result.filter_column_ids);
287
288
10.4k
    _filter_groups = true;
289
10.4k
    return Status::OK();
290
10.4k
}
291
292
ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* field_desc,
293
10.2k
                                                     const TupleDescriptor* tuple_descriptor) {
294
    // First, assign column IDs to the field descriptor
295
10.2k
    auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
296
10.2k
    mutable_field_desc->assign_ids();
297
298
    // map top-level table column name (lower-cased) -> FieldSchema*
299
10.2k
    std::unordered_map<std::string, const FieldSchema*> table_col_name_to_field_schema_map;
300
87.7k
    for (int i = 0; i < field_desc->size(); ++i) {
301
77.5k
        auto field_schema = field_desc->get_column(i);
302
77.5k
        if (!field_schema) continue;
303
304
77.5k
        table_col_name_to_field_schema_map[field_schema->lower_case_name] = field_schema;
305
77.5k
    }
306
307
10.2k
    std::set<uint64_t> column_ids;
308
10.2k
    std::set<uint64_t> filter_column_ids;
309
310
    // helper to process access paths for a given top-level parquet field
311
10.2k
    auto process_access_paths = [](const FieldSchema* parquet_field,
312
10.2k
                                   const std::vector<TColumnAccessPath>& access_paths,
313
15.8k
                                   std::set<uint64_t>& out_ids) {
314
15.8k
        process_nested_access_paths(
315
15.8k
                parquet_field, access_paths, out_ids,
316
15.8k
                [](const FieldSchema* field) { return field->get_column_id(); },
317
15.8k
                [](const FieldSchema* field) { return field->get_max_column_id(); },
318
15.8k
                HiveParquetNestedColumnUtils::extract_nested_column_ids);
319
15.8k
    };
320
321
55.1k
    for (const auto* slot : tuple_descriptor->slots()) {
322
55.1k
        auto it = table_col_name_to_field_schema_map.find(slot->col_name_lower_case());
323
55.1k
        if (it == table_col_name_to_field_schema_map.end()) {
324
            // Column not found in file
325
6.62k
            continue;
326
6.62k
        }
327
48.5k
        auto field_schema = it->second;
328
329
        // primitive (non-nested) types
330
48.5k
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
331
48.5k
             slot->col_type() != TYPE_MAP)) {
332
32.8k
            column_ids.insert(field_schema->column_id);
333
334
32.8k
            if (slot->is_predicate()) {
335
4.97k
                filter_column_ids.insert(field_schema->column_id);
336
4.97k
            }
337
32.8k
            continue;
338
32.8k
        }
339
340
        // complex types
341
15.7k
        const auto& all_access_paths = slot->all_access_paths();
342
15.7k
        process_access_paths(field_schema, all_access_paths, column_ids);
343
344
15.7k
        const auto& predicate_access_paths = slot->predicate_access_paths();
345
15.7k
        if (!predicate_access_paths.empty()) {
346
188
            process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
347
188
        }
348
15.7k
    }
349
350
10.2k
    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
351
10.2k
}
352
353
ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index(
354
270
        const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor) {
355
    // First, assign column IDs to the field descriptor
356
270
    auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
357
270
    mutable_field_desc->assign_ids();
358
359
    // map top-level table column position -> FieldSchema*
360
270
    std::unordered_map<uint64_t, const FieldSchema*> table_col_pos_to_field_schema_map;
361
1.32k
    for (int i = 0; i < field_desc->size(); ++i) {
362
1.05k
        auto field_schema = field_desc->get_column(i);
363
1.05k
        if (!field_schema) continue;
364
365
1.05k
        table_col_pos_to_field_schema_map[i] = field_schema;
366
1.05k
    }
367
368
270
    std::set<uint64_t> column_ids;
369
270
    std::set<uint64_t> filter_column_ids;
370
371
    // helper to process access paths for a given top-level parquet field
372
270
    auto process_access_paths = [](const FieldSchema* parquet_field,
373
270
                                   const std::vector<TColumnAccessPath>& access_paths,
374
270
                                   std::set<uint64_t>& out_ids) {
375
13
        process_nested_access_paths(
376
13
                parquet_field, access_paths, out_ids,
377
13
                [](const FieldSchema* field) { return field->get_column_id(); },
378
13
                [](const FieldSchema* field) { return field->get_max_column_id(); },
379
13
                HiveParquetNestedColumnUtils::extract_nested_column_ids);
380
13
    };
381
382
893
    for (const auto* slot : tuple_descriptor->slots()) {
383
893
        auto it = table_col_pos_to_field_schema_map.find(slot->col_pos());
384
893
        if (it == table_col_pos_to_field_schema_map.end()) {
385
            // Column not found in file
386
880
            continue;
387
880
        }
388
13
        auto field_schema = it->second;
389
390
        // primitive (non-nested) types
391
13
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
392
13
             slot->col_type() != TYPE_MAP)) {
393
6
            column_ids.insert(field_schema->column_id);
394
395
6
            if (slot->is_predicate()) {
396
0
                filter_column_ids.insert(field_schema->column_id);
397
0
            }
398
6
            continue;
399
6
        }
400
401
        // complex types
402
7
        const auto& all_access_paths = slot->all_access_paths();
403
7
        process_access_paths(field_schema, all_access_paths, column_ids);
404
405
7
        const auto& predicate_access_paths = slot->predicate_access_paths();
406
7
        if (!predicate_access_paths.empty()) {
407
6
            process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
408
6
        }
409
7
    }
410
411
270
    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
412
270
}
413
414
} // namespace doris