Coverage Report

Created: 2026-06-02 16:01

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/format/table/hive_reader.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "format/table/hive_reader.h"
19
20
#include <vector>
21
22
#include "common/status.h"
23
#include "format/table/hive/hive_orc_nested_column_utils.h"
24
#include "format/table/hive/hive_parquet_nested_column_utils.h"
25
#include "format/table/nested_column_access_helper.h"
26
#include "runtime/runtime_state.h"
27
28
namespace doris {
29
30
15.3k
Status HiveOrcReader::on_before_init_reader(ReaderInitContext* ctx) {
31
15.3k
    _column_descs = ctx->column_descs;
32
15.3k
    _fill_col_name_to_block_idx = ctx->col_name_to_block_idx;
33
15.3k
    RETURN_IF_ERROR(_extract_partition_values(*ctx->range, ctx->tuple_descriptor,
34
15.3k
                                              _fill_partition_values,
35
15.3k
                                              &_fill_partition_value_is_null));
36
63.5k
    for (const auto& desc : *ctx->column_descs) {
37
63.5k
        if (desc.category == ColumnCategory::REGULAR ||
38
63.5k
            desc.category == ColumnCategory::GENERATED) {
39
53.7k
            ctx->column_names.push_back(desc.name);
40
53.7k
        } else if (desc.category == ColumnCategory::SYNTHESIZED &&
41
9.82k
                   desc.name.starts_with(BeConsts::GLOBAL_ROWID_COL)) {
42
4.98k
            auto topn_row_id_column_iter = _create_topn_row_id_column_iterator();
43
4.98k
            this->register_synthesized_column_handler(
44
4.98k
                    desc.name,
45
4.98k
                    [iter = std::move(topn_row_id_column_iter), this, &desc](
46
27.2k
                            Block* block, size_t rows) -> Status {
47
27.2k
                        return fill_topn_row_id(iter, desc.name, block, rows);
48
27.2k
                    });
49
4.98k
            continue;
50
4.98k
        }
51
63.5k
    }
52
53
    // Get file type (available because _create_file_reader() runs before this hook)
54
15.3k
    const orc::Type* orc_type_ptr = nullptr;
55
15.3k
    RETURN_IF_ERROR(get_file_type(&orc_type_ptr));
56
15.3k
    bool is_hive_col_name = OrcReader::is_hive1_col_name(orc_type_ptr);
57
58
    // Build table_info_node based on config
59
15.3k
    if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) {
60
15.0k
        RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(ctx->tuple_descriptor, orc_type_ptr,
61
15.0k
                                                        ctx->table_info_node, _is_file_slot));
62
15.0k
    } else {
63
348
        ctx->table_info_node = std::make_shared<StructNode>();
64
348
        std::map<std::string, const SlotDescriptor*> slot_map;
65
974
        for (const auto& slot : ctx->tuple_descriptor->slots()) {
66
974
            slot_map.emplace(slot->col_name_lower_case(), slot);
67
974
        }
68
69
1.20k
        for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) {
70
852
            auto table_column_name = ctx->column_names[idx];
71
852
            auto file_index = get_scan_params().column_idxs[idx];
72
73
852
            if (file_index >= orc_type_ptr->getSubtypeCount()) {
74
112
                ctx->table_info_node->add_not_exist_children(table_column_name);
75
740
            } else {
76
740
                auto field_node = std::make_shared<Node>();
77
740
                RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(
78
740
                        slot_map[table_column_name]->type(), orc_type_ptr->getSubtype(file_index),
79
740
                        field_node));
80
740
                ctx->table_info_node->add_children(
81
740
                        table_column_name, orc_type_ptr->getFieldName(file_index), field_node);
82
740
            }
83
852
            slot_map.erase(table_column_name);
84
852
        }
85
348
        for (const auto& [partition_col_name, _] : slot_map) {
86
116
            ctx->table_info_node->add_not_exist_children(partition_col_name);
87
116
        }
88
348
    }
89
90
    // Compute column_ids
91
15.3k
    auto column_id_result = ColumnIdResult();
92
15.3k
    if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) {
93
14.9k
        column_id_result = _create_column_ids(orc_type_ptr, ctx->tuple_descriptor);
94
14.9k
    } else {
95
390
        column_id_result =
96
390
                _create_column_ids_by_top_level_col_index(orc_type_ptr, ctx->tuple_descriptor);
97
390
    }
98
15.3k
    ctx->column_ids = std::move(column_id_result.column_ids);
99
15.3k
    ctx->filter_column_ids = std::move(column_id_result.filter_column_ids);
100
101
    // _is_acid is false by default, no need to set explicitly
102
15.3k
    return Status::OK();
103
15.3k
}
104
105
ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type,
106
14.9k
                                                 const TupleDescriptor* tuple_descriptor) {
107
    // map top-level table column name (lower-cased) -> orc::Type*
108
14.9k
    std::unordered_map<std::string, const orc::Type*> table_col_name_to_orc_type_map;
109
1.26M
    for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
110
1.25M
        auto orc_sub_type = orc_type->getSubtype(i);
111
1.25M
        if (!orc_sub_type) continue;
112
113
1.25M
        std::string table_col_name = to_lower(orc_type->getFieldName(i));
114
1.25M
        table_col_name_to_orc_type_map[table_col_name] = orc_sub_type;
115
1.25M
    }
116
117
14.9k
    std::set<uint64_t> column_ids;
118
14.9k
    std::set<uint64_t> filter_column_ids;
119
120
    // helper to process access paths for a given top-level orc field
121
14.9k
    auto process_access_paths = [](const orc::Type* orc_field,
122
14.9k
                                   const std::vector<TColumnAccessPath>& access_paths,
123
14.9k
                                   std::set<uint64_t>& out_ids) {
124
10.1k
        process_nested_access_paths(
125
10.1k
                orc_field, access_paths, out_ids,
126
10.1k
                [](const orc::Type* type) { return type->getColumnId(); },
127
10.1k
                [](const orc::Type* type) { return type->getMaximumColumnId(); },
128
10.1k
                HiveOrcNestedColumnUtils::extract_nested_column_ids);
129
10.1k
    };
130
131
62.5k
    for (const auto* slot : tuple_descriptor->slots()) {
132
62.5k
        auto it = table_col_name_to_orc_type_map.find(slot->col_name_lower_case());
133
62.5k
        if (it == table_col_name_to_orc_type_map.end()) {
134
            // Column not found in file
135
10.4k
            continue;
136
10.4k
        }
137
52.1k
        const orc::Type* orc_field = it->second;
138
139
        // primitive (non-nested) types
140
52.1k
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
141
52.1k
             slot->col_type() != TYPE_MAP)) {
142
41.9k
            column_ids.insert(orc_field->getColumnId());
143
41.9k
            if (slot->is_predicate()) {
144
10.7k
                filter_column_ids.insert(orc_field->getColumnId());
145
10.7k
            }
146
41.9k
            continue;
147
41.9k
        }
148
149
        // complex types
150
10.1k
        const auto& all_access_paths = slot->all_access_paths();
151
10.1k
        process_access_paths(orc_field, all_access_paths, column_ids);
152
153
10.1k
        const auto& predicate_access_paths = slot->predicate_access_paths();
154
10.1k
        if (!predicate_access_paths.empty()) {
155
94
            process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
156
94
        }
157
10.1k
    }
158
159
14.9k
    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
160
14.9k
}
161
162
ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index(
163
312
        const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) {
164
    // map top-level table column position -> orc::Type*
165
312
    std::unordered_map<uint64_t, const orc::Type*> table_col_pos_to_orc_type_map;
166
1.49k
    for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
167
1.18k
        auto orc_sub_type = orc_type->getSubtype(i);
168
1.18k
        if (!orc_sub_type) continue;
169
170
1.18k
        table_col_pos_to_orc_type_map[i] = orc_sub_type;
171
1.18k
    }
172
173
312
    std::set<uint64_t> column_ids;
174
312
    std::set<uint64_t> filter_column_ids;
175
176
    // helper to process access paths for a given top-level orc field
177
312
    auto process_access_paths = [](const orc::Type* orc_field,
178
312
                                   const std::vector<TColumnAccessPath>& access_paths,
179
312
                                   std::set<uint64_t>& out_ids) {
180
13
        process_nested_access_paths(
181
13
                orc_field, access_paths, out_ids,
182
13
                [](const orc::Type* type) { return type->getColumnId(); },
183
13
                [](const orc::Type* type) { return type->getMaximumColumnId(); },
184
13
                HiveOrcNestedColumnUtils::extract_nested_column_ids);
185
13
    };
186
187
989
    for (const auto* slot : tuple_descriptor->slots()) {
188
989
        auto it = table_col_pos_to_orc_type_map.find(slot->col_pos());
189
989
        if (it == table_col_pos_to_orc_type_map.end()) {
190
            // Column not found in file
191
972
            continue;
192
972
        }
193
17
        const orc::Type* orc_field = it->second;
194
195
        // primitive (non-nested) types
196
17
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
197
17
             slot->col_type() != TYPE_MAP)) {
198
6
            column_ids.insert(orc_field->getColumnId());
199
6
            if (slot->is_predicate()) {
200
0
                filter_column_ids.insert(orc_field->getColumnId());
201
0
            }
202
6
            continue;
203
6
        }
204
205
11
        const auto& all_access_paths = slot->all_access_paths();
206
        // complex types
207
11
        process_access_paths(orc_field, all_access_paths, column_ids);
208
209
11
        const auto& predicate_access_paths = slot->predicate_access_paths();
210
11
        if (!predicate_access_paths.empty()) {
211
6
            process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
212
6
        }
213
11
    }
214
215
312
    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
216
312
}
217
218
10.4k
Status HiveParquetReader::on_before_init_reader(ReaderInitContext* ctx) {
219
10.4k
    _column_descs = ctx->column_descs;
220
10.4k
    _fill_col_name_to_block_idx = ctx->col_name_to_block_idx;
221
10.4k
    RETURN_IF_ERROR(_extract_partition_values(*ctx->range, ctx->tuple_descriptor,
222
10.4k
                                              _fill_partition_values,
223
10.4k
                                              &_fill_partition_value_is_null));
224
56.1k
    for (const auto& desc : *ctx->column_descs) {
225
56.1k
        if (desc.category == ColumnCategory::REGULAR ||
226
56.1k
            desc.category == ColumnCategory::GENERATED) {
227
49.9k
            ctx->column_names.push_back(desc.name);
228
49.9k
        } else if (desc.category == ColumnCategory::SYNTHESIZED &&
229
6.17k
                   desc.name.starts_with(BeConsts::GLOBAL_ROWID_COL)) {
230
1.83k
            auto topn_row_id_column_iter = _create_topn_row_id_column_iterator();
231
1.83k
            this->register_synthesized_column_handler(
232
1.83k
                    desc.name,
233
1.83k
                    [iter = std::move(topn_row_id_column_iter), this, &desc](
234
7.71k
                            Block* block, size_t rows) -> Status {
235
7.71k
                        return fill_topn_row_id(iter, desc.name, block, rows);
236
7.71k
                    });
237
1.83k
            continue;
238
1.83k
        }
239
56.1k
    }
240
241
    // Get file metadata schema (available because _open_file() runs before this hook)
242
10.4k
    const FieldDescriptor* field_desc = nullptr;
243
10.4k
    RETURN_IF_ERROR(get_file_metadata_schema(&field_desc));
244
10.4k
    DCHECK(field_desc != nullptr);
245
246
    // Build table_info_node based on config
247
10.4k
    if (get_state()->query_options().hive_parquet_use_column_names) {
248
10.2k
        RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(ctx->tuple_descriptor, *field_desc,
249
10.2k
                                                            ctx->table_info_node, _is_file_slot));
250
10.2k
    } else {
251
264
        ctx->table_info_node = std::make_shared<StructNode>();
252
264
        std::map<std::string, const SlotDescriptor*> slot_map;
253
880
        for (const auto& slot : ctx->tuple_descriptor->slots()) {
254
880
            slot_map.emplace(slot->col_name_lower_case(), slot);
255
880
        }
256
257
264
        auto parquet_fields_schema = field_desc->get_fields_schema();
258
1.02k
        for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) {
259
762
            auto table_column_name = ctx->column_names[idx];
260
762
            auto file_index = get_scan_params().column_idxs[idx];
261
262
762
            if (file_index >= parquet_fields_schema.size()) {
263
112
                ctx->table_info_node->add_not_exist_children(table_column_name);
264
650
            } else {
265
650
                auto field_node = std::make_shared<Node>();
266
650
                RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(
267
650
                        slot_map[table_column_name]->type(), parquet_fields_schema[file_index],
268
650
                        field_node));
269
650
                ctx->table_info_node->add_children(
270
650
                        table_column_name, parquet_fields_schema[file_index].name, field_node);
271
650
            }
272
762
            slot_map.erase(table_column_name);
273
762
        }
274
264
        for (const auto& [partition_col_name, _] : slot_map) {
275
116
            ctx->table_info_node->add_not_exist_children(partition_col_name);
276
116
        }
277
264
    }
278
279
    // Compute column_ids for lazy materialization
280
10.4k
    auto column_id_result = ColumnIdResult();
281
10.4k
    if (get_state()->query_options().hive_parquet_use_column_names) {
282
10.2k
        column_id_result = _create_column_ids(field_desc, ctx->tuple_descriptor);
283
10.2k
    } else {
284
274
        column_id_result =
285
274
                _create_column_ids_by_top_level_col_index(field_desc, ctx->tuple_descriptor);
286
274
    }
287
10.4k
    ctx->column_ids = std::move(column_id_result.column_ids);
288
10.4k
    ctx->filter_column_ids = std::move(column_id_result.filter_column_ids);
289
290
10.4k
    _filter_groups = true;
291
10.4k
    return Status::OK();
292
10.4k
}
293
294
ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* field_desc,
295
10.2k
                                                     const TupleDescriptor* tuple_descriptor) {
296
    // First, assign column IDs to the field descriptor
297
10.2k
    auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
298
10.2k
    mutable_field_desc->assign_ids();
299
300
    // map top-level table column name (lower-cased) -> FieldSchema*
301
10.2k
    std::unordered_map<std::string, const FieldSchema*> table_col_name_to_field_schema_map;
302
87.7k
    for (int i = 0; i < field_desc->size(); ++i) {
303
77.5k
        auto field_schema = field_desc->get_column(i);
304
77.5k
        if (!field_schema) continue;
305
306
77.5k
        table_col_name_to_field_schema_map[field_schema->lower_case_name] = field_schema;
307
77.5k
    }
308
309
10.2k
    std::set<uint64_t> column_ids;
310
10.2k
    std::set<uint64_t> filter_column_ids;
311
312
    // helper to process access paths for a given top-level parquet field
313
10.2k
    auto process_access_paths = [](const FieldSchema* parquet_field,
314
10.2k
                                   const std::vector<TColumnAccessPath>& access_paths,
315
15.7k
                                   std::set<uint64_t>& out_ids) {
316
15.7k
        process_nested_access_paths(
317
15.7k
                parquet_field, access_paths, out_ids,
318
15.8k
                [](const FieldSchema* field) { return field->get_column_id(); },
319
15.7k
                [](const FieldSchema* field) { return field->get_max_column_id(); },
320
15.7k
                HiveParquetNestedColumnUtils::extract_nested_column_ids);
321
15.7k
    };
322
323
55.1k
    for (const auto* slot : tuple_descriptor->slots()) {
324
55.1k
        auto it = table_col_name_to_field_schema_map.find(slot->col_name_lower_case());
325
55.1k
        if (it == table_col_name_to_field_schema_map.end()) {
326
            // Column not found in file
327
6.67k
            continue;
328
6.67k
        }
329
48.4k
        auto field_schema = it->second;
330
331
        // primitive (non-nested) types
332
48.4k
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
333
48.4k
             slot->col_type() != TYPE_MAP)) {
334
32.7k
            column_ids.insert(field_schema->column_id);
335
336
32.7k
            if (slot->is_predicate()) {
337
5.14k
                filter_column_ids.insert(field_schema->column_id);
338
5.14k
            }
339
32.7k
            continue;
340
32.7k
        }
341
342
        // complex types
343
15.7k
        const auto& all_access_paths = slot->all_access_paths();
344
15.7k
        process_access_paths(field_schema, all_access_paths, column_ids);
345
346
15.7k
        const auto& predicate_access_paths = slot->predicate_access_paths();
347
15.7k
        if (!predicate_access_paths.empty()) {
348
98
            process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
349
98
        }
350
15.7k
    }
351
352
10.2k
    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
353
10.2k
}
354
355
ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index(
356
268
        const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor) {
357
    // First, assign column IDs to the field descriptor
358
268
    auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
359
268
    mutable_field_desc->assign_ids();
360
361
    // map top-level table column position -> FieldSchema*
362
268
    std::unordered_map<uint64_t, const FieldSchema*> table_col_pos_to_field_schema_map;
363
1.31k
    for (int i = 0; i < field_desc->size(); ++i) {
364
1.05k
        auto field_schema = field_desc->get_column(i);
365
1.05k
        if (!field_schema) continue;
366
367
1.05k
        table_col_pos_to_field_schema_map[i] = field_schema;
368
1.05k
    }
369
370
268
    std::set<uint64_t> column_ids;
371
268
    std::set<uint64_t> filter_column_ids;
372
373
    // helper to process access paths for a given top-level parquet field
374
268
    auto process_access_paths = [](const FieldSchema* parquet_field,
375
268
                                   const std::vector<TColumnAccessPath>& access_paths,
376
268
                                   std::set<uint64_t>& out_ids) {
377
13
        process_nested_access_paths(
378
13
                parquet_field, access_paths, out_ids,
379
13
                [](const FieldSchema* field) { return field->get_column_id(); },
380
13
                [](const FieldSchema* field) { return field->get_max_column_id(); },
381
13
                HiveParquetNestedColumnUtils::extract_nested_column_ids);
382
13
    };
383
384
885
    for (const auto* slot : tuple_descriptor->slots()) {
385
885
        auto it = table_col_pos_to_field_schema_map.find(slot->col_pos());
386
885
        if (it == table_col_pos_to_field_schema_map.end()) {
387
            // Column not found in file
388
872
            continue;
389
872
        }
390
13
        auto field_schema = it->second;
391
392
        // primitive (non-nested) types
393
13
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
394
13
             slot->col_type() != TYPE_MAP)) {
395
6
            column_ids.insert(field_schema->column_id);
396
397
6
            if (slot->is_predicate()) {
398
0
                filter_column_ids.insert(field_schema->column_id);
399
0
            }
400
6
            continue;
401
6
        }
402
403
        // complex types
404
7
        const auto& all_access_paths = slot->all_access_paths();
405
7
        process_access_paths(field_schema, all_access_paths, column_ids);
406
407
7
        const auto& predicate_access_paths = slot->predicate_access_paths();
408
7
        if (!predicate_access_paths.empty()) {
409
6
            process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
410
6
        }
411
7
    }
412
413
268
    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
414
268
}
415
416
} // namespace doris