be/src/format/table/hive_reader.cpp

Source
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include "format/table/hive_reader.h"

#include <vector>

#include "common/status.h"
#include "format/table/hive/hive_orc_nested_column_utils.h"
#include "format/table/hive/hive_parquet_nested_column_utils.h"
#include "format/table/nested_column_access_helper.h"
#include "runtime/runtime_state.h"

namespace doris {

Status HiveOrcReader::on_before_init_reader(ReaderInitContext* ctx) {
    _column_descs = ctx->column_descs;
    _fill_col_name_to_block_idx = ctx->col_name_to_block_idx;
    RETURN_IF_ERROR(
            _extract_partition_values(*ctx->range, ctx->tuple_descriptor, _fill_partition_values));
    for (auto& desc : *ctx->column_descs) {
        if (desc.category == ColumnCategory::REGULAR ||
            desc.category == ColumnCategory::GENERATED) {
            ctx->column_names.push_back(desc.name);
        } else if (desc.category == ColumnCategory::SYNTHESIZED &&
                   desc.name.starts_with(BeConsts::GLOBAL_ROWID_COL)) {
            auto topn_row_id_column_iter = _create_topn_row_id_column_iterator();
            this->register_synthesized_column_handler(
                    desc.name,
                    [iter = std::move(topn_row_id_column_iter), this, &desc](
                            Block* block, size_t rows) -> Status {
                        return fill_topn_row_id(iter, desc.name, block, rows);
                    });
            continue;
        }
    }

    // Get file type (available because _create_file_reader() runs before this hook)
    const orc::Type* orc_type_ptr = nullptr;
    RETURN_IF_ERROR(get_file_type(&orc_type_ptr));
    bool is_hive_col_name = OrcReader::is_hive1_col_name(orc_type_ptr);

    // Build table_info_node based on config
    if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) {
        RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(ctx->tuple_descriptor, orc_type_ptr,
                                                        ctx->table_info_node, _is_file_slot));
    } else {
        ctx->table_info_node = std::make_shared<StructNode>();
        std::map<std::string, const SlotDescriptor*> slot_map;
        for (const auto& slot : ctx->tuple_descriptor->slots()) {
            slot_map.emplace(slot->col_name_lower_case(), slot);
        }

        for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) {
            auto table_column_name = ctx->column_names[idx];
            auto file_index = get_scan_params().column_idxs[idx];

            if (file_index >= orc_type_ptr->getSubtypeCount()) {
                ctx->table_info_node->add_not_exist_children(table_column_name);
            } else {
                auto field_node = std::make_shared<Node>();
                RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(
                        slot_map[table_column_name]->type(), orc_type_ptr->getSubtype(file_index),
                        field_node));
                ctx->table_info_node->add_children(
                        table_column_name, orc_type_ptr->getFieldName(file_index), field_node);
            }
            slot_map.erase(table_column_name);
        }
        for (const auto& [partition_col_name, _] : slot_map) {
            ctx->table_info_node->add_not_exist_children(partition_col_name);
        }
    }

    // Compute column_ids
    auto column_id_result = ColumnIdResult();
    if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) {
        column_id_result = _create_column_ids(orc_type_ptr, ctx->tuple_descriptor);
    } else {
        column_id_result =
                _create_column_ids_by_top_level_col_index(orc_type_ptr, ctx->tuple_descriptor);
    }
    ctx->column_ids = std::move(column_id_result.column_ids);
    ctx->filter_column_ids = std::move(column_id_result.filter_column_ids);

    // _is_acid is false by default, no need to set explicitly
    return Status::OK();
}

ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type,
                                                 const TupleDescriptor* tuple_descriptor) {
    // map top-level table column name (lower-cased) -> orc::Type*
    std::unordered_map<std::string, const orc::Type*> table_col_name_to_orc_type_map;
    for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
        auto orc_sub_type = orc_type->getSubtype(i);
        if (!orc_sub_type) continue;

        std::string table_col_name = to_lower(orc_type->getFieldName(i));
        table_col_name_to_orc_type_map[table_col_name] = orc_sub_type;
    }

    std::set<uint64_t> column_ids;
    std::set<uint64_t> filter_column_ids;

    // helper to process access paths for a given top-level orc field
    auto process_access_paths = [](const orc::Type* orc_field,
                                   const std::vector<TColumnAccessPath>& access_paths,
                                   std::set<uint64_t>& out_ids) {
        process_nested_access_paths(
                orc_field, access_paths, out_ids,
                [](const orc::Type* type) { return type->getColumnId(); },
                [](const orc::Type* type) { return type->getMaximumColumnId(); },
                HiveOrcNestedColumnUtils::extract_nested_column_ids);
    };

    for (const auto* slot : tuple_descriptor->slots()) {
        auto it = table_col_name_to_orc_type_map.find(slot->col_name_lower_case());
        if (it == table_col_name_to_orc_type_map.end()) {
            // Column not found in file
            continue;
        }
        const orc::Type* orc_field = it->second;

        // primitive (non-nested) types
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
             slot->col_type() != TYPE_MAP)) {
            column_ids.insert(orc_field->getColumnId());
            if (slot->is_predicate()) {
                filter_column_ids.insert(orc_field->getColumnId());
            }
            continue;
        }

        // complex types
        const auto& all_access_paths = slot->all_access_paths();
        process_access_paths(orc_field, all_access_paths, column_ids);

        const auto& predicate_access_paths = slot->predicate_access_paths();
        if (!predicate_access_paths.empty()) {
            process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
        }
    }

    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
}

ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index(
        const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) {
    // map top-level table column position -> orc::Type*
    std::unordered_map<uint64_t, const orc::Type*> table_col_pos_to_orc_type_map;
    for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
        auto orc_sub_type = orc_type->getSubtype(i);
        if (!orc_sub_type) continue;

        table_col_pos_to_orc_type_map[i] = orc_sub_type;
    }

    std::set<uint64_t> column_ids;
    std::set<uint64_t> filter_column_ids;

    // helper to process access paths for a given top-level orc field
    auto process_access_paths = [](const orc::Type* orc_field,
                                   const std::vector<TColumnAccessPath>& access_paths,
                                   std::set<uint64_t>& out_ids) {
        process_nested_access_paths(
                orc_field, access_paths, out_ids,
                [](const orc::Type* type) { return type->getColumnId(); },
                [](const orc::Type* type) { return type->getMaximumColumnId(); },
                HiveOrcNestedColumnUtils::extract_nested_column_ids);
    };

    for (const auto* slot : tuple_descriptor->slots()) {
        auto it = table_col_pos_to_orc_type_map.find(slot->col_pos());
        if (it == table_col_pos_to_orc_type_map.end()) {
            // Column not found in file
            continue;
        }
        const orc::Type* orc_field = it->second;

        // primitive (non-nested) types
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
             slot->col_type() != TYPE_MAP)) {
            column_ids.insert(orc_field->getColumnId());
            if (slot->is_predicate()) {
                filter_column_ids.insert(orc_field->getColumnId());
            }
            continue;
        }

        const auto& all_access_paths = slot->all_access_paths();
        // complex types
        process_access_paths(orc_field, all_access_paths, column_ids);

        const auto& predicate_access_paths = slot->predicate_access_paths();
        if (!predicate_access_paths.empty()) {
            process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
        }
    }

    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
}

Status HiveParquetReader::on_before_init_reader(ReaderInitContext* ctx) {
    _column_descs = ctx->column_descs;
    _fill_col_name_to_block_idx = ctx->col_name_to_block_idx;
    RETURN_IF_ERROR(
            _extract_partition_values(*ctx->range, ctx->tuple_descriptor, _fill_partition_values));
    for (auto& desc : *ctx->column_descs) {
        if (desc.category == ColumnCategory::REGULAR ||
            desc.category == ColumnCategory::GENERATED) {
            ctx->column_names.push_back(desc.name);
        } else if (desc.category == ColumnCategory::SYNTHESIZED &&
                   desc.name.starts_with(BeConsts::GLOBAL_ROWID_COL)) {
            auto topn_row_id_column_iter = _create_topn_row_id_column_iterator();
            this->register_synthesized_column_handler(
                    desc.name,
                    [iter = std::move(topn_row_id_column_iter), this, &desc](
                            Block* block, size_t rows) -> Status {
                        return fill_topn_row_id(iter, desc.name, block, rows);
                    });
            continue;
        }
    }

    // Get file metadata schema (available because _open_file() runs before this hook)
    const FieldDescriptor* field_desc = nullptr;
    RETURN_IF_ERROR(get_file_metadata_schema(&field_desc));
    DCHECK(field_desc != nullptr);

    // Build table_info_node based on config
    if (get_state()->query_options().hive_parquet_use_column_names) {
        RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(ctx->tuple_descriptor, *field_desc,
                                                            ctx->table_info_node, _is_file_slot));
    } else {
        ctx->table_info_node = std::make_shared<StructNode>();
        std::map<std::string, const SlotDescriptor*> slot_map;
        for (const auto& slot : ctx->tuple_descriptor->slots()) {
            slot_map.emplace(slot->col_name_lower_case(), slot);
        }

        auto parquet_fields_schema = field_desc->get_fields_schema();
        for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) {
            auto table_column_name = ctx->column_names[idx];
            auto file_index = get_scan_params().column_idxs[idx];

            if (file_index >= parquet_fields_schema.size()) {
                ctx->table_info_node->add_not_exist_children(table_column_name);
            } else {
                auto field_node = std::make_shared<Node>();
                RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(
                        slot_map[table_column_name]->type(), parquet_fields_schema[file_index],
                        field_node));
                ctx->table_info_node->add_children(
                        table_column_name, parquet_fields_schema[file_index].name, field_node);
            }
            slot_map.erase(table_column_name);
        }
        for (const auto& [partition_col_name, _] : slot_map) {
            ctx->table_info_node->add_not_exist_children(partition_col_name);
        }
    }

    // Compute column_ids for lazy materialization
    auto column_id_result = ColumnIdResult();
    if (get_state()->query_options().hive_parquet_use_column_names) {
        column_id_result = _create_column_ids(field_desc, ctx->tuple_descriptor);
    } else {
        column_id_result =
                _create_column_ids_by_top_level_col_index(field_desc, ctx->tuple_descriptor);
    }
    ctx->column_ids = std::move(column_id_result.column_ids);
    ctx->filter_column_ids = std::move(column_id_result.filter_column_ids);

    _filter_groups = true;
    return Status::OK();
}

ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* field_desc,
                                                     const TupleDescriptor* tuple_descriptor) {
    // First, assign column IDs to the field descriptor
    auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
    mutable_field_desc->assign_ids();

    // map top-level table column name (lower-cased) -> FieldSchema*
    std::unordered_map<std::string, const FieldSchema*> table_col_name_to_field_schema_map;
    for (int i = 0; i < field_desc->size(); ++i) {
        auto field_schema = field_desc->get_column(i);
        if (!field_schema) continue;

        table_col_name_to_field_schema_map[field_schema->lower_case_name] = field_schema;
    }

    std::set<uint64_t> column_ids;
    std::set<uint64_t> filter_column_ids;

    // helper to process access paths for a given top-level parquet field
    auto process_access_paths = [](const FieldSchema* parquet_field,
                                   const std::vector<TColumnAccessPath>& access_paths,
                                   std::set<uint64_t>& out_ids) {
        process_nested_access_paths(
                parquet_field, access_paths, out_ids,
                [](const FieldSchema* field) { return field->get_column_id(); },
                [](const FieldSchema* field) { return field->get_max_column_id(); },
                HiveParquetNestedColumnUtils::extract_nested_column_ids);
    };

    for (const auto* slot : tuple_descriptor->slots()) {
        auto it = table_col_name_to_field_schema_map.find(slot->col_name_lower_case());
        if (it == table_col_name_to_field_schema_map.end()) {
            // Column not found in file
            continue;
        }
        auto field_schema = it->second;

        // primitive (non-nested) types
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
             slot->col_type() != TYPE_MAP)) {
            column_ids.insert(field_schema->column_id);

            if (slot->is_predicate()) {
                filter_column_ids.insert(field_schema->column_id);
            }
            continue;
        }

        // complex types
        const auto& all_access_paths = slot->all_access_paths();
        process_access_paths(field_schema, all_access_paths, column_ids);

        const auto& predicate_access_paths = slot->predicate_access_paths();
        if (!predicate_access_paths.empty()) {
            process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
        }
    }

    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
}

ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index(
        const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor) {
    // First, assign column IDs to the field descriptor
    auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
    mutable_field_desc->assign_ids();

    // map top-level table column position -> FieldSchema*
    std::unordered_map<uint64_t, const FieldSchema*> table_col_pos_to_field_schema_map;
    for (int i = 0; i < field_desc->size(); ++i) {
        auto field_schema = field_desc->get_column(i);
        if (!field_schema) continue;

        table_col_pos_to_field_schema_map[i] = field_schema;
    }

    std::set<uint64_t> column_ids;
    std::set<uint64_t> filter_column_ids;

    // helper to process access paths for a given top-level parquet field
    auto process_access_paths = [](const FieldSchema* parquet_field,
                                   const std::vector<TColumnAccessPath>& access_paths,
                                   std::set<uint64_t>& out_ids) {
        process_nested_access_paths(
                parquet_field, access_paths, out_ids,
                [](const FieldSchema* field) { return field->get_column_id(); },
                [](const FieldSchema* field) { return field->get_max_column_id(); },
                HiveParquetNestedColumnUtils::extract_nested_column_ids);
    };

    for (const auto* slot : tuple_descriptor->slots()) {
        auto it = table_col_pos_to_field_schema_map.find(slot->col_pos());
        if (it == table_col_pos_to_field_schema_map.end()) {
            // Column not found in file
            continue;
        }
        auto field_schema = it->second;

        // primitive (non-nested) types
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
             slot->col_type() != TYPE_MAP)) {
            column_ids.insert(field_schema->column_id);

            if (slot->is_predicate()) {
                filter_column_ids.insert(field_schema->column_id);
            }
            continue;
        }

        // complex types
        const auto& all_access_paths = slot->all_access_paths();
        process_access_paths(field_schema, all_access_paths, column_ids);

        const auto& predicate_access_paths = slot->predicate_access_paths();
        if (!predicate_access_paths.empty()) {
            process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
        }
    }

    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
}

} // namespace doris

Coverage Report

Created: 2026-04-20 07:27

Line	Count	Source
1		// Licensed to the Apache Software Foundation (ASF) under one
2		// or more contributor license agreements. See the NOTICE file
3		// distributed with this work for additional information
4		// regarding copyright ownership. The ASF licenses this file
5		// to you under the Apache License, Version 2.0 (the
6		// "License"); you may not use this file except in compliance
7		// with the License. You may obtain a copy of the License at
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing,
12		// software distributed under the License is distributed on an
13		// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14		// KIND, either express or implied. See the License for the
15		// specific language governing permissions and limitations
16		// under the License.
17
18		#include "format/table/hive_reader.h"
19
20		#include <vector>
21
22		#include "common/status.h"
23		#include "format/table/hive/hive_orc_nested_column_utils.h"
24		#include "format/table/hive/hive_parquet_nested_column_utils.h"
25		#include "format/table/nested_column_access_helper.h"
26		#include "runtime/runtime_state.h"
27
28		namespace doris {
29
30	15.3k	Status HiveOrcReader::on_before_init_reader(ReaderInitContext* ctx) {
31	15.3k	_column_descs = ctx->column_descs;
32	15.3k	_fill_col_name_to_block_idx = ctx->col_name_to_block_idx;
33	15.3k	RETURN_IF_ERROR(
34	15.3k	_extract_partition_values(*ctx->range, ctx->tuple_descriptor, _fill_partition_values));
35	63.2k	for (auto& desc : *ctx->column_descs) {
36	63.2k	if (desc.category == ColumnCategory::REGULAR \|\|
37	63.2k	desc.category == ColumnCategory::GENERATED) {
38	53.3k	ctx->column_names.push_back(desc.name);
39	53.3k	} else if (desc.category == ColumnCategory::SYNTHESIZED &&
40	9.85k	desc.name.starts_with(BeConsts::GLOBAL_ROWID_COL)) {
41	4.99k	auto topn_row_id_column_iter = _create_topn_row_id_column_iterator();
42	4.99k	this->register_synthesized_column_handler(
43	4.99k	desc.name,
44	4.99k	[iter = std::move(topn_row_id_column_iter), this, &desc](
45	4.99k	Block* block, size_t rows) -> Status {
46	4.23k	return fill_topn_row_id(iter, desc.name, block, rows);
47	4.23k	});
48	4.99k	continue;
49	4.99k	}
50	63.2k	}
51
52		// Get file type (available because _create_file_reader() runs before this hook)
53	15.3k	const orc::Type* orc_type_ptr = nullptr;
54	15.3k	RETURN_IF_ERROR(get_file_type(&orc_type_ptr));
55	15.3k	bool is_hive_col_name = OrcReader::is_hive1_col_name(orc_type_ptr);
56
57		// Build table_info_node based on config
58	15.3k	if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) {
59	15.0k	RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(ctx->tuple_descriptor, orc_type_ptr,
60	15.0k	ctx->table_info_node, _is_file_slot));
61	15.0k	} else {
62	320	ctx->table_info_node = std::make_shared<StructNode>();
63	320	std::map<std::string, const SlotDescriptor*> slot_map;
64	966	for (const auto& slot : ctx->tuple_descriptor->slots()) {
65	966	slot_map.emplace(slot->col_name_lower_case(), slot);
66	966	}
67
68	1.17k	for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) {
69	850	auto table_column_name = ctx->column_names[idx];
70	850	auto file_index = get_scan_params().column_idxs[idx];
71
72	850	if (file_index >= orc_type_ptr->getSubtypeCount()) {
73	112	ctx->table_info_node->add_not_exist_children(table_column_name);
74	738	} else {
75	738	auto field_node = std::make_shared<Node>();
76	738	RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(
77	738	slot_map[table_column_name]->type(), orc_type_ptr->getSubtype(file_index),
78	738	field_node));
79	738	ctx->table_info_node->add_children(
80	738	table_column_name, orc_type_ptr->getFieldName(file_index), field_node);
81	738	}
82	850	slot_map.erase(table_column_name);
83	850	}
84	320	for (const auto& [partition_col_name, _] : slot_map) {
85	112	ctx->table_info_node->add_not_exist_children(partition_col_name);
86	112	}
87	320	}
88
89		// Compute column_ids
90	15.3k	auto column_id_result = ColumnIdResult();
91	15.3k	if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) {
92	15.0k	column_id_result = _create_column_ids(orc_type_ptr, ctx->tuple_descriptor);
93	15.0k	} else {
94	362	column_id_result =
95	362	_create_column_ids_by_top_level_col_index(orc_type_ptr, ctx->tuple_descriptor);
96	362	}
97	15.3k	ctx->column_ids = std::move(column_id_result.column_ids);
98	15.3k	ctx->filter_column_ids = std::move(column_id_result.filter_column_ids);
99
100		// _is_acid is false by default, no need to set explicitly
101	15.3k	return Status::OK();
102	15.3k	}
103
104		ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type,
105	14.9k	const TupleDescriptor* tuple_descriptor) {
106		// map top-level table column name (lower-cased) -> orc::Type*
107	14.9k	std::unordered_map<std::string, const orc::Type*> table_col_name_to_orc_type_map;
108	1.27M	for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
109	1.25M	auto orc_sub_type = orc_type->getSubtype(i);
110	1.25M	if (!orc_sub_type) continue;
111
112	1.25M	std::string table_col_name = to_lower(orc_type->getFieldName(i));
113	1.25M	table_col_name_to_orc_type_map[table_col_name] = orc_sub_type;
114	1.25M	}
115
116	14.9k	std::set<uint64_t> column_ids;
117	14.9k	std::set<uint64_t> filter_column_ids;
118
119		// helper to process access paths for a given top-level orc field
120	14.9k	auto process_access_paths = [](const orc::Type* orc_field,
121	14.9k	const std::vector<TColumnAccessPath>& access_paths,
122	14.9k	std::set<uint64_t>& out_ids) {
123	10.1k	process_nested_access_paths(
124	10.1k	orc_field, access_paths, out_ids,
125	10.1k	[](const orc::Type* type) { return type->getColumnId(); },
126	10.1k	[](const orc::Type* type) { return type->getMaximumColumnId(); },
127	10.1k	HiveOrcNestedColumnUtils::extract_nested_column_ids);
128	10.1k	};
129
130	62.2k	for (const auto* slot : tuple_descriptor->slots()) {
131	62.2k	auto it = table_col_name_to_orc_type_map.find(slot->col_name_lower_case());
132	62.2k	if (it == table_col_name_to_orc_type_map.end()) {
133		// Column not found in file
134	10.4k	continue;
135	10.4k	}
136	51.8k	const orc::Type* orc_field = it->second;
137
138		// primitive (non-nested) types
139	51.8k	if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
140	51.8k	slot->col_type() != TYPE_MAP)) {
141	41.8k	column_ids.insert(orc_field->getColumnId());
142	41.8k	if (slot->is_predicate()) {
143	10.3k	filter_column_ids.insert(orc_field->getColumnId());
144	10.3k	}
145	41.8k	continue;
146	41.8k	}
147
148		// complex types
149	10.0k	const auto& all_access_paths = slot->all_access_paths();
150	10.0k	process_access_paths(orc_field, all_access_paths, column_ids);
151
152	10.0k	const auto& predicate_access_paths = slot->predicate_access_paths();
153	10.0k	if (!predicate_access_paths.empty()) {
154	150	process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
155	150	}
156	10.0k	}
157
158	14.9k	return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
159	14.9k	}
160
161		ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index(
162	314	const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) {
163		// map top-level table column position -> orc::Type*
164	314	std::unordered_map<uint64_t, const orc::Type*> table_col_pos_to_orc_type_map;
165	1.50k	for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
166	1.18k	auto orc_sub_type = orc_type->getSubtype(i);
167	1.18k	if (!orc_sub_type) continue;
168
169	1.18k	table_col_pos_to_orc_type_map[i] = orc_sub_type;
170	1.18k	}
171
172	314	std::set<uint64_t> column_ids;
173	314	std::set<uint64_t> filter_column_ids;
174
175		// helper to process access paths for a given top-level orc field
176	314	auto process_access_paths = [](const orc::Type* orc_field,
177	314	const std::vector<TColumnAccessPath>& access_paths,
178	314	std::set<uint64_t>& out_ids) {
179	13	process_nested_access_paths(
180	13	orc_field, access_paths, out_ids,
181	13	[](const orc::Type* type) { return type->getColumnId(); },
182	13	[](const orc::Type* type) { return type->getMaximumColumnId(); },
183	13	HiveOrcNestedColumnUtils::extract_nested_column_ids);
184	13	};
185
186	973	for (const auto* slot : tuple_descriptor->slots()) {
187	973	auto it = table_col_pos_to_orc_type_map.find(slot->col_pos());
188	973	if (it == table_col_pos_to_orc_type_map.end()) {
189		// Column not found in file
190	960	continue;
191	960	}
192	13	const orc::Type* orc_field = it->second;
193
194		// primitive (non-nested) types
195	13	if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
196	13	slot->col_type() != TYPE_MAP)) {
197	6	column_ids.insert(orc_field->getColumnId());
198	6	if (slot->is_predicate()) {
199	0	filter_column_ids.insert(orc_field->getColumnId());
200	0	}
201	6	continue;
202	6	}
203
204	7	const auto& all_access_paths = slot->all_access_paths();
205		// complex types
206	7	process_access_paths(orc_field, all_access_paths, column_ids);
207
208	7	const auto& predicate_access_paths = slot->predicate_access_paths();
209	7	if (!predicate_access_paths.empty()) {
210	6	process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
211	6	}
212	7	}
213
214	314	return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
215	314	}
216
217	10.4k	Status HiveParquetReader::on_before_init_reader(ReaderInitContext* ctx) {
218	10.4k	_column_descs = ctx->column_descs;
219	10.4k	_fill_col_name_to_block_idx = ctx->col_name_to_block_idx;
220	10.4k	RETURN_IF_ERROR(
221	10.4k	_extract_partition_values(*ctx->range, ctx->tuple_descriptor, _fill_partition_values));
222	56.0k	for (auto& desc : *ctx->column_descs) {
223	56.0k	if (desc.category == ColumnCategory::REGULAR \|\|
224	56.0k	desc.category == ColumnCategory::GENERATED) {
225	49.9k	ctx->column_names.push_back(desc.name);
226	49.9k	} else if (desc.category == ColumnCategory::SYNTHESIZED &&
227	6.14k	desc.name.starts_with(BeConsts::GLOBAL_ROWID_COL)) {
228	1.84k	auto topn_row_id_column_iter = _create_topn_row_id_column_iterator();
229	1.84k	this->register_synthesized_column_handler(
230	1.84k	desc.name,
231	1.84k	[iter = std::move(topn_row_id_column_iter), this, &desc](
232	2.33k	Block* block, size_t rows) -> Status {
233	2.33k	return fill_topn_row_id(iter, desc.name, block, rows);
234	2.33k	});
235	1.84k	continue;
236	1.84k	}
237	56.0k	}
238
239		// Get file metadata schema (available because _open_file() runs before this hook)
240	10.4k	const FieldDescriptor* field_desc = nullptr;
241	10.4k	RETURN_IF_ERROR(get_file_metadata_schema(&field_desc));
242	10.4k	DCHECK(field_desc != nullptr);
243
244		// Build table_info_node based on config
245	10.4k	if (get_state()->query_options().hive_parquet_use_column_names) {
246	10.2k	RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(ctx->tuple_descriptor, *field_desc,
247	10.2k	ctx->table_info_node, _is_file_slot));
248	10.2k	} else {
249	266	ctx->table_info_node = std::make_shared<StructNode>();
250	266	std::map<std::string, const SlotDescriptor*> slot_map;
251	880	for (const auto& slot : ctx->tuple_descriptor->slots()) {
252	880	slot_map.emplace(slot->col_name_lower_case(), slot);
253	880	}
254
255	266	auto parquet_fields_schema = field_desc->get_fields_schema();
256	1.03k	for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) {
257	764	auto table_column_name = ctx->column_names[idx];
258	764	auto file_index = get_scan_params().column_idxs[idx];
259
260	764	if (file_index >= parquet_fields_schema.size()) {
261	112	ctx->table_info_node->add_not_exist_children(table_column_name);
262	652	} else {
263	652	auto field_node = std::make_shared<Node>();
264	652	RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(
265	652	slot_map[table_column_name]->type(), parquet_fields_schema[file_index],
266	652	field_node));
267	652	ctx->table_info_node->add_children(
268	652	table_column_name, parquet_fields_schema[file_index].name, field_node);
269	652	}
270	764	slot_map.erase(table_column_name);
271	764	}
272	266	for (const auto& [partition_col_name, _] : slot_map) {
273	116	ctx->table_info_node->add_not_exist_children(partition_col_name);
274	116	}
275	266	}
276
277		// Compute column_ids for lazy materialization
278	10.4k	auto column_id_result = ColumnIdResult();
279	10.4k	if (get_state()->query_options().hive_parquet_use_column_names) {
280	10.1k	column_id_result = _create_column_ids(field_desc, ctx->tuple_descriptor);
281	10.1k	} else {
282	304	column_id_result =
283	304	_create_column_ids_by_top_level_col_index(field_desc, ctx->tuple_descriptor);
284	304	}
285	10.4k	ctx->column_ids = std::move(column_id_result.column_ids);
286	10.4k	ctx->filter_column_ids = std::move(column_id_result.filter_column_ids);
287
288	10.4k	_filter_groups = true;
289	10.4k	return Status::OK();
290	10.4k	}
291
292		ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* field_desc,
293	10.2k	const TupleDescriptor* tuple_descriptor) {
294		// First, assign column IDs to the field descriptor
295	10.2k	auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
296	10.2k	mutable_field_desc->assign_ids();
297
298		// map top-level table column name (lower-cased) -> FieldSchema*
299	10.2k	std::unordered_map<std::string, const FieldSchema*> table_col_name_to_field_schema_map;
300	87.7k	for (int i = 0; i < field_desc->size(); ++i) {
301	77.5k	auto field_schema = field_desc->get_column(i);
302	77.5k	if (!field_schema) continue;
303
304	77.5k	table_col_name_to_field_schema_map[field_schema->lower_case_name] = field_schema;
305	77.5k	}
306
307	10.2k	std::set<uint64_t> column_ids;
308	10.2k	std::set<uint64_t> filter_column_ids;
309
310		// helper to process access paths for a given top-level parquet field
311	10.2k	auto process_access_paths = [](const FieldSchema* parquet_field,
312	10.2k	const std::vector<TColumnAccessPath>& access_paths,
313	15.8k	std::set<uint64_t>& out_ids) {
314	15.8k	process_nested_access_paths(
315	15.8k	parquet_field, access_paths, out_ids,
316	15.8k	[](const FieldSchema* field) { return field->get_column_id(); },
317	15.8k	[](const FieldSchema* field) { return field->get_max_column_id(); },
318	15.8k	HiveParquetNestedColumnUtils::extract_nested_column_ids);
319	15.8k	};
320
321	55.1k	for (const auto* slot : tuple_descriptor->slots()) {
322	55.1k	auto it = table_col_name_to_field_schema_map.find(slot->col_name_lower_case());
323	55.1k	if (it == table_col_name_to_field_schema_map.end()) {
324		// Column not found in file
325	6.62k	continue;
326	6.62k	}
327	48.5k	auto field_schema = it->second;
328
329		// primitive (non-nested) types
330	48.5k	if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
331	48.5k	slot->col_type() != TYPE_MAP)) {
332	32.8k	column_ids.insert(field_schema->column_id);
333
334	32.8k	if (slot->is_predicate()) {
335	4.97k	filter_column_ids.insert(field_schema->column_id);
336	4.97k	}
337	32.8k	continue;
338	32.8k	}
339
340		// complex types
341	15.7k	const auto& all_access_paths = slot->all_access_paths();
342	15.7k	process_access_paths(field_schema, all_access_paths, column_ids);
343
344	15.7k	const auto& predicate_access_paths = slot->predicate_access_paths();
345	15.7k	if (!predicate_access_paths.empty()) {
346	188	process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
347	188	}
348	15.7k	}
349
350	10.2k	return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
351	10.2k	}
352
353		ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index(
354	270	const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor) {
355		// First, assign column IDs to the field descriptor
356	270	auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
357	270	mutable_field_desc->assign_ids();
358
359		// map top-level table column position -> FieldSchema*
360	270	std::unordered_map<uint64_t, const FieldSchema*> table_col_pos_to_field_schema_map;
361	1.32k	for (int i = 0; i < field_desc->size(); ++i) {
362	1.05k	auto field_schema = field_desc->get_column(i);
363	1.05k	if (!field_schema) continue;
364
365	1.05k	table_col_pos_to_field_schema_map[i] = field_schema;
366	1.05k	}
367
368	270	std::set<uint64_t> column_ids;
369	270	std::set<uint64_t> filter_column_ids;
370
371		// helper to process access paths for a given top-level parquet field
372	270	auto process_access_paths = [](const FieldSchema* parquet_field,
373	270	const std::vector<TColumnAccessPath>& access_paths,
374	270	std::set<uint64_t>& out_ids) {
375	13	process_nested_access_paths(
376	13	parquet_field, access_paths, out_ids,
377	13	[](const FieldSchema* field) { return field->get_column_id(); },
378	13	[](const FieldSchema* field) { return field->get_max_column_id(); },
379	13	HiveParquetNestedColumnUtils::extract_nested_column_ids);
380	13	};
381
382	893	for (const auto* slot : tuple_descriptor->slots()) {
383	893	auto it = table_col_pos_to_field_schema_map.find(slot->col_pos());
384	893	if (it == table_col_pos_to_field_schema_map.end()) {
385		// Column not found in file
386	880	continue;
387	880	}
388	13	auto field_schema = it->second;
389
390		// primitive (non-nested) types
391	13	if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
392	13	slot->col_type() != TYPE_MAP)) {
393	6	column_ids.insert(field_schema->column_id);
394
395	6	if (slot->is_predicate()) {
396	0	filter_column_ids.insert(field_schema->column_id);
397	0	}
398	6	continue;
399	6	}
400
401		// complex types
402	7	const auto& all_access_paths = slot->all_access_paths();
403	7	process_access_paths(field_schema, all_access_paths, column_ids);
404
405	7	const auto& predicate_access_paths = slot->predicate_access_paths();
406	7	if (!predicate_access_paths.empty()) {
407	6	process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
408	6	}
409	7	}
410
411	270	return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
412	270	}
413
414		} // namespace doris