be/src/format/table/hive_reader.cpp

Source
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include "format/table/hive_reader.h"

#include <vector>

#include "common/status.h"
#include "format/table/hive/hive_orc_nested_column_utils.h"
#include "format/table/hive/hive_parquet_nested_column_utils.h"
#include "format/table/nested_column_access_helper.h"
#include "runtime/runtime_state.h"

namespace doris {
#include "common/compile_check_begin.h"

Status HiveOrcReader::on_before_init_reader(ReaderInitContext* ctx) {
    _column_descs = ctx->column_descs;
    _fill_col_name_to_block_idx = ctx->col_name_to_block_idx;
    RETURN_IF_ERROR(
            _extract_partition_values(*ctx->range, ctx->tuple_descriptor, _fill_partition_values));
    for (auto& desc : *ctx->column_descs) {
        if (desc.category == ColumnCategory::REGULAR ||
            desc.category == ColumnCategory::GENERATED) {
            ctx->column_names.push_back(desc.name);
        }
    }

    // Get file type (available because _create_file_reader() runs before this hook)
    const orc::Type* orc_type_ptr = nullptr;
    RETURN_IF_ERROR(get_file_type(&orc_type_ptr));
    bool is_hive_col_name = OrcReader::is_hive1_col_name(orc_type_ptr);

    // Build table_info_node based on config
    if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) {
        RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(ctx->tuple_descriptor, orc_type_ptr,
                                                        ctx->table_info_node, _is_file_slot));
    } else {
        ctx->table_info_node = std::make_shared<StructNode>();
        std::map<std::string, const SlotDescriptor*> slot_map;
        for (const auto& slot : ctx->tuple_descriptor->slots()) {
            slot_map.emplace(slot->col_name_lower_case(), slot);
        }

        for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) {
            auto table_column_name = ctx->column_names[idx];
            auto file_index = get_scan_params().column_idxs[idx];

            if (file_index >= orc_type_ptr->getSubtypeCount()) {
                ctx->table_info_node->add_not_exist_children(table_column_name);
            } else {
                auto field_node = std::make_shared<Node>();
                RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(
                        slot_map[table_column_name]->type(), orc_type_ptr->getSubtype(file_index),
                        field_node));
                ctx->table_info_node->add_children(
                        table_column_name, orc_type_ptr->getFieldName(file_index), field_node);
            }
            slot_map.erase(table_column_name);
        }
        for (const auto& [partition_col_name, _] : slot_map) {
            ctx->table_info_node->add_not_exist_children(partition_col_name);
        }
    }

    // Compute column_ids
    auto column_id_result = ColumnIdResult();
    if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) {
        column_id_result = _create_column_ids(orc_type_ptr, ctx->tuple_descriptor);
    } else {
        column_id_result =
                _create_column_ids_by_top_level_col_index(orc_type_ptr, ctx->tuple_descriptor);
    }
    ctx->column_ids = std::move(column_id_result.column_ids);
    ctx->filter_column_ids = std::move(column_id_result.filter_column_ids);

    // _is_acid is false by default, no need to set explicitly
    return Status::OK();
}

ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type,
                                                 const TupleDescriptor* tuple_descriptor) {
    // map top-level table column name (lower-cased) -> orc::Type*
    std::unordered_map<std::string, const orc::Type*> table_col_name_to_orc_type_map;
    for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
        auto orc_sub_type = orc_type->getSubtype(i);
        if (!orc_sub_type) continue;

        std::string table_col_name = to_lower(orc_type->getFieldName(i));
        table_col_name_to_orc_type_map[table_col_name] = orc_sub_type;
    }

    std::set<uint64_t> column_ids;
    std::set<uint64_t> filter_column_ids;

    // helper to process access paths for a given top-level orc field
    auto process_access_paths = [](const orc::Type* orc_field,
                                   const std::vector<TColumnAccessPath>& access_paths,
                                   std::set<uint64_t>& out_ids) {
        process_nested_access_paths(
                orc_field, access_paths, out_ids,
                [](const orc::Type* type) { return type->getColumnId(); },
                [](const orc::Type* type) { return type->getMaximumColumnId(); },
                HiveOrcNestedColumnUtils::extract_nested_column_ids);
    };

    for (const auto* slot : tuple_descriptor->slots()) {
        auto it = table_col_name_to_orc_type_map.find(slot->col_name_lower_case());
        if (it == table_col_name_to_orc_type_map.end()) {
            // Column not found in file
            continue;
        }
        const orc::Type* orc_field = it->second;

        // primitive (non-nested) types
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
             slot->col_type() != TYPE_MAP)) {
            column_ids.insert(orc_field->getColumnId());
            if (slot->is_predicate()) {
                filter_column_ids.insert(orc_field->getColumnId());
            }
            continue;
        }

        // complex types
        const auto& all_access_paths = slot->all_access_paths();
        process_access_paths(orc_field, all_access_paths, column_ids);

        const auto& predicate_access_paths = slot->predicate_access_paths();
        if (!predicate_access_paths.empty()) {
            process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
        }
    }

    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
}

ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index(
        const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) {
    // map top-level table column position -> orc::Type*
    std::unordered_map<uint64_t, const orc::Type*> table_col_pos_to_orc_type_map;
    for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
        auto orc_sub_type = orc_type->getSubtype(i);
        if (!orc_sub_type) continue;

        table_col_pos_to_orc_type_map[i] = orc_sub_type;
    }

    std::set<uint64_t> column_ids;
    std::set<uint64_t> filter_column_ids;

    // helper to process access paths for a given top-level orc field
    auto process_access_paths = [](const orc::Type* orc_field,
                                   const std::vector<TColumnAccessPath>& access_paths,
                                   std::set<uint64_t>& out_ids) {
        process_nested_access_paths(
                orc_field, access_paths, out_ids,
                [](const orc::Type* type) { return type->getColumnId(); },
                [](const orc::Type* type) { return type->getMaximumColumnId(); },
                HiveOrcNestedColumnUtils::extract_nested_column_ids);
    };

    for (const auto* slot : tuple_descriptor->slots()) {
        auto it = table_col_pos_to_orc_type_map.find(slot->col_pos());
        if (it == table_col_pos_to_orc_type_map.end()) {
            // Column not found in file
            continue;
        }
        const orc::Type* orc_field = it->second;

        // primitive (non-nested) types
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
             slot->col_type() != TYPE_MAP)) {
            column_ids.insert(orc_field->getColumnId());
            if (slot->is_predicate()) {
                filter_column_ids.insert(orc_field->getColumnId());
            }
            continue;
        }

        const auto& all_access_paths = slot->all_access_paths();
        // complex types
        process_access_paths(orc_field, all_access_paths, column_ids);

        const auto& predicate_access_paths = slot->predicate_access_paths();
        if (!predicate_access_paths.empty()) {
            process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
        }
    }

    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
}

Status HiveParquetReader::on_before_init_reader(ReaderInitContext* ctx) {
    _column_descs = ctx->column_descs;
    _fill_col_name_to_block_idx = ctx->col_name_to_block_idx;
    RETURN_IF_ERROR(
            _extract_partition_values(*ctx->range, ctx->tuple_descriptor, _fill_partition_values));
    for (auto& desc : *ctx->column_descs) {
        if (desc.category == ColumnCategory::REGULAR ||
            desc.category == ColumnCategory::GENERATED) {
            ctx->column_names.push_back(desc.name);
        }
    }

    // Get file metadata schema (available because _open_file() runs before this hook)
    const FieldDescriptor* field_desc = nullptr;
    RETURN_IF_ERROR(get_file_metadata_schema(&field_desc));
    DCHECK(field_desc != nullptr);

    // Build table_info_node based on config
    if (get_state()->query_options().hive_parquet_use_column_names) {
        RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(ctx->tuple_descriptor, *field_desc,
                                                            ctx->table_info_node, _is_file_slot));
    } else {
        ctx->table_info_node = std::make_shared<StructNode>();
        std::map<std::string, const SlotDescriptor*> slot_map;
        for (const auto& slot : ctx->tuple_descriptor->slots()) {
            slot_map.emplace(slot->col_name_lower_case(), slot);
        }

        auto parquet_fields_schema = field_desc->get_fields_schema();
        for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) {
            auto table_column_name = ctx->column_names[idx];
            auto file_index = get_scan_params().column_idxs[idx];

            if (file_index >= parquet_fields_schema.size()) {
                ctx->table_info_node->add_not_exist_children(table_column_name);
            } else {
                auto field_node = std::make_shared<Node>();
                RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(
                        slot_map[table_column_name]->type(), parquet_fields_schema[file_index],
                        field_node));
                ctx->table_info_node->add_children(
                        table_column_name, parquet_fields_schema[file_index].name, field_node);
            }
            slot_map.erase(table_column_name);
        }
        for (const auto& [partition_col_name, _] : slot_map) {
            ctx->table_info_node->add_not_exist_children(partition_col_name);
        }
    }

    // Compute column_ids for lazy materialization
    auto column_id_result = ColumnIdResult();
    if (get_state()->query_options().hive_parquet_use_column_names) {
        column_id_result = _create_column_ids(field_desc, ctx->tuple_descriptor);
    } else {
        column_id_result =
                _create_column_ids_by_top_level_col_index(field_desc, ctx->tuple_descriptor);
    }
    ctx->column_ids = std::move(column_id_result.column_ids);
    ctx->filter_column_ids = std::move(column_id_result.filter_column_ids);

    _filter_groups = true;
    return Status::OK();
}

ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* field_desc,
                                                     const TupleDescriptor* tuple_descriptor) {
    // First, assign column IDs to the field descriptor
    auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
    mutable_field_desc->assign_ids();

    // map top-level table column name (lower-cased) -> FieldSchema*
    std::unordered_map<std::string, const FieldSchema*> table_col_name_to_field_schema_map;
    for (int i = 0; i < field_desc->size(); ++i) {
        auto field_schema = field_desc->get_column(i);
        if (!field_schema) continue;

        table_col_name_to_field_schema_map[field_schema->lower_case_name] = field_schema;
    }

    std::set<uint64_t> column_ids;
    std::set<uint64_t> filter_column_ids;

    // helper to process access paths for a given top-level parquet field
    auto process_access_paths = [](const FieldSchema* parquet_field,
                                   const std::vector<TColumnAccessPath>& access_paths,
                                   std::set<uint64_t>& out_ids) {
        process_nested_access_paths(
                parquet_field, access_paths, out_ids,
                [](const FieldSchema* field) { return field->get_column_id(); },
                [](const FieldSchema* field) { return field->get_max_column_id(); },
                HiveParquetNestedColumnUtils::extract_nested_column_ids);
    };

    for (const auto* slot : tuple_descriptor->slots()) {
        auto it = table_col_name_to_field_schema_map.find(slot->col_name_lower_case());
        if (it == table_col_name_to_field_schema_map.end()) {
            // Column not found in file
            continue;
        }
        auto field_schema = it->second;

        // primitive (non-nested) types
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
             slot->col_type() != TYPE_MAP)) {
            column_ids.insert(field_schema->column_id);

            if (slot->is_predicate()) {
                filter_column_ids.insert(field_schema->column_id);
            }
            continue;
        }

        // complex types
        const auto& all_access_paths = slot->all_access_paths();
        process_access_paths(field_schema, all_access_paths, column_ids);

        const auto& predicate_access_paths = slot->predicate_access_paths();
        if (!predicate_access_paths.empty()) {
            process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
        }
    }

    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
}

ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index(
        const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor) {
    // First, assign column IDs to the field descriptor
    auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
    mutable_field_desc->assign_ids();

    // map top-level table column position -> FieldSchema*
    std::unordered_map<uint64_t, const FieldSchema*> table_col_pos_to_field_schema_map;
    for (int i = 0; i < field_desc->size(); ++i) {
        auto field_schema = field_desc->get_column(i);
        if (!field_schema) continue;

        table_col_pos_to_field_schema_map[i] = field_schema;
    }

    std::set<uint64_t> column_ids;
    std::set<uint64_t> filter_column_ids;

    // helper to process access paths for a given top-level parquet field
    auto process_access_paths = [](const FieldSchema* parquet_field,
                                   const std::vector<TColumnAccessPath>& access_paths,
                                   std::set<uint64_t>& out_ids) {
        process_nested_access_paths(
                parquet_field, access_paths, out_ids,
                [](const FieldSchema* field) { return field->get_column_id(); },
                [](const FieldSchema* field) { return field->get_max_column_id(); },
                HiveParquetNestedColumnUtils::extract_nested_column_ids);
    };

    for (const auto* slot : tuple_descriptor->slots()) {
        auto it = table_col_pos_to_field_schema_map.find(slot->col_pos());
        if (it == table_col_pos_to_field_schema_map.end()) {
            // Column not found in file
            continue;
        }
        auto field_schema = it->second;

        // primitive (non-nested) types
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
             slot->col_type() != TYPE_MAP)) {
            column_ids.insert(field_schema->column_id);

            if (slot->is_predicate()) {
                filter_column_ids.insert(field_schema->column_id);
            }
            continue;
        }

        // complex types
        const auto& all_access_paths = slot->all_access_paths();
        process_access_paths(field_schema, all_access_paths, column_ids);

        const auto& predicate_access_paths = slot->predicate_access_paths();
        if (!predicate_access_paths.empty()) {
            process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
        }
    }

    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
}

#include "common/compile_check_end.h"
} // namespace doris

Coverage Report

Created: 2026-04-09 21:24

Line	Count	Source
1		// Licensed to the Apache Software Foundation (ASF) under one
2		// or more contributor license agreements. See the NOTICE file
3		// distributed with this work for additional information
4		// regarding copyright ownership. The ASF licenses this file
5		// to you under the Apache License, Version 2.0 (the
6		// "License"); you may not use this file except in compliance
7		// with the License. You may obtain a copy of the License at
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing,
12		// software distributed under the License is distributed on an
13		// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14		// KIND, either express or implied. See the License for the
15		// specific language governing permissions and limitations
16		// under the License.
17
18		#include "format/table/hive_reader.h"
19
20		#include <vector>
21
22		#include "common/status.h"
23		#include "format/table/hive/hive_orc_nested_column_utils.h"
24		#include "format/table/hive/hive_parquet_nested_column_utils.h"
25		#include "format/table/nested_column_access_helper.h"
26		#include "runtime/runtime_state.h"
27
28		namespace doris {
29		#include "common/compile_check_begin.h"
30
31	11	Status HiveOrcReader::on_before_init_reader(ReaderInitContext* ctx) {
32	11	_column_descs = ctx->column_descs;
33	11	_fill_col_name_to_block_idx = ctx->col_name_to_block_idx;
34	11	RETURN_IF_ERROR(
35	11	_extract_partition_values(*ctx->range, ctx->tuple_descriptor, _fill_partition_values));
36	99	for (auto& desc : *ctx->column_descs) {
37	99	if (desc.category == ColumnCategory::REGULAR \|\|
38	99	desc.category == ColumnCategory::GENERATED) {
39	99	ctx->column_names.push_back(desc.name);
40	99	}
41	99	}
42
43		// Get file type (available because _create_file_reader() runs before this hook)
44	11	const orc::Type* orc_type_ptr = nullptr;
45	11	RETURN_IF_ERROR(get_file_type(&orc_type_ptr));
46	11	bool is_hive_col_name = OrcReader::is_hive1_col_name(orc_type_ptr);
47
48		// Build table_info_node based on config
49	11	if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) {
50	11	RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(ctx->tuple_descriptor, orc_type_ptr,
51	11	ctx->table_info_node, _is_file_slot));
52	11	} else {
53	0	ctx->table_info_node = std::make_shared<StructNode>();
54	0	std::map<std::string, const SlotDescriptor*> slot_map;
55	0	for (const auto& slot : ctx->tuple_descriptor->slots()) {
56	0	slot_map.emplace(slot->col_name_lower_case(), slot);
57	0	}
58
59	0	for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) {
60	0	auto table_column_name = ctx->column_names[idx];
61	0	auto file_index = get_scan_params().column_idxs[idx];
62
63	0	if (file_index >= orc_type_ptr->getSubtypeCount()) {
64	0	ctx->table_info_node->add_not_exist_children(table_column_name);
65	0	} else {
66	0	auto field_node = std::make_shared<Node>();
67	0	RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(
68	0	slot_map[table_column_name]->type(), orc_type_ptr->getSubtype(file_index),
69	0	field_node));
70	0	ctx->table_info_node->add_children(
71	0	table_column_name, orc_type_ptr->getFieldName(file_index), field_node);
72	0	}
73	0	slot_map.erase(table_column_name);
74	0	}
75	0	for (const auto& [partition_col_name, _] : slot_map) {
76	0	ctx->table_info_node->add_not_exist_children(partition_col_name);
77	0	}
78	0	}
79
80		// Compute column_ids
81	11	auto column_id_result = ColumnIdResult();
82	11	if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) {
83	11	column_id_result = _create_column_ids(orc_type_ptr, ctx->tuple_descriptor);
84	11	} else {
85	0	column_id_result =
86	0	_create_column_ids_by_top_level_col_index(orc_type_ptr, ctx->tuple_descriptor);
87	0	}
88	11	ctx->column_ids = std::move(column_id_result.column_ids);
89	11	ctx->filter_column_ids = std::move(column_id_result.filter_column_ids);
90
91		// _is_acid is false by default, no need to set explicitly
92	11	return Status::OK();
93	11	}
94
95		ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type,
96	17	const TupleDescriptor* tuple_descriptor) {
97		// map top-level table column name (lower-cased) -> orc::Type*
98	17	std::unordered_map<std::string, const orc::Type*> table_col_name_to_orc_type_map;
99	164	for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
100	147	auto orc_sub_type = orc_type->getSubtype(i);
101	147	if (!orc_sub_type) continue;
102
103	147	std::string table_col_name = to_lower(orc_type->getFieldName(i));
104	147	table_col_name_to_orc_type_map[table_col_name] = orc_sub_type;
105	147	}
106
107	17	std::set<uint64_t> column_ids;
108	17	std::set<uint64_t> filter_column_ids;
109
110		// helper to process access paths for a given top-level orc field
111	17	auto process_access_paths = [](const orc::Type* orc_field,
112	17	const std::vector<TColumnAccessPath>& access_paths,
113	17	std::set<uint64_t>& out_ids) {
114	13	process_nested_access_paths(
115	13	orc_field, access_paths, out_ids,
116	13	[](const orc::Type* type) { return type->getColumnId(); },
117	13	[](const orc::Type* type) { return type->getMaximumColumnId(); },
118	13	HiveOrcNestedColumnUtils::extract_nested_column_ids);
119	13	};
120
121	112	for (const auto* slot : tuple_descriptor->slots()) {
122	112	auto it = table_col_name_to_orc_type_map.find(slot->col_name_lower_case());
123	112	if (it == table_col_name_to_orc_type_map.end()) {
124		// Column not found in file
125	0	continue;
126	0	}
127	112	const orc::Type* orc_field = it->second;
128
129		// primitive (non-nested) types
130	112	if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
131	112	slot->col_type() != TYPE_MAP)) {
132	105	column_ids.insert(orc_field->getColumnId());
133	105	if (slot->is_predicate()) {
134	0	filter_column_ids.insert(orc_field->getColumnId());
135	0	}
136	105	continue;
137	105	}
138
139		// complex types
140	7	const auto& all_access_paths = slot->all_access_paths();
141	7	process_access_paths(orc_field, all_access_paths, column_ids);
142
143	7	const auto& predicate_access_paths = slot->predicate_access_paths();
144	7	if (!predicate_access_paths.empty()) {
145	6	process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
146	6	}
147	7	}
148
149	17	return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
150	17	}
151
152		ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index(
153	6	const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) {
154		// map top-level table column position -> orc::Type*
155	6	std::unordered_map<uint64_t, const orc::Type*> table_col_pos_to_orc_type_map;
156	54	for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
157	48	auto orc_sub_type = orc_type->getSubtype(i);
158	48	if (!orc_sub_type) continue;
159
160	48	table_col_pos_to_orc_type_map[i] = orc_sub_type;
161	48	}
162
163	6	std::set<uint64_t> column_ids;
164	6	std::set<uint64_t> filter_column_ids;
165
166		// helper to process access paths for a given top-level orc field
167	6	auto process_access_paths = [](const orc::Type* orc_field,
168	6	const std::vector<TColumnAccessPath>& access_paths,
169	13	std::set<uint64_t>& out_ids) {
170	13	process_nested_access_paths(
171	13	orc_field, access_paths, out_ids,
172	13	[](const orc::Type* type) { return type->getColumnId(); },
173	13	[](const orc::Type* type) { return type->getMaximumColumnId(); },
174	13	HiveOrcNestedColumnUtils::extract_nested_column_ids);
175	13	};
176
177	13	for (const auto* slot : tuple_descriptor->slots()) {
178	13	auto it = table_col_pos_to_orc_type_map.find(slot->col_pos());
179	13	if (it == table_col_pos_to_orc_type_map.end()) {
180		// Column not found in file
181	0	continue;
182	0	}
183	13	const orc::Type* orc_field = it->second;
184
185		// primitive (non-nested) types
186	13	if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
187	13	slot->col_type() != TYPE_MAP)) {
188	6	column_ids.insert(orc_field->getColumnId());
189	6	if (slot->is_predicate()) {
190	0	filter_column_ids.insert(orc_field->getColumnId());
191	0	}
192	6	continue;
193	6	}
194
195	7	const auto& all_access_paths = slot->all_access_paths();
196		// complex types
197	7	process_access_paths(orc_field, all_access_paths, column_ids);
198
199	7	const auto& predicate_access_paths = slot->predicate_access_paths();
200	7	if (!predicate_access_paths.empty()) {
201	6	process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
202	6	}
203	7	}
204
205	6	return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
206	6	}
207
208	14	Status HiveParquetReader::on_before_init_reader(ReaderInitContext* ctx) {
209	14	_column_descs = ctx->column_descs;
210	14	_fill_col_name_to_block_idx = ctx->col_name_to_block_idx;
211	14	RETURN_IF_ERROR(
212	14	_extract_partition_values(*ctx->range, ctx->tuple_descriptor, _fill_partition_values));
213	51	for (auto& desc : *ctx->column_descs) {
214	51	if (desc.category == ColumnCategory::REGULAR \|\|
215	51	desc.category == ColumnCategory::GENERATED) {
216	51	ctx->column_names.push_back(desc.name);
217	51	}
218	51	}
219
220		// Get file metadata schema (available because _open_file() runs before this hook)
221	14	const FieldDescriptor* field_desc = nullptr;
222	14	RETURN_IF_ERROR(get_file_metadata_schema(&field_desc));
223	14	DCHECK(field_desc != nullptr);
224
225		// Build table_info_node based on config
226	14	if (get_state()->query_options().hive_parquet_use_column_names) {
227	14	RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(ctx->tuple_descriptor, *field_desc,
228	14	ctx->table_info_node, _is_file_slot));
229	14	} else {
230	0	ctx->table_info_node = std::make_shared<StructNode>();
231	0	std::map<std::string, const SlotDescriptor*> slot_map;
232	0	for (const auto& slot : ctx->tuple_descriptor->slots()) {
233	0	slot_map.emplace(slot->col_name_lower_case(), slot);
234	0	}
235
236	0	auto parquet_fields_schema = field_desc->get_fields_schema();
237	0	for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) {
238	0	auto table_column_name = ctx->column_names[idx];
239	0	auto file_index = get_scan_params().column_idxs[idx];
240
241	0	if (file_index >= parquet_fields_schema.size()) {
242	0	ctx->table_info_node->add_not_exist_children(table_column_name);
243	0	} else {
244	0	auto field_node = std::make_shared<Node>();
245	0	RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(
246	0	slot_map[table_column_name]->type(), parquet_fields_schema[file_index],
247	0	field_node));
248	0	ctx->table_info_node->add_children(
249	0	table_column_name, parquet_fields_schema[file_index].name, field_node);
250	0	}
251	0	slot_map.erase(table_column_name);
252	0	}
253	0	for (const auto& [partition_col_name, _] : slot_map) {
254	0	ctx->table_info_node->add_not_exist_children(partition_col_name);
255	0	}
256	0	}
257
258		// Compute column_ids for lazy materialization
259	14	auto column_id_result = ColumnIdResult();
260	14	if (get_state()->query_options().hive_parquet_use_column_names) {
261	14	column_id_result = _create_column_ids(field_desc, ctx->tuple_descriptor);
262	14	} else {
263	0	column_id_result =
264	0	_create_column_ids_by_top_level_col_index(field_desc, ctx->tuple_descriptor);
265	0	}
266	14	ctx->column_ids = std::move(column_id_result.column_ids);
267	14	ctx->filter_column_ids = std::move(column_id_result.filter_column_ids);
268
269	14	_filter_groups = true;
270	14	return Status::OK();
271	14	}
272
273		ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* field_desc,
274	20	const TupleDescriptor* tuple_descriptor) {
275		// First, assign column IDs to the field descriptor
276	20	auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
277	20	mutable_field_desc->assign_ids();
278
279		// map top-level table column name (lower-cased) -> FieldSchema*
280	20	std::unordered_map<std::string, const FieldSchema*> table_col_name_to_field_schema_map;
281	278	for (int i = 0; i < field_desc->size(); ++i) {
282	258	auto field_schema = field_desc->get_column(i);
283	258	if (!field_schema) continue;
284
285	258	table_col_name_to_field_schema_map[field_schema->lower_case_name] = field_schema;
286	258	}
287
288	20	std::set<uint64_t> column_ids;
289	20	std::set<uint64_t> filter_column_ids;
290
291		// helper to process access paths for a given top-level parquet field
292	20	auto process_access_paths = [](const FieldSchema* parquet_field,
293	20	const std::vector<TColumnAccessPath>& access_paths,
294	20	std::set<uint64_t>& out_ids) {
295	13	process_nested_access_paths(
296	13	parquet_field, access_paths, out_ids,
297	13	[](const FieldSchema* field) { return field->get_column_id(); },
298	13	[](const FieldSchema* field) { return field->get_max_column_id(); },
299	13	HiveParquetNestedColumnUtils::extract_nested_column_ids);
300	13	};
301
302	64	for (const auto* slot : tuple_descriptor->slots()) {
303	64	auto it = table_col_name_to_field_schema_map.find(slot->col_name_lower_case());
304	64	if (it == table_col_name_to_field_schema_map.end()) {
305		// Column not found in file
306	0	continue;
307	0	}
308	64	auto field_schema = it->second;
309
310		// primitive (non-nested) types
311	64	if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
312	64	slot->col_type() != TYPE_MAP)) {
313	57	column_ids.insert(field_schema->column_id);
314
315	57	if (slot->is_predicate()) {
316	0	filter_column_ids.insert(field_schema->column_id);
317	0	}
318	57	continue;
319	57	}
320
321		// complex types
322	7	const auto& all_access_paths = slot->all_access_paths();
323	7	process_access_paths(field_schema, all_access_paths, column_ids);
324
325	7	const auto& predicate_access_paths = slot->predicate_access_paths();
326	7	if (!predicate_access_paths.empty()) {
327	6	process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
328	6	}
329	7	}
330
331	20	return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
332	20	}
333
334		ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index(
335	6	const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor) {
336		// First, assign column IDs to the field descriptor
337	6	auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
338	6	mutable_field_desc->assign_ids();
339
340		// map top-level table column position -> FieldSchema*
341	6	std::unordered_map<uint64_t, const FieldSchema*> table_col_pos_to_field_schema_map;
342	54	for (int i = 0; i < field_desc->size(); ++i) {
343	48	auto field_schema = field_desc->get_column(i);
344	48	if (!field_schema) continue;
345
346	48	table_col_pos_to_field_schema_map[i] = field_schema;
347	48	}
348
349	6	std::set<uint64_t> column_ids;
350	6	std::set<uint64_t> filter_column_ids;
351
352		// helper to process access paths for a given top-level parquet field
353	6	auto process_access_paths = [](const FieldSchema* parquet_field,
354	6	const std::vector<TColumnAccessPath>& access_paths,
355	13	std::set<uint64_t>& out_ids) {
356	13	process_nested_access_paths(
357	13	parquet_field, access_paths, out_ids,
358	13	[](const FieldSchema* field) { return field->get_column_id(); },
359	13	[](const FieldSchema* field) { return field->get_max_column_id(); },
360	13	HiveParquetNestedColumnUtils::extract_nested_column_ids);
361	13	};
362
363	13	for (const auto* slot : tuple_descriptor->slots()) {
364	13	auto it = table_col_pos_to_field_schema_map.find(slot->col_pos());
365	13	if (it == table_col_pos_to_field_schema_map.end()) {
366		// Column not found in file
367	0	continue;
368	0	}
369	13	auto field_schema = it->second;
370
371		// primitive (non-nested) types
372	13	if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
373	13	slot->col_type() != TYPE_MAP)) {
374	6	column_ids.insert(field_schema->column_id);
375
376	6	if (slot->is_predicate()) {
377	0	filter_column_ids.insert(field_schema->column_id);
378	0	}
379	6	continue;
380	6	}
381
382		// complex types
383	7	const auto& all_access_paths = slot->all_access_paths();
384	7	process_access_paths(field_schema, all_access_paths, column_ids);
385
386	7	const auto& predicate_access_paths = slot->predicate_access_paths();
387	7	if (!predicate_access_paths.empty()) {
388	6	process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
389	6	}
390	7	}
391
392	6	return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
393	6	}
394
395		#include "common/compile_check_end.h"
396		} // namespace doris