be/src/format/table/hive_reader.cpp

Source
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include "format/table/hive_reader.h"

#include <vector>

#include "common/status.h"
#include "format/table/hive/hive_orc_nested_column_utils.h"
#include "format/table/hive/hive_parquet_nested_column_utils.h"
#include "format/table/nested_column_access_helper.h"
#include "runtime/runtime_state.h"

namespace doris {

Status HiveOrcReader::on_before_init_reader(ReaderInitContext* ctx) {
    _column_descs = ctx->column_descs;
    _fill_col_name_to_block_idx = ctx->col_name_to_block_idx;
    RETURN_IF_ERROR(_extract_partition_values(*ctx->range, ctx->tuple_descriptor,
                                              _fill_partition_values,
                                              &_fill_partition_value_is_null));
    for (const auto& desc : *ctx->column_descs) {
        if (desc.category == ColumnCategory::REGULAR ||
            desc.category == ColumnCategory::GENERATED) {
            ctx->column_names.push_back(desc.name);
        } else if (desc.category == ColumnCategory::SYNTHESIZED &&
                   desc.name.starts_with(BeConsts::GLOBAL_ROWID_COL)) {
            auto topn_row_id_column_iter = _create_topn_row_id_column_iterator();
            this->register_synthesized_column_handler(
                    desc.name,
                    [iter = std::move(topn_row_id_column_iter), this, &desc](
                            Block* block, size_t rows) -> Status {
                        return fill_topn_row_id(iter, desc.name, block, rows);
                    });
            continue;
        }
    }

    // Get file type (available because _create_file_reader() runs before this hook)
    const orc::Type* orc_type_ptr = nullptr;
    RETURN_IF_ERROR(get_file_type(&orc_type_ptr));
    bool is_hive_col_name = OrcReader::is_hive1_col_name(orc_type_ptr);

    // Build table_info_node based on config
    if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) {
        RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(ctx->tuple_descriptor, orc_type_ptr,
                                                        ctx->table_info_node, _is_file_slot));
    } else {
        ctx->table_info_node = std::make_shared<StructNode>();
        std::map<std::string, const SlotDescriptor*> slot_map;
        for (const auto& slot : ctx->tuple_descriptor->slots()) {
            slot_map.emplace(slot->col_name_lower_case(), slot);
        }

        for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) {
            auto table_column_name = ctx->column_names[idx];
            auto file_index = get_scan_params().column_idxs[idx];

            if (file_index >= orc_type_ptr->getSubtypeCount()) {
                ctx->table_info_node->add_not_exist_children(table_column_name);
            } else {
                auto field_node = std::make_shared<Node>();
                RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(
                        slot_map[table_column_name]->type(), orc_type_ptr->getSubtype(file_index),
                        field_node));
                ctx->table_info_node->add_children(
                        table_column_name, orc_type_ptr->getFieldName(file_index), field_node);
            }
            slot_map.erase(table_column_name);
        }
        for (const auto& [partition_col_name, _] : slot_map) {
            ctx->table_info_node->add_not_exist_children(partition_col_name);
        }
    }

    // Compute column_ids
    auto column_id_result = ColumnIdResult();
    if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) {
        column_id_result = _create_column_ids(orc_type_ptr, ctx->tuple_descriptor);
    } else {
        column_id_result =
                _create_column_ids_by_top_level_col_index(orc_type_ptr, ctx->tuple_descriptor);
    }
    ctx->column_ids = std::move(column_id_result.column_ids);
    ctx->filter_column_ids = std::move(column_id_result.filter_column_ids);

    // _is_acid is false by default, no need to set explicitly
    return Status::OK();
}

ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type,
                                                 const TupleDescriptor* tuple_descriptor) {
    // map top-level table column name (lower-cased) -> orc::Type*
    std::unordered_map<std::string, const orc::Type*> table_col_name_to_orc_type_map;
    for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
        auto orc_sub_type = orc_type->getSubtype(i);
        if (!orc_sub_type) continue;

        std::string table_col_name = to_lower(orc_type->getFieldName(i));
        table_col_name_to_orc_type_map[table_col_name] = orc_sub_type;
    }

    std::set<uint64_t> column_ids;
    std::set<uint64_t> filter_column_ids;

    // helper to process access paths for a given top-level orc field
    auto process_access_paths = [](const orc::Type* orc_field,
                                   const std::vector<TColumnAccessPath>& access_paths,
                                   std::set<uint64_t>& out_ids) {
        process_nested_access_paths(
                orc_field, access_paths, out_ids,
                [](const orc::Type* type) { return type->getColumnId(); },
                [](const orc::Type* type) { return type->getMaximumColumnId(); },
                HiveOrcNestedColumnUtils::extract_nested_column_ids);
    };

    for (const auto* slot : tuple_descriptor->slots()) {
        auto it = table_col_name_to_orc_type_map.find(slot->col_name_lower_case());
        if (it == table_col_name_to_orc_type_map.end()) {
            // Column not found in file
            continue;
        }
        const orc::Type* orc_field = it->second;

        // primitive (non-nested) types
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
             slot->col_type() != TYPE_MAP)) {
            column_ids.insert(orc_field->getColumnId());
            if (slot->is_predicate()) {
                filter_column_ids.insert(orc_field->getColumnId());
            }
            continue;
        }

        // complex types
        const auto& all_access_paths = slot->all_access_paths();
        process_access_paths(orc_field, all_access_paths, column_ids);

        const auto& predicate_access_paths = slot->predicate_access_paths();
        if (!predicate_access_paths.empty()) {
            process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
        }
    }

    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
}

ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index(
        const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) {
    // map top-level table column position -> orc::Type*
    std::unordered_map<uint64_t, const orc::Type*> table_col_pos_to_orc_type_map;
    for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
        auto orc_sub_type = orc_type->getSubtype(i);
        if (!orc_sub_type) continue;

        table_col_pos_to_orc_type_map[i] = orc_sub_type;
    }

    std::set<uint64_t> column_ids;
    std::set<uint64_t> filter_column_ids;

    // helper to process access paths for a given top-level orc field
    auto process_access_paths = [](const orc::Type* orc_field,
                                   const std::vector<TColumnAccessPath>& access_paths,
                                   std::set<uint64_t>& out_ids) {
        process_nested_access_paths(
                orc_field, access_paths, out_ids,
                [](const orc::Type* type) { return type->getColumnId(); },
                [](const orc::Type* type) { return type->getMaximumColumnId(); },
                HiveOrcNestedColumnUtils::extract_nested_column_ids);
    };

    for (const auto* slot : tuple_descriptor->slots()) {
        auto it = table_col_pos_to_orc_type_map.find(slot->col_pos());
        if (it == table_col_pos_to_orc_type_map.end()) {
            // Column not found in file
            continue;
        }
        const orc::Type* orc_field = it->second;

        // primitive (non-nested) types
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
             slot->col_type() != TYPE_MAP)) {
            column_ids.insert(orc_field->getColumnId());
            if (slot->is_predicate()) {
                filter_column_ids.insert(orc_field->getColumnId());
            }
            continue;
        }

        const auto& all_access_paths = slot->all_access_paths();
        // complex types
        process_access_paths(orc_field, all_access_paths, column_ids);

        const auto& predicate_access_paths = slot->predicate_access_paths();
        if (!predicate_access_paths.empty()) {
            process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
        }
    }

    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
}

Status HiveParquetReader::on_before_init_reader(ReaderInitContext* ctx) {
    _column_descs = ctx->column_descs;
    _fill_col_name_to_block_idx = ctx->col_name_to_block_idx;
    RETURN_IF_ERROR(_extract_partition_values(*ctx->range, ctx->tuple_descriptor,
                                              _fill_partition_values,
                                              &_fill_partition_value_is_null));
    for (const auto& desc : *ctx->column_descs) {
        if (desc.category == ColumnCategory::REGULAR ||
            desc.category == ColumnCategory::GENERATED) {
            ctx->column_names.push_back(desc.name);
        } else if (desc.category == ColumnCategory::SYNTHESIZED &&
                   desc.name.starts_with(BeConsts::GLOBAL_ROWID_COL)) {
            auto topn_row_id_column_iter = _create_topn_row_id_column_iterator();
            this->register_synthesized_column_handler(
                    desc.name,
                    [iter = std::move(topn_row_id_column_iter), this, &desc](
                            Block* block, size_t rows) -> Status {
                        return fill_topn_row_id(iter, desc.name, block, rows);
                    });
            continue;
        }
    }

    // Get file metadata schema (available because _open_file() runs before this hook)
    const FieldDescriptor* field_desc = nullptr;
    RETURN_IF_ERROR(get_file_metadata_schema(&field_desc));
    DCHECK(field_desc != nullptr);

    // Build table_info_node based on config
    if (get_state()->query_options().hive_parquet_use_column_names) {
        RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(ctx->tuple_descriptor, *field_desc,
                                                            ctx->table_info_node, _is_file_slot));
    } else {
        ctx->table_info_node = std::make_shared<StructNode>();
        std::map<std::string, const SlotDescriptor*> slot_map;
        for (const auto& slot : ctx->tuple_descriptor->slots()) {
            slot_map.emplace(slot->col_name_lower_case(), slot);
        }

        auto parquet_fields_schema = field_desc->get_fields_schema();
        for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) {
            auto table_column_name = ctx->column_names[idx];
            auto file_index = get_scan_params().column_idxs[idx];

            if (file_index >= parquet_fields_schema.size()) {
                ctx->table_info_node->add_not_exist_children(table_column_name);
            } else {
                auto field_node = std::make_shared<Node>();
                RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(
                        slot_map[table_column_name]->type(), parquet_fields_schema[file_index],
                        field_node));
                ctx->table_info_node->add_children(
                        table_column_name, parquet_fields_schema[file_index].name, field_node);
            }
            slot_map.erase(table_column_name);
        }
        for (const auto& [partition_col_name, _] : slot_map) {
            ctx->table_info_node->add_not_exist_children(partition_col_name);
        }
    }

    // Compute column_ids for lazy materialization
    auto column_id_result = ColumnIdResult();
    if (get_state()->query_options().hive_parquet_use_column_names) {
        column_id_result = _create_column_ids(field_desc, ctx->tuple_descriptor);
    } else {
        column_id_result =
                _create_column_ids_by_top_level_col_index(field_desc, ctx->tuple_descriptor);
    }
    ctx->column_ids = std::move(column_id_result.column_ids);
    ctx->filter_column_ids = std::move(column_id_result.filter_column_ids);

    _filter_groups = true;
    return Status::OK();
}

ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* field_desc,
                                                     const TupleDescriptor* tuple_descriptor) {
    // First, assign column IDs to the field descriptor
    auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
    mutable_field_desc->assign_ids();

    // map top-level table column name (lower-cased) -> FieldSchema*
    std::unordered_map<std::string, const FieldSchema*> table_col_name_to_field_schema_map;
    for (int i = 0; i < field_desc->size(); ++i) {
        auto field_schema = field_desc->get_column(i);
        if (!field_schema) continue;

        table_col_name_to_field_schema_map[field_schema->lower_case_name] = field_schema;
    }

    std::set<uint64_t> column_ids;
    std::set<uint64_t> filter_column_ids;

    // helper to process access paths for a given top-level parquet field
    auto process_access_paths = [](const FieldSchema* parquet_field,
                                   const std::vector<TColumnAccessPath>& access_paths,
                                   std::set<uint64_t>& out_ids) {
        process_nested_access_paths(
                parquet_field, access_paths, out_ids,
                [](const FieldSchema* field) { return field->get_column_id(); },
                [](const FieldSchema* field) { return field->get_max_column_id(); },
                HiveParquetNestedColumnUtils::extract_nested_column_ids);
    };

    for (const auto* slot : tuple_descriptor->slots()) {
        auto it = table_col_name_to_field_schema_map.find(slot->col_name_lower_case());
        if (it == table_col_name_to_field_schema_map.end()) {
            // Column not found in file
            continue;
        }
        auto field_schema = it->second;

        // primitive (non-nested) types
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
             slot->col_type() != TYPE_MAP)) {
            column_ids.insert(field_schema->column_id);

            if (slot->is_predicate()) {
                filter_column_ids.insert(field_schema->column_id);
            }
            continue;
        }

        // complex types
        const auto& all_access_paths = slot->all_access_paths();
        process_access_paths(field_schema, all_access_paths, column_ids);

        const auto& predicate_access_paths = slot->predicate_access_paths();
        if (!predicate_access_paths.empty()) {
            process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
        }
    }

    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
}

ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index(
        const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor) {
    // First, assign column IDs to the field descriptor
    auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
    mutable_field_desc->assign_ids();

    // map top-level table column position -> FieldSchema*
    std::unordered_map<uint64_t, const FieldSchema*> table_col_pos_to_field_schema_map;
    for (int i = 0; i < field_desc->size(); ++i) {
        auto field_schema = field_desc->get_column(i);
        if (!field_schema) continue;

        table_col_pos_to_field_schema_map[i] = field_schema;
    }

    std::set<uint64_t> column_ids;
    std::set<uint64_t> filter_column_ids;

    // helper to process access paths for a given top-level parquet field
    auto process_access_paths = [](const FieldSchema* parquet_field,
                                   const std::vector<TColumnAccessPath>& access_paths,
                                   std::set<uint64_t>& out_ids) {
        process_nested_access_paths(
                parquet_field, access_paths, out_ids,
                [](const FieldSchema* field) { return field->get_column_id(); },
                [](const FieldSchema* field) { return field->get_max_column_id(); },
                HiveParquetNestedColumnUtils::extract_nested_column_ids);
    };

    for (const auto* slot : tuple_descriptor->slots()) {
        auto it = table_col_pos_to_field_schema_map.find(slot->col_pos());
        if (it == table_col_pos_to_field_schema_map.end()) {
            // Column not found in file
            continue;
        }
        auto field_schema = it->second;

        // primitive (non-nested) types
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
             slot->col_type() != TYPE_MAP)) {
            column_ids.insert(field_schema->column_id);

            if (slot->is_predicate()) {
                filter_column_ids.insert(field_schema->column_id);
            }
            continue;
        }

        // complex types
        const auto& all_access_paths = slot->all_access_paths();
        process_access_paths(field_schema, all_access_paths, column_ids);

        const auto& predicate_access_paths = slot->predicate_access_paths();
        if (!predicate_access_paths.empty()) {
            process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
        }
    }

    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
}

} // namespace doris

Coverage Report

Created: 2026-06-02 16:01

Line	Count	Source
1		// Licensed to the Apache Software Foundation (ASF) under one
2		// or more contributor license agreements. See the NOTICE file
3		// distributed with this work for additional information
4		// regarding copyright ownership. The ASF licenses this file
5		// to you under the Apache License, Version 2.0 (the
6		// "License"); you may not use this file except in compliance
7		// with the License. You may obtain a copy of the License at
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing,
12		// software distributed under the License is distributed on an
13		// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14		// KIND, either express or implied. See the License for the
15		// specific language governing permissions and limitations
16		// under the License.
17
18		#include "format/table/hive_reader.h"
19
20		#include <vector>
21
22		#include "common/status.h"
23		#include "format/table/hive/hive_orc_nested_column_utils.h"
24		#include "format/table/hive/hive_parquet_nested_column_utils.h"
25		#include "format/table/nested_column_access_helper.h"
26		#include "runtime/runtime_state.h"
27
28		namespace doris {
29
30	15.3k	Status HiveOrcReader::on_before_init_reader(ReaderInitContext* ctx) {
31	15.3k	_column_descs = ctx->column_descs;
32	15.3k	_fill_col_name_to_block_idx = ctx->col_name_to_block_idx;
33	15.3k	RETURN_IF_ERROR(_extract_partition_values(*ctx->range, ctx->tuple_descriptor,
34	15.3k	_fill_partition_values,
35	15.3k	&_fill_partition_value_is_null));
36	63.5k	for (const auto& desc : *ctx->column_descs) {
37	63.5k	if (desc.category == ColumnCategory::REGULAR \|\|
38	63.5k	desc.category == ColumnCategory::GENERATED) {
39	53.7k	ctx->column_names.push_back(desc.name);
40	53.7k	} else if (desc.category == ColumnCategory::SYNTHESIZED &&
41	9.82k	desc.name.starts_with(BeConsts::GLOBAL_ROWID_COL)) {
42	4.98k	auto topn_row_id_column_iter = _create_topn_row_id_column_iterator();
43	4.98k	this->register_synthesized_column_handler(
44	4.98k	desc.name,
45	4.98k	[iter = std::move(topn_row_id_column_iter), this, &desc](
46	27.2k	Block* block, size_t rows) -> Status {
47	27.2k	return fill_topn_row_id(iter, desc.name, block, rows);
48	27.2k	});
49	4.98k	continue;
50	4.98k	}
51	63.5k	}
52
53		// Get file type (available because _create_file_reader() runs before this hook)
54	15.3k	const orc::Type* orc_type_ptr = nullptr;
55	15.3k	RETURN_IF_ERROR(get_file_type(&orc_type_ptr));
56	15.3k	bool is_hive_col_name = OrcReader::is_hive1_col_name(orc_type_ptr);
57
58		// Build table_info_node based on config
59	15.3k	if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) {
60	15.0k	RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(ctx->tuple_descriptor, orc_type_ptr,
61	15.0k	ctx->table_info_node, _is_file_slot));
62	15.0k	} else {
63	348	ctx->table_info_node = std::make_shared<StructNode>();
64	348	std::map<std::string, const SlotDescriptor*> slot_map;
65	974	for (const auto& slot : ctx->tuple_descriptor->slots()) {
66	974	slot_map.emplace(slot->col_name_lower_case(), slot);
67	974	}
68
69	1.20k	for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) {
70	852	auto table_column_name = ctx->column_names[idx];
71	852	auto file_index = get_scan_params().column_idxs[idx];
72
73	852	if (file_index >= orc_type_ptr->getSubtypeCount()) {
74	112	ctx->table_info_node->add_not_exist_children(table_column_name);
75	740	} else {
76	740	auto field_node = std::make_shared<Node>();
77	740	RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(
78	740	slot_map[table_column_name]->type(), orc_type_ptr->getSubtype(file_index),
79	740	field_node));
80	740	ctx->table_info_node->add_children(
81	740	table_column_name, orc_type_ptr->getFieldName(file_index), field_node);
82	740	}
83	852	slot_map.erase(table_column_name);
84	852	}
85	348	for (const auto& [partition_col_name, _] : slot_map) {
86	116	ctx->table_info_node->add_not_exist_children(partition_col_name);
87	116	}
88	348	}
89
90		// Compute column_ids
91	15.3k	auto column_id_result = ColumnIdResult();
92	15.3k	if (get_state()->query_options().hive_orc_use_column_names && !is_hive_col_name) {
93	14.9k	column_id_result = _create_column_ids(orc_type_ptr, ctx->tuple_descriptor);
94	14.9k	} else {
95	390	column_id_result =
96	390	_create_column_ids_by_top_level_col_index(orc_type_ptr, ctx->tuple_descriptor);
97	390	}
98	15.3k	ctx->column_ids = std::move(column_id_result.column_ids);
99	15.3k	ctx->filter_column_ids = std::move(column_id_result.filter_column_ids);
100
101		// _is_acid is false by default, no need to set explicitly
102	15.3k	return Status::OK();
103	15.3k	}
104
105		ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type,
106	14.9k	const TupleDescriptor* tuple_descriptor) {
107		// map top-level table column name (lower-cased) -> orc::Type*
108	14.9k	std::unordered_map<std::string, const orc::Type*> table_col_name_to_orc_type_map;
109	1.26M	for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
110	1.25M	auto orc_sub_type = orc_type->getSubtype(i);
111	1.25M	if (!orc_sub_type) continue;
112
113	1.25M	std::string table_col_name = to_lower(orc_type->getFieldName(i));
114	1.25M	table_col_name_to_orc_type_map[table_col_name] = orc_sub_type;
115	1.25M	}
116
117	14.9k	std::set<uint64_t> column_ids;
118	14.9k	std::set<uint64_t> filter_column_ids;
119
120		// helper to process access paths for a given top-level orc field
121	14.9k	auto process_access_paths = [](const orc::Type* orc_field,
122	14.9k	const std::vector<TColumnAccessPath>& access_paths,
123	14.9k	std::set<uint64_t>& out_ids) {
124	10.1k	process_nested_access_paths(
125	10.1k	orc_field, access_paths, out_ids,
126	10.1k	[](const orc::Type* type) { return type->getColumnId(); },
127	10.1k	[](const orc::Type* type) { return type->getMaximumColumnId(); },
128	10.1k	HiveOrcNestedColumnUtils::extract_nested_column_ids);
129	10.1k	};
130
131	62.5k	for (const auto* slot : tuple_descriptor->slots()) {
132	62.5k	auto it = table_col_name_to_orc_type_map.find(slot->col_name_lower_case());
133	62.5k	if (it == table_col_name_to_orc_type_map.end()) {
134		// Column not found in file
135	10.4k	continue;
136	10.4k	}
137	52.1k	const orc::Type* orc_field = it->second;
138
139		// primitive (non-nested) types
140	52.1k	if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
141	52.1k	slot->col_type() != TYPE_MAP)) {
142	41.9k	column_ids.insert(orc_field->getColumnId());
143	41.9k	if (slot->is_predicate()) {
144	10.7k	filter_column_ids.insert(orc_field->getColumnId());
145	10.7k	}
146	41.9k	continue;
147	41.9k	}
148
149		// complex types
150	10.1k	const auto& all_access_paths = slot->all_access_paths();
151	10.1k	process_access_paths(orc_field, all_access_paths, column_ids);
152
153	10.1k	const auto& predicate_access_paths = slot->predicate_access_paths();
154	10.1k	if (!predicate_access_paths.empty()) {
155	94	process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
156	94	}
157	10.1k	}
158
159	14.9k	return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
160	14.9k	}
161
162		ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index(
163	312	const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) {
164		// map top-level table column position -> orc::Type*
165	312	std::unordered_map<uint64_t, const orc::Type*> table_col_pos_to_orc_type_map;
166	1.49k	for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
167	1.18k	auto orc_sub_type = orc_type->getSubtype(i);
168	1.18k	if (!orc_sub_type) continue;
169
170	1.18k	table_col_pos_to_orc_type_map[i] = orc_sub_type;
171	1.18k	}
172
173	312	std::set<uint64_t> column_ids;
174	312	std::set<uint64_t> filter_column_ids;
175
176		// helper to process access paths for a given top-level orc field
177	312	auto process_access_paths = [](const orc::Type* orc_field,
178	312	const std::vector<TColumnAccessPath>& access_paths,
179	312	std::set<uint64_t>& out_ids) {
180	13	process_nested_access_paths(
181	13	orc_field, access_paths, out_ids,
182	13	[](const orc::Type* type) { return type->getColumnId(); },
183	13	[](const orc::Type* type) { return type->getMaximumColumnId(); },
184	13	HiveOrcNestedColumnUtils::extract_nested_column_ids);
185	13	};
186
187	989	for (const auto* slot : tuple_descriptor->slots()) {
188	989	auto it = table_col_pos_to_orc_type_map.find(slot->col_pos());
189	989	if (it == table_col_pos_to_orc_type_map.end()) {
190		// Column not found in file
191	972	continue;
192	972	}
193	17	const orc::Type* orc_field = it->second;
194
195		// primitive (non-nested) types
196	17	if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
197	17	slot->col_type() != TYPE_MAP)) {
198	6	column_ids.insert(orc_field->getColumnId());
199	6	if (slot->is_predicate()) {
200	0	filter_column_ids.insert(orc_field->getColumnId());
201	0	}
202	6	continue;
203	6	}
204
205	11	const auto& all_access_paths = slot->all_access_paths();
206		// complex types
207	11	process_access_paths(orc_field, all_access_paths, column_ids);
208
209	11	const auto& predicate_access_paths = slot->predicate_access_paths();
210	11	if (!predicate_access_paths.empty()) {
211	6	process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
212	6	}
213	11	}
214
215	312	return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
216	312	}
217
218	10.4k	Status HiveParquetReader::on_before_init_reader(ReaderInitContext* ctx) {
219	10.4k	_column_descs = ctx->column_descs;
220	10.4k	_fill_col_name_to_block_idx = ctx->col_name_to_block_idx;
221	10.4k	RETURN_IF_ERROR(_extract_partition_values(*ctx->range, ctx->tuple_descriptor,
222	10.4k	_fill_partition_values,
223	10.4k	&_fill_partition_value_is_null));
224	56.1k	for (const auto& desc : *ctx->column_descs) {
225	56.1k	if (desc.category == ColumnCategory::REGULAR \|\|
226	56.1k	desc.category == ColumnCategory::GENERATED) {
227	49.9k	ctx->column_names.push_back(desc.name);
228	49.9k	} else if (desc.category == ColumnCategory::SYNTHESIZED &&
229	6.17k	desc.name.starts_with(BeConsts::GLOBAL_ROWID_COL)) {
230	1.83k	auto topn_row_id_column_iter = _create_topn_row_id_column_iterator();
231	1.83k	this->register_synthesized_column_handler(
232	1.83k	desc.name,
233	1.83k	[iter = std::move(topn_row_id_column_iter), this, &desc](
234	7.71k	Block* block, size_t rows) -> Status {
235	7.71k	return fill_topn_row_id(iter, desc.name, block, rows);
236	7.71k	});
237	1.83k	continue;
238	1.83k	}
239	56.1k	}
240
241		// Get file metadata schema (available because _open_file() runs before this hook)
242	10.4k	const FieldDescriptor* field_desc = nullptr;
243	10.4k	RETURN_IF_ERROR(get_file_metadata_schema(&field_desc));
244	10.4k	DCHECK(field_desc != nullptr);
245
246		// Build table_info_node based on config
247	10.4k	if (get_state()->query_options().hive_parquet_use_column_names) {
248	10.2k	RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(ctx->tuple_descriptor, *field_desc,
249	10.2k	ctx->table_info_node, _is_file_slot));
250	10.2k	} else {
251	264	ctx->table_info_node = std::make_shared<StructNode>();
252	264	std::map<std::string, const SlotDescriptor*> slot_map;
253	880	for (const auto& slot : ctx->tuple_descriptor->slots()) {
254	880	slot_map.emplace(slot->col_name_lower_case(), slot);
255	880	}
256
257	264	auto parquet_fields_schema = field_desc->get_fields_schema();
258	1.02k	for (size_t idx = 0; idx < get_scan_params().column_idxs.size(); idx++) {
259	762	auto table_column_name = ctx->column_names[idx];
260	762	auto file_index = get_scan_params().column_idxs[idx];
261
262	762	if (file_index >= parquet_fields_schema.size()) {
263	112	ctx->table_info_node->add_not_exist_children(table_column_name);
264	650	} else {
265	650	auto field_node = std::make_shared<Node>();
266	650	RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(
267	650	slot_map[table_column_name]->type(), parquet_fields_schema[file_index],
268	650	field_node));
269	650	ctx->table_info_node->add_children(
270	650	table_column_name, parquet_fields_schema[file_index].name, field_node);
271	650	}
272	762	slot_map.erase(table_column_name);
273	762	}
274	264	for (const auto& [partition_col_name, _] : slot_map) {
275	116	ctx->table_info_node->add_not_exist_children(partition_col_name);
276	116	}
277	264	}
278
279		// Compute column_ids for lazy materialization
280	10.4k	auto column_id_result = ColumnIdResult();
281	10.4k	if (get_state()->query_options().hive_parquet_use_column_names) {
282	10.2k	column_id_result = _create_column_ids(field_desc, ctx->tuple_descriptor);
283	10.2k	} else {
284	274	column_id_result =
285	274	_create_column_ids_by_top_level_col_index(field_desc, ctx->tuple_descriptor);
286	274	}
287	10.4k	ctx->column_ids = std::move(column_id_result.column_ids);
288	10.4k	ctx->filter_column_ids = std::move(column_id_result.filter_column_ids);
289
290	10.4k	_filter_groups = true;
291	10.4k	return Status::OK();
292	10.4k	}
293
294		ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* field_desc,
295	10.2k	const TupleDescriptor* tuple_descriptor) {
296		// First, assign column IDs to the field descriptor
297	10.2k	auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
298	10.2k	mutable_field_desc->assign_ids();
299
300		// map top-level table column name (lower-cased) -> FieldSchema*
301	10.2k	std::unordered_map<std::string, const FieldSchema*> table_col_name_to_field_schema_map;
302	87.7k	for (int i = 0; i < field_desc->size(); ++i) {
303	77.5k	auto field_schema = field_desc->get_column(i);
304	77.5k	if (!field_schema) continue;
305
306	77.5k	table_col_name_to_field_schema_map[field_schema->lower_case_name] = field_schema;
307	77.5k	}
308
309	10.2k	std::set<uint64_t> column_ids;
310	10.2k	std::set<uint64_t> filter_column_ids;
311
312		// helper to process access paths for a given top-level parquet field
313	10.2k	auto process_access_paths = [](const FieldSchema* parquet_field,
314	10.2k	const std::vector<TColumnAccessPath>& access_paths,
315	15.7k	std::set<uint64_t>& out_ids) {
316	15.7k	process_nested_access_paths(
317	15.7k	parquet_field, access_paths, out_ids,
318	15.8k	[](const FieldSchema* field) { return field->get_column_id(); },
319	15.7k	[](const FieldSchema* field) { return field->get_max_column_id(); },
320	15.7k	HiveParquetNestedColumnUtils::extract_nested_column_ids);
321	15.7k	};
322
323	55.1k	for (const auto* slot : tuple_descriptor->slots()) {
324	55.1k	auto it = table_col_name_to_field_schema_map.find(slot->col_name_lower_case());
325	55.1k	if (it == table_col_name_to_field_schema_map.end()) {
326		// Column not found in file
327	6.67k	continue;
328	6.67k	}
329	48.4k	auto field_schema = it->second;
330
331		// primitive (non-nested) types
332	48.4k	if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
333	48.4k	slot->col_type() != TYPE_MAP)) {
334	32.7k	column_ids.insert(field_schema->column_id);
335
336	32.7k	if (slot->is_predicate()) {
337	5.14k	filter_column_ids.insert(field_schema->column_id);
338	5.14k	}
339	32.7k	continue;
340	32.7k	}
341
342		// complex types
343	15.7k	const auto& all_access_paths = slot->all_access_paths();
344	15.7k	process_access_paths(field_schema, all_access_paths, column_ids);
345
346	15.7k	const auto& predicate_access_paths = slot->predicate_access_paths();
347	15.7k	if (!predicate_access_paths.empty()) {
348	98	process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
349	98	}
350	15.7k	}
351
352	10.2k	return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
353	10.2k	}
354
355		ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index(
356	268	const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor) {
357		// First, assign column IDs to the field descriptor
358	268	auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
359	268	mutable_field_desc->assign_ids();
360
361		// map top-level table column position -> FieldSchema*
362	268	std::unordered_map<uint64_t, const FieldSchema*> table_col_pos_to_field_schema_map;
363	1.31k	for (int i = 0; i < field_desc->size(); ++i) {
364	1.05k	auto field_schema = field_desc->get_column(i);
365	1.05k	if (!field_schema) continue;
366
367	1.05k	table_col_pos_to_field_schema_map[i] = field_schema;
368	1.05k	}
369
370	268	std::set<uint64_t> column_ids;
371	268	std::set<uint64_t> filter_column_ids;
372
373		// helper to process access paths for a given top-level parquet field
374	268	auto process_access_paths = [](const FieldSchema* parquet_field,
375	268	const std::vector<TColumnAccessPath>& access_paths,
376	268	std::set<uint64_t>& out_ids) {
377	13	process_nested_access_paths(
378	13	parquet_field, access_paths, out_ids,
379	13	[](const FieldSchema* field) { return field->get_column_id(); },
380	13	[](const FieldSchema* field) { return field->get_max_column_id(); },
381	13	HiveParquetNestedColumnUtils::extract_nested_column_ids);
382	13	};
383
384	885	for (const auto* slot : tuple_descriptor->slots()) {
385	885	auto it = table_col_pos_to_field_schema_map.find(slot->col_pos());
386	885	if (it == table_col_pos_to_field_schema_map.end()) {
387		// Column not found in file
388	872	continue;
389	872	}
390	13	auto field_schema = it->second;
391
392		// primitive (non-nested) types
393	13	if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
394	13	slot->col_type() != TYPE_MAP)) {
395	6	column_ids.insert(field_schema->column_id);
396
397	6	if (slot->is_predicate()) {
398	0	filter_column_ids.insert(field_schema->column_id);
399	0	}
400	6	continue;
401	6	}
402
403		// complex types
404	7	const auto& all_access_paths = slot->all_access_paths();
405	7	process_access_paths(field_schema, all_access_paths, column_ids);
406
407	7	const auto& predicate_access_paths = slot->predicate_access_paths();
408	7	if (!predicate_access_paths.empty()) {
409	6	process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
410	6	}
411	7	}
412
413	268	return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
414	268	}
415
416		} // namespace doris