be/src/format/table/hive_reader.cpp

Source
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include "format/table/hive_reader.h"

#include <vector>

#include "common/status.h"
#include "format/table/hive/hive_orc_nested_column_utils.h"
#include "format/table/hive/hive_parquet_nested_column_utils.h"
#include "format/table/nested_column_access_helper.h"
#include "runtime/runtime_state.h"

namespace doris {

Status HiveReader::get_next_block_inner(Block* block, size_t* read_rows, bool* eof) {
    RETURN_IF_ERROR(_file_format_reader->get_next_block(block, read_rows, eof));
    return Status::OK();
};

Status HiveOrcReader::init_reader(
        const std::vector<std::string>& read_table_col_names,
        std::unordered_map<std::string, uint32_t>* col_name_to_block_idx,
        const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor,
        const RowDescriptor* row_descriptor,
        const VExprContextSPtrs* not_single_slot_filter_conjuncts,
        const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts) {
    auto* orc_reader = static_cast<OrcReader*>(_file_format_reader.get());

    const orc::Type* orc_type_ptr = nullptr;
    RETURN_IF_ERROR(orc_reader->get_file_type(&orc_type_ptr));
    bool is_hive_col_name = OrcReader::is_hive1_col_name(orc_type_ptr);

    if (_state->query_options().hive_orc_use_column_names && !is_hive_col_name) {
        // Directly use the table column name to match the file column name, but pay attention to the case issue.
        RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(tuple_descriptor, orc_type_ptr,
                                                        table_info_node_ptr, _is_file_slot));
    } else {
        // hive1 / use index
        std::map<std::string, const SlotDescriptor*> slot_map; // table_name to slot
        for (const auto& slot : tuple_descriptor->slots()) {
            slot_map.emplace(slot->col_name_lower_case(), slot);
        }

        // For top-level columns, use indexes to match, and for sub-columns, still use name to match columns.
        for (size_t idx = 0; idx < _params.column_idxs.size(); idx++) {
            auto table_column_name = read_table_col_names[idx];
            auto file_index = _params.column_idxs[idx];

            if (file_index >= orc_type_ptr->getSubtypeCount()) {
                table_info_node_ptr->add_not_exist_children(table_column_name);
            } else {
                auto field_node = std::make_shared<Node>();
                // For sub-columns, still use name to match columns.
                RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(
                        slot_map[table_column_name]->type(), orc_type_ptr->getSubtype(file_index),
                        field_node));
                table_info_node_ptr->add_children(
                        table_column_name, orc_type_ptr->getFieldName(file_index), field_node);
            }
            slot_map.erase(table_column_name);
        }
        for (const auto& [partition_col_name, _] : slot_map) {
            table_info_node_ptr->add_not_exist_children(partition_col_name);
        }
    }

    auto column_id_result = ColumnIdResult();
    if (_state->query_options().hive_orc_use_column_names && !is_hive_col_name) {
        column_id_result = _create_column_ids(orc_type_ptr, tuple_descriptor);
    } else {
        column_id_result =
                _create_column_ids_by_top_level_col_index(orc_type_ptr, tuple_descriptor);
    }

    const auto& column_ids = column_id_result.column_ids;
    const auto& filter_column_ids = column_id_result.filter_column_ids;

    return orc_reader->init_reader(&read_table_col_names, col_name_to_block_idx, conjuncts, false,
                                   tuple_descriptor, row_descriptor,
                                   not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts,
                                   table_info_node_ptr, column_ids, filter_column_ids);
}

ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type,
                                                 const TupleDescriptor* tuple_descriptor) {
    // map top-level table column name (lower-cased) -> orc::Type*
    std::unordered_map<std::string, const orc::Type*> table_col_name_to_orc_type_map;
    for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
        auto orc_sub_type = orc_type->getSubtype(i);
        if (!orc_sub_type) continue;

        std::string table_col_name = to_lower(orc_type->getFieldName(i));
        table_col_name_to_orc_type_map[table_col_name] = orc_sub_type;
    }

    std::set<uint64_t> column_ids;
    std::set<uint64_t> filter_column_ids;

    // helper to process access paths for a given top-level orc field
    auto process_access_paths = [](const orc::Type* orc_field,
                                   const std::vector<TColumnAccessPath>& access_paths,
                                   std::set<uint64_t>& out_ids) {
        process_nested_access_paths(
                orc_field, access_paths, out_ids,
                [](const orc::Type* type) { return type->getColumnId(); },
                [](const orc::Type* type) { return type->getMaximumColumnId(); },
                HiveOrcNestedColumnUtils::extract_nested_column_ids);
    };

    for (const auto* slot : tuple_descriptor->slots()) {
        auto it = table_col_name_to_orc_type_map.find(slot->col_name_lower_case());
        if (it == table_col_name_to_orc_type_map.end()) {
            // Column not found in file
            continue;
        }
        const orc::Type* orc_field = it->second;

        // primitive (non-nested) types
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
             slot->col_type() != TYPE_MAP)) {
            column_ids.insert(orc_field->getColumnId());
            if (slot->is_predicate()) {
                filter_column_ids.insert(orc_field->getColumnId());
            }
            continue;
        }

        // complex types
        const auto& all_access_paths = slot->all_access_paths();
        process_access_paths(orc_field, all_access_paths, column_ids);

        const auto& predicate_access_paths = slot->predicate_access_paths();
        if (!predicate_access_paths.empty()) {
            process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
        }
    }

    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
}

ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index(
        const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) {
    // map top-level table column position -> orc::Type*
    std::unordered_map<uint64_t, const orc::Type*> table_col_pos_to_orc_type_map;
    for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
        auto orc_sub_type = orc_type->getSubtype(i);
        if (!orc_sub_type) continue;

        table_col_pos_to_orc_type_map[i] = orc_sub_type;
    }

    std::set<uint64_t> column_ids;
    std::set<uint64_t> filter_column_ids;

    // helper to process access paths for a given top-level orc field
    auto process_access_paths = [](const orc::Type* orc_field,
                                   const std::vector<TColumnAccessPath>& access_paths,
                                   std::set<uint64_t>& out_ids) {
        process_nested_access_paths(
                orc_field, access_paths, out_ids,
                [](const orc::Type* type) { return type->getColumnId(); },
                [](const orc::Type* type) { return type->getMaximumColumnId(); },
                HiveOrcNestedColumnUtils::extract_nested_column_ids);
    };

    for (const auto* slot : tuple_descriptor->slots()) {
        auto it = table_col_pos_to_orc_type_map.find(slot->col_pos());
        if (it == table_col_pos_to_orc_type_map.end()) {
            // Column not found in file
            continue;
        }
        const orc::Type* orc_field = it->second;

        // primitive (non-nested) types
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
             slot->col_type() != TYPE_MAP)) {
            column_ids.insert(orc_field->getColumnId());
            if (slot->is_predicate()) {
                filter_column_ids.insert(orc_field->getColumnId());
            }
            continue;
        }

        const auto& all_access_paths = slot->all_access_paths();
        // complex types
        process_access_paths(orc_field, all_access_paths, column_ids);

        const auto& predicate_access_paths = slot->predicate_access_paths();
        if (!predicate_access_paths.empty()) {
            process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
        }
    }

    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
}

Status HiveParquetReader::init_reader(
        const std::vector<std::string>& read_table_col_names,
        std::unordered_map<std::string, uint32_t>* col_name_to_block_idx,
        const VExprContextSPtrs& conjuncts,
        phmap::flat_hash_map<int, std::vector<std::shared_ptr<ColumnPredicate>>>&
                slot_id_to_predicates,
        const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor,
        const std::unordered_map<std::string, int>* colname_to_slot_id,
        const VExprContextSPtrs* not_single_slot_filter_conjuncts,
        const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts) {
    auto* parquet_reader = static_cast<ParquetReader*>(_file_format_reader.get());
    const FieldDescriptor* field_desc = nullptr;
    RETURN_IF_ERROR(parquet_reader->get_file_metadata_schema(&field_desc));
    DCHECK(field_desc != nullptr);

    if (_state->query_options().hive_parquet_use_column_names) {
        // Directly use the table column name to match the file column name, but pay attention to the case issue.
        RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(tuple_descriptor, *field_desc,
                                                            table_info_node_ptr, _is_file_slot));
    } else {                                                   // use idx
        std::map<std::string, const SlotDescriptor*> slot_map; //table_name to slot
        for (const auto& slot : tuple_descriptor->slots()) {
            slot_map.emplace(slot->col_name_lower_case(), slot);
        }

        // For top-level columns, use indexes to match, and for sub-columns, still use name to match columns.
        auto parquet_fields_schema = field_desc->get_fields_schema();
        for (size_t idx = 0; idx < _params.column_idxs.size(); idx++) {
            auto table_column_name = read_table_col_names[idx];
            auto file_index = _params.column_idxs[idx];

            if (file_index >= parquet_fields_schema.size()) {
                // Non-partitioning columns, which may be columns added later.
                table_info_node_ptr->add_not_exist_children(table_column_name);
            } else {
                // Non-partitioning columns, columns that exist in both the table and the file.
                auto field_node = std::make_shared<Node>();
                // for sub-columns, still use name to match columns.
                RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(
                        slot_map[table_column_name]->type(), parquet_fields_schema[file_index],
                        field_node));
                table_info_node_ptr->add_children(
                        table_column_name, parquet_fields_schema[file_index].name, field_node);
            }

            slot_map.erase(table_column_name);
        }
        /*
         * `_params.column_idxs` only have `isIsFileSlot()`, so we need add `partition slot`.
         * eg:
         * Table : A, B, C, D     (D: partition column)
         * Parquet file : A, B
         * Column C is obtained by add column.
         *
         * sql : select * from table;
         * slot : A, B, C, D
         * _params.column_idxs: 0, 1, 2 (There is no 3, because column D is the partition column)
         *
         */
        for (const auto& [partition_col_name, _] : slot_map) {
            table_info_node_ptr->add_not_exist_children(partition_col_name);
        }
    }

    auto column_id_result = ColumnIdResult();
    if (_state->query_options().hive_parquet_use_column_names) {
        column_id_result = _create_column_ids(field_desc, tuple_descriptor);
    } else {
        column_id_result = _create_column_ids_by_top_level_col_index(field_desc, tuple_descriptor);
    }

    const auto& column_ids = column_id_result.column_ids;
    const auto& filter_column_ids = column_id_result.filter_column_ids;

    RETURN_IF_ERROR(init_row_filters());

    return parquet_reader->init_reader(
            read_table_col_names, col_name_to_block_idx, conjuncts, slot_id_to_predicates,
            tuple_descriptor, row_descriptor, colname_to_slot_id, not_single_slot_filter_conjuncts,
            slot_id_to_filter_conjuncts, table_info_node_ptr, true, column_ids, filter_column_ids);
}

ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* field_desc,
                                                     const TupleDescriptor* tuple_descriptor) {
    // First, assign column IDs to the field descriptor
    auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
    mutable_field_desc->assign_ids();

    // map top-level table column name (lower-cased) -> FieldSchema*
    std::unordered_map<std::string, const FieldSchema*> table_col_name_to_field_schema_map;
    for (int i = 0; i < field_desc->size(); ++i) {
        auto field_schema = field_desc->get_column(i);
        if (!field_schema) continue;

        table_col_name_to_field_schema_map[field_schema->lower_case_name] = field_schema;
    }

    std::set<uint64_t> column_ids;
    std::set<uint64_t> filter_column_ids;

    // helper to process access paths for a given top-level parquet field
    auto process_access_paths = [](const FieldSchema* parquet_field,
                                   const std::vector<TColumnAccessPath>& access_paths,
                                   std::set<uint64_t>& out_ids) {
        process_nested_access_paths(
                parquet_field, access_paths, out_ids,
                [](const FieldSchema* field) { return field->get_column_id(); },
                [](const FieldSchema* field) { return field->get_max_column_id(); },
                HiveParquetNestedColumnUtils::extract_nested_column_ids);
    };

    for (const auto* slot : tuple_descriptor->slots()) {
        auto it = table_col_name_to_field_schema_map.find(slot->col_name_lower_case());
        if (it == table_col_name_to_field_schema_map.end()) {
            // Column not found in file
            continue;
        }
        auto field_schema = it->second;

        // primitive (non-nested) types
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
             slot->col_type() != TYPE_MAP)) {
            column_ids.insert(field_schema->column_id);

            if (slot->is_predicate()) {
                filter_column_ids.insert(field_schema->column_id);
            }
            continue;
        }

        // complex types
        const auto& all_access_paths = slot->all_access_paths();
        process_access_paths(field_schema, all_access_paths, column_ids);

        const auto& predicate_access_paths = slot->predicate_access_paths();
        if (!predicate_access_paths.empty()) {
            process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
        }
    }

    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
}

ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index(
        const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor) {
    // First, assign column IDs to the field descriptor
    auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
    mutable_field_desc->assign_ids();

    // map top-level table column position -> FieldSchema*
    std::unordered_map<uint64_t, const FieldSchema*> table_col_pos_to_field_schema_map;
    for (int i = 0; i < field_desc->size(); ++i) {
        auto field_schema = field_desc->get_column(i);
        if (!field_schema) continue;

        table_col_pos_to_field_schema_map[i] = field_schema;
    }

    std::set<uint64_t> column_ids;
    std::set<uint64_t> filter_column_ids;

    // helper to process access paths for a given top-level parquet field
    auto process_access_paths = [](const FieldSchema* parquet_field,
                                   const std::vector<TColumnAccessPath>& access_paths,
                                   std::set<uint64_t>& out_ids) {
        process_nested_access_paths(
                parquet_field, access_paths, out_ids,
                [](const FieldSchema* field) { return field->get_column_id(); },
                [](const FieldSchema* field) { return field->get_max_column_id(); },
                HiveParquetNestedColumnUtils::extract_nested_column_ids);
    };

    for (const auto* slot : tuple_descriptor->slots()) {
        auto it = table_col_pos_to_field_schema_map.find(slot->col_pos());
        if (it == table_col_pos_to_field_schema_map.end()) {
            // Column not found in file
            continue;
        }
        auto field_schema = it->second;

        // primitive (non-nested) types
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
             slot->col_type() != TYPE_MAP)) {
            column_ids.insert(field_schema->column_id);

            if (slot->is_predicate()) {
                filter_column_ids.insert(field_schema->column_id);
            }
            continue;
        }

        // complex types
        const auto& all_access_paths = slot->all_access_paths();
        process_access_paths(field_schema, all_access_paths, column_ids);

        const auto& predicate_access_paths = slot->predicate_access_paths();
        if (!predicate_access_paths.empty()) {
            process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
        }
    }

    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
}

} // namespace doris

Coverage Report

Created: 2026-04-10 04:05

Line	Count	Source
1		// Licensed to the Apache Software Foundation (ASF) under one
2		// or more contributor license agreements. See the NOTICE file
3		// distributed with this work for additional information
4		// regarding copyright ownership. The ASF licenses this file
5		// to you under the Apache License, Version 2.0 (the
6		// "License"); you may not use this file except in compliance
7		// with the License. You may obtain a copy of the License at
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing,
12		// software distributed under the License is distributed on an
13		// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14		// KIND, either express or implied. See the License for the
15		// specific language governing permissions and limitations
16		// under the License.
17
18		#include "format/table/hive_reader.h"
19
20		#include <vector>
21
22		#include "common/status.h"
23		#include "format/table/hive/hive_orc_nested_column_utils.h"
24		#include "format/table/hive/hive_parquet_nested_column_utils.h"
25		#include "format/table/nested_column_access_helper.h"
26		#include "runtime/runtime_state.h"
27
28		namespace doris {
29
30	38	Status HiveReader::get_next_block_inner(Block* block, size_t* read_rows, bool* eof) {
31	38	RETURN_IF_ERROR(_file_format_reader->get_next_block(block, read_rows, eof));
32	38	return Status::OK();
33	38	};
34
35		Status HiveOrcReader::init_reader(
36		const std::vector<std::string>& read_table_col_names,
37		std::unordered_map<std::string, uint32_t>* col_name_to_block_idx,
38		const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor,
39		const RowDescriptor* row_descriptor,
40		const VExprContextSPtrs* not_single_slot_filter_conjuncts,
41	12	const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts) {
42	12	auto* orc_reader = static_cast<OrcReader*>(_file_format_reader.get());
43
44	12	const orc::Type* orc_type_ptr = nullptr;
45	12	RETURN_IF_ERROR(orc_reader->get_file_type(&orc_type_ptr));
46	12	bool is_hive_col_name = OrcReader::is_hive1_col_name(orc_type_ptr);
47
48	12	if (_state->query_options().hive_orc_use_column_names && !is_hive_col_name) {
49		// Directly use the table column name to match the file column name, but pay attention to the case issue.
50	12	RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(tuple_descriptor, orc_type_ptr,
51	12	table_info_node_ptr, _is_file_slot));
52	12	} else {
53		// hive1 / use index
54	0	std::map<std::string, const SlotDescriptor*> slot_map; // table_name to slot
55	0	for (const auto& slot : tuple_descriptor->slots()) {
56	0	slot_map.emplace(slot->col_name_lower_case(), slot);
57	0	}
58
59		// For top-level columns, use indexes to match, and for sub-columns, still use name to match columns.
60	0	for (size_t idx = 0; idx < _params.column_idxs.size(); idx++) {
61	0	auto table_column_name = read_table_col_names[idx];
62	0	auto file_index = _params.column_idxs[idx];
63
64	0	if (file_index >= orc_type_ptr->getSubtypeCount()) {
65	0	table_info_node_ptr->add_not_exist_children(table_column_name);
66	0	} else {
67	0	auto field_node = std::make_shared<Node>();
68		// For sub-columns, still use name to match columns.
69	0	RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(
70	0	slot_map[table_column_name]->type(), orc_type_ptr->getSubtype(file_index),
71	0	field_node));
72	0	table_info_node_ptr->add_children(
73	0	table_column_name, orc_type_ptr->getFieldName(file_index), field_node);
74	0	}
75	0	slot_map.erase(table_column_name);
76	0	}
77	0	for (const auto& [partition_col_name, _] : slot_map) {
78	0	table_info_node_ptr->add_not_exist_children(partition_col_name);
79	0	}
80	0	}
81
82	12	auto column_id_result = ColumnIdResult();
83	12	if (_state->query_options().hive_orc_use_column_names && !is_hive_col_name) {
84	12	column_id_result = _create_column_ids(orc_type_ptr, tuple_descriptor);
85	12	} else {
86	0	column_id_result =
87	0	_create_column_ids_by_top_level_col_index(orc_type_ptr, tuple_descriptor);
88	0	}
89
90	12	const auto& column_ids = column_id_result.column_ids;
91	12	const auto& filter_column_ids = column_id_result.filter_column_ids;
92
93	12	return orc_reader->init_reader(&read_table_col_names, col_name_to_block_idx, conjuncts, false,
94	12	tuple_descriptor, row_descriptor,
95	12	not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts,
96	12	table_info_node_ptr, column_ids, filter_column_ids);
97	12	}
98
99		ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type,
100	18	const TupleDescriptor* tuple_descriptor) {
101		// map top-level table column name (lower-cased) -> orc::Type*
102	18	std::unordered_map<std::string, const orc::Type*> table_col_name_to_orc_type_map;
103	168	for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
104	150	auto orc_sub_type = orc_type->getSubtype(i);
105	150	if (!orc_sub_type) continue;
106
107	150	std::string table_col_name = to_lower(orc_type->getFieldName(i));
108	150	table_col_name_to_orc_type_map[table_col_name] = orc_sub_type;
109	150	}
110
111	18	std::set<uint64_t> column_ids;
112	18	std::set<uint64_t> filter_column_ids;
113
114		// helper to process access paths for a given top-level orc field
115	18	auto process_access_paths = [](const orc::Type* orc_field,
116	18	const std::vector<TColumnAccessPath>& access_paths,
117	18	std::set<uint64_t>& out_ids) {
118	14	process_nested_access_paths(
119	14	orc_field, access_paths, out_ids,
120	14	[](const orc::Type* type) { return type->getColumnId(); },
121	14	[](const orc::Type* type) { return type->getMaximumColumnId(); },
122	14	HiveOrcNestedColumnUtils::extract_nested_column_ids);
123	14	};
124
125	114	for (const auto* slot : tuple_descriptor->slots()) {
126	114	auto it = table_col_name_to_orc_type_map.find(slot->col_name_lower_case());
127	114	if (it == table_col_name_to_orc_type_map.end()) {
128		// Column not found in file
129	0	continue;
130	0	}
131	114	const orc::Type* orc_field = it->second;
132
133		// primitive (non-nested) types
134	114	if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
135	114	slot->col_type() != TYPE_MAP)) {
136	106	column_ids.insert(orc_field->getColumnId());
137	106	if (slot->is_predicate()) {
138	0	filter_column_ids.insert(orc_field->getColumnId());
139	0	}
140	106	continue;
141	106	}
142
143		// complex types
144	8	const auto& all_access_paths = slot->all_access_paths();
145	8	process_access_paths(orc_field, all_access_paths, column_ids);
146
147	8	const auto& predicate_access_paths = slot->predicate_access_paths();
148	8	if (!predicate_access_paths.empty()) {
149	6	process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
150	6	}
151	8	}
152
153	18	return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
154	18	}
155
156		ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index(
157	6	const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) {
158		// map top-level table column position -> orc::Type*
159	6	std::unordered_map<uint64_t, const orc::Type*> table_col_pos_to_orc_type_map;
160	54	for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
161	48	auto orc_sub_type = orc_type->getSubtype(i);
162	48	if (!orc_sub_type) continue;
163
164	48	table_col_pos_to_orc_type_map[i] = orc_sub_type;
165	48	}
166
167	6	std::set<uint64_t> column_ids;
168	6	std::set<uint64_t> filter_column_ids;
169
170		// helper to process access paths for a given top-level orc field
171	6	auto process_access_paths = [](const orc::Type* orc_field,
172	6	const std::vector<TColumnAccessPath>& access_paths,
173	13	std::set<uint64_t>& out_ids) {
174	13	process_nested_access_paths(
175	13	orc_field, access_paths, out_ids,
176	13	[](const orc::Type* type) { return type->getColumnId(); },
177	13	[](const orc::Type* type) { return type->getMaximumColumnId(); },
178	13	HiveOrcNestedColumnUtils::extract_nested_column_ids);
179	13	};
180
181	13	for (const auto* slot : tuple_descriptor->slots()) {
182	13	auto it = table_col_pos_to_orc_type_map.find(slot->col_pos());
183	13	if (it == table_col_pos_to_orc_type_map.end()) {
184		// Column not found in file
185	0	continue;
186	0	}
187	13	const orc::Type* orc_field = it->second;
188
189		// primitive (non-nested) types
190	13	if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
191	13	slot->col_type() != TYPE_MAP)) {
192	6	column_ids.insert(orc_field->getColumnId());
193	6	if (slot->is_predicate()) {
194	0	filter_column_ids.insert(orc_field->getColumnId());
195	0	}
196	6	continue;
197	6	}
198
199	7	const auto& all_access_paths = slot->all_access_paths();
200		// complex types
201	7	process_access_paths(orc_field, all_access_paths, column_ids);
202
203	7	const auto& predicate_access_paths = slot->predicate_access_paths();
204	7	if (!predicate_access_paths.empty()) {
205	6	process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
206	6	}
207	7	}
208
209	6	return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
210	6	}
211
212		Status HiveParquetReader::init_reader(
213		const std::vector<std::string>& read_table_col_names,
214		std::unordered_map<std::string, uint32_t>* col_name_to_block_idx,
215		const VExprContextSPtrs& conjuncts,
216		phmap::flat_hash_map<int, std::vector<std::shared_ptr<ColumnPredicate>>>&
217		slot_id_to_predicates,
218		const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor,
219		const std::unordered_map<std::string, int>* colname_to_slot_id,
220		const VExprContextSPtrs* not_single_slot_filter_conjuncts,
221	15	const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts) {
222	15	auto* parquet_reader = static_cast<ParquetReader*>(_file_format_reader.get());
223	15	const FieldDescriptor* field_desc = nullptr;
224	15	RETURN_IF_ERROR(parquet_reader->get_file_metadata_schema(&field_desc));
225	15	DCHECK(field_desc != nullptr);
226
227	15	if (_state->query_options().hive_parquet_use_column_names) {
228		// Directly use the table column name to match the file column name, but pay attention to the case issue.
229	15	RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(tuple_descriptor, *field_desc,
230	15	table_info_node_ptr, _is_file_slot));
231	15	} else { // use idx
232	0	std::map<std::string, const SlotDescriptor*> slot_map; //table_name to slot
233	0	for (const auto& slot : tuple_descriptor->slots()) {
234	0	slot_map.emplace(slot->col_name_lower_case(), slot);
235	0	}
236
237		// For top-level columns, use indexes to match, and for sub-columns, still use name to match columns.
238	0	auto parquet_fields_schema = field_desc->get_fields_schema();
239	0	for (size_t idx = 0; idx < _params.column_idxs.size(); idx++) {
240	0	auto table_column_name = read_table_col_names[idx];
241	0	auto file_index = _params.column_idxs[idx];
242
243	0	if (file_index >= parquet_fields_schema.size()) {
244		// Non-partitioning columns, which may be columns added later.
245	0	table_info_node_ptr->add_not_exist_children(table_column_name);
246	0	} else {
247		// Non-partitioning columns, columns that exist in both the table and the file.
248	0	auto field_node = std::make_shared<Node>();
249		// for sub-columns, still use name to match columns.
250	0	RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(
251	0	slot_map[table_column_name]->type(), parquet_fields_schema[file_index],
252	0	field_node));
253	0	table_info_node_ptr->add_children(
254	0	table_column_name, parquet_fields_schema[file_index].name, field_node);
255	0	}
256
257	0	slot_map.erase(table_column_name);
258	0	}
259		/*
260		* `_params.column_idxs` only have `isIsFileSlot()`, so we need add `partition slot`.
261		* eg:
262		* Table : A, B, C, D (D: partition column)
263		* Parquet file : A, B
264		* Column C is obtained by add column.
265		*
266		* sql : select * from table;
267		* slot : A, B, C, D
268		* _params.column_idxs: 0, 1, 2 (There is no 3, because column D is the partition column)
269		*
270		*/
271	0	for (const auto& [partition_col_name, _] : slot_map) {
272	0	table_info_node_ptr->add_not_exist_children(partition_col_name);
273	0	}
274	0	}
275
276	15	auto column_id_result = ColumnIdResult();
277	15	if (_state->query_options().hive_parquet_use_column_names) {
278	15	column_id_result = _create_column_ids(field_desc, tuple_descriptor);
279	15	} else {
280	0	column_id_result = _create_column_ids_by_top_level_col_index(field_desc, tuple_descriptor);
281	0	}
282
283	15	const auto& column_ids = column_id_result.column_ids;
284	15	const auto& filter_column_ids = column_id_result.filter_column_ids;
285
286	15	RETURN_IF_ERROR(init_row_filters());
287
288	15	return parquet_reader->init_reader(
289	15	read_table_col_names, col_name_to_block_idx, conjuncts, slot_id_to_predicates,
290	15	tuple_descriptor, row_descriptor, colname_to_slot_id, not_single_slot_filter_conjuncts,
291	15	slot_id_to_filter_conjuncts, table_info_node_ptr, true, column_ids, filter_column_ids);
292	15	}
293
294		ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* field_desc,
295	21	const TupleDescriptor* tuple_descriptor) {
296		// First, assign column IDs to the field descriptor
297	21	auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
298	21	mutable_field_desc->assign_ids();
299
300		// map top-level table column name (lower-cased) -> FieldSchema*
301	21	std::unordered_map<std::string, const FieldSchema*> table_col_name_to_field_schema_map;
302	282	for (int i = 0; i < field_desc->size(); ++i) {
303	261	auto field_schema = field_desc->get_column(i);
304	261	if (!field_schema) continue;
305
306	261	table_col_name_to_field_schema_map[field_schema->lower_case_name] = field_schema;
307	261	}
308
309	21	std::set<uint64_t> column_ids;
310	21	std::set<uint64_t> filter_column_ids;
311
312		// helper to process access paths for a given top-level parquet field
313	21	auto process_access_paths = [](const FieldSchema* parquet_field,
314	21	const std::vector<TColumnAccessPath>& access_paths,
315	21	std::set<uint64_t>& out_ids) {
316	14	process_nested_access_paths(
317	14	parquet_field, access_paths, out_ids,
318	14	[](const FieldSchema* field) { return field->get_column_id(); },
319	14	[](const FieldSchema* field) { return field->get_max_column_id(); },
320	14	HiveParquetNestedColumnUtils::extract_nested_column_ids);
321	14	};
322
323	66	for (const auto* slot : tuple_descriptor->slots()) {
324	66	auto it = table_col_name_to_field_schema_map.find(slot->col_name_lower_case());
325	66	if (it == table_col_name_to_field_schema_map.end()) {
326		// Column not found in file
327	0	continue;
328	0	}
329	66	auto field_schema = it->second;
330
331		// primitive (non-nested) types
332	66	if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
333	66	slot->col_type() != TYPE_MAP)) {
334	58	column_ids.insert(field_schema->column_id);
335
336	58	if (slot->is_predicate()) {
337	0	filter_column_ids.insert(field_schema->column_id);
338	0	}
339	58	continue;
340	58	}
341
342		// complex types
343	8	const auto& all_access_paths = slot->all_access_paths();
344	8	process_access_paths(field_schema, all_access_paths, column_ids);
345
346	8	const auto& predicate_access_paths = slot->predicate_access_paths();
347	8	if (!predicate_access_paths.empty()) {
348	6	process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
349	6	}
350	8	}
351
352	21	return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
353	21	}
354
355		ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index(
356	6	const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor) {
357		// First, assign column IDs to the field descriptor
358	6	auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
359	6	mutable_field_desc->assign_ids();
360
361		// map top-level table column position -> FieldSchema*
362	6	std::unordered_map<uint64_t, const FieldSchema*> table_col_pos_to_field_schema_map;
363	54	for (int i = 0; i < field_desc->size(); ++i) {
364	48	auto field_schema = field_desc->get_column(i);
365	48	if (!field_schema) continue;
366
367	48	table_col_pos_to_field_schema_map[i] = field_schema;
368	48	}
369
370	6	std::set<uint64_t> column_ids;
371	6	std::set<uint64_t> filter_column_ids;
372
373		// helper to process access paths for a given top-level parquet field
374	6	auto process_access_paths = [](const FieldSchema* parquet_field,
375	6	const std::vector<TColumnAccessPath>& access_paths,
376	13	std::set<uint64_t>& out_ids) {
377	13	process_nested_access_paths(
378	13	parquet_field, access_paths, out_ids,
379	13	[](const FieldSchema* field) { return field->get_column_id(); },
380	13	[](const FieldSchema* field) { return field->get_max_column_id(); },
381	13	HiveParquetNestedColumnUtils::extract_nested_column_ids);
382	13	};
383
384	13	for (const auto* slot : tuple_descriptor->slots()) {
385	13	auto it = table_col_pos_to_field_schema_map.find(slot->col_pos());
386	13	if (it == table_col_pos_to_field_schema_map.end()) {
387		// Column not found in file
388	0	continue;
389	0	}
390	13	auto field_schema = it->second;
391
392		// primitive (non-nested) types
393	13	if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
394	13	slot->col_type() != TYPE_MAP)) {
395	6	column_ids.insert(field_schema->column_id);
396
397	6	if (slot->is_predicate()) {
398	0	filter_column_ids.insert(field_schema->column_id);
399	0	}
400	6	continue;
401	6	}
402
403		// complex types
404	7	const auto& all_access_paths = slot->all_access_paths();
405	7	process_access_paths(field_schema, all_access_paths, column_ids);
406
407	7	const auto& predicate_access_paths = slot->predicate_access_paths();
408	7	if (!predicate_access_paths.empty()) {
409	6	process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
410	6	}
411	7	}
412
413	6	return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
414	6	}
415
416		} // namespace doris