be/src/format/table/hive_reader.cpp

Source
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include "format/table/hive_reader.h"

#include <vector>

#include "common/status.h"
#include "format/table/hive/hive_orc_nested_column_utils.h"
#include "format/table/hive/hive_parquet_nested_column_utils.h"
#include "format/table/nested_column_access_helper.h"
#include "runtime/runtime_state.h"

namespace doris {
#include "common/compile_check_begin.h"

Status HiveReader::get_next_block_inner(Block* block, size_t* read_rows, bool* eof) {
    RETURN_IF_ERROR(_file_format_reader->get_next_block(block, read_rows, eof));
    return Status::OK();
};

Status HiveOrcReader::init_reader(
        const std::vector<std::string>& read_table_col_names,
        std::unordered_map<std::string, uint32_t>* col_name_to_block_idx,
        const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor,
        const RowDescriptor* row_descriptor,
        const VExprContextSPtrs* not_single_slot_filter_conjuncts,
        const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts) {
    auto* orc_reader = static_cast<OrcReader*>(_file_format_reader.get());

    const orc::Type* orc_type_ptr = nullptr;
    RETURN_IF_ERROR(orc_reader->get_file_type(&orc_type_ptr));
    bool is_hive_col_name = OrcReader::is_hive1_col_name(orc_type_ptr);

    if (_state->query_options().hive_orc_use_column_names && !is_hive_col_name) {
        // Directly use the table column name to match the file column name, but pay attention to the case issue.
        RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(tuple_descriptor, orc_type_ptr,
                                                        table_info_node_ptr, _is_file_slot));
    } else {
        // hive1 / use index
        std::map<std::string, const SlotDescriptor*> slot_map; // table_name to slot
        for (const auto& slot : tuple_descriptor->slots()) {
            slot_map.emplace(slot->col_name_lower_case(), slot);
        }

        // For top-level columns, use indexes to match, and for sub-columns, still use name to match columns.
        for (size_t idx = 0; idx < _params.column_idxs.size(); idx++) {
            auto table_column_name = read_table_col_names[idx];
            auto file_index = _params.column_idxs[idx];

            if (file_index >= orc_type_ptr->getSubtypeCount()) {
                table_info_node_ptr->add_not_exist_children(table_column_name);
            } else {
                auto field_node = std::make_shared<Node>();
                // For sub-columns, still use name to match columns.
                RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(
                        slot_map[table_column_name]->type(), orc_type_ptr->getSubtype(file_index),
                        field_node));
                table_info_node_ptr->add_children(
                        table_column_name, orc_type_ptr->getFieldName(file_index), field_node);
            }
            slot_map.erase(table_column_name);
        }
        for (const auto& [partition_col_name, _] : slot_map) {
            table_info_node_ptr->add_not_exist_children(partition_col_name);
        }
    }

    auto column_id_result = ColumnIdResult();
    if (_state->query_options().hive_orc_use_column_names && !is_hive_col_name) {
        column_id_result = _create_column_ids(orc_type_ptr, tuple_descriptor);
    } else {
        column_id_result =
                _create_column_ids_by_top_level_col_index(orc_type_ptr, tuple_descriptor);
    }

    const auto& column_ids = column_id_result.column_ids;
    const auto& filter_column_ids = column_id_result.filter_column_ids;

    return orc_reader->init_reader(&read_table_col_names, col_name_to_block_idx, conjuncts, false,
                                   tuple_descriptor, row_descriptor,
                                   not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts,
                                   table_info_node_ptr, column_ids, filter_column_ids);
}

ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type,
                                                 const TupleDescriptor* tuple_descriptor) {
    // map top-level table column name (lower-cased) -> orc::Type*
    std::unordered_map<std::string, const orc::Type*> table_col_name_to_orc_type_map;
    for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
        auto orc_sub_type = orc_type->getSubtype(i);
        if (!orc_sub_type) continue;

        std::string table_col_name = to_lower(orc_type->getFieldName(i));
        table_col_name_to_orc_type_map[table_col_name] = orc_sub_type;
    }

    std::set<uint64_t> column_ids;
    std::set<uint64_t> filter_column_ids;

    // helper to process access paths for a given top-level orc field
    auto process_access_paths = [](const orc::Type* orc_field,
                                   const std::vector<TColumnAccessPath>& access_paths,
                                   std::set<uint64_t>& out_ids) {
        process_nested_access_paths(
                orc_field, access_paths, out_ids,
                [](const orc::Type* type) { return type->getColumnId(); },
                [](const orc::Type* type) { return type->getMaximumColumnId(); },
                HiveOrcNestedColumnUtils::extract_nested_column_ids);
    };

    for (const auto* slot : tuple_descriptor->slots()) {
        auto it = table_col_name_to_orc_type_map.find(slot->col_name_lower_case());
        if (it == table_col_name_to_orc_type_map.end()) {
            // Column not found in file
            continue;
        }
        const orc::Type* orc_field = it->second;

        // primitive (non-nested) types
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
             slot->col_type() != TYPE_MAP)) {
            column_ids.insert(orc_field->getColumnId());
            if (slot->is_predicate()) {
                filter_column_ids.insert(orc_field->getColumnId());
            }
            continue;
        }

        // complex types
        const auto& all_access_paths = slot->all_access_paths();
        process_access_paths(orc_field, all_access_paths, column_ids);

        const auto& predicate_access_paths = slot->predicate_access_paths();
        if (!predicate_access_paths.empty()) {
            process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
        }
    }

    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
}

ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index(
        const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) {
    // map top-level table column position -> orc::Type*
    std::unordered_map<uint64_t, const orc::Type*> table_col_pos_to_orc_type_map;
    for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
        auto orc_sub_type = orc_type->getSubtype(i);
        if (!orc_sub_type) continue;

        table_col_pos_to_orc_type_map[i] = orc_sub_type;
    }

    std::set<uint64_t> column_ids;
    std::set<uint64_t> filter_column_ids;

    // helper to process access paths for a given top-level orc field
    auto process_access_paths = [](const orc::Type* orc_field,
                                   const std::vector<TColumnAccessPath>& access_paths,
                                   std::set<uint64_t>& out_ids) {
        process_nested_access_paths(
                orc_field, access_paths, out_ids,
                [](const orc::Type* type) { return type->getColumnId(); },
                [](const orc::Type* type) { return type->getMaximumColumnId(); },
                HiveOrcNestedColumnUtils::extract_nested_column_ids);
    };

    for (const auto* slot : tuple_descriptor->slots()) {
        auto it = table_col_pos_to_orc_type_map.find(slot->col_pos());
        if (it == table_col_pos_to_orc_type_map.end()) {
            // Column not found in file
            continue;
        }
        const orc::Type* orc_field = it->second;

        // primitive (non-nested) types
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
             slot->col_type() != TYPE_MAP)) {
            column_ids.insert(orc_field->getColumnId());
            if (slot->is_predicate()) {
                filter_column_ids.insert(orc_field->getColumnId());
            }
            continue;
        }

        const auto& all_access_paths = slot->all_access_paths();
        // complex types
        process_access_paths(orc_field, all_access_paths, column_ids);

        const auto& predicate_access_paths = slot->predicate_access_paths();
        if (!predicate_access_paths.empty()) {
            process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
        }
    }

    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
}

Status HiveParquetReader::init_reader(
        const std::vector<std::string>& read_table_col_names,
        std::unordered_map<std::string, uint32_t>* col_name_to_block_idx,
        const VExprContextSPtrs& conjuncts,
        phmap::flat_hash_map<int, std::vector<std::shared_ptr<ColumnPredicate>>>&
                slot_id_to_predicates,
        const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor,
        const std::unordered_map<std::string, int>* colname_to_slot_id,
        const VExprContextSPtrs* not_single_slot_filter_conjuncts,
        const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts) {
    auto* parquet_reader = static_cast<ParquetReader*>(_file_format_reader.get());
    const FieldDescriptor* field_desc = nullptr;
    RETURN_IF_ERROR(parquet_reader->get_file_metadata_schema(&field_desc));
    DCHECK(field_desc != nullptr);

    if (_state->query_options().hive_parquet_use_column_names) {
        // Directly use the table column name to match the file column name, but pay attention to the case issue.
        RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(tuple_descriptor, *field_desc,
                                                            table_info_node_ptr, _is_file_slot));
    } else {                                                   // use idx
        std::map<std::string, const SlotDescriptor*> slot_map; //table_name to slot
        for (const auto& slot : tuple_descriptor->slots()) {
            slot_map.emplace(slot->col_name_lower_case(), slot);
        }

        // For top-level columns, use indexes to match, and for sub-columns, still use name to match columns.
        auto parquet_fields_schema = field_desc->get_fields_schema();
        for (size_t idx = 0; idx < _params.column_idxs.size(); idx++) {
            auto table_column_name = read_table_col_names[idx];
            auto file_index = _params.column_idxs[idx];

            if (file_index >= parquet_fields_schema.size()) {
                // Non-partitioning columns, which may be columns added later.
                table_info_node_ptr->add_not_exist_children(table_column_name);
            } else {
                // Non-partitioning columns, columns that exist in both the table and the file.
                auto field_node = std::make_shared<Node>();
                // for sub-columns, still use name to match columns.
                RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(
                        slot_map[table_column_name]->type(), parquet_fields_schema[file_index],
                        field_node));
                table_info_node_ptr->add_children(
                        table_column_name, parquet_fields_schema[file_index].name, field_node);
            }

            slot_map.erase(table_column_name);
        }
        /*
         * `_params.column_idxs` only have `isIsFileSlot()`, so we need add `partition slot`.
         * eg:
         * Table : A, B, C, D     (D: partition column)
         * Parquet file : A, B
         * Column C is obtained by add column.
         *
         * sql : select * from table;
         * slot : A, B, C, D
         * _params.column_idxs: 0, 1, 2 (There is no 3, because column D is the partition column)
         *
         */
        for (const auto& [partition_col_name, _] : slot_map) {
            table_info_node_ptr->add_not_exist_children(partition_col_name);
        }
    }

    auto column_id_result = ColumnIdResult();
    if (_state->query_options().hive_parquet_use_column_names) {
        column_id_result = _create_column_ids(field_desc, tuple_descriptor);
    } else {
        column_id_result = _create_column_ids_by_top_level_col_index(field_desc, tuple_descriptor);
    }

    const auto& column_ids = column_id_result.column_ids;
    const auto& filter_column_ids = column_id_result.filter_column_ids;

    RETURN_IF_ERROR(init_row_filters());

    return parquet_reader->init_reader(
            read_table_col_names, col_name_to_block_idx, conjuncts, slot_id_to_predicates,
            tuple_descriptor, row_descriptor, colname_to_slot_id, not_single_slot_filter_conjuncts,
            slot_id_to_filter_conjuncts, table_info_node_ptr, true, column_ids, filter_column_ids);
}

ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* field_desc,
                                                     const TupleDescriptor* tuple_descriptor) {
    // First, assign column IDs to the field descriptor
    auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
    mutable_field_desc->assign_ids();

    // map top-level table column name (lower-cased) -> FieldSchema*
    std::unordered_map<std::string, const FieldSchema*> table_col_name_to_field_schema_map;
    for (int i = 0; i < field_desc->size(); ++i) {
        auto field_schema = field_desc->get_column(i);
        if (!field_schema) continue;

        table_col_name_to_field_schema_map[field_schema->lower_case_name] = field_schema;
    }

    std::set<uint64_t> column_ids;
    std::set<uint64_t> filter_column_ids;

    // helper to process access paths for a given top-level parquet field
    auto process_access_paths = [](const FieldSchema* parquet_field,
                                   const std::vector<TColumnAccessPath>& access_paths,
                                   std::set<uint64_t>& out_ids) {
        process_nested_access_paths(
                parquet_field, access_paths, out_ids,
                [](const FieldSchema* field) { return field->get_column_id(); },
                [](const FieldSchema* field) { return field->get_max_column_id(); },
                HiveParquetNestedColumnUtils::extract_nested_column_ids);
    };

    for (const auto* slot : tuple_descriptor->slots()) {
        auto it = table_col_name_to_field_schema_map.find(slot->col_name_lower_case());
        if (it == table_col_name_to_field_schema_map.end()) {
            // Column not found in file
            continue;
        }
        auto field_schema = it->second;

        // primitive (non-nested) types
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
             slot->col_type() != TYPE_MAP)) {
            column_ids.insert(field_schema->column_id);

            if (slot->is_predicate()) {
                filter_column_ids.insert(field_schema->column_id);
            }
            continue;
        }

        // complex types
        const auto& all_access_paths = slot->all_access_paths();
        process_access_paths(field_schema, all_access_paths, column_ids);

        const auto& predicate_access_paths = slot->predicate_access_paths();
        if (!predicate_access_paths.empty()) {
            process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
        }
    }

    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
}

ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index(
        const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor) {
    // First, assign column IDs to the field descriptor
    auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
    mutable_field_desc->assign_ids();

    // map top-level table column position -> FieldSchema*
    std::unordered_map<uint64_t, const FieldSchema*> table_col_pos_to_field_schema_map;
    for (int i = 0; i < field_desc->size(); ++i) {
        auto field_schema = field_desc->get_column(i);
        if (!field_schema) continue;

        table_col_pos_to_field_schema_map[i] = field_schema;
    }

    std::set<uint64_t> column_ids;
    std::set<uint64_t> filter_column_ids;

    // helper to process access paths for a given top-level parquet field
    auto process_access_paths = [](const FieldSchema* parquet_field,
                                   const std::vector<TColumnAccessPath>& access_paths,
                                   std::set<uint64_t>& out_ids) {
        process_nested_access_paths(
                parquet_field, access_paths, out_ids,
                [](const FieldSchema* field) { return field->get_column_id(); },
                [](const FieldSchema* field) { return field->get_max_column_id(); },
                HiveParquetNestedColumnUtils::extract_nested_column_ids);
    };

    for (const auto* slot : tuple_descriptor->slots()) {
        auto it = table_col_pos_to_field_schema_map.find(slot->col_pos());
        if (it == table_col_pos_to_field_schema_map.end()) {
            // Column not found in file
            continue;
        }
        auto field_schema = it->second;

        // primitive (non-nested) types
        if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
             slot->col_type() != TYPE_MAP)) {
            column_ids.insert(field_schema->column_id);

            if (slot->is_predicate()) {
                filter_column_ids.insert(field_schema->column_id);
            }
            continue;
        }

        // complex types
        const auto& all_access_paths = slot->all_access_paths();
        process_access_paths(field_schema, all_access_paths, column_ids);

        const auto& predicate_access_paths = slot->predicate_access_paths();
        if (!predicate_access_paths.empty()) {
            process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
        }
    }

    return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
}

#include "common/compile_check_end.h"
} // namespace doris

Coverage Report

Created: 2026-03-26 06:30

Line	Count	Source
1		// Licensed to the Apache Software Foundation (ASF) under one
2		// or more contributor license agreements. See the NOTICE file
3		// distributed with this work for additional information
4		// regarding copyright ownership. The ASF licenses this file
5		// to you under the Apache License, Version 2.0 (the
6		// "License"); you may not use this file except in compliance
7		// with the License. You may obtain a copy of the License at
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing,
12		// software distributed under the License is distributed on an
13		// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14		// KIND, either express or implied. See the License for the
15		// specific language governing permissions and limitations
16		// under the License.
17
18		#include "format/table/hive_reader.h"
19
20		#include <vector>
21
22		#include "common/status.h"
23		#include "format/table/hive/hive_orc_nested_column_utils.h"
24		#include "format/table/hive/hive_parquet_nested_column_utils.h"
25		#include "format/table/nested_column_access_helper.h"
26		#include "runtime/runtime_state.h"
27
28		namespace doris {
29		#include "common/compile_check_begin.h"
30
31	90.2k	Status HiveReader::get_next_block_inner(Block* block, size_t* read_rows, bool* eof) {
32	90.2k	RETURN_IF_ERROR(_file_format_reader->get_next_block(block, read_rows, eof));
33	90.2k	return Status::OK();
34	90.2k	};
35
36		Status HiveOrcReader::init_reader(
37		const std::vector<std::string>& read_table_col_names,
38		std::unordered_map<std::string, uint32_t>* col_name_to_block_idx,
39		const VExprContextSPtrs& conjuncts, const TupleDescriptor* tuple_descriptor,
40		const RowDescriptor* row_descriptor,
41		const VExprContextSPtrs* not_single_slot_filter_conjuncts,
42	15.2k	const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts) {
43	15.2k	auto* orc_reader = static_cast<OrcReader*>(_file_format_reader.get());
44
45	15.2k	const orc::Type* orc_type_ptr = nullptr;
46	15.2k	RETURN_IF_ERROR(orc_reader->get_file_type(&orc_type_ptr));
47	15.2k	bool is_hive_col_name = OrcReader::is_hive1_col_name(orc_type_ptr);
48
49	15.2k	if (_state->query_options().hive_orc_use_column_names && !is_hive_col_name) {
50		// Directly use the table column name to match the file column name, but pay attention to the case issue.
51	15.0k	RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(tuple_descriptor, orc_type_ptr,
52	15.0k	table_info_node_ptr, _is_file_slot));
53	15.0k	} else {
54		// hive1 / use index
55	252	std::map<std::string, const SlotDescriptor*> slot_map; // table_name to slot
56	972	for (const auto& slot : tuple_descriptor->slots()) {
57	972	slot_map.emplace(slot->col_name_lower_case(), slot);
58	972	}
59
60		// For top-level columns, use indexes to match, and for sub-columns, still use name to match columns.
61	1.10k	for (size_t idx = 0; idx < _params.column_idxs.size(); idx++) {
62	856	auto table_column_name = read_table_col_names[idx];
63	856	auto file_index = _params.column_idxs[idx];
64
65	856	if (file_index >= orc_type_ptr->getSubtypeCount()) {
66	112	table_info_node_ptr->add_not_exist_children(table_column_name);
67	744	} else {
68	744	auto field_node = std::make_shared<Node>();
69		// For sub-columns, still use name to match columns.
70	744	RETURN_IF_ERROR(BuildTableInfoUtil::by_orc_name(
71	744	slot_map[table_column_name]->type(), orc_type_ptr->getSubtype(file_index),
72	744	field_node));
73	744	table_info_node_ptr->add_children(
74	744	table_column_name, orc_type_ptr->getFieldName(file_index), field_node);
75	744	}
76	856	slot_map.erase(table_column_name);
77	856	}
78	252	for (const auto& [partition_col_name, _] : slot_map) {
79	116	table_info_node_ptr->add_not_exist_children(partition_col_name);
80	116	}
81	252	}
82
83	15.2k	auto column_id_result = ColumnIdResult();
84	15.2k	if (_state->query_options().hive_orc_use_column_names && !is_hive_col_name) {
85	14.8k	column_id_result = _create_column_ids(orc_type_ptr, tuple_descriptor);
86	14.8k	} else {
87	458	column_id_result =
88	458	_create_column_ids_by_top_level_col_index(orc_type_ptr, tuple_descriptor);
89	458	}
90
91	15.2k	const auto& column_ids = column_id_result.column_ids;
92	15.2k	const auto& filter_column_ids = column_id_result.filter_column_ids;
93
94	15.2k	return orc_reader->init_reader(&read_table_col_names, col_name_to_block_idx, conjuncts, false,
95	15.2k	tuple_descriptor, row_descriptor,
96	15.2k	not_single_slot_filter_conjuncts, slot_id_to_filter_conjuncts,
97	15.2k	table_info_node_ptr, column_ids, filter_column_ids);
98	15.2k	}
99
100		ColumnIdResult HiveOrcReader::_create_column_ids(const orc::Type* orc_type,
101	14.8k	const TupleDescriptor* tuple_descriptor) {
102		// map top-level table column name (lower-cased) -> orc::Type*
103	14.8k	std::unordered_map<std::string, const orc::Type*> table_col_name_to_orc_type_map;
104	1.25M	for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
105	1.24M	auto orc_sub_type = orc_type->getSubtype(i);
106	1.24M	if (!orc_sub_type) continue;
107
108	1.24M	std::string table_col_name = to_lower(orc_type->getFieldName(i));
109	1.24M	table_col_name_to_orc_type_map[table_col_name] = orc_sub_type;
110	1.24M	}
111
112	14.8k	std::set<uint64_t> column_ids;
113	14.8k	std::set<uint64_t> filter_column_ids;
114
115		// helper to process access paths for a given top-level orc field
116	14.8k	auto process_access_paths = [](const orc::Type* orc_field,
117	14.8k	const std::vector<TColumnAccessPath>& access_paths,
118	14.8k	std::set<uint64_t>& out_ids) {
119	10.0k	process_nested_access_paths(
120	10.0k	orc_field, access_paths, out_ids,
121	10.1k	[](const orc::Type* type) { return type->getColumnId(); },
122	10.0k	[](const orc::Type* type) { return type->getMaximumColumnId(); },
123	10.0k	HiveOrcNestedColumnUtils::extract_nested_column_ids);
124	10.0k	};
125
126	61.8k	for (const auto* slot : tuple_descriptor->slots()) {
127	61.8k	auto it = table_col_name_to_orc_type_map.find(slot->col_name_lower_case());
128	61.8k	if (it == table_col_name_to_orc_type_map.end()) {
129		// Column not found in file
130	10.4k	continue;
131	10.4k	}
132	51.3k	const orc::Type* orc_field = it->second;
133
134		// primitive (non-nested) types
135	51.3k	if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
136	51.3k	slot->col_type() != TYPE_MAP)) {
137	41.1k	column_ids.insert(orc_field->getColumnId());
138	41.1k	if (slot->is_predicate()) {
139	10.3k	filter_column_ids.insert(orc_field->getColumnId());
140	10.3k	}
141	41.1k	continue;
142	41.1k	}
143
144		// complex types
145	10.2k	const auto& all_access_paths = slot->all_access_paths();
146	10.2k	process_access_paths(orc_field, all_access_paths, column_ids);
147
148	10.2k	const auto& predicate_access_paths = slot->predicate_access_paths();
149	10.2k	if (!predicate_access_paths.empty()) {
150	150	process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
151	150	}
152	10.2k	}
153
154	14.8k	return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
155	14.8k	}
156
157		ColumnIdResult HiveOrcReader::_create_column_ids_by_top_level_col_index(
158	306	const orc::Type* orc_type, const TupleDescriptor* tuple_descriptor) {
159		// map top-level table column position -> orc::Type*
160	306	std::unordered_map<uint64_t, const orc::Type*> table_col_pos_to_orc_type_map;
161	1.49k	for (uint64_t i = 0; i < orc_type->getSubtypeCount(); ++i) {
162	1.18k	auto orc_sub_type = orc_type->getSubtype(i);
163	1.18k	if (!orc_sub_type) continue;
164
165	1.18k	table_col_pos_to_orc_type_map[i] = orc_sub_type;
166	1.18k	}
167
168	306	std::set<uint64_t> column_ids;
169	306	std::set<uint64_t> filter_column_ids;
170
171		// helper to process access paths for a given top-level orc field
172	306	auto process_access_paths = [](const orc::Type* orc_field,
173	306	const std::vector<TColumnAccessPath>& access_paths,
174	306	std::set<uint64_t>& out_ids) {
175	13	process_nested_access_paths(
176	13	orc_field, access_paths, out_ids,
177	13	[](const orc::Type* type) { return type->getColumnId(); },
178	13	[](const orc::Type* type) { return type->getMaximumColumnId(); },
179	13	HiveOrcNestedColumnUtils::extract_nested_column_ids);
180	13	};
181
182	967	for (const auto* slot : tuple_descriptor->slots()) {
183	967	auto it = table_col_pos_to_orc_type_map.find(slot->col_pos());
184	967	if (it == table_col_pos_to_orc_type_map.end()) {
185		// Column not found in file
186	952	continue;
187	952	}
188	15	const orc::Type* orc_field = it->second;
189
190		// primitive (non-nested) types
191	15	if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
192	15	slot->col_type() != TYPE_MAP)) {
193	6	column_ids.insert(orc_field->getColumnId());
194	6	if (slot->is_predicate()) {
195	0	filter_column_ids.insert(orc_field->getColumnId());
196	0	}
197	6	continue;
198	6	}
199
200	9	const auto& all_access_paths = slot->all_access_paths();
201		// complex types
202	9	process_access_paths(orc_field, all_access_paths, column_ids);
203
204	9	const auto& predicate_access_paths = slot->predicate_access_paths();
205	9	if (!predicate_access_paths.empty()) {
206	6	process_access_paths(orc_field, predicate_access_paths, filter_column_ids);
207	6	}
208	9	}
209
210	306	return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
211	306	}
212
213		Status HiveParquetReader::init_reader(
214		const std::vector<std::string>& read_table_col_names,
215		std::unordered_map<std::string, uint32_t>* col_name_to_block_idx,
216		const VExprContextSPtrs& conjuncts,
217		phmap::flat_hash_map<int, std::vector<std::shared_ptr<ColumnPredicate>>>&
218		slot_id_to_predicates,
219		const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor,
220		const std::unordered_map<std::string, int>* colname_to_slot_id,
221		const VExprContextSPtrs* not_single_slot_filter_conjuncts,
222	10.3k	const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts) {
223	10.3k	auto* parquet_reader = static_cast<ParquetReader*>(_file_format_reader.get());
224	10.3k	const FieldDescriptor* field_desc = nullptr;
225	10.3k	RETURN_IF_ERROR(parquet_reader->get_file_metadata_schema(&field_desc));
226	10.3k	DCHECK(field_desc != nullptr);
227
228	10.3k	if (_state->query_options().hive_parquet_use_column_names) {
229		// Directly use the table column name to match the file column name, but pay attention to the case issue.
230	10.0k	RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(tuple_descriptor, *field_desc,
231	10.0k	table_info_node_ptr, _is_file_slot));
232	10.0k	} else { // use idx
233	260	std::map<std::string, const SlotDescriptor*> slot_map; //table_name to slot
234	878	for (const auto& slot : tuple_descriptor->slots()) {
235	878	slot_map.emplace(slot->col_name_lower_case(), slot);
236	878	}
237
238		// For top-level columns, use indexes to match, and for sub-columns, still use name to match columns.
239	260	auto parquet_fields_schema = field_desc->get_fields_schema();
240	1.02k	for (size_t idx = 0; idx < _params.column_idxs.size(); idx++) {
241	764	auto table_column_name = read_table_col_names[idx];
242	764	auto file_index = _params.column_idxs[idx];
243
244	764	if (file_index >= parquet_fields_schema.size()) {
245		// Non-partitioning columns, which may be columns added later.
246	112	table_info_node_ptr->add_not_exist_children(table_column_name);
247	652	} else {
248		// Non-partitioning columns, columns that exist in both the table and the file.
249	652	auto field_node = std::make_shared<Node>();
250		// for sub-columns, still use name to match columns.
251	652	RETURN_IF_ERROR(BuildTableInfoUtil::by_parquet_name(
252	652	slot_map[table_column_name]->type(), parquet_fields_schema[file_index],
253	652	field_node));
254	652	table_info_node_ptr->add_children(
255	652	table_column_name, parquet_fields_schema[file_index].name, field_node);
256	652	}
257
258	764	slot_map.erase(table_column_name);
259	764	}
260		/*
261		* `_params.column_idxs` only have `isIsFileSlot()`, so we need add `partition slot`.
262		* eg:
263		* Table : A, B, C, D (D: partition column)
264		* Parquet file : A, B
265		* Column C is obtained by add column.
266		*
267		* sql : select * from table;
268		* slot : A, B, C, D
269		* _params.column_idxs: 0, 1, 2 (There is no 3, because column D is the partition column)
270		*
271		*/
272	260	for (const auto& [partition_col_name, _] : slot_map) {
273	116	table_info_node_ptr->add_not_exist_children(partition_col_name);
274	116	}
275	260	}
276
277	10.3k	auto column_id_result = ColumnIdResult();
278	10.3k	if (_state->query_options().hive_parquet_use_column_names) {
279	10.0k	column_id_result = _create_column_ids(field_desc, tuple_descriptor);
280	10.0k	} else {
281	286	column_id_result = _create_column_ids_by_top_level_col_index(field_desc, tuple_descriptor);
282	286	}
283
284	10.3k	const auto& column_ids = column_id_result.column_ids;
285	10.3k	const auto& filter_column_ids = column_id_result.filter_column_ids;
286
287	10.3k	RETURN_IF_ERROR(init_row_filters());
288
289	10.3k	return parquet_reader->init_reader(
290	10.3k	read_table_col_names, col_name_to_block_idx, conjuncts, slot_id_to_predicates,
291	10.3k	tuple_descriptor, row_descriptor, colname_to_slot_id, not_single_slot_filter_conjuncts,
292	10.3k	slot_id_to_filter_conjuncts, table_info_node_ptr, true, column_ids, filter_column_ids);
293	10.3k	}
294
295		ColumnIdResult HiveParquetReader::_create_column_ids(const FieldDescriptor* field_desc,
296	10.0k	const TupleDescriptor* tuple_descriptor) {
297		// First, assign column IDs to the field descriptor
298	10.0k	auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
299	10.0k	mutable_field_desc->assign_ids();
300
301		// map top-level table column name (lower-cased) -> FieldSchema*
302	10.0k	std::unordered_map<std::string, const FieldSchema*> table_col_name_to_field_schema_map;
303	87.0k	for (int i = 0; i < field_desc->size(); ++i) {
304	77.0k	auto field_schema = field_desc->get_column(i);
305	77.0k	if (!field_schema) continue;
306
307	77.0k	table_col_name_to_field_schema_map[field_schema->lower_case_name] = field_schema;
308	77.0k	}
309
310	10.0k	std::set<uint64_t> column_ids;
311	10.0k	std::set<uint64_t> filter_column_ids;
312
313		// helper to process access paths for a given top-level parquet field
314	10.0k	auto process_access_paths = [](const FieldSchema* parquet_field,
315	10.0k	const std::vector<TColumnAccessPath>& access_paths,
316	15.6k	std::set<uint64_t>& out_ids) {
317	15.6k	process_nested_access_paths(
318	15.6k	parquet_field, access_paths, out_ids,
319	15.6k	[](const FieldSchema* field) { return field->get_column_id(); },
320	15.6k	[](const FieldSchema* field) { return field->get_max_column_id(); },
321	15.6k	HiveParquetNestedColumnUtils::extract_nested_column_ids);
322	15.6k	};
323
324	54.7k	for (const auto* slot : tuple_descriptor->slots()) {
325	54.7k	auto it = table_col_name_to_field_schema_map.find(slot->col_name_lower_case());
326	54.7k	if (it == table_col_name_to_field_schema_map.end()) {
327		// Column not found in file
328	6.63k	continue;
329	6.63k	}
330	48.1k	auto field_schema = it->second;
331
332		// primitive (non-nested) types
333	48.1k	if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
334	48.1k	slot->col_type() != TYPE_MAP)) {
335	32.6k	column_ids.insert(field_schema->column_id);
336
337	32.6k	if (slot->is_predicate()) {
338	4.98k	filter_column_ids.insert(field_schema->column_id);
339	4.98k	}
340	32.6k	continue;
341	32.6k	}
342
343		// complex types
344	15.4k	const auto& all_access_paths = slot->all_access_paths();
345	15.4k	process_access_paths(field_schema, all_access_paths, column_ids);
346
347	15.4k	const auto& predicate_access_paths = slot->predicate_access_paths();
348	15.4k	if (!predicate_access_paths.empty()) {
349	188	process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
350	188	}
351	15.4k	}
352
353	10.0k	return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
354	10.0k	}
355
356		ColumnIdResult HiveParquetReader::_create_column_ids_by_top_level_col_index(
357	270	const FieldDescriptor* field_desc, const TupleDescriptor* tuple_descriptor) {
358		// First, assign column IDs to the field descriptor
359	270	auto* mutable_field_desc = const_cast<FieldDescriptor*>(field_desc);
360	270	mutable_field_desc->assign_ids();
361
362		// map top-level table column position -> FieldSchema*
363	270	std::unordered_map<uint64_t, const FieldSchema*> table_col_pos_to_field_schema_map;
364	1.32k	for (int i = 0; i < field_desc->size(); ++i) {
365	1.05k	auto field_schema = field_desc->get_column(i);
366	1.05k	if (!field_schema) continue;
367
368	1.05k	table_col_pos_to_field_schema_map[i] = field_schema;
369	1.05k	}
370
371	270	std::set<uint64_t> column_ids;
372	270	std::set<uint64_t> filter_column_ids;
373
374		// helper to process access paths for a given top-level parquet field
375	270	auto process_access_paths = [](const FieldSchema* parquet_field,
376	270	const std::vector<TColumnAccessPath>& access_paths,
377	270	std::set<uint64_t>& out_ids) {
378	13	process_nested_access_paths(
379	13	parquet_field, access_paths, out_ids,
380	13	[](const FieldSchema* field) { return field->get_column_id(); },
381	13	[](const FieldSchema* field) { return field->get_max_column_id(); },
382	13	HiveParquetNestedColumnUtils::extract_nested_column_ids);
383	13	};
384
385	891	for (const auto* slot : tuple_descriptor->slots()) {
386	891	auto it = table_col_pos_to_field_schema_map.find(slot->col_pos());
387	891	if (it == table_col_pos_to_field_schema_map.end()) {
388		// Column not found in file
389	878	continue;
390	878	}
391	13	auto field_schema = it->second;
392
393		// primitive (non-nested) types
394	13	if ((slot->col_type() != TYPE_STRUCT && slot->col_type() != TYPE_ARRAY &&
395	13	slot->col_type() != TYPE_MAP)) {
396	6	column_ids.insert(field_schema->column_id);
397
398	6	if (slot->is_predicate()) {
399	0	filter_column_ids.insert(field_schema->column_id);
400	0	}
401	6	continue;
402	6	}
403
404		// complex types
405	7	const auto& all_access_paths = slot->all_access_paths();
406	7	process_access_paths(field_schema, all_access_paths, column_ids);
407
408	7	const auto& predicate_access_paths = slot->predicate_access_paths();
409	7	if (!predicate_access_paths.empty()) {
410	6	process_access_paths(field_schema, predicate_access_paths, filter_column_ids);
411	6	}
412	7	}
413
414	270	return ColumnIdResult(std::move(column_ids), std::move(filter_column_ids));
415	270	}
416
417		#include "common/compile_check_end.h"
418		} // namespace doris