be/src/format/parquet/vparquet_reader.h

Source
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <gen_cpp/parquet_types.h>
#include <stddef.h>
#include <stdint.h>

#include <list>
#include <memory>
#include <string>
#include <tuple>
#include <unordered_map>
#include <unordered_set>
#include <vector>

#include "common/status.h"
#include "format/generic_reader.h"
#include "format/parquet/parquet_common.h"
#include "format/parquet/parquet_predicate.h"
#include "format/parquet/vparquet_column_reader.h"
#include "format/parquet/vparquet_group_reader.h"
#include "format/table/table_format_reader.h"
#include "io/file_factory.h"
#include "io/fs/file_meta_cache.h"
#include "io/fs/file_reader.h"
#include "io/fs/file_reader_writer_fwd.h"
#include "runtime/runtime_profile.h"
#include "storage/olap_scan_common.h"
#include "util/obj_lru_cache.h"

namespace cctz {
class time_zone;
} // namespace cctz
namespace doris {
class RowDescriptor;
class RuntimeState;
class SlotDescriptor;
class TFileRangeDesc;
class TFileScanRangeParams;
class TupleDescriptor;

namespace io {
class FileSystem;
struct IOContext;
} // namespace io
class Block;
class FileMetaData;
class PageIndex;
class ShardedKVCache;
class VExprContext;
} // namespace doris

namespace doris {
#include "common/compile_check_begin.h"
class ParquetReader : public GenericReader {
    ENABLE_FACTORY_CREATOR(ParquetReader);

public:
    struct ReaderStatistics {
        int32_t filtered_row_groups = 0;
        int32_t filtered_row_groups_by_min_max = 0;
        int32_t filtered_row_groups_by_bloom_filter = 0;
        int32_t read_row_groups = 0;
        int64_t filtered_group_rows = 0;
        int64_t filtered_page_rows = 0;
        int64_t lazy_read_filtered_rows = 0;
        int64_t read_rows = 0;
        int64_t filtered_bytes = 0;
        int64_t column_read_time = 0;
        int64_t parse_meta_time = 0;
        int64_t parse_footer_time = 0;
        int64_t file_footer_read_calls = 0;
        int64_t file_footer_hit_cache = 0;
        int64_t file_reader_create_time = 0;
        int64_t open_file_num = 0;
        int64_t row_group_filter_time = 0;
        int64_t page_index_filter_time = 0;
        int64_t read_page_index_time = 0;
        int64_t parse_page_index_time = 0;
        int64_t predicate_filter_time = 0;
        int64_t dict_filter_rewrite_time = 0;
        int64_t bloom_filter_read_time = 0;
    };

    ParquetReader(RuntimeProfile* profile, const TFileScanRangeParams& params,
                  const TFileRangeDesc& range, size_t batch_size, const cctz::time_zone* ctz,
                  io::IOContext* io_ctx, RuntimeState* state, FileMetaCache* meta_cache = nullptr,
                  bool enable_lazy_mat = true);

    ParquetReader(RuntimeProfile* profile, const TFileScanRangeParams& params,
                  const TFileRangeDesc& range, size_t batch_size, const cctz::time_zone* ctz,
                  std::shared_ptr<io::IOContext> io_ctx_holder, RuntimeState* state,
                  FileMetaCache* meta_cache = nullptr, bool enable_lazy_mat = true);

    ParquetReader(const TFileScanRangeParams& params, const TFileRangeDesc& range,
                  io::IOContext* io_ctx, RuntimeState* state, FileMetaCache* meta_cache = nullptr,
                  bool enable_lazy_mat = true);

    ParquetReader(const TFileScanRangeParams& params, const TFileRangeDesc& range,
                  std::shared_ptr<io::IOContext> io_ctx_holder, RuntimeState* state,
                  FileMetaCache* meta_cache = nullptr, bool enable_lazy_mat = true);

    ~ParquetReader() override;
#ifdef BE_TEST
    // for unit test
    void set_file_reader(io::FileReaderSPtr file_reader);
#endif

    Status init_reader(
            const std::vector<std::string>& all_column_names,
            std::unordered_map<std::string, uint32_t>* col_name_to_block_idx,
            const VExprContextSPtrs& conjuncts,
            phmap::flat_hash_map<int, std::vector<std::shared_ptr<ColumnPredicate>>>&
                    slot_id_to_predicates,
            const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor,
            const std::unordered_map<std::string, int>* colname_to_slot_id,
            const VExprContextSPtrs* not_single_slot_filter_conjuncts,
            const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts,
            std::shared_ptr<TableSchemaChangeHelper::Node> table_info_node_ptr =
                    TableSchemaChangeHelper::ConstNode::get_instance(),
            bool filter_groups = true, const std::set<uint64_t>& column_ids = {},
            const std::set<uint64_t>& filter_column_ids = {});

    Status get_next_block(Block* block, size_t* read_rows, bool* eof) override;

    Status close() override;

    // set the delete rows in current parquet file
    void set_delete_rows(const std::vector<int64_t>* delete_rows) { _delete_rows = delete_rows; }

    int64_t size() const { return _file_reader->size(); }

    Status get_columns(std::unordered_map<std::string, DataTypePtr>* name_to_type,
                       std::unordered_set<std::string>* missing_cols) override;

    Status init_schema_reader() override;

    Status get_parsed_schema(std::vector<std::string>* col_names,
                             std::vector<DataTypePtr>* col_types) override;

    ReaderStatistics& reader_statistics() { return _reader_statistics; }

    const tparquet::FileMetaData* get_meta_data() const { return _t_metadata; }

    // Partition columns will not be materialized in parquet files. So we should fill it with missing columns.
    Status set_fill_columns(
            const std::unordered_map<std::string, std::tuple<std::string, const SlotDescriptor*>>&
                    partition_columns,
            const std::unordered_map<std::string, VExprContextSPtr>& missing_columns) override;

    Status get_file_metadata_schema(const FieldDescriptor** ptr);

    void set_row_id_column_iterator(
            std::pair<std::shared_ptr<RowIdColumnIteratorV2>, int> iterator_pair) {
        _row_id_column_iterator_pair = iterator_pair;
    }

    bool count_read_rows() override { return true; }

protected:
    void _collect_profile_before_close() override;

private:
    struct ParquetProfile {
        RuntimeProfile::Counter* filtered_row_groups = nullptr;
        RuntimeProfile::Counter* filtered_row_groups_by_min_max = nullptr;
        RuntimeProfile::Counter* filtered_row_groups_by_bloom_filter = nullptr;
        RuntimeProfile::Counter* to_read_row_groups = nullptr;
        RuntimeProfile::Counter* total_row_groups = nullptr;
        RuntimeProfile::Counter* filtered_group_rows = nullptr;
        RuntimeProfile::Counter* filtered_page_rows = nullptr;
        RuntimeProfile::Counter* lazy_read_filtered_rows = nullptr;
        RuntimeProfile::Counter* filtered_bytes = nullptr;
        RuntimeProfile::Counter* raw_rows_read = nullptr;
        RuntimeProfile::Counter* column_read_time = nullptr;
        RuntimeProfile::Counter* parse_meta_time = nullptr;
        RuntimeProfile::Counter* parse_footer_time = nullptr;
        RuntimeProfile::Counter* file_reader_create_time = nullptr;
        RuntimeProfile::Counter* open_file_num = nullptr;
        RuntimeProfile::Counter* row_group_filter_time = nullptr;
        RuntimeProfile::Counter* page_index_read_calls = nullptr;
        RuntimeProfile::Counter* page_index_filter_time = nullptr;
        RuntimeProfile::Counter* read_page_index_time = nullptr;
        RuntimeProfile::Counter* parse_page_index_time = nullptr;
        RuntimeProfile::Counter* file_footer_read_calls = nullptr;
        RuntimeProfile::Counter* file_footer_hit_cache = nullptr;
        RuntimeProfile::Counter* decompress_time = nullptr;
        RuntimeProfile::Counter* decompress_cnt = nullptr;
        RuntimeProfile::Counter* page_read_counter = nullptr;
        RuntimeProfile::Counter* page_cache_write_counter = nullptr;
        RuntimeProfile::Counter* page_cache_compressed_write_counter = nullptr;
        RuntimeProfile::Counter* page_cache_decompressed_write_counter = nullptr;
        RuntimeProfile::Counter* page_cache_hit_counter = nullptr;
        RuntimeProfile::Counter* page_cache_missing_counter = nullptr;
        RuntimeProfile::Counter* page_cache_compressed_hit_counter = nullptr;
        RuntimeProfile::Counter* page_cache_decompressed_hit_counter = nullptr;
        RuntimeProfile::Counter* decode_header_time = nullptr;
        RuntimeProfile::Counter* read_page_header_time = nullptr;
        RuntimeProfile::Counter* decode_value_time = nullptr;
        RuntimeProfile::Counter* decode_dict_time = nullptr;
        RuntimeProfile::Counter* decode_level_time = nullptr;
        RuntimeProfile::Counter* decode_null_map_time = nullptr;
        RuntimeProfile::Counter* skip_page_header_num = nullptr;
        RuntimeProfile::Counter* parse_page_header_num = nullptr;
        RuntimeProfile::Counter* predicate_filter_time = nullptr;
        RuntimeProfile::Counter* dict_filter_rewrite_time = nullptr;
        RuntimeProfile::Counter* bloom_filter_read_time = nullptr;
    };

    Status _open_file();
    void _init_profile();
    void _close_internal();
    Status _next_row_group_reader();
    RowGroupReader::PositionDeleteContext _get_position_delete_ctx(
            const tparquet::RowGroup& row_group,
            const RowGroupReader::RowGroupIndex& row_group_index);
    void _init_system_properties();
    void _init_file_description();

    // At the beginning of reading next row group, index should be loaded and used to filter data efficiently.
    Status _process_page_index_filter(
            const tparquet::RowGroup& row_group,
            const RowGroupReader::RowGroupIndex& row_group_index,
            const std::vector<std::unique_ptr<MutilColumnBlockPredicate>>& push_down_pred,
            RowRanges* candidate_row_ranges);

    // check this range contain this row group.
    bool _is_misaligned_range_group(const tparquet::RowGroup& row_group);

    // Row Group min-max Filter
    Status _process_column_stat_filter(
            const tparquet::RowGroup& row_group,
            const std::vector<std::unique_ptr<MutilColumnBlockPredicate>>& push_down_pred,
            bool* filter_group, bool* filtered_by_min_max, bool* filtered_by_bloom_filter);

    /*
     * 1. row group min-max filter
     * 2. row group bloom filter
     * 3. page index min-max filter
     *
     * return Status && row_ranges (lines to be read)
     */
    Status _process_min_max_bloom_filter(
            const RowGroupReader::RowGroupIndex& row_group_index,
            const tparquet::RowGroup& row_group,
            const std::vector<std::unique_ptr<MutilColumnBlockPredicate>>& push_down_pred,
            RowRanges* row_ranges);

    int64_t _get_column_start_offset(const tparquet::ColumnMetaData& column_init_column_readers);
    std::string _meta_cache_key(const std::string& path) { return "meta_" + path; }
    std::vector<io::PrefetchRange> _generate_random_access_ranges(
            const RowGroupReader::RowGroupIndex& group, size_t* avg_io_size);
    void _collect_profile();

    Status _set_read_one_line_impl() override { return Status::OK(); }

    bool _exists_in_file(const std::string& expr_name) const;
    bool _type_matches(const int cid) const;

    RuntimeProfile* _profile = nullptr;
    const TFileScanRangeParams& _scan_params;
    const TFileRangeDesc& _scan_range;
    io::FileSystemProperties _system_properties;
    io::FileDescription _file_description;

    // the following fields are for parquet meta data cache.
    // if _meta_cache is not null, the _file_metadata will be got from _meta_cache,
    // and it is owned by _meta_cache_handle.
    // if _meta_cache is null, _file_metadata will be managed by _file_metadata_ptr,
    // which will be released when deconstructing.
    // ATTN: these fields must be before _file_reader, to make sure they will be released
    // after _file_reader. Otherwise, there may be heap-use-after-free bug.
    ObjLRUCache::CacheHandle _meta_cache_handle;
    std::unique_ptr<FileMetaData> _file_metadata_ptr;
    const FileMetaData* _file_metadata = nullptr;
    const tparquet::FileMetaData* _t_metadata = nullptr;

    // _tracing_file_reader wraps _file_reader.
    // _file_reader is original file reader.
    // _tracing_file_reader is tracing file reader with io context.
    // If io_ctx is null, _tracing_file_reader will be the same as file_reader.
    io::FileReaderSPtr _file_reader = nullptr;
    io::FileReaderSPtr _tracing_file_reader = nullptr;
    std::unique_ptr<RowGroupReader> _current_group_reader;

    RowGroupReader::RowGroupIndex _current_row_group_index {-1, 0, 0};
    // read to the end of current reader
    bool _row_group_eof = true;
    size_t _total_groups; // num of groups(stripes) of a parquet(orc) file

    // Through this node, you can find the file column based on the table column.
    std::shared_ptr<TableSchemaChangeHelper::Node> _table_info_node_ptr =
            TableSchemaChangeHelper::ConstNode::get_instance();

    //sequence in file, need to read
    std::vector<std::string> _read_table_columns;
    std::vector<std::string> _read_file_columns;
    // The set of file columns to be read; only columns within this set will be filtered using the min-max predicate.
    std::set<std::string> _read_table_columns_set;
    // Deleted rows will be marked by Iceberg/Paimon. So we should filter deleted rows when reading it.
    const std::vector<int64_t>* _delete_rows = nullptr;
    int64_t _delete_rows_index = 0;

    // Used for column lazy read.
    RowGroupReader::LazyReadContext _lazy_read_ctx;

    // parquet file reader object
    size_t _batch_size;
    int64_t _range_start_offset;
    int64_t _range_size;
    const cctz::time_zone* _ctz = nullptr;

    std::unordered_map<int, tparquet::OffsetIndex> _col_offsets;

    std::vector<std::string> _missing_cols;
    // _table_column_names = _missing_cols + _read_table_columns
    const std::vector<std::string>* _table_column_names = nullptr;

    ReaderStatistics _reader_statistics;
    ParquetColumnReader::ColumnStatistics _column_statistics;
    ParquetProfile _parquet_profile;
    bool _closed = false;
    io::IOContext* _io_ctx = nullptr;
    std::shared_ptr<io::IOContext> _io_ctx_holder;
    RuntimeState* _state = nullptr;
    bool _enable_lazy_mat = true;
    bool _enable_filter_by_min_max = true;
    bool _enable_filter_by_bloom_filter = true;
    const TupleDescriptor* _tuple_descriptor = nullptr;
    const RowDescriptor* _row_descriptor = nullptr;
    const std::unordered_map<std::string, int>* _colname_to_slot_id = nullptr;
    const VExprContextSPtrs* _not_single_slot_filter_conjuncts = nullptr;
    const std::unordered_map<int, VExprContextSPtrs>* _slot_id_to_filter_conjuncts = nullptr;
    std::unordered_map<tparquet::Type::type, bool> _ignored_stats;

    std::pair<std::shared_ptr<RowIdColumnIteratorV2>, int> _row_id_column_iterator_pair = {nullptr,
                                                                                           -1};
    bool _filter_groups = true;

    std::set<uint64_t> _column_ids;
    std::set<uint64_t> _filter_column_ids;

    std::unordered_map<std::string, uint32_t>* _col_name_to_block_idx = nullptr;

    std::vector<std::unique_ptr<MutilColumnBlockPredicate>> _push_down_predicates;
    Arena _arena;
};
#include "common/compile_check_end.h"

} // namespace doris

Coverage Report

Created: 2026-03-12 17:07

Line	Count	Source
1		// Licensed to the Apache Software Foundation (ASF) under one
2		// or more contributor license agreements. See the NOTICE file
3		// distributed with this work for additional information
4		// regarding copyright ownership. The ASF licenses this file
5		// to you under the Apache License, Version 2.0 (the
6		// "License"); you may not use this file except in compliance
7		// with the License. You may obtain a copy of the License at
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing,
12		// software distributed under the License is distributed on an
13		// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14		// KIND, either express or implied. See the License for the
15		// specific language governing permissions and limitations
16		// under the License.
17
18		#pragma once
19
20		#include <gen_cpp/parquet_types.h>
21		#include <stddef.h>
22		#include <stdint.h>
23
24		#include <list>
25		#include <memory>
26		#include <string>
27		#include <tuple>
28		#include <unordered_map>
29		#include <unordered_set>
30		#include <vector>
31
32		#include "common/status.h"
33		#include "format/generic_reader.h"
34		#include "format/parquet/parquet_common.h"
35		#include "format/parquet/parquet_predicate.h"
36		#include "format/parquet/vparquet_column_reader.h"
37		#include "format/parquet/vparquet_group_reader.h"
38		#include "format/table/table_format_reader.h"
39		#include "io/file_factory.h"
40		#include "io/fs/file_meta_cache.h"
41		#include "io/fs/file_reader.h"
42		#include "io/fs/file_reader_writer_fwd.h"
43		#include "runtime/runtime_profile.h"
44		#include "storage/olap_scan_common.h"
45		#include "util/obj_lru_cache.h"
46
47		namespace cctz {
48		class time_zone;
49		} // namespace cctz
50		namespace doris {
51		class RowDescriptor;
52		class RuntimeState;
53		class SlotDescriptor;
54		class TFileRangeDesc;
55		class TFileScanRangeParams;
56		class TupleDescriptor;
57
58		namespace io {
59		class FileSystem;
60		struct IOContext;
61		} // namespace io
62		class Block;
63		class FileMetaData;
64		class PageIndex;
65		class ShardedKVCache;
66		class VExprContext;
67		} // namespace doris
68
69		namespace doris {
70		#include "common/compile_check_begin.h"
71		class ParquetReader : public GenericReader {
72		ENABLE_FACTORY_CREATOR(ParquetReader);
73
74		public:
75		struct ReaderStatistics {
76		int32_t filtered_row_groups = 0;
77		int32_t filtered_row_groups_by_min_max = 0;
78		int32_t filtered_row_groups_by_bloom_filter = 0;
79		int32_t read_row_groups = 0;
80		int64_t filtered_group_rows = 0;
81		int64_t filtered_page_rows = 0;
82		int64_t lazy_read_filtered_rows = 0;
83		int64_t read_rows = 0;
84		int64_t filtered_bytes = 0;
85		int64_t column_read_time = 0;
86		int64_t parse_meta_time = 0;
87		int64_t parse_footer_time = 0;
88		int64_t file_footer_read_calls = 0;
89		int64_t file_footer_hit_cache = 0;
90		int64_t file_reader_create_time = 0;
91		int64_t open_file_num = 0;
92		int64_t row_group_filter_time = 0;
93		int64_t page_index_filter_time = 0;
94		int64_t read_page_index_time = 0;
95		int64_t parse_page_index_time = 0;
96		int64_t predicate_filter_time = 0;
97		int64_t dict_filter_rewrite_time = 0;
98		int64_t bloom_filter_read_time = 0;
99		};
100
101		ParquetReader(RuntimeProfile* profile, const TFileScanRangeParams& params,
102		const TFileRangeDesc& range, size_t batch_size, const cctz::time_zone* ctz,
103		io::IOContext* io_ctx, RuntimeState* state, FileMetaCache* meta_cache = nullptr,
104		bool enable_lazy_mat = true);
105
106		ParquetReader(RuntimeProfile* profile, const TFileScanRangeParams& params,
107		const TFileRangeDesc& range, size_t batch_size, const cctz::time_zone* ctz,
108		std::shared_ptr<io::IOContext> io_ctx_holder, RuntimeState* state,
109		FileMetaCache* meta_cache = nullptr, bool enable_lazy_mat = true);
110
111		ParquetReader(const TFileScanRangeParams& params, const TFileRangeDesc& range,
112		io::IOContext* io_ctx, RuntimeState* state, FileMetaCache* meta_cache = nullptr,
113		bool enable_lazy_mat = true);
114
115		ParquetReader(const TFileScanRangeParams& params, const TFileRangeDesc& range,
116		std::shared_ptr<io::IOContext> io_ctx_holder, RuntimeState* state,
117		FileMetaCache* meta_cache = nullptr, bool enable_lazy_mat = true);
118
119		~ParquetReader() override;
120		#ifdef BE_TEST
121		// for unit test
122		void set_file_reader(io::FileReaderSPtr file_reader);
123		#endif
124
125		Status init_reader(
126		const std::vector<std::string>& all_column_names,
127		std::unordered_map<std::string, uint32_t>* col_name_to_block_idx,
128		const VExprContextSPtrs& conjuncts,
129		phmap::flat_hash_map<int, std::vector<std::shared_ptr<ColumnPredicate>>>&
130		slot_id_to_predicates,
131		const TupleDescriptor* tuple_descriptor, const RowDescriptor* row_descriptor,
132		const std::unordered_map<std::string, int>* colname_to_slot_id,
133		const VExprContextSPtrs* not_single_slot_filter_conjuncts,
134		const std::unordered_map<int, VExprContextSPtrs>* slot_id_to_filter_conjuncts,
135		std::shared_ptr<TableSchemaChangeHelper::Node> table_info_node_ptr =
136		TableSchemaChangeHelper::ConstNode::get_instance(),
137		bool filter_groups = true, const std::set<uint64_t>& column_ids = {},
138		const std::set<uint64_t>& filter_column_ids = {});
139
140		Status get_next_block(Block* block, size_t* read_rows, bool* eof) override;
141
142		Status close() override;
143
144		// set the delete rows in current parquet file
145	0	void set_delete_rows(const std::vector<int64_t>* delete_rows) { _delete_rows = delete_rows; }
146
147	0	int64_t size() const { return _file_reader->size(); }
148
149		Status get_columns(std::unordered_map<std::string, DataTypePtr>* name_to_type,
150		std::unordered_set<std::string>* missing_cols) override;
151
152		Status init_schema_reader() override;
153
154		Status get_parsed_schema(std::vector<std::string>* col_names,
155		std::vector<DataTypePtr>* col_types) override;
156
157	0	ReaderStatistics& reader_statistics() { return _reader_statistics; }
158
159	0	const tparquet::FileMetaData* get_meta_data() const { return _t_metadata; }
160
161		// Partition columns will not be materialized in parquet files. So we should fill it with missing columns.
162		Status set_fill_columns(
163		const std::unordered_map<std::string, std::tuple<std::string, const SlotDescriptor*>>&
164		partition_columns,
165		const std::unordered_map<std::string, VExprContextSPtr>& missing_columns) override;
166
167		Status get_file_metadata_schema(const FieldDescriptor** ptr);
168
169		void set_row_id_column_iterator(
170	13	std::pair<std::shared_ptr<RowIdColumnIteratorV2>, int> iterator_pair) {
171	13	_row_id_column_iterator_pair = iterator_pair;
172	13	}
173
174	184	bool count_read_rows() override { return true; }
175
176		protected:
177		void _collect_profile_before_close() override;
178
179		private:
180		struct ParquetProfile {
181		RuntimeProfile::Counter* filtered_row_groups = nullptr;
182		RuntimeProfile::Counter* filtered_row_groups_by_min_max = nullptr;
183		RuntimeProfile::Counter* filtered_row_groups_by_bloom_filter = nullptr;
184		RuntimeProfile::Counter* to_read_row_groups = nullptr;
185		RuntimeProfile::Counter* total_row_groups = nullptr;
186		RuntimeProfile::Counter* filtered_group_rows = nullptr;
187		RuntimeProfile::Counter* filtered_page_rows = nullptr;
188		RuntimeProfile::Counter* lazy_read_filtered_rows = nullptr;
189		RuntimeProfile::Counter* filtered_bytes = nullptr;
190		RuntimeProfile::Counter* raw_rows_read = nullptr;
191		RuntimeProfile::Counter* column_read_time = nullptr;
192		RuntimeProfile::Counter* parse_meta_time = nullptr;
193		RuntimeProfile::Counter* parse_footer_time = nullptr;
194		RuntimeProfile::Counter* file_reader_create_time = nullptr;
195		RuntimeProfile::Counter* open_file_num = nullptr;
196		RuntimeProfile::Counter* row_group_filter_time = nullptr;
197		RuntimeProfile::Counter* page_index_read_calls = nullptr;
198		RuntimeProfile::Counter* page_index_filter_time = nullptr;
199		RuntimeProfile::Counter* read_page_index_time = nullptr;
200		RuntimeProfile::Counter* parse_page_index_time = nullptr;
201		RuntimeProfile::Counter* file_footer_read_calls = nullptr;
202		RuntimeProfile::Counter* file_footer_hit_cache = nullptr;
203		RuntimeProfile::Counter* decompress_time = nullptr;
204		RuntimeProfile::Counter* decompress_cnt = nullptr;
205		RuntimeProfile::Counter* page_read_counter = nullptr;
206		RuntimeProfile::Counter* page_cache_write_counter = nullptr;
207		RuntimeProfile::Counter* page_cache_compressed_write_counter = nullptr;
208		RuntimeProfile::Counter* page_cache_decompressed_write_counter = nullptr;
209		RuntimeProfile::Counter* page_cache_hit_counter = nullptr;
210		RuntimeProfile::Counter* page_cache_missing_counter = nullptr;
211		RuntimeProfile::Counter* page_cache_compressed_hit_counter = nullptr;
212		RuntimeProfile::Counter* page_cache_decompressed_hit_counter = nullptr;
213		RuntimeProfile::Counter* decode_header_time = nullptr;
214		RuntimeProfile::Counter* read_page_header_time = nullptr;
215		RuntimeProfile::Counter* decode_value_time = nullptr;
216		RuntimeProfile::Counter* decode_dict_time = nullptr;
217		RuntimeProfile::Counter* decode_level_time = nullptr;
218		RuntimeProfile::Counter* decode_null_map_time = nullptr;
219		RuntimeProfile::Counter* skip_page_header_num = nullptr;
220		RuntimeProfile::Counter* parse_page_header_num = nullptr;
221		RuntimeProfile::Counter* predicate_filter_time = nullptr;
222		RuntimeProfile::Counter* dict_filter_rewrite_time = nullptr;
223		RuntimeProfile::Counter* bloom_filter_read_time = nullptr;
224		};
225
226		Status _open_file();
227		void _init_profile();
228		void _close_internal();
229		Status _next_row_group_reader();
230		RowGroupReader::PositionDeleteContext _get_position_delete_ctx(
231		const tparquet::RowGroup& row_group,
232		const RowGroupReader::RowGroupIndex& row_group_index);
233		void _init_system_properties();
234		void _init_file_description();
235
236		// At the beginning of reading next row group, index should be loaded and used to filter data efficiently.
237		Status _process_page_index_filter(
238		const tparquet::RowGroup& row_group,
239		const RowGroupReader::RowGroupIndex& row_group_index,
240		const std::vector<std::unique_ptr<MutilColumnBlockPredicate>>& push_down_pred,
241		RowRanges* candidate_row_ranges);
242
243		// check this range contain this row group.
244		bool _is_misaligned_range_group(const tparquet::RowGroup& row_group);
245
246		// Row Group min-max Filter
247		Status _process_column_stat_filter(
248		const tparquet::RowGroup& row_group,
249		const std::vector<std::unique_ptr<MutilColumnBlockPredicate>>& push_down_pred,
250		bool* filter_group, bool* filtered_by_min_max, bool* filtered_by_bloom_filter);
251
252		/*
253		* 1. row group min-max filter
254		* 2. row group bloom filter
255		* 3. page index min-max filter
256		*
257		* return Status && row_ranges (lines to be read)
258		*/
259		Status _process_min_max_bloom_filter(
260		const RowGroupReader::RowGroupIndex& row_group_index,
261		const tparquet::RowGroup& row_group,
262		const std::vector<std::unique_ptr<MutilColumnBlockPredicate>>& push_down_pred,
263		RowRanges* row_ranges);
264
265		int64_t _get_column_start_offset(const tparquet::ColumnMetaData& column_init_column_readers);
266	0	std::string _meta_cache_key(const std::string& path) { return "meta_" + path; }
267		std::vector<io::PrefetchRange> _generate_random_access_ranges(
268		const RowGroupReader::RowGroupIndex& group, size_t* avg_io_size);
269		void _collect_profile();
270
271	27	Status _set_read_one_line_impl() override { return Status::OK(); }
272
273		bool _exists_in_file(const std::string& expr_name) const;
274		bool _type_matches(const int cid) const;
275
276		RuntimeProfile* _profile = nullptr;
277		const TFileScanRangeParams& _scan_params;
278		const TFileRangeDesc& _scan_range;
279		io::FileSystemProperties _system_properties;
280		io::FileDescription _file_description;
281
282		// the following fields are for parquet meta data cache.
283		// if _meta_cache is not null, the _file_metadata will be got from _meta_cache,
284		// and it is owned by _meta_cache_handle.
285		// if _meta_cache is null, _file_metadata will be managed by _file_metadata_ptr,
286		// which will be released when deconstructing.
287		// ATTN: these fields must be before _file_reader, to make sure they will be released
288		// after _file_reader. Otherwise, there may be heap-use-after-free bug.
289		ObjLRUCache::CacheHandle _meta_cache_handle;
290		std::unique_ptr<FileMetaData> _file_metadata_ptr;
291		const FileMetaData* _file_metadata = nullptr;
292		const tparquet::FileMetaData* _t_metadata = nullptr;
293
294		// _tracing_file_reader wraps _file_reader.
295		// _file_reader is original file reader.
296		// _tracing_file_reader is tracing file reader with io context.
297		// If io_ctx is null, _tracing_file_reader will be the same as file_reader.
298		io::FileReaderSPtr _file_reader = nullptr;
299		io::FileReaderSPtr _tracing_file_reader = nullptr;
300		std::unique_ptr<RowGroupReader> _current_group_reader;
301
302		RowGroupReader::RowGroupIndex _current_row_group_index {-1, 0, 0};
303		// read to the end of current reader
304		bool _row_group_eof = true;
305		size_t _total_groups; // num of groups(stripes) of a parquet(orc) file
306
307		// Through this node, you can find the file column based on the table column.
308		std::shared_ptr<TableSchemaChangeHelper::Node> _table_info_node_ptr =
309		TableSchemaChangeHelper::ConstNode::get_instance();
310
311		//sequence in file, need to read
312		std::vector<std::string> _read_table_columns;
313		std::vector<std::string> _read_file_columns;
314		// The set of file columns to be read; only columns within this set will be filtered using the min-max predicate.
315		std::set<std::string> _read_table_columns_set;
316		// Deleted rows will be marked by Iceberg/Paimon. So we should filter deleted rows when reading it.
317		const std::vector<int64_t>* _delete_rows = nullptr;
318		int64_t _delete_rows_index = 0;
319
320		// Used for column lazy read.
321		RowGroupReader::LazyReadContext _lazy_read_ctx;
322
323		// parquet file reader object
324		size_t _batch_size;
325		int64_t _range_start_offset;
326		int64_t _range_size;
327		const cctz::time_zone* _ctz = nullptr;
328
329		std::unordered_map<int, tparquet::OffsetIndex> _col_offsets;
330
331		std::vector<std::string> _missing_cols;
332		// _table_column_names = _missing_cols + _read_table_columns
333		const std::vector<std::string>* _table_column_names = nullptr;
334
335		ReaderStatistics _reader_statistics;
336		ParquetColumnReader::ColumnStatistics _column_statistics;
337		ParquetProfile _parquet_profile;
338		bool _closed = false;
339		io::IOContext* _io_ctx = nullptr;
340		std::shared_ptr<io::IOContext> _io_ctx_holder;
341		RuntimeState* _state = nullptr;
342		bool _enable_lazy_mat = true;
343		bool _enable_filter_by_min_max = true;
344		bool _enable_filter_by_bloom_filter = true;
345		const TupleDescriptor* _tuple_descriptor = nullptr;
346		const RowDescriptor* _row_descriptor = nullptr;
347		const std::unordered_map<std::string, int>* _colname_to_slot_id = nullptr;
348		const VExprContextSPtrs* _not_single_slot_filter_conjuncts = nullptr;
349		const std::unordered_map<int, VExprContextSPtrs>* _slot_id_to_filter_conjuncts = nullptr;
350		std::unordered_map<tparquet::Type::type, bool> _ignored_stats;
351
352		std::pair<std::shared_ptr<RowIdColumnIteratorV2>, int> _row_id_column_iterator_pair = {nullptr,
353		-1};
354		bool _filter_groups = true;
355
356		std::set<uint64_t> _column_ids;
357		std::set<uint64_t> _filter_column_ids;
358
359		std::unordered_map<std::string, uint32_t>* _col_name_to_block_idx = nullptr;
360
361		std::vector<std::unique_ptr<MutilColumnBlockPredicate>> _push_down_predicates;
362		Arena _arena;
363		};
364		#include "common/compile_check_end.h"
365
366		} // namespace doris