be/src/format/generic_reader.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <gen_cpp/PlanNodes_types.h> |
21 | | |
22 | | #include "common/status.h" |
23 | | #include "exprs/vexpr_fwd.h" |
24 | | #include "runtime/descriptors.h" |
25 | | #include "storage/predicate/block_column_predicate.h" |
26 | | #include "util/profile_collector.h" |
27 | | |
28 | | namespace doris { |
29 | | class ColumnPredicate; |
30 | | } // namespace doris |
31 | | |
32 | | namespace doris { |
33 | | #include "common/compile_check_begin.h" |
34 | | |
35 | | class Block; |
36 | | class VSlotRef; |
37 | | |
38 | | // Context passed from FileScanner to readers for condition cache integration. |
39 | | // On MISS: readers populate filter_result per-granule during predicate evaluation. |
40 | | // On HIT: readers skip granules where filter_result[granule] == false. |
41 | | struct ConditionCacheContext { |
42 | | bool is_hit = false; |
43 | | std::shared_ptr<std::vector<bool>> filter_result; // per-granule: true = has surviving rows |
44 | | int64_t base_granule = 0; // global granule index of the first granule in filter_result |
45 | | static constexpr int GRANULE_SIZE = 2048; |
46 | | }; |
47 | | |
48 | | // This a reader interface for all file readers. |
49 | | // A GenericReader is responsible for reading a file and return |
50 | | // a set of blocks with specified schema, |
51 | | class GenericReader : public ProfileCollector { |
52 | | public: |
53 | 3.53k | GenericReader() : _push_down_agg_type(TPushAggOp::type::NONE) {} |
54 | 2.88k | void set_push_down_agg_type(TPushAggOp::type push_down_agg_type) { |
55 | 2.88k | _push_down_agg_type = push_down_agg_type; |
56 | 2.88k | } |
57 | | |
58 | | virtual Status get_next_block(Block* block, size_t* read_rows, bool* eof) = 0; |
59 | | |
60 | | // Type is always nullable to process illegal values. |
61 | | virtual Status get_columns(std::unordered_map<std::string, DataTypePtr>* name_to_type, |
62 | 0 | std::unordered_set<std::string>* missing_cols) { |
63 | 0 | return Status::NotSupported("get_columns is not implemented"); |
64 | 0 | } |
65 | | |
66 | | // This method is responsible for initializing the resource for parsing schema. |
67 | | // It will be called before `get_parsed_schema`. |
68 | 0 | virtual Status init_schema_reader() { |
69 | 0 | return Status::NotSupported("init_schema_reader is not implemented for this reader."); |
70 | 0 | } |
71 | | // `col_types` is always nullable to process illegal values. |
72 | | virtual Status get_parsed_schema(std::vector<std::string>* col_names, |
73 | 0 | std::vector<DataTypePtr>* col_types) { |
74 | 0 | return Status::NotSupported("get_parsed_schema is not implemented for this reader."); |
75 | 0 | } |
76 | 3.53k | ~GenericReader() override = default; |
77 | | |
78 | | /// If the underlying FileReader has filled the partition&missing columns, |
79 | | /// The FileScanner does not need to fill |
80 | 5.88k | virtual bool fill_all_columns() const { return _fill_all_columns; } |
81 | | |
82 | | /// Tell the underlying FileReader the partition&missing columns, |
83 | | /// and the FileReader determine to fill columns or not. |
84 | | /// Should set _fill_all_columns = true, if fill the columns. |
85 | | virtual Status set_fill_columns( |
86 | | const std::unordered_map<std::string, std::tuple<std::string, const SlotDescriptor*>>& |
87 | | partition_columns, |
88 | 2.47k | const std::unordered_map<std::string, VExprContextSPtr>& missing_columns) { |
89 | 2.47k | return Status::OK(); |
90 | 2.47k | } |
91 | | |
92 | 566 | virtual Status close() { return Status::OK(); } |
93 | | |
94 | 54 | Status read_by_rows(const std::list<int64_t>& row_ids) { |
95 | 54 | _read_by_rows = true; |
96 | 54 | _row_ids = row_ids; |
97 | 54 | return _set_read_one_line_impl(); |
98 | 54 | } |
99 | | |
100 | | /// The reader is responsible for counting the number of rows read, |
101 | | /// because some readers, such as parquet/orc, |
102 | | /// can skip some pages/rowgroups through indexes. |
103 | 5.55k | virtual bool count_read_rows() { return false; } |
104 | | |
105 | | protected: |
106 | 0 | virtual Status _set_read_one_line_impl() { |
107 | 0 | return Status::NotSupported("read_by_rows is not implemented for this reader."); |
108 | 0 | } |
109 | | |
110 | | const size_t _MIN_BATCH_SIZE = 4064; // 4094 - 32(padding) |
111 | | |
112 | | /// Whether the underlying FileReader has filled the partition&missing columns |
113 | | bool _fill_all_columns = false; |
114 | | TPushAggOp::type _push_down_agg_type {}; |
115 | | |
116 | | public: |
117 | | // Pass condition cache context to the reader for HIT/MISS tracking. |
118 | 0 | virtual void set_condition_cache_context(std::shared_ptr<ConditionCacheContext> ctx) {} |
119 | | |
120 | | // Returns the total number of rows the reader will produce. |
121 | | // Used to pre-allocate condition cache with the correct number of granules. |
122 | 2 | virtual int64_t get_total_rows() const { return 0; } |
123 | | |
124 | | // Returns true if this reader has delete operations (e.g. Iceberg position/equality deletes, |
125 | | // Hive ACID deletes). Used to disable condition cache when deletes are present, since cached |
126 | | // granule results may become stale if delete files change between queries. |
127 | 2 | virtual bool has_delete_operations() const { return false; } |
128 | | |
129 | | protected: |
130 | | bool _read_by_rows = false; |
131 | | std::list<int64_t> _row_ids; |
132 | | |
133 | | // Cache to save some common part such as file footer. |
134 | | // Maybe null if not used |
135 | | FileMetaCache* _meta_cache = nullptr; |
136 | | }; |
137 | | |
138 | | #include "common/compile_check_end.h" |
139 | | } // namespace doris |