Coverage Report

Created: 2026-07-04 02:05

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/format_v2/parquet/parquet_statistics.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//   http://www.apache.org/licenses/LICENSE-2.0
9
// Unless required by applicable law or agreed to in writing,
10
// software distributed under the License is distributed on an
11
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
12
// KIND, either express or implied.  See the License for the
13
// specific language governing permissions and limitations
14
// under the License.
15
16
#pragma once
17
18
#include <cstdint>
19
#include <map>
20
#include <memory>
21
#include <vector>
22
23
#include "common/status.h"
24
#include "core/field.h"
25
#include "format_v2/file_reader.h"
26
#include "format_v2/parquet/selection_vector.h"
27
28
namespace parquet {
29
class BloomFilter;
30
class FileMetaData;
31
class ParquetFileReader;
32
class Statistics;
33
} // namespace parquet
34
35
namespace cctz {
36
class time_zone;
37
} // namespace cctz
38
39
namespace doris {
40
class ColumnPredicate;
41
} // namespace doris
42
43
namespace doris::format::parquet {
44
45
struct ParquetColumnSchema;
46
47
// ============================================================================
48
// ============================================================================
49
50
struct ParquetPruningStats {
51
    int64_t total_row_groups = 0;                    // total row groups in the file
52
    int64_t selected_row_groups = 0;                 // row groups selected after pruning
53
    int64_t filtered_row_groups_by_statistics = 0;   // row groups pruned by min/max statistics
54
    int64_t filtered_row_groups_by_dictionary = 0;   // row groups pruned by dictionary
55
    int64_t filtered_row_groups_by_bloom_filter = 0; // row groups pruned by bloom filter
56
    int64_t filtered_row_groups_by_page_index = 0;   // row groups fully pruned by page index
57
    int64_t filtered_group_rows = 0;                 // rows in pruned row groups
58
    int64_t filtered_page_rows = 0;                  // rows pruned by page index
59
    int64_t selected_row_ranges = 0;                 // selected row range count
60
    int64_t page_index_read_calls = 0;               // Page Index read count
61
    int64_t bloom_filter_read_time = 0;              // Bloom filter read time (ns)
62
    int64_t row_group_filter_time = 0;               // row-group pruning time (ns)
63
    int64_t page_index_filter_time = 0;              // page-index pruning time (ns)
64
    int64_t read_page_index_time = 0;                // page-index read time (ns)
65
};
66
67
struct ParquetColumnStatistics {
68
    Field min_value;             // column minimum value converted to Doris type
69
    Field max_value;             // column maximum value
70
    bool has_null = false;       // whether NULL exists
71
    bool has_not_null = false;   // whether non-NULL values exist
72
    bool has_null_count = false; // whether null_count is valid
73
    bool has_min_max = false;    // whether min/max is valid after conversion
74
75
419
    bool has_any_statistics() const { return has_null_count || has_min_max; }
76
};
77
78
// ============================================================================
79
// ============================================================================
80
//     statistics(TransformColumnStatistics + check_statistics)
81
//     -> dictionary(read_dictionary_words + predicate::evaluate_and)
82
//     -> bloom filter(bloom_filter_prune_reason)
83
// ============================================================================
84
struct ParquetStatisticsUtils {
85
    static ParquetColumnStatistics TransformColumnStatistics(
86
            const ParquetColumnSchema& column_schema,
87
            const std::shared_ptr<::parquet::Statistics>& statistics,
88
            const cctz::time_zone* timezone = nullptr);
89
90
    static bool BloomFilterExcludes(const ParquetColumnSchema& column_schema,
91
                                    const format::FileColumnPredicateFilter& column_filter,
92
                                    const ::parquet::BloomFilter& bloom_filter);
93
};
94
95
Status select_row_groups_by_statistics(
96
        const ::parquet::FileMetaData& metadata, ::parquet::ParquetFileReader* file_reader,
97
        const std::vector<std::unique_ptr<ParquetColumnSchema>>& file_schema,
98
        const format::FileScanRequest& request, const std::vector<int>* candidate_row_groups,
99
        std::vector<int>* selected_row_groups, bool enable_bloom_filter,
100
        ParquetPruningStats* pruning_stats, const cctz::time_zone* timezone = nullptr);
101
102
Status select_row_group_ranges_by_page_index(
103
        ::parquet::ParquetFileReader* file_reader,
104
        const std::vector<std::unique_ptr<ParquetColumnSchema>>& file_schema,
105
        const format::FileScanRequest& request, int row_group_idx, int64_t row_group_rows,
106
        std::vector<RowRange>* selected_ranges, std::map<int, ParquetPageSkipPlan>* page_skip_plans,
107
        ParquetPruningStats* pruning_stats, const cctz::time_zone* timezone = nullptr);
108
109
} // namespace doris::format::parquet