be/src/format_v2/parquet/parquet_scan.cpp

Source
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//   http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include "format_v2/parquet/parquet_scan.h"

#include <algorithm>
#include <limits>
#include <memory>
#include <ranges>
#include <unordered_set>
#include <utility>

#include "common/exception.h"
#include "common/status.h"
#include "core/assert_cast.h"
#include "core/block/block.h"
#include "core/column/column_vector.h"
#include "exprs/vexpr_context.h"
#include "format_v2/parquet/parquet_column_schema.h"
#include "format_v2/parquet/parquet_file_context.h"
#include "format_v2/parquet/parquet_statistics.h"

namespace doris::format::parquet {

namespace {

int64_t column_start_offset(const ::parquet::ColumnChunkMetaData& column_metadata) {
    return column_metadata.has_dictionary_page()
                   ? cast_set<int64_t>(column_metadata.dictionary_page_offset())
                   : cast_set<int64_t>(column_metadata.data_page_offset());
}

void collect_all_leaf_column_ids(const ParquetColumnSchema& column_schema,
                                 std::unordered_set<int>* leaf_column_ids) {
    DORIS_CHECK(leaf_column_ids != nullptr);
    if (column_schema.kind == ParquetColumnSchemaKind::PRIMITIVE) {
        if (column_schema.leaf_column_id >= 0) {
            leaf_column_ids->insert(column_schema.leaf_column_id);
        }
        return;
    }
    for (const auto& child : column_schema.children) {
        DORIS_CHECK(child != nullptr);
        collect_all_leaf_column_ids(*child, leaf_column_ids);
    }
}

void collect_projected_leaf_column_ids(const ParquetColumnSchema& column_schema,
                                       const format::LocalColumnIndex& projection,
                                       std::unordered_set<int>* leaf_column_ids) {
    DORIS_CHECK(leaf_column_ids != nullptr);
    if (projection.project_all_children || projection.children.empty()) {
        collect_all_leaf_column_ids(column_schema, leaf_column_ids);
        return;
    }
    for (const auto& child_projection : projection.children) {
        const auto child_it =
                std::ranges::find_if(column_schema.children, [&](const auto& child_schema) {
                    return child_schema->local_id == child_projection.local_id();
                });
        DORIS_CHECK(child_it != column_schema.children.end());
        collect_projected_leaf_column_ids(**child_it, child_projection, leaf_column_ids);
    }
}

bool is_row_group_outside_range(const ::parquet::FileMetaData& metadata,
                                const ParquetScanRange& scan_range, int row_group_idx) {
    if (scan_range.size < 0) {
        return false;
    }
    const int64_t range_start_offset = scan_range.start_offset;
    const int64_t range_end_offset = range_start_offset + scan_range.size;
    DORIS_CHECK(range_start_offset >= 0);
    DORIS_CHECK(range_end_offset >= range_start_offset);
    if (range_start_offset == 0 &&
        (scan_range.file_size < 0 || range_end_offset >= scan_range.file_size)) {
        return false;
    }

    auto row_group_metadata = metadata.RowGroup(row_group_idx);
    DORIS_CHECK(row_group_metadata != nullptr);
    DORIS_CHECK(row_group_metadata->num_columns() > 0);
    const auto first_column = row_group_metadata->ColumnChunk(0);
    const auto last_column = row_group_metadata->ColumnChunk(row_group_metadata->num_columns() - 1);
    DORIS_CHECK(first_column != nullptr);
    DORIS_CHECK(last_column != nullptr);
    const int64_t row_group_start_offset = column_start_offset(*first_column);
    const int64_t row_group_end_offset =
            column_start_offset(*last_column) + last_column->total_compressed_size();
    const int64_t row_group_mid_offset =
            row_group_start_offset + (row_group_end_offset - row_group_start_offset) / 2;
    return row_group_mid_offset < range_start_offset || row_group_mid_offset >= range_end_offset;
}

std::vector<format::LocalColumnIndex> request_scan_columns(const format::FileScanRequest& request) {
    std::vector<format::LocalColumnIndex> scan_columns;
    scan_columns.reserve(request.predicate_columns.size() + request.non_predicate_columns.size());
    scan_columns.insert(scan_columns.end(), request.predicate_columns.begin(),
                        request.predicate_columns.end());
    scan_columns.insert(scan_columns.end(), request.non_predicate_columns.begin(),
                        request.non_predicate_columns.end());
    return scan_columns;
}

std::vector<ParquetPageCacheRange> build_row_group_prefetch_ranges(
        const ::parquet::FileMetaData& metadata,
        const std::vector<std::unique_ptr<ParquetColumnSchema>>& file_schema,
        const std::vector<format::LocalColumnIndex>& scan_columns, int row_group_idx) {
    std::unordered_set<int> leaf_column_ids;
    for (const auto& projection : scan_columns) {
        const auto local_id = projection.local_id();
        if (local_id == format::ROW_POSITION_COLUMN_ID ||
            local_id == format::GLOBAL_ROWID_COLUMN_ID) {
            continue;
        }
        DORIS_CHECK(local_id >= 0 && local_id < static_cast<int32_t>(file_schema.size()));
        DORIS_CHECK(file_schema[local_id] != nullptr);
        // Prefetch and merge-reader ranges must be physical leaf chunks, not Doris logical slots.
        // Example: for a struct column s<a:int,b:string>, projecting only s.a should include only
        // the Parquet leaf chunk of a. Projecting the whole struct includes both a and b.
        collect_projected_leaf_column_ids(*file_schema[local_id], projection, &leaf_column_ids);
    }

    auto row_group_metadata = metadata.RowGroup(row_group_idx);
    DORIS_CHECK(row_group_metadata != nullptr);
    std::vector<int> ordered_leaf_column_ids(leaf_column_ids.begin(), leaf_column_ids.end());
    std::ranges::sort(ordered_leaf_column_ids);

    std::vector<ParquetPageCacheRange> ranges;
    ranges.reserve(ordered_leaf_column_ids.size());
    for (const auto leaf_column_id : ordered_leaf_column_ids) {
        DORIS_CHECK(leaf_column_id >= 0 && leaf_column_id < row_group_metadata->num_columns());
        auto column_metadata = row_group_metadata->ColumnChunk(leaf_column_id);
        DORIS_CHECK(column_metadata != nullptr);
        const int64_t offset = column_start_offset(*column_metadata);
        const int64_t size = column_metadata->total_compressed_size();
        DORIS_CHECK(offset >= 0);
        if (size > 0) {
            ranges.push_back(ParquetPageCacheRange {.offset = offset, .size = size});
        }
    }
    return ranges;
}

} // namespace

Status plan_parquet_row_groups(const ::parquet::FileMetaData& metadata,
                               ::parquet::ParquetFileReader* file_reader,
                               const std::vector<std::unique_ptr<ParquetColumnSchema>>& file_schema,
                               const format::FileScanRequest& request,
                               const ParquetScanRange& scan_range, bool enable_bloom_filter,
                               RowGroupScanPlan* plan, const cctz::time_zone* timezone) {
    DORIS_CHECK(plan != nullptr);
    plan->row_groups.clear();
    plan->pruning_stats = ParquetPruningStats {};

    std::vector<int64_t> row_group_first_rows(metadata.num_row_groups());
    std::vector<int> scan_range_selected_row_groups;
    scan_range_selected_row_groups.reserve(metadata.num_row_groups());
    int64_t next_row_group_first_row = 0;
    for (int row_group_idx = 0; row_group_idx < metadata.num_row_groups(); ++row_group_idx) {
        row_group_first_rows[row_group_idx] = next_row_group_first_row;
        auto row_group_metadata = metadata.RowGroup(row_group_idx);
        DORIS_CHECK(row_group_metadata != nullptr);
        const int64_t row_group_rows = row_group_metadata->num_rows();
        if (row_group_rows < 0) {
            return Status::Corruption("Invalid negative row count in parquet row group {}",
                                      row_group_idx);
        }
        next_row_group_first_row += row_group_rows;
        if (!is_row_group_outside_range(metadata, scan_range, row_group_idx)) {
            scan_range_selected_row_groups.push_back(row_group_idx);
        }
    }

    std::vector<int> statistics_selected_row_groups;
    RETURN_IF_ERROR(select_row_groups_by_statistics(
            metadata, file_reader, file_schema, request, &scan_range_selected_row_groups,
            &statistics_selected_row_groups, enable_bloom_filter, &plan->pruning_stats, timezone));

    plan->row_groups.reserve(statistics_selected_row_groups.size());
    for (const auto row_group_idx : statistics_selected_row_groups) {
        auto row_group_metadata = metadata.RowGroup(row_group_idx);
        DORIS_CHECK(row_group_metadata != nullptr);
        const int64_t row_group_rows = row_group_metadata->num_rows();
        if (row_group_rows == 0) {
            continue;
        }

        RowGroupReadPlan row_group_plan;
        row_group_plan.row_group_id = row_group_idx;
        row_group_plan.first_file_row = row_group_first_rows[row_group_idx];
        row_group_plan.row_group_rows = row_group_rows;
        RETURN_IF_ERROR(select_row_group_ranges_by_page_index(
                file_reader, file_schema, request, row_group_idx, row_group_rows,
                &row_group_plan.selected_ranges, &row_group_plan.page_skip_plans,
                &plan->pruning_stats, timezone));
        if (row_group_plan.selected_ranges.empty()) {
            continue;
        }
        plan->pruning_stats.selected_row_ranges += row_group_plan.selected_ranges.size();
        plan->row_groups.push_back(std::move(row_group_plan));
    }
    plan->pruning_stats.selected_row_groups = plan->row_groups.size();
    return Status::OK();
}

namespace {

uint16_t apply_filter_to_selection(const IColumn::Filter& filter, SelectionVector* selection,
                                   uint16_t selected_rows) {
    uint16_t new_selected_rows = 0;
    for (uint16_t selection_idx = 0; selection_idx < selected_rows; ++selection_idx) {
        const auto row_idx = selection->get_index(selection_idx);
        if (filter[row_idx] != 0) {
            selection->set_index(new_selected_rows++, static_cast<SelectionVector::Index>(row_idx));
        }
    }
    return new_selected_rows;
}

Status execute_filter_conjuncts(const format::FileScanRequest& request, int64_t batch_rows,
                                Block* file_block, SelectionVector* selection,
                                uint16_t* selected_rows) {
    for (const auto& conjunct : request.conjuncts) {
        if (*selected_rows == 0) {
            break;
        }
        DORIS_CHECK(conjunct != nullptr);
        IColumn::Filter filter(static_cast<size_t>(batch_rows), 1);
        bool can_filter_all = false;
        RETURN_IF_ERROR(conjunct->execute_filter(file_block, filter.data(),
                                                 static_cast<size_t>(batch_rows), false,
                                                 &can_filter_all));
        *selected_rows =
                can_filter_all ? 0 : apply_filter_to_selection(filter, selection, *selected_rows);
    }
    return Status::OK();
}

Status execute_delete_conjuncts(const format::FileScanRequest& request, int64_t batch_rows,
                                Block* file_block, SelectionVector* selection,
                                uint16_t* selected_rows) {
    for (const auto& delete_conjunct : request.delete_conjuncts) {
        if (*selected_rows == 0) {
            break;
        }
        DORIS_CHECK(delete_conjunct != nullptr);
        int result_column_id = -1;
        RETURN_IF_ERROR(delete_conjunct->root()->execute(delete_conjunct.get(), file_block,
                                                         &result_column_id));
        DORIS_CHECK(result_column_id >= 0 &&
                    result_column_id < static_cast<int>(file_block->columns()));
        const auto& delete_filter = assert_cast<const ColumnUInt8&>(
                                            *file_block->get_by_position(result_column_id).column)
                                            .get_data();
        DORIS_CHECK(delete_filter.size() == static_cast<size_t>(batch_rows));
        IColumn::Filter keep_filter(static_cast<size_t>(batch_rows), 1);
        bool has_kept_row = false;
        for (size_t row = 0; row < static_cast<size_t>(batch_rows); ++row) {
            keep_filter[row] = !delete_filter[row];
            has_kept_row |= keep_filter[row] != 0;
        }
        file_block->erase(result_column_id);
        *selected_rows =
                !has_kept_row ? 0
                              : apply_filter_to_selection(keep_filter, selection, *selected_rows);
    }
    return Status::OK();
}

} // namespace

IColumn::Filter selection_to_filter(const SelectionVector& selection, uint16_t selected_rows,
                                    int64_t batch_rows) {
    IColumn::Filter filter(static_cast<size_t>(batch_rows), 0);
    for (uint16_t selection_idx = 0; selection_idx < selected_rows; ++selection_idx) {
        filter[selection.get_index(selection_idx)] = 1;
    }
    return filter;
}

Status execute_batch_filters(const format::FileScanRequest& request, int64_t batch_rows,
                             Block* file_block, SelectionVector* selection, uint16_t* selected_rows,
                             int64_t* conjunct_filtered_rows) {
    if (request.conjuncts.empty() && request.delete_conjuncts.empty()) {
        return Status::OK();
    }
    const auto selected_rows_before_conjunct = *selected_rows;
    RETURN_IF_ERROR(
            execute_filter_conjuncts(request, batch_rows, file_block, selection, selected_rows));
    if (conjunct_filtered_rows != nullptr) {
        *conjunct_filtered_rows += static_cast<int64_t>(selected_rows_before_conjunct) -
                                   static_cast<int64_t>(*selected_rows);
    }
    if (*selected_rows == 0) {
        return Status::OK();
    }
    return execute_delete_conjuncts(request, batch_rows, file_block, selection, selected_rows);
}

namespace {
int64_t count_range_rows(const std::vector<RowRange>& ranges) {
    int64_t rows = 0;
    for (const auto& range : ranges) {
        rows += range.length;
    }
    return rows;
}

void append_intersection(const RowRange& left, const RowRange& right,
                         std::vector<RowRange>* result) {
    const int64_t start = std::max(left.start, right.start);
    const int64_t end = std::min(left.start + left.length, right.start + right.length);
    if (start < end) {
        result->push_back(RowRange {.start = start, .length = end - start});
    }
}

std::vector<RowRange> filter_ranges_by_condition_cache(const std::vector<RowRange>& ranges,
                                                       const std::vector<bool>& cache,
                                                       int64_t row_group_first_row,
                                                       int64_t base_granule) {
    std::vector<RowRange> result;
    if (cache.empty()) {
        return ranges;
    }

    // Cache coordinates are file-global granules; RowRange coordinates are row-group-relative.
    // Walk every selected range in order and split it by granule. Granules covered by the bitmap
    // are kept only when the bit is true. Granules outside the bitmap are kept conservatively, so
    // an undersized or old-format cache entry cannot skip valid rows.
    for (const auto& range : ranges) {
        const int64_t global_start = row_group_first_row + range.start;
        const int64_t global_end = global_start + range.length;
        for (int64_t granule = global_start / ConditionCacheContext::GRANULE_SIZE;
             granule <= (global_end - 1) / ConditionCacheContext::GRANULE_SIZE; ++granule) {
            const int64_t cache_idx = granule - base_granule;
            const bool keep = cache_idx < 0 || static_cast<size_t>(cache_idx) >= cache.size() ||
                              cache[static_cast<size_t>(cache_idx)];
            if (!keep) {
                continue;
            }
            const int64_t granule_start = granule * ConditionCacheContext::GRANULE_SIZE;
            const int64_t granule_end = granule_start + ConditionCacheContext::GRANULE_SIZE;
            const RowRange file_granule_range {.start = granule_start - row_group_first_row,
                                               .length = granule_end - granule_start};
            append_intersection(range, file_granule_range, &result);
        }
    }
    return result;
}

} // namespace

void ParquetScanScheduler::set_plan(RowGroupScanPlan plan) {
    _row_group_plans = std::move(plan.row_groups);
    _condition_cache_filtered_rows = 0;
    _predicate_filtered_rows = 0;
    reset();
}

void ParquetScanScheduler::set_condition_cache_context(std::shared_ptr<ConditionCacheContext> ctx) {
    _condition_cache_ctx = std::move(ctx);
    if (!_condition_cache_ctx || !_condition_cache_ctx->filter_result || _row_group_plans.empty()) {
        return;
    }

    _condition_cache_ctx->base_granule =
            _row_group_plans.front().first_file_row / ConditionCacheContext::GRANULE_SIZE;
    if (!_condition_cache_ctx->is_hit) {
        return;
    }

    std::vector<RowGroupReadPlan> filtered_plans;
    filtered_plans.reserve(_row_group_plans.size());
    for (auto& plan : _row_group_plans) {
        const int64_t old_rows = count_range_rows(plan.selected_ranges);
        plan.selected_ranges = filter_ranges_by_condition_cache(
                plan.selected_ranges, *_condition_cache_ctx->filter_result, plan.first_file_row,
                _condition_cache_ctx->base_granule);
        const int64_t new_rows = count_range_rows(plan.selected_ranges);
        _condition_cache_filtered_rows += old_rows - new_rows;
        if (!plan.selected_ranges.empty()) {
            filtered_plans.push_back(std::move(plan));
        }
    }
    _row_group_plans = std::move(filtered_plans);
    reset();
}

void ParquetScanScheduler::reset() {
    _next_row_group_plan_idx = 0;
    reset_current_row_group();
}

void ParquetScanScheduler::reset_current_row_group() {
    _current_row_group.reset();
    _current_predicate_columns.clear();
    _current_non_predicate_columns.clear();
    _current_row_group_rows = 0;
    _current_row_group_id = -1;
    _current_row_group_rows_read = 0;
    _current_row_group_first_row = 0;
    _current_selected_ranges.clear();
    _current_range_idx = 0;
    _current_range_rows_read = 0;
    _current_predicate_prefetched = false;
    _current_non_predicate_prefetched = false;
    _current_merge_range_active = false;
}

Status ParquetScanScheduler::open_next_row_group(
        ParquetFileContext& file_context,
        const std::vector<std::unique_ptr<ParquetColumnSchema>>& file_schema,
        const format::FileScanRequest& request, bool* has_row_group) {
    *has_row_group = false;
    if (_next_row_group_plan_idx >= _row_group_plans.size()) {
        return Status::OK();
    }
    const RowGroupReadPlan& row_group_plan = _row_group_plans[_next_row_group_plan_idx++];
    const int row_group_idx = row_group_plan.row_group_id;
    _current_merge_range_active =
            prepare_current_row_group_reader(file_context, file_schema, request, row_group_idx);
    try {
        _current_row_group = file_context.file_reader->RowGroup(row_group_idx);
    } catch (const ::parquet::ParquetException& e) {
        return Status::Corruption("Failed to open parquet row group {}: {}", row_group_idx,
                                  e.what());
    } catch (const std::exception& e) {
        return Status::InternalError("Failed to open parquet row group {}: {}", row_group_idx,
                                     e.what());
    }

    auto row_group_metadata = file_context.metadata->RowGroup(row_group_idx);
    DORIS_CHECK(row_group_metadata != nullptr);
    _current_row_group_rows = row_group_metadata->num_rows();
    DORIS_CHECK(_current_row_group_rows == row_group_plan.row_group_rows);
    DORIS_CHECK(_current_row_group_rows > 0);
    _current_row_group_id = row_group_idx;
    DORIS_CHECK(!row_group_plan.selected_ranges.empty());
    _current_row_group_first_row = row_group_plan.first_file_row;
    _current_row_group_rows_read = 0;
    _current_selected_ranges = row_group_plan.selected_ranges;
    _current_range_idx = 0;
    _current_range_rows_read = 0;
    _current_predicate_columns.clear();
    _current_non_predicate_columns.clear();

    ParquetColumnReaderFactory column_reader_factory(
            _current_row_group, file_context.schema->num_columns(), &row_group_plan.page_skip_plans,
            _page_skip_profile, _timezone, _enable_strict_mode,
            _scan_profile.column_reader_profile);
    for (const auto& col : request.predicate_columns) {
        const auto local_id = col.local_id();
        if (local_id == format::ROW_POSITION_COLUMN_ID) {
            _current_predicate_columns[local_id] =
                    column_reader_factory.create_row_position_column_reader(
                            _current_row_group_first_row);
            continue;
        }
        if (local_id == format::GLOBAL_ROWID_COLUMN_ID) {
            DORIS_CHECK(_global_rowid_context.has_value());
            _current_predicate_columns[local_id] =
                    column_reader_factory.create_global_rowid_column_reader(
                            *_global_rowid_context, _current_row_group_first_row);
            continue;
        }

        DORIS_CHECK(local_id >= 0 && local_id < static_cast<int32_t>(file_schema.size()));
        const auto& column_schema = file_schema[local_id];
        DORIS_CHECK(column_schema != nullptr);
        std::unique_ptr<ParquetColumnReader> column_reader;
        RETURN_IF_ERROR(column_reader_factory.create(*column_schema, &col, &column_reader));
        _current_predicate_columns[local_id] = std::move(column_reader);
    }
    // Start warming filter-column chunks as soon as their row group is selected. Parquet v2 still
    // reads through Arrow's random-access reader; this prefetch only warms Doris file cache blocks
    // in the background and never changes the row/column materialization order.
    if (!_current_merge_range_active) {
        prefetch_current_row_group_columns(file_context, file_schema, request.predicate_columns,
                                           &_current_predicate_prefetched);
    }
    for (const auto& col : request.non_predicate_columns) {
        const auto local_id = col.local_id();
        if (local_id == format::ROW_POSITION_COLUMN_ID) {
            _current_non_predicate_columns[local_id] =
                    column_reader_factory.create_row_position_column_reader(
                            _current_row_group_first_row);
            continue;
        }
        if (local_id == format::GLOBAL_ROWID_COLUMN_ID) {
            DORIS_CHECK(_global_rowid_context.has_value());
            _current_non_predicate_columns[local_id] =
                    column_reader_factory.create_global_rowid_column_reader(
                            *_global_rowid_context, _current_row_group_first_row);
            continue;
        }
        DORIS_CHECK(local_id >= 0 && local_id < static_cast<int32_t>(file_schema.size()));
        const auto& column_schema = file_schema[local_id];
        DORIS_CHECK(column_schema != nullptr);
        std::unique_ptr<ParquetColumnReader> column_reader;
        RETURN_IF_ERROR(column_reader_factory.create(*column_schema, &col, &column_reader));
        _current_non_predicate_columns[local_id] = std::move(column_reader);
    }
    if (!_current_merge_range_active && request.conjuncts.empty() &&
        request.delete_conjuncts.empty()) {
        // With no row-level filters there is no lazy-read decision to wait for, so start warming
        // output chunks immediately after their readers are created. Filtered scans still defer
        // this until at least one row survives the predicate phase.
        prefetch_current_row_group_columns(file_context, file_schema, request.non_predicate_columns,
                                           &_current_non_predicate_prefetched);
    }
    *has_row_group = true;
    return Status::OK();
}

Status ParquetScanScheduler::skip_current_row_group_rows(int64_t rows) {
    DORIS_CHECK(rows >= 0);
    if (rows == 0) {
        return Status::OK();
    }
    if (_scan_profile.range_gap_skipped_rows != nullptr) {
        COUNTER_UPDATE(_scan_profile.range_gap_skipped_rows, rows);
    }
    for (const auto& column_reader : _current_predicate_columns | std::views::values) {
        RETURN_IF_ERROR(column_reader->skip(rows));
    }
    for (const auto& column_reader : _current_non_predicate_columns | std::views::values) {
        RETURN_IF_ERROR(column_reader->skip(rows));
    }
    _current_row_group_rows_read += rows;
    return Status::OK();
}

Status ParquetScanScheduler::read_filter_columns(int64_t batch_rows,
                                                 const format::FileScanRequest& request,
                                                 Block* file_block, SelectionVector* selection,
                                                 uint16_t* selected_rows,
                                                 int64_t* conjunct_filtered_rows) {
    if (!request.conjuncts.empty() || !request.delete_conjuncts.empty()) {
        selection->resize(static_cast<size_t>(batch_rows));
    }
    for (const auto& [fid, column_reader] : _current_predicate_columns) {
        auto position_it = request.local_positions.find(format::LocalColumnId(fid));
        DORIS_CHECK(position_it != request.local_positions.end());
        const auto block_position = position_it->second.value();
        DCHECK(remove_nullable(column_reader->type())
                       ->equals(*remove_nullable(file_block->get_by_position(block_position).type)))
                << column_reader->type()->get_name() << " "
                << file_block->get_by_position(block_position).type->get_name() << " "
                << column_reader->name() << " " << file_block->get_by_position(block_position).name;
        auto column = file_block->get_by_position(block_position).column->assert_mutable();
        int64_t column_rows = 0;
        {
            SCOPED_TIMER(_scan_profile.column_read_time);
            RETURN_IF_ERROR(column_reader->read(batch_rows, column, &column_rows));
        }
        if (column_rows != batch_rows) {
            return Status::Corruption("Parquet filter column {} returned {} rows, expected {} rows",
                                      column_reader->name(), column_rows, batch_rows);
        }
        file_block->replace_by_position(block_position, std::move(column));
    }
    if (_scan_profile.predicate_filter_time == nullptr) {
        return execute_batch_filters(request, batch_rows, file_block, selection, selected_rows,
                                     conjunct_filtered_rows);
    }
    SCOPED_TIMER(_scan_profile.predicate_filter_time);
    return execute_batch_filters(request, batch_rows, file_block, selection, selected_rows,
                                 conjunct_filtered_rows);
}

bool ParquetScanScheduler::prepare_current_row_group_reader(
        ParquetFileContext& file_context,
        const std::vector<std::unique_ptr<ParquetColumnSchema>>& file_schema,
        const format::FileScanRequest& request, int row_group_idx) {
    if (file_context.metadata == nullptr) {
        return false;
    }
    const auto ranges = build_row_group_prefetch_ranges(
            *file_context.metadata, file_schema, request_scan_columns(request), row_group_idx);
    const size_t avg_io_size = detail::average_prefetch_range_size(ranges);
    return file_context.set_random_access_ranges(ranges, avg_io_size, _profile,
                                                 _merge_read_slice_size);
}

void ParquetScanScheduler::prefetch_current_row_group_columns(
        ParquetFileContext& file_context,
        const std::vector<std::unique_ptr<ParquetColumnSchema>>& file_schema,
        const std::vector<format::LocalColumnIndex>& scan_columns, bool* prefetched) {
    DORIS_CHECK(prefetched != nullptr);
    if (_current_merge_range_active || *prefetched || scan_columns.empty() ||
        _current_row_group_id < 0 || file_context.metadata == nullptr) {
        return;
    }
    *prefetched = true;
    // The scanner request separates predicate and non-predicate columns so Parquet can read
    // predicate columns first and lazily materialize the rest. Keep the same contract for
    // prefetch: callers decide which side to warm, and this helper only translates that selected
    // projection into physical column-chunk byte ranges for the current row group.
    file_context.prefetch_ranges(
            build_row_group_prefetch_ranges(*file_context.metadata, file_schema, scan_columns,
                                            _current_row_group_id),
            nullptr);
}

Status ParquetScanScheduler::read_current_row_group_batch(
        ParquetFileContext& file_context,
        const std::vector<std::unique_ptr<ParquetColumnSchema>>& file_schema, int64_t batch_rows,
        const format::FileScanRequest& request, int64_t batch_first_file_row, Block* file_block,
        size_t* rows) {
    if (_scan_profile.total_batches != nullptr) {
        COUNTER_UPDATE(_scan_profile.total_batches, 1);
    }
    if (_scan_profile.raw_rows_read != nullptr) {
        COUNTER_UPDATE(_scan_profile.raw_rows_read, batch_rows);
    }
    if (_current_predicate_columns.empty() && _current_non_predicate_columns.empty()) {
        *rows = static_cast<size_t>(batch_rows);
        if (_scan_profile.selected_rows != nullptr) {
            COUNTER_UPDATE(_scan_profile.selected_rows, batch_rows);
        }
        return Status::OK();
    }
    SelectionVector selection;
    DORIS_CHECK(batch_rows <= std::numeric_limits<uint16_t>::max());
    uint16_t selected_rows = static_cast<uint16_t>(batch_rows);
    int64_t conjunct_filtered_rows = 0;
    RETURN_IF_ERROR(read_filter_columns(batch_rows, request, file_block, &selection, &selected_rows,
                                        &conjunct_filtered_rows));
    _predicate_filtered_rows += conjunct_filtered_rows;
    mark_condition_cache_granules(selection, selected_rows, batch_first_file_row);

    const bool need_filter_output = selected_rows != batch_rows;
    if (_scan_profile.selected_rows != nullptr) {
        COUNTER_UPDATE(_scan_profile.selected_rows, selected_rows);
    }
    if (_scan_profile.rows_filtered_by_conjunct != nullptr) {
        COUNTER_UPDATE(_scan_profile.rows_filtered_by_conjunct, conjunct_filtered_rows);
    }
    if (!_current_non_predicate_columns.empty() &&
        _scan_profile.lazy_read_filtered_rows != nullptr) {
        COUNTER_UPDATE(_scan_profile.lazy_read_filtered_rows, batch_rows - selected_rows);
    }
    if (selected_rows == 0 && _scan_profile.empty_selection_batches != nullptr) {
        COUNTER_UPDATE(_scan_profile.empty_selection_batches, 1);
    }
    if (need_filter_output) {
        IColumn::Filter output_filter = selection_to_filter(selection, selected_rows, batch_rows);
        for (const auto& col : request.predicate_columns) {
            auto position_it = request.local_positions.find(col.column_id());
            DORIS_CHECK(position_it != request.local_positions.end());
            const auto block_position = position_it->second.value();
            RETURN_IF_CATCH_EXCEPTION(file_block->replace_by_position(
                    block_position, file_block->get_by_position(block_position)
                                            .column->filter(output_filter, selected_rows)));
        }
    }
    if (!_current_merge_range_active && selected_rows > 0 &&
        !_current_non_predicate_columns.empty()) {
        // Do not prefetch lazy output columns until at least one row survives filtering. This is
        // the same decision point where the v2 reader switches from predicate-only reads to
        // materializing non-predicate columns, so fully filtered batches avoid unnecessary IO.
        prefetch_current_row_group_columns(file_context, file_schema, request.non_predicate_columns,
                                           &_current_non_predicate_prefetched);
    }

    {
        SCOPED_TIMER(_scan_profile.column_read_time);
        for (const auto& [fid, column_reader] : _current_non_predicate_columns) {
            auto position_it = request.local_positions.find(format::LocalColumnId(fid));
            DORIS_CHECK(position_it != request.local_positions.end());
            const auto block_position = position_it->second.value();
            auto column = file_block->get_by_position(block_position).column->assert_mutable();
            DCHECK_EQ(file_block->get_by_position(block_position).type->get_primitive_type(),
                      column_reader->type()->get_primitive_type())
                    << type_to_string(file_block->get_by_position(block_position)
                                              .type->get_primitive_type())
                    << " " << type_to_string(column_reader->type()->get_primitive_type()) << " "
                    << column_reader->name() << " " << fid << " " << block_position;
            if (need_filter_output) {
                [[maybe_unused]] auto old_size = column->size();
                RETURN_IF_ERROR(
                        column_reader->select(selection, selected_rows, batch_rows, column));
                if (column->size() != old_size + selected_rows) {
                    return Status::Corruption(
                            "Parquet selected output column {} returned {} rows, expected {} rows",
                            column_reader->name(), column->size(), old_size + selected_rows);
                }
            } else {
                int64_t column_rows = 0;
                RETURN_IF_ERROR(column_reader->read(batch_rows, column, &column_rows));
                if (column_rows != batch_rows) {
                    return Status::Corruption(
                            "Parquet output column {} returned {} rows, expected {} rows",
                            column_reader->name(), column_rows, batch_rows);
                }
            }
            file_block->replace_by_position(block_position, std::move(column));
        }
    }
    *rows = static_cast<size_t>(selected_rows);
    return Status::OK();
}

void ParquetScanScheduler::mark_condition_cache_granules(const SelectionVector& selection,
                                                         uint16_t selected_rows,
                                                         int64_t batch_first_file_row) {
    if (!_condition_cache_ctx || _condition_cache_ctx->is_hit ||
        !_condition_cache_ctx->filter_result) {
        return;
    }
    auto& cache = *_condition_cache_ctx->filter_result;
    for (uint16_t selection_idx = 0; selection_idx < selected_rows; ++selection_idx) {
        const int64_t file_row = batch_first_file_row + selection.get_index(selection_idx);
        const int64_t granule = file_row / ConditionCacheContext::GRANULE_SIZE;
        const int64_t cache_idx = granule - _condition_cache_ctx->base_granule;
        if (cache_idx >= 0 && static_cast<size_t>(cache_idx) < cache.size()) {
            cache[static_cast<size_t>(cache_idx)] = true;
        }
    }
}

Status ParquetScanScheduler::read_next_batch(
        ParquetFileContext& file_context,
        const std::vector<std::unique_ptr<ParquetColumnSchema>>& file_schema,
        const format::FileScanRequest& request, Block* file_block, size_t* rows, bool* eof) {
    *rows = 0;
    while (true) {
        if (_current_row_group == nullptr) {
            bool has_row_group = false;
            RETURN_IF_ERROR(
                    open_next_row_group(file_context, file_schema, request, &has_row_group));
            if (!has_row_group) {
                *eof = true;
                return Status::OK();
            }
        }

        if (_current_range_idx >= _current_selected_ranges.size()) {
            // Current row group finished, try next row group.
            reset_current_row_group();
            continue;
        }

        const RowRange& current_range = _current_selected_ranges[_current_range_idx];
        DORIS_CHECK(current_range.start >= 0);
        DORIS_CHECK(current_range.length > 0);
        DORIS_CHECK(current_range.start + current_range.length <= _current_row_group_rows);

        if (_current_row_group_rows_read < current_range.start) {
            // Skip filtered rows according to row group level pruning.
            RETURN_IF_ERROR(skip_current_row_group_rows(current_range.start -
                                                        _current_row_group_rows_read));
        }
        DORIS_CHECK(_current_row_group_rows_read == current_range.start + _current_range_rows_read);
        const int64_t remaining_rows = current_range.length - _current_range_rows_read;
        if (remaining_rows <= 0) {
            // Current range finished, try next range in the same row group.
            ++_current_range_idx;
            _current_range_rows_read = 0;
            continue;
        }

        const int64_t batch_rows = std::min<int64_t>(_batch_size, remaining_rows);
        const int64_t physical_rows_read = batch_rows;
        const int64_t batch_first_file_row =
                _current_row_group_first_row + _current_row_group_rows_read;
        RETURN_IF_ERROR(read_current_row_group_batch(file_context, file_schema, batch_rows, request,
                                                     batch_first_file_row, file_block, rows));
        _current_row_group_rows_read += physical_rows_read;
        _current_range_rows_read += physical_rows_read;
        if (_current_range_rows_read >= current_range.length) {
            ++_current_range_idx;
            _current_range_rows_read = 0;
        }
        if (*rows == 0) {
            continue;
        }
        *eof = false;
        return Status::OK();
    }
}

} // namespace doris::format::parquet

Coverage Report

Created: 2026-07-04 04:47