DeriveStatsJob.java

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.nereids.jobs.cascades;

import org.apache.doris.nereids.jobs.Job;
import org.apache.doris.nereids.jobs.JobContext;
import org.apache.doris.nereids.jobs.JobType;
import org.apache.doris.nereids.memo.Group;
import org.apache.doris.nereids.memo.GroupExpression;
import org.apache.doris.nereids.metrics.EventChannel;
import org.apache.doris.nereids.metrics.EventProducer;
import org.apache.doris.nereids.metrics.consumer.LogConsumer;
import org.apache.doris.nereids.metrics.event.StatsStateEvent;
import org.apache.doris.nereids.minidump.MinidumpUtils;
import org.apache.doris.nereids.stats.HboStatsCalculator;
import org.apache.doris.nereids.stats.StatsCalculator;
import org.apache.doris.nereids.trees.expressions.CTEId;
import org.apache.doris.nereids.trees.plans.algebra.Project;
import org.apache.doris.qe.ConnectContext;
import org.apache.doris.qe.SessionVariable;
import org.apache.doris.statistics.Statistics;

import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Job to derive stats for {@link GroupExpression} in {@link org.apache.doris.nereids.memo.Memo}.
 */
public class DeriveStatsJob extends Job {
    private static final EventProducer STATS_STATE_TRACER = new EventProducer(
            StatsStateEvent.class,
            EventChannel.getDefaultChannel().addConsumers(new LogConsumer(StatsStateEvent.class, EventChannel.LOG)));
    private final GroupExpression groupExpression;
    private boolean deriveChildren;

    /**
     * Constructor for DeriveStatsJob.
     *
     * @param groupExpression Derive stats on this {@link GroupExpression}
     * @param context context of current job
     */
    public DeriveStatsJob(GroupExpression groupExpression, JobContext context) {
        this(groupExpression, false, context, new HashMap<>());
    }

    public DeriveStatsJob(GroupExpression groupExpression, JobContext context, Map<CTEId, Statistics> cteIdToStats) {
        this(groupExpression, false, context, cteIdToStats);
    }

    private DeriveStatsJob(GroupExpression groupExpression, boolean deriveChildren, JobContext context,
            Map<CTEId, Statistics> cteIdToStats) {
        super(JobType.DERIVE_STATS, context);
        this.groupExpression = groupExpression;
        this.deriveChildren = deriveChildren;
        super.cteIdToStats = cteIdToStats;
    }

    @Override
    public void execute() {
        if (groupExpression.isStatDerived() || groupExpression.isUnused()) {
            return;
        }
        countJobExecutionTimesOfGroupExpressions(groupExpression);
        if (!deriveChildren && groupExpression.arity() > 0) {
            pushJob(new DeriveStatsJob(groupExpression, true, context, cteIdToStats));

            List<Group> children = groupExpression.children();
            // Derive stats for left child first, so push it to stack at last, CTE related logic requires this order
            // DO NOT CHANGE IT UNLESS YOU KNOW WHAT YOU ARE DOING.
            // rule maybe return new logical plans to wrap some new physical plans,
            // so we should check derive stats for it if no stats
            for (int i = children.size() - 1; i >= 0; i--) {
                Group childGroup = children.get(i);

                List<GroupExpression> logicalExpressions = childGroup.getLogicalExpressions();
                for (int j = logicalExpressions.size() - 1; j >= 0; j--) {
                    GroupExpression logicalChild = logicalExpressions.get(j);
                    if (!logicalChild.isStatDerived()) {
                        pushJob(new DeriveStatsJob(logicalChild, context, cteIdToStats));
                    }
                }

                List<GroupExpression> physicalExpressions = childGroup.getPhysicalExpressions();
                for (int j = physicalExpressions.size() - 1; j >= 0; j--) {
                    GroupExpression physicalChild = physicalExpressions.get(j);
                    if (!physicalChild.isStatDerived()) {
                        pushJob(new DeriveStatsJob(physicalChild, context, cteIdToStats));
                    }
                }
            }
        } else {
            ConnectContext connectContext = context.getCascadesContext().getConnectContext();
            SessionVariable sessionVariable = connectContext.getSessionVariable();
            boolean isHboEnabled = sessionVariable.isEnableHboOptimization();
            StatsCalculator statsCalculator;
            if (isHboEnabled) {
                statsCalculator = new HboStatsCalculator(groupExpression,
                        sessionVariable.getForbidUnknownColStats(),
                        connectContext.getTotalColumnStatisticMap(),
                        sessionVariable.isPlayNereidsDump(),
                        cteIdToStats,
                        context.getCascadesContext());
                statsCalculator.estimate();
            } else {
                statsCalculator = new StatsCalculator(groupExpression,
                        sessionVariable.getForbidUnknownColStats(),
                        connectContext.getTotalColumnStatisticMap(),
                        sessionVariable.isPlayNereidsDump(),
                        cteIdToStats,
                        context.getCascadesContext());
                statsCalculator.estimate();
            }
            STATS_STATE_TRACER.log(StatsStateEvent.of(groupExpression,
                    groupExpression.getOwnerGroup().getStatistics()));
            if (MinidumpUtils.isDump() && !sessionVariable.isPlayNereidsDump()) {
                connectContext.getTotalColumnStatisticMap().putAll(statsCalculator.getTotalColumnStatisticMap());
                connectContext.getTotalHistogramMap().putAll(statsCalculator.getTotalHistogramMap());
            }

            if (groupExpression.getPlan() instanceof Project) {
                // In the context of reorder join, when a new plan is generated, it may include a project operation.
                // In this case, the newly generated join root and the original join root will no longer be in the
                // same group. To avoid inconsistencies in the statistics between these two groups, we keep the
                // child group's row count unchanged when the parent group expression is a project operation.
                double parentRowCount = groupExpression.getOwnerGroup().getStatistics().getRowCount();
                groupExpression.children().forEach(g -> g.setStatistics(
                        g.getStatistics().withRowCountAndEnforceValid(parentRowCount))
                );
            }
        }
    }
}