HistogramTask.java

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.statistics;

import org.apache.doris.catalog.Env;
import org.apache.doris.common.DdlException;
import org.apache.doris.common.FeConstants;
import org.apache.doris.statistics.AnalysisInfo.AnalysisMethod;
import org.apache.doris.statistics.util.StatisticsUtil;

import org.apache.commons.text.StringSubstitutor;

import java.util.HashMap;
import java.util.Map;

/**
 * Each task analyze one column.
 */
public class HistogramTask extends BaseAnalysisTask {

    private static final String ANALYZE_HISTOGRAM_SQL_TEMPLATE_TABLE = "INSERT INTO "
            + "${internalDB}.${histogramStatTbl} "
            + "SELECT "
            + "    CONCAT(${tblId}, '-', ${idxId}, '-', '${colId}') AS id, "
            + "    ${catalogId} AS catalog_id, "
            + "    ${dbId} AS db_id, "
            + "    ${tblId} AS tbl_id, "
            + "    ${idxId} AS idx_id, "
            + "    '${colId}' AS col_id, "
            + "    ${sampleRate} AS sample_rate, "
            + "    HISTOGRAM(`${colName}`, ${maxBucketNum}) AS buckets, "
            + "    NOW() AS create_time "
            + "FROM "
            + "    `${dbName}`.`${tblName}`";

    public HistogramTask(AnalysisInfo info) {
        super(info);
    }

    @Override
    public void doExecute() throws Exception {
        Map<String, String> params = new HashMap<>();
        params.put("internalDB", FeConstants.INTERNAL_DB_NAME);
        params.put("histogramStatTbl", StatisticConstants.HISTOGRAM_TBL_NAME);
        params.put("catalogId", String.valueOf(catalog.getId()));
        params.put("dbId", String.valueOf(db.getId()));
        params.put("tblId", String.valueOf(tbl.getId()));
        params.put("idxId", String.valueOf(info.indexId));
        params.put("colId", String.valueOf(info.colName));
        params.put("dbName", db.getFullName());
        params.put("tblName", tbl.getName());
        params.put("colName", String.valueOf(info.colName));
        params.put("sampleRate", getSampleRateFunction());
        params.put("maxBucketNum", String.valueOf(info.maxBucketNum));

        StringSubstitutor stringSubstitutor = new StringSubstitutor(params);
        StatisticsUtil.execUpdate(stringSubstitutor.replace(ANALYZE_HISTOGRAM_SQL_TEMPLATE_TABLE));
        Env.getCurrentEnv().getStatisticsCache().refreshHistogramSync(
                tbl.getDatabase().getCatalog().getId(), tbl.getDatabase().getId(), tbl.getId(), -1, col.getName());
    }

    @Override
    protected void doSample() {
    }

    @Override
    protected void deleteNotExistPartitionStats(AnalysisInfo jobInfo) throws DdlException {
    }

    private String getSampleRateFunction() {
        if (info.analysisMethod == AnalysisMethod.FULL) {
            return "0";
        }
        if (info.samplePercent > 0) {
            return String.valueOf(info.samplePercent / 100.0);
        } else {
            long rowCount = tbl.getRowCount() > 0 ? tbl.getRowCount() : 1;
            double sampRate = (double) info.sampleRows / rowCount;
            return sampRate >= 1 ? "1.0" : String.format("%.4f", sampRate);
        }
    }
}