ColStatsData.java

// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

package org.apache.doris.statistics;

import org.apache.doris.catalog.Column;
import org.apache.doris.common.AnalysisException;
import org.apache.doris.statistics.util.StatisticsUtil;

import com.google.common.annotations.VisibleForTesting;
import com.google.gson.annotations.SerializedName;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.StringJoiner;

/**
 * Used to convert data from ResultRow.
 * 0: id
 * 1: catalog_id
 * 2: db_id
 * 3: tbl_id
 * 4: idx_id
 * 5: col_id
 * 6: part_id
 * 7: count
 * 8: ndv
 * 9: null_count
 * 10: min
 * 11: max
 * 12: data_size_in_bytes
 * 13: update_time
 */
public class ColStatsData {
    private static final Logger LOG = LogManager.getLogger(ColStatsData.class);

    @SerializedName("statsId")
    public final StatsId statsId;
    @SerializedName("count")
    public final long count;
    @SerializedName("ndv")
    public final long ndv;
    @SerializedName("nullCount")
    public final long nullCount;
    @SerializedName("minLit")
    public final String minLit;
    @SerializedName("maxLit")
    public final String maxLit;
    @SerializedName("dataSizeInBytes")
    public final long dataSizeInBytes;
    @SerializedName("updateTime")
    public final String updateTime;

    @VisibleForTesting
    public ColStatsData() {
        statsId = new StatsId();
        count = 0;
        ndv = 0;
        nullCount = 0;
        minLit = null;
        maxLit = null;
        dataSizeInBytes = 0;
        updateTime = null;
    }

    public ColStatsData(StatsId statsId) {
        this.statsId = statsId;
        count = 0;
        ndv = 0;
        nullCount = 0;
        minLit = null;
        maxLit = null;
        dataSizeInBytes = 0;
        updateTime = LocalDateTime.now().format(DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss"));
    }

    public ColStatsData(ResultRow row) {
        this.statsId = new StatsId(row);
        this.count = (long) Double.parseDouble(row.getWithDefault(7, "0"));
        this.ndv = (long) Double.parseDouble(row.getWithDefault(8, "0"));
        this.nullCount = (long) Double.parseDouble(row.getWithDefault(9, "0"));
        this.minLit = row.get(10);
        this.maxLit = row.get(11);
        this.dataSizeInBytes = (long) Double.parseDouble(row.getWithDefault(12, "0"));
        this.updateTime = row.get(13);
    }

    public ColStatsData(String id, long catalogId, long dbId, long tblId, long idxId, String colId, String partId,
                        ColumnStatistic columnStatistic) {
        this.statsId = new StatsId(id, catalogId, dbId, tblId, idxId, colId, partId);
        this.count = Math.round(columnStatistic.count);
        this.ndv = Math.round(columnStatistic.ndv);
        this.nullCount = Math.round(columnStatistic.numNulls);
        this.minLit = columnStatistic.minExpr == null ? null : columnStatistic.minExpr.getStringValue();
        this.maxLit = columnStatistic.maxExpr == null ? null : columnStatistic.maxExpr.getStringValue();
        this.dataSizeInBytes = Math.round(columnStatistic.dataSize);
        this.updateTime = columnStatistic.updatedTime;
    }

    public String toSQL(boolean roundByParentheses) {
        StringJoiner sj = null;
        if (roundByParentheses) {
            sj = new StringJoiner(",", "(" + statsId.toSQL() + ",", ")");
        } else {
            sj = new StringJoiner(",", statsId.toSQL(), "");
        }
        sj.add(String.valueOf(count));
        sj.add(String.valueOf(ndv));
        sj.add(String.valueOf(nullCount));
        sj.add(minLit == null ? "NULL" : "'" + StatisticsUtil.escapeSQL(minLit) + "'");
        sj.add(maxLit == null ? "NULL" : "'" + StatisticsUtil.escapeSQL(maxLit) + "'");
        sj.add(String.valueOf(dataSizeInBytes));
        sj.add(StatisticsUtil.quote(updateTime));
        return sj.toString();
    }

    public ColumnStatistic toColumnStatistic() {
        try {
            ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(count);
            columnStatisticBuilder.setNdv(ndv);
            columnStatisticBuilder.setNumNulls(nullCount);
            columnStatisticBuilder.setDataSize(dataSizeInBytes);
            columnStatisticBuilder.setAvgSizeByte(count == 0 ? 0 : ((double) dataSizeInBytes) / count);
            if (statsId == null) {
                return ColumnStatistic.UNKNOWN;
            }
            long catalogId = statsId.catalogId;
            long idxId = statsId.idxId;
            long dbID = statsId.dbId;
            long tblId = statsId.tblId;
            String colName = statsId.colId;
            Column col = StatisticsUtil.findColumn(catalogId, dbID, tblId, idxId, colName);
            if (col == null) {
                return ColumnStatistic.UNKNOWN;
            }
            String min = minLit;
            String max = maxLit;
            if (min != null && !min.equalsIgnoreCase("NULL")) {
                try {
                    columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(), min));
                    columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(), min));
                } catch (AnalysisException e) {
                    LOG.warn("Failed to process column {} min value {}.", col, min, e);
                    columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY);
                }
            } else {
                columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY);
            }
            if (max != null && !max.equalsIgnoreCase("NULL")) {
                try {
                    columnStatisticBuilder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(), max));
                    columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(), max));
                } catch (AnalysisException e) {
                    LOG.warn("Failed to process column {} max value {}.", col, max, e);
                    columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY);
                }
            } else {
                columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY);
            }
            columnStatisticBuilder.setUpdatedTime(updateTime);
            return columnStatisticBuilder.build();
        } catch (Exception e) {
            LOG.warn("Failed to convert column statistics.", e);
            return ColumnStatistic.UNKNOWN;
        }
    }

    public boolean isNull(String value) {
        // Checking "NULL" as null is a historical bug which treat literal value "NULL" as null. Will fix it soon.
        return value == null || value.equalsIgnoreCase("NULL");
    }

    public boolean isValid() {
        if (ndv > 10 * count) {
            LOG.debug("Ndv {} is much larger than count {}", ndv, count);
            return false;
        }
        if (ndv == 0 && (!isNull(minLit) || !isNull(maxLit))) {
            LOG.debug("Ndv is 0 but min or max exists");
            return false;
        }
        if (count > 0 && ndv == 0 && isNull(minLit) && isNull(maxLit) && (nullCount == 0 || count > nullCount * 10)) {
            LOG.debug("count {} not 0, ndv is 0, min and max are all null, null count {} is too small", count, count);
            return false;
        }
        return true;
    }
}