ColumnStatistic.java
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.statistics;
import org.apache.doris.analysis.LiteralExpr;
import org.apache.doris.catalog.Column;
import org.apache.doris.catalog.Type;
import org.apache.doris.common.AnalysisException;
import org.apache.doris.datasource.InternalCatalog;
import org.apache.doris.nereids.types.DataType;
import org.apache.doris.nereids.types.coercion.CharacterType;
import org.apache.doris.persist.gson.GsonUtils;
import org.apache.doris.statistics.util.StatisticsUtil;
import com.google.common.collect.Sets;
import com.google.gson.annotations.SerializedName;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.json.JSONObject;
import java.util.List;
import java.util.Set;
public class ColumnStatistic {
public static final double STATS_ERROR = 0.1D;
public static final double ALMOST_UNIQUE_FACTOR = 0.9;
public static final StatsType NDV = StatsType.NDV;
public static final StatsType AVG_SIZE = StatsType.AVG_SIZE;
public static final StatsType MAX_SIZE = StatsType.MAX_SIZE;
public static final StatsType NUM_NULLS = StatsType.NUM_NULLS;
public static final StatsType MIN_VALUE = StatsType.MIN_VALUE;
public static final StatsType MAX_VALUE = StatsType.MAX_VALUE;
private static final Logger LOG = LogManager.getLogger(ColumnStatistic.class);
public static ColumnStatistic UNKNOWN = new ColumnStatisticBuilder(1).setAvgSizeByte(1).setNdv(1)
.setNumNulls(1).setMaxValue(Double.POSITIVE_INFINITY).setMinValue(Double.NEGATIVE_INFINITY)
.setIsUnknown(true).setUpdatedTime("")
.build();
public static final Set<Type> UNSUPPORTED_TYPE = Sets.newHashSet(
Type.HLL, Type.BITMAP, Type.ARRAY, Type.STRUCT, Type.MAP, Type.QUANTILE_STATE, Type.JSONB,
Type.VARIANT, Type.TIME, Type.TIMEV2, Type.LAMBDA_FUNCTION
);
// ATTENTION: Stats deriving WILL NOT use 'count' field any longer.
// Use 'rowCount' field in Statistics if needed.
@SerializedName("count")
public final double count;
@SerializedName("ndv")
public final double ndv;
@SerializedName("numNulls")
public final double numNulls;
@SerializedName("dataSize")
public final double dataSize;
@SerializedName("avgSizeByte")
public final double avgSizeByte;
@SerializedName("minValue")
public final double minValue;
@SerializedName("maxValue")
public final double maxValue;
@SerializedName("isUnKnown")
public final boolean isUnKnown;
/*
originalNdv is the ndv in stats of ScanNode. ndv may be changed after filter or join,
but originalNdv is not. It is used to trace the change of a column's ndv through serials
of sql operators.
*/
@SerializedName("original")
public final ColumnStatistic original;
@SerializedName("minExpr")
public final LiteralExpr minExpr;
@SerializedName("maxExpr")
public final LiteralExpr maxExpr;
@SerializedName("updatedTime")
public final String updatedTime;
public ColumnStatistic(double count, double ndv, ColumnStatistic original, double avgSizeByte,
double numNulls, double dataSize, double minValue, double maxValue,
LiteralExpr minExpr, LiteralExpr maxExpr, boolean isUnKnown,
String updatedTime) {
this.count = count;
this.ndv = ndv;
this.original = original;
this.avgSizeByte = avgSizeByte;
this.numNulls = numNulls;
this.dataSize = dataSize;
this.minValue = minValue;
this.maxValue = maxValue;
this.minExpr = minExpr;
this.maxExpr = maxExpr;
this.isUnKnown = isUnKnown;
this.updatedTime = updatedTime;
}
public static ColumnStatistic fromResultRow(List<ResultRow> resultRows) {
ColumnStatistic columnStatistic = ColumnStatistic.UNKNOWN;
for (ResultRow resultRow : resultRows) {
String partId = resultRow.get(6);
if (partId == null) {
columnStatistic = fromResultRow(resultRow);
} else {
LOG.warn("Column statistics table shouldn't contain partition stats. [{}]", resultRow);
}
}
return columnStatistic;
}
// TODO: use thrift
public static ColumnStatistic fromResultRow(ResultRow row) {
double count = Double.parseDouble(row.get(7));
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(count);
double ndv = Double.parseDouble(row.getWithDefault(8, "0"));
columnStatisticBuilder.setNdv(ndv);
String nullCount = row.getWithDefault(9, "0");
columnStatisticBuilder.setNumNulls(Double.parseDouble(nullCount));
columnStatisticBuilder.setDataSize(Double
.parseDouble(row.getWithDefault(12, "0")));
columnStatisticBuilder.setAvgSizeByte(columnStatisticBuilder.getCount() == 0
? 0 : columnStatisticBuilder.getDataSize()
/ columnStatisticBuilder.getCount());
long catalogId = Long.parseLong(row.get(1));
long idxId = Long.parseLong(row.get(4));
long dbID = Long.parseLong(row.get(2));
long tblId = Long.parseLong(row.get(3));
String colName = row.get(5);
Column col = StatisticsUtil.findColumn(catalogId, dbID, tblId, idxId, colName);
if (col == null) {
if (LOG.isDebugEnabled()) {
LOG.debug("Failed to deserialize column statistics, ctlId: {} dbId: {}"
+ "tblId: {} column: {} not exists",
catalogId, dbID, tblId, colName);
}
return ColumnStatistic.UNKNOWN;
}
String min = row.get(10);
String max = row.get(11);
if (min != null && !min.equalsIgnoreCase("NULL")) {
// Internal catalog get the min/max value using a separate SQL,
// and the value is already encoded by base64. Need to handle internal and external catalog separately.
if (catalogId != InternalCatalog.INTERNAL_CATALOG_ID && min.equalsIgnoreCase("NULL")) {
columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY);
} else {
try {
columnStatisticBuilder.setMinValue(StatisticsUtil.convertToDouble(col.getType(), min));
columnStatisticBuilder.setMinExpr(StatisticsUtil.readableValue(col.getType(), min));
} catch (AnalysisException e) {
LOG.warn("Failed to deserialize column {} min value {}.", col, min, e);
columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY);
}
}
} else {
columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY);
}
if (max != null && !max.equalsIgnoreCase("NULL")) {
if (catalogId != InternalCatalog.INTERNAL_CATALOG_ID && max.equalsIgnoreCase("NULL")) {
columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY);
} else {
try {
columnStatisticBuilder.setMaxValue(StatisticsUtil.convertToDouble(col.getType(), max));
columnStatisticBuilder.setMaxExpr(StatisticsUtil.readableValue(col.getType(), max));
} catch (AnalysisException e) {
LOG.warn("Failed to deserialize column {} max value {}.", col, max, e);
columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY);
}
}
} else {
columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY);
}
columnStatisticBuilder.setUpdatedTime(row.get(13));
return columnStatisticBuilder.build();
}
public static boolean isAlmostUnique(double ndv, double rowCount) {
return rowCount * ALMOST_UNIQUE_FACTOR < ndv;
}
public boolean hasIntersect(ColumnStatistic other) {
return Math.max(this.minValue, other.minValue) <= Math.min(this.maxValue, other.maxValue);
}
public ColumnStatistic updateBySelectivity(double selectivity, double rowCount) {
if (isUnKnown) {
return this;
}
ColumnStatisticBuilder builder = new ColumnStatisticBuilder(this);
Double rowsAfterFilter = rowCount * selectivity;
if (isAlmostUnique(ndv, rowCount)) {
builder.setNdv(ndv * selectivity);
} else {
if (ndv > rowsAfterFilter) {
builder.setNdv(rowsAfterFilter);
} else {
builder.setNdv(this.ndv);
}
}
builder.setNumNulls((long) Math.ceil(numNulls * selectivity));
return builder.build();
}
public double ndvIntersection(ColumnStatistic other) {
if (isUnKnown) {
return 1;
}
if (Double.isInfinite(minValue) || Double.isInfinite(maxValue)
|| Double.isInfinite(other.minValue) || Double.isInfinite(other.maxValue)) {
return 1;
}
if (maxValue == minValue) {
if (minValue <= other.maxValue && minValue >= other.minValue) {
return 1;
} else {
return 0;
}
}
double min = Math.max(minValue, other.minValue);
double max = Math.min(maxValue, other.maxValue);
if (min < max) {
return Math.ceil(ndv * (max - min) / (maxValue - minValue));
} else if (min > max) {
return 0;
} else {
return 1;
}
}
public boolean notEnclosed(ColumnStatistic other) {
return !enclosed(other);
}
/**
* Return true if range of this is enclosed by another.
*/
public boolean enclosed(ColumnStatistic other) {
return this.maxValue >= other.maxValue && this.maxValue <= other.maxValue;
}
@Override
public String toString() {
return isUnKnown ? "unknown(" + count + ")"
: String.format("ndv=%.4f, min=%f(%s), max=%f(%s), count=%.4f, numNulls=%.4f, avgSizeByte=%f",
ndv, minValue, minExpr, maxValue, maxExpr, count, numNulls, avgSizeByte);
}
public JSONObject toJson() {
JSONObject statistic = new JSONObject();
statistic.put("Ndv", ndv);
if (Double.isInfinite(minValue)) {
statistic.put("MinValueType", "Infinite");
} else if (Double.isNaN(minValue)) {
statistic.put("MinValueType", "Invalid");
} else {
statistic.put("MinValueType", "Normal");
statistic.put("MinValue", minValue);
}
if (Double.isInfinite(maxValue)) {
statistic.put("MaxValueType", "Infinite");
} else if (Double.isNaN(maxValue)) {
statistic.put("MaxValueType", "Invalid");
} else {
statistic.put("MaxValueType", "Normal");
statistic.put("MaxValue", maxValue);
}
statistic.put("Count", count);
statistic.put("AvgSizeByte", avgSizeByte);
statistic.put("NumNulls", numNulls);
statistic.put("DataSize", dataSize);
statistic.put("MinExprValue", minExpr.getStringValue());
statistic.put("MinExprType", minExpr.getType());
statistic.put("MaxExprValue", maxExpr.getStringValue());
statistic.put("MaxExprType", maxExpr.getType());
statistic.put("IsUnKnown", isUnKnown);
statistic.put("Original", original);
statistic.put("LastUpdatedTime", updatedTime);
return statistic;
}
// MinExpr and MaxExpr serialize and deserialize is not complete
// Histogram is got by other place
public static ColumnStatistic fromJson(String statJson) throws AnalysisException {
JSONObject stat = new JSONObject(statJson);
Double minValue;
switch (stat.getString("MinValueType")) {
case "Infinite":
minValue = Double.NEGATIVE_INFINITY;
break;
case "Invalid":
minValue = Double.NaN;
break;
case "Normal":
minValue = stat.getDouble("MinValue");
break;
default:
throw new RuntimeException(String.format("Min value does not get anytype"));
}
Double maxValue;
switch (stat.getString("MaxValueType")) {
case "Infinite":
maxValue = Double.POSITIVE_INFINITY;
break;
case "Invalid":
maxValue = Double.NaN;
break;
case "Normal":
maxValue = stat.getDouble("MaxValue");
break;
default:
throw new RuntimeException(String.format("Min value does not get anytype"));
}
String lastUpdatedTime = "";
try {
lastUpdatedTime = stat.getString("LastUpdatedTime");
} catch (Exception e) {
LOG.warn("lastUpdateTimeIsEmpty", e.getMessage());
if (LOG.isDebugEnabled()) {
LOG.debug(e);
}
}
return new ColumnStatistic(
stat.getDouble("Count"),
stat.getDouble("Ndv"),
null,
stat.getDouble("AvgSizeByte"),
stat.getDouble("NumNulls"),
stat.getDouble("DataSize"),
minValue,
maxValue,
LiteralExpr.create(stat.getString("MinExprValue"),
GsonUtils.GSON.fromJson(stat.getString("MinExprType"), Type.class)),
LiteralExpr.create(stat.getString("MaxExprValue"),
GsonUtils.GSON.fromJson(stat.getString("MaxExprType"), Type.class)),
stat.getBoolean("IsUnKnown"),
lastUpdatedTime
);
}
public boolean isMinMaxInvalid() {
return Double.isInfinite(maxValue) || Double.isInfinite(minValue);
}
public double getOriginalNdv() {
if (original != null) {
return original.ndv;
}
return ndv;
}
public boolean isUnKnown() {
return isUnKnown;
}
public ColumnStatistic withAvgSizeByte(double avgSizeByte) {
return new ColumnStatisticBuilder(this).setAvgSizeByte(avgSizeByte).build();
}
public static ColumnStatistic createUnknownByDataType(DataType dataType) {
if (dataType instanceof CharacterType) {
return new ColumnStatisticBuilder(1)
.setAvgSizeByte(Math.max(1, Math.min(dataType.width(), CharacterType.DEFAULT_WIDTH)))
.setNdv(1)
.setNumNulls(1)
.setMaxValue(Double.POSITIVE_INFINITY)
.setMinValue(Double.NEGATIVE_INFINITY)
.setIsUnknown(true)
.setUpdatedTime("")
.build();
} else {
return new ColumnStatisticBuilder(1)
.setAvgSizeByte(dataType.width())
.setNdv(1)
.setNumNulls(1)
.setMaxValue(Double.POSITIVE_INFINITY)
.setMinValue(Double.NEGATIVE_INFINITY)
.setIsUnknown(true)
.setUpdatedTime("")
.build();
}
}
}