ExpressionEstimation.java
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
package org.apache.doris.nereids.stats;
import org.apache.doris.analysis.ArithmeticExpr.Operator;
import org.apache.doris.analysis.NumericLiteralExpr;
import org.apache.doris.analysis.StringLiteral;
import org.apache.doris.nereids.exceptions.AnalysisException;
import org.apache.doris.nereids.trees.expressions.Add;
import org.apache.doris.nereids.trees.expressions.AggregateExpression;
import org.apache.doris.nereids.trees.expressions.Alias;
import org.apache.doris.nereids.trees.expressions.And;
import org.apache.doris.nereids.trees.expressions.BinaryArithmetic;
import org.apache.doris.nereids.trees.expressions.CaseWhen;
import org.apache.doris.nereids.trees.expressions.Cast;
import org.apache.doris.nereids.trees.expressions.ComparisonPredicate;
import org.apache.doris.nereids.trees.expressions.Divide;
import org.apache.doris.nereids.trees.expressions.Expression;
import org.apache.doris.nereids.trees.expressions.IntegralDivide;
import org.apache.doris.nereids.trees.expressions.MarkJoinSlotReference;
import org.apache.doris.nereids.trees.expressions.Mod;
import org.apache.doris.nereids.trees.expressions.Multiply;
import org.apache.doris.nereids.trees.expressions.Or;
import org.apache.doris.nereids.trees.expressions.SlotReference;
import org.apache.doris.nereids.trees.expressions.Subtract;
import org.apache.doris.nereids.trees.expressions.TimestampArithmetic;
import org.apache.doris.nereids.trees.expressions.VirtualSlotReference;
import org.apache.doris.nereids.trees.expressions.WhenClause;
import org.apache.doris.nereids.trees.expressions.functions.BoundFunction;
import org.apache.doris.nereids.trees.expressions.functions.agg.Avg;
import org.apache.doris.nereids.trees.expressions.functions.agg.Count;
import org.apache.doris.nereids.trees.expressions.functions.agg.Max;
import org.apache.doris.nereids.trees.expressions.functions.agg.Min;
import org.apache.doris.nereids.trees.expressions.functions.agg.Sum;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Abs;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Acos;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Ascii;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Asin;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Atan;
import org.apache.doris.nereids.trees.expressions.functions.scalar.DayOfMonth;
import org.apache.doris.nereids.trees.expressions.functions.scalar.DayOfWeek;
import org.apache.doris.nereids.trees.expressions.functions.scalar.DayOfYear;
import org.apache.doris.nereids.trees.expressions.functions.scalar.DaysAdd;
import org.apache.doris.nereids.trees.expressions.functions.scalar.DaysDiff;
import org.apache.doris.nereids.trees.expressions.functions.scalar.DaysSub;
import org.apache.doris.nereids.trees.expressions.functions.scalar.FromDays;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Hour;
import org.apache.doris.nereids.trees.expressions.functions.scalar.HoursDiff;
import org.apache.doris.nereids.trees.expressions.functions.scalar.HoursSub;
import org.apache.doris.nereids.trees.expressions.functions.scalar.If;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Least;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Minute;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MinutesAdd;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MinutesDiff;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MinutesSub;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MonthsAdd;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MonthsDiff;
import org.apache.doris.nereids.trees.expressions.functions.scalar.MonthsSub;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Negative;
import org.apache.doris.nereids.trees.expressions.functions.scalar.NullIf;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Quarter;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Radians;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Random;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Second;
import org.apache.doris.nereids.trees.expressions.functions.scalar.SecondsAdd;
import org.apache.doris.nereids.trees.expressions.functions.scalar.SecondsDiff;
import org.apache.doris.nereids.trees.expressions.functions.scalar.SecondsSub;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Sqrt;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Substring;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToDate;
import org.apache.doris.nereids.trees.expressions.functions.scalar.ToDays;
import org.apache.doris.nereids.trees.expressions.functions.scalar.WeekOfYear;
import org.apache.doris.nereids.trees.expressions.functions.scalar.WeeksDiff;
import org.apache.doris.nereids.trees.expressions.functions.scalar.Year;
import org.apache.doris.nereids.trees.expressions.functions.scalar.YearOfWeek;
import org.apache.doris.nereids.trees.expressions.functions.scalar.YearsAdd;
import org.apache.doris.nereids.trees.expressions.functions.scalar.YearsDiff;
import org.apache.doris.nereids.trees.expressions.functions.scalar.YearsSub;
import org.apache.doris.nereids.trees.expressions.literal.DateLiteral;
import org.apache.doris.nereids.trees.expressions.literal.Literal;
import org.apache.doris.nereids.trees.expressions.visitor.ExpressionVisitor;
import org.apache.doris.nereids.types.DataType;
import org.apache.doris.qe.ConnectContext;
import org.apache.doris.statistics.ColumnStatistic;
import org.apache.doris.statistics.ColumnStatisticBuilder;
import org.apache.doris.statistics.Statistics;
import com.google.common.base.Preconditions;
import org.apache.commons.collections.CollectionUtils;
import org.apache.log4j.LogManager;
import org.apache.log4j.Logger;
import java.time.Instant;
import java.time.LocalDate;
import java.time.LocalDateTime;
import java.time.ZoneId;
import java.util.List;
/**
* Used to estimate for expressions that not producing boolean value.
*/
public class ExpressionEstimation extends ExpressionVisitor<ColumnStatistic, Statistics> {
public static final Logger LOG = LogManager.getLogger(ExpressionEstimation.class);
public static final long DAYS_FROM_0_TO_1970 = 719528;
public static final long DAYS_FROM_0_TO_9999 = 3652424;
private static final ExpressionEstimation INSTANCE = new ExpressionEstimation();
/**
* returned columnStat is newly created or a copy of stats
*/
public static ColumnStatistic estimate(Expression expression, Statistics stats) {
try {
ColumnStatistic columnStatistic = expression.accept(INSTANCE, stats);
if (columnStatistic == null) {
return ColumnStatistic.createUnknownByDataType(expression.getDataType());
}
return columnStatistic;
} catch (Exception e) {
// in regression test, feDebug is true so that the exception is thrown in order to detect problems.
if (ConnectContext.get() != null && ConnectContext.get().getSessionVariable().feDebug) {
throw e;
}
LOG.warn("ExpressionEstimation failed : " + expression, e);
return ColumnStatistic.createUnknownByDataType(expression.getDataType());
}
}
@Override
public ColumnStatistic visit(Expression expr, Statistics context) {
ColumnStatistic stats = context.findColumnStatistics(expr);
if (stats != null) {
return stats;
}
List<Expression> childrenExpr = expr.children();
if (CollectionUtils.isEmpty(childrenExpr)) {
return ColumnStatistic.UNKNOWN;
}
return expr.child(0).accept(this, context);
}
@Override
public ColumnStatistic visitCaseWhen(CaseWhen caseWhen, Statistics context) {
double ndv = caseWhen.getWhenClauses().size();
double width = 1;
if (caseWhen.getDefaultValue().isPresent()) {
ndv += 1;
}
for (WhenClause clause : caseWhen.getWhenClauses()) {
ColumnStatistic colStats = ExpressionEstimation.estimate(clause.getResult(), context);
ndv = Math.max(ndv, colStats.ndv);
width = Math.max(width, clause.getResult().getDataType().width());
}
if (caseWhen.getDefaultValue().isPresent()) {
ColumnStatistic colStats = ExpressionEstimation.estimate(caseWhen.getDefaultValue().get(), context);
ndv = Math.max(ndv, colStats.ndv);
width = Math.max(width, caseWhen.getDefaultValue().get().getDataType().width());
}
return new ColumnStatisticBuilder()
.setNdv(ndv)
.setMinValue(Double.NEGATIVE_INFINITY)
.setMaxValue(Double.POSITIVE_INFINITY)
.setAvgSizeByte(width)
.setNumNulls(0)
.build();
}
@Override
public ColumnStatistic visitIf(If ifClause, Statistics context) {
double ndv = 2;
double width = 1;
ColumnStatistic colStatsThen = ExpressionEstimation.estimate(ifClause.child(1), context);
ndv = Math.max(ndv, colStatsThen.ndv);
width = Math.max(width, ifClause.child(1).getDataType().width());
ColumnStatistic colStatsElse = ExpressionEstimation.estimate(ifClause.child(2), context);
ndv = Math.max(ndv, colStatsElse.ndv);
width = Math.max(width, ifClause.child(2).getDataType().width());
return new ColumnStatisticBuilder()
.setNdv(ndv)
.setMinValue(Double.NEGATIVE_INFINITY)
.setMaxValue(Double.POSITIVE_INFINITY)
.setAvgSizeByte(width)
.setNumNulls(0)
.build();
}
@Override
public ColumnStatistic visitCast(Cast cast, Statistics context) {
ColumnStatistic stats = context.findColumnStatistics(cast);
if (stats != null) {
return stats;
}
ColumnStatistic childColStats = cast.child().accept(this, context);
Preconditions.checkNotNull(childColStats, "childColStats is null");
return castMinMax(childColStats, cast.getDataType());
}
private ColumnStatistic castMinMax(ColumnStatistic colStats, DataType targetType) {
// cast str to date/datetime
if (colStats.minExpr instanceof StringLiteral
&& colStats.maxExpr instanceof StringLiteral
&& targetType.isDateLikeType()) {
boolean convertSuccess = true;
ColumnStatisticBuilder builder = new ColumnStatisticBuilder(colStats);
if (colStats.minExpr != null) {
try {
String strMin = colStats.minExpr.getStringValue();
DateLiteral dateMinLiteral = new DateLiteral(strMin);
long min = dateMinLiteral.getValue();
builder.setMinValue(min);
builder.setMinExpr(dateMinLiteral.toLegacyLiteral());
} catch (AnalysisException e) {
convertSuccess = false;
}
}
if (convertSuccess && colStats.maxExpr != null) {
try {
String strMax = colStats.maxExpr.getStringValue();
DateLiteral dateMaxLiteral = new DateLiteral(strMax);
long max = dateMaxLiteral.getValue();
builder.setMaxValue(max);
builder.setMaxExpr(dateMaxLiteral.toLegacyLiteral());
} catch (AnalysisException e) {
convertSuccess = false;
}
}
if (convertSuccess) {
return builder.build();
}
}
// cast numeric to numeric
if (colStats.minExpr instanceof NumericLiteralExpr && colStats.maxExpr instanceof NumericLiteralExpr) {
if (targetType.isNumericType()) {
return colStats;
}
}
// cast other date types, set min/max infinity
ColumnStatisticBuilder builder = new ColumnStatisticBuilder(colStats);
builder.setMinExpr(null).setMinValue(Double.NEGATIVE_INFINITY)
.setMaxExpr(null).setMaxValue(Double.POSITIVE_INFINITY);
return builder.build();
}
@Override
public ColumnStatistic visitLiteral(Literal literal, Statistics context) {
if (ColumnStatistic.UNSUPPORTED_TYPE.contains(literal.getDataType().toCatalogDataType())) {
return ColumnStatistic.UNKNOWN;
}
double literalVal = literal.getDouble();
return new ColumnStatisticBuilder()
.setMaxValue(literalVal)
.setMinValue(literalVal)
.setNdv(literal.isNullLiteral() ? 0 : 1)
.setNumNulls(literal.isNullLiteral() ? 1 : 0)
.setAvgSizeByte(literal.getDataType().width())
.setMinExpr(literal.toLegacyLiteral())
.setMaxExpr(literal.toLegacyLiteral())
.build();
}
@Override
public ColumnStatistic visitSlotReference(SlotReference slotReference, Statistics context) {
return context.findColumnStatistics(slotReference);
}
@Override
public ColumnStatistic visitBinaryArithmetic(BinaryArithmetic binaryArithmetic, Statistics context) {
ColumnStatistic leftColStats = binaryArithmetic.left().accept(this, context);
ColumnStatistic rightColStats = binaryArithmetic.right().accept(this, context);
double leftNdv = leftColStats.ndv;
double rightNdv = rightColStats.ndv;
double ndv = Math.max(leftNdv, rightNdv);
double leftNullCount = leftColStats.numNulls;
double rightNullCount = rightColStats.numNulls;
double rowCount = context.getRowCount();
double numNulls = context.getRowCount()
* (1 - (1 - (leftNullCount / rowCount) * (1 - rightNullCount / rowCount)));
double leftMax = leftColStats.maxValue;
double rightMax = rightColStats.maxValue;
double leftMin = leftColStats.minValue;
double rightMin = rightColStats.minValue;
int exprResultTypeWidth = binaryArithmetic.getDataType().width();
double dataSize = exprResultTypeWidth * rowCount;
if (binaryArithmetic instanceof Add) {
return new ColumnStatisticBuilder().setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte)
.setNumNulls(numNulls).setDataSize(dataSize).setMinValue(leftMin + rightMin)
.setMaxValue(leftMax + rightMax)
.setMinExpr(null).setMaxExpr(null).build();
}
if (binaryArithmetic instanceof Subtract) {
return new ColumnStatisticBuilder().setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte)
.setNumNulls(numNulls).setDataSize(dataSize).setMinValue(leftMin - rightMax)
.setMaxValue(leftMax - rightMin).setMinExpr(null)
.setMaxExpr(null).build();
}
// TODO: stat for multiply and divide produced by below algorithm may have huge deviation with reality.
if (binaryArithmetic instanceof Multiply) {
double min = Math.min(
Math.min(
Math.min(leftMin * rightMin, leftMin * rightMax),
leftMax * rightMin),
leftMax * rightMax);
double max = Math.max(
Math.max(
Math.max(leftMin * rightMin, leftMin * rightMax),
leftMax * rightMin),
leftMax * rightMax);
return new ColumnStatisticBuilder().setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte)
.setNumNulls(numNulls).setDataSize(dataSize).setMinValue(min).setMaxValue(max)
.setMaxExpr(null).setMinExpr(null).build();
}
if (binaryArithmetic instanceof Divide || binaryArithmetic instanceof IntegralDivide) {
double min = Math.min(
Math.min(
Math.min(leftMin / noneZeroDivisor(rightMin), leftMin / noneZeroDivisor(rightMax)),
leftMax / noneZeroDivisor(rightMin)),
leftMax / noneZeroDivisor(rightMax));
double max = Math.max(
Math.max(
Math.max(leftMin / noneZeroDivisor(rightMin), leftMin / noneZeroDivisor(rightMax)),
leftMax / noneZeroDivisor(rightMin)),
leftMax / noneZeroDivisor(rightMax));
return new ColumnStatisticBuilder().setNdv(ndv).setAvgSizeByte(leftColStats.avgSizeByte)
.setNumNulls(numNulls).setDataSize(binaryArithmetic.getDataType().width()).setMinValue(min)
.setMaxValue(max).build();
}
if (binaryArithmetic instanceof Mod) {
double min = -Math.max(Math.abs(rightMin), Math.abs(rightMax));
double max = -min;
return new ColumnStatisticBuilder().setNdv(ndv)
.setAvgSizeByte(exprResultTypeWidth)
.setDataSize(dataSize)
.setNumNulls(numNulls)
.setMaxValue(max)
.setMinValue(min)
.build();
}
return ColumnStatistic.UNKNOWN;
}
private double noneZeroDivisor(double d) {
return d == 0.0 ? 1.0 : d;
}
@Override
public ColumnStatistic visitMin(Min min, Statistics context) {
Expression child = min.child();
ColumnStatistic columnStat = child.accept(this, context);
if (columnStat.isUnKnown) {
return ColumnStatistic.UNKNOWN.withAvgSizeByte(min.getDataType().width());
}
// if this is scalar agg, we will update count and ndv to 1 when visiting group clause
return new ColumnStatisticBuilder(columnStat).build();
}
@Override
public ColumnStatistic visitMax(Max max, Statistics context) {
Expression child = max.child();
ColumnStatistic columnStat = child.accept(this, context);
if (columnStat.isUnKnown) {
return ColumnStatistic.UNKNOWN.withAvgSizeByte(max.getDataType().width());
}
// if this is scalar agg, we will update count and ndv to 1 when visiting group clause
return new ColumnStatisticBuilder(columnStat).build();
}
@Override
public ColumnStatistic visitCount(Count count, Statistics context) {
double width = count.getDataType().width();
// for scalar agg, ndv and row count will be normalized by 1 in StatsCalculator.computeAggregate()
return ColumnStatistic.UNKNOWN.withAvgSizeByte(width);
}
// TODO: return a proper estimated stat after supports histogram
@Override
public ColumnStatistic visitSum(Sum sum, Statistics context) {
// estimate size as BIGINT
return ColumnStatistic.UNKNOWN.withAvgSizeByte(sum.getDataType().width());
}
// TODO: return a proper estimated stat after supports histogram
@Override
public ColumnStatistic visitAvg(Avg avg, Statistics context) {
return ColumnStatistic.UNKNOWN.withAvgSizeByte(avg.getDataType().width());
}
@Override
public ColumnStatistic visitYear(Year year, Statistics context) {
ColumnStatistic childStat = year.child().accept(this, context);
double rowCount = context.getRowCount();
long minYear = 1970;
long maxYear = 2038;
return new ColumnStatisticBuilder()
.setNdv(maxYear - minYear + 1)
.setAvgSizeByte(4)
.setNumNulls(childStat.numNulls)
.setDataSize(4 * rowCount)
.setMinValue(minYear)
.setMaxValue(maxYear).setMinExpr(null).build();
}
@Override
public ColumnStatistic visitYearOfWeek(YearOfWeek yearOfWeek, Statistics context) {
ColumnStatistic childStat = yearOfWeek.child().accept(this, context);
double rowCount = context.getRowCount();
long minYear = 1970;
long maxYear = 2038;
return new ColumnStatisticBuilder()
.setNdv(maxYear - minYear + 1)
.setAvgSizeByte(4)
.setNumNulls(childStat.numNulls)
.setDataSize(4 * rowCount)
.setMinValue(minYear)
.setMaxValue(maxYear).setMinExpr(null).build();
}
@Override
public ColumnStatistic visitWeekOfYear(WeekOfYear weekOfYear, Statistics context) {
ColumnStatistic childStat = weekOfYear.child().accept(this, context);
double width = weekOfYear.getDataType().width();
return new ColumnStatisticBuilder(childStat)
.setNdv(54)
.setAvgSizeByte(width)
.setNumNulls(childStat.numNulls)
.setDataSize(1).setMinValue(1).setMaxValue(53).setMinExpr(null)
.build();
}
// TODO: find a proper way to predicate stat of substring
@Override
public ColumnStatistic visitSubstring(Substring substring, Statistics context) {
return substring.child(0).accept(this, context);
}
@Override
public ColumnStatistic visitAlias(Alias alias, Statistics context) {
return alias.child().accept(this, context);
}
@Override
public ColumnStatistic visitVirtualReference(VirtualSlotReference virtualSlotReference, Statistics context) {
return ColumnStatistic.UNKNOWN;
}
@Override
public ColumnStatistic visitBoundFunction(BoundFunction boundFunction, Statistics context) {
return ColumnStatistic.UNKNOWN;
}
@Override
public ColumnStatistic visitAggregateExpression(AggregateExpression aggregateExpression,
Statistics context) {
return aggregateExpression.child().accept(this, context);
}
@Override
public ColumnStatistic visitComparisonPredicate(ComparisonPredicate cp, Statistics context) {
ColumnStatistic leftStats = cp.left().accept(this, context);
ColumnStatistic rightStats = cp.right().accept(this, context);
return new ColumnStatisticBuilder(leftStats)
.setNumNulls(StatsMathUtil.maxNonNaN(leftStats.numNulls, rightStats.numNulls))
.setNdv(2).build();
}
@Override
public ColumnStatistic visitOr(Or or, Statistics inputStats) {
List<Expression> children = or.children();
// TODO: this algorithm is not right, fix it latter
ColumnStatistic firstChild = children.get(0).accept(this, inputStats);
double maxNull = StatsMathUtil.maxNonNaN(firstChild.numNulls, 1);
for (int i = 1; i < children.size(); i++) {
ColumnStatistic columnStatistic = children.get(i).accept(this, inputStats);
maxNull = StatsMathUtil.maxNonNaN(maxNull, columnStatistic.numNulls);
}
return new ColumnStatisticBuilder(firstChild).setNumNulls(maxNull).setNdv(2).build();
}
@Override
public ColumnStatistic visitAnd(And and, Statistics inputStats) {
List<Expression> children = and.children();
// TODO: this algorithm is not right, fix it latter
ColumnStatistic firstChild = children.get(0).accept(this, inputStats);
double maxNull = StatsMathUtil.maxNonNaN(firstChild.numNulls, 1);
for (int i = 1; i < children.size(); i++) {
ColumnStatistic columnStatistic = children.get(i).accept(this, inputStats);
maxNull = StatsMathUtil.maxNonNaN(maxNull, columnStatistic.numNulls);
}
return new ColumnStatisticBuilder(firstChild).setNumNulls(maxNull).setNdv(2).build();
}
@Override
public ColumnStatistic visitTimestampArithmetic(TimestampArithmetic arithmetic, Statistics context) {
Operator operator = arithmetic.getOp();
switch (operator) {
case ADD:
return dateAdd(arithmetic, context);
case SUBTRACT:
return dateSub(arithmetic, context);
default:
return arithmetic.left().accept(this, context);
}
}
@Override
public ColumnStatistic visitMarkJoinReference(
MarkJoinSlotReference markJoinSlotReference, Statistics context) {
return ColumnStatistic.UNKNOWN;
}
public ColumnStatistic visitNullIf(NullIf nullIf, Statistics context) {
Expression leftChild = nullIf.left();
return leftChild.accept(this, context);
}
@Override
public ColumnStatistic visitLeast(Least least, Statistics context) {
return least.child(0).accept(this, context);
}
@Override
public ColumnStatistic visitAscii(Ascii ascii, Statistics context) {
DataType returnType = ascii.getDataType();
ColumnStatistic childColumnStats = ascii.child().accept(this, context);
return new ColumnStatisticBuilder()
.setDataSize(returnType.width() * context.getRowCount())
.setNdv(128)
.setMinValue(0)
.setMaxValue(127)
.setNumNulls(childColumnStats.numNulls)
.setAvgSizeByte(returnType.width()).build();
}
@Override
public ColumnStatistic visitQuarter(Quarter quarter, Statistics context) {
DataType returnType = quarter.getDataType();
ColumnStatistic childColumnStats = quarter.child().accept(this, context);
return new ColumnStatisticBuilder()
.setNdv(4)
.setMinValue(1)
.setMaxValue(4)
.setNumNulls(childColumnStats.numNulls)
.setAvgSizeByte(returnType.width())
.setDataSize(returnType.width() * context.getRowCount()).build();
}
@Override
public ColumnStatistic visitDayOfMonth(DayOfMonth dayOfMonth, Statistics context) {
DataType returnType = dayOfMonth.getDataType();
ColumnStatistic childColumnStats = dayOfMonth.child().accept(this, context);
return new ColumnStatisticBuilder(childColumnStats).setNdv(31)
.setAvgSizeByte(returnType.width())
.setDataSize(returnType.width() * context.getRowCount())
.setMaxValue(1)
.setMaxValue(31).build();
}
@Override
public ColumnStatistic visitDayOfWeek(DayOfWeek dayOfWeek, Statistics context) {
ColumnStatistic childColumnStats = dayOfWeek.child().accept(this, context);
return new ColumnStatisticBuilder(childColumnStats)
.setNdv(7)
.setMinValue(1)
.setMaxValue(7).build();
}
@Override
public ColumnStatistic visitDayOfYear(DayOfYear dayOfYear, Statistics context) {
ColumnStatistic childColumnStats = dayOfYear.child().accept(this, context);
return new ColumnStatisticBuilder(childColumnStats)
.setNdv(366)
.setMaxValue(366)
.setAvgSizeByte(dayOfYear.getDataType().width())
.setDataSize(dayOfYear.getDataType().width() * context.getRowCount())
.setMinValue(1)
.build();
}
@Override
public ColumnStatistic visitHour(Hour hour, Statistics context) {
ColumnStatistic childColumnStats = hour.child().accept(this, context);
return new ColumnStatisticBuilder(childColumnStats)
.setNdv(24)
.setMinValue(0)
.setAvgSizeByte(hour.getDataType().width())
.setDataSize(hour.getDataType().width() * context.getRowCount())
.setMaxValue(23).build();
}
@Override
public ColumnStatistic visitMinute(Minute minute, Statistics context) {
ColumnStatistic childColumnStats = minute.child().accept(this, context);
return new ColumnStatisticBuilder(childColumnStats)
.setNdv(60)
.setMinValue(0)
.setAvgSizeByte(minute.getDataType().width())
.setDataSize(minute.getDataType().width() * context.getRowCount())
.setMaxValue(59).build();
}
@Override
public ColumnStatistic visitSecond(Second second, Statistics context) {
ColumnStatistic childColumnStats = second.child().accept(this, context);
return new ColumnStatisticBuilder(childColumnStats)
.setNdv(60)
.setMinValue(0)
.setAvgSizeByte(second.getDataType().width())
.setDataSize(second.getDataType().width() * context.getRowCount())
.setMaxValue(59).build();
}
@Override
public ColumnStatistic visitToDate(ToDate toDate, Statistics context) {
ColumnStatistic childColumnStats = toDate.child().accept(this, context);
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(childColumnStats)
.setAvgSizeByte(toDate.getDataType().width())
.setDataSize(toDate.getDataType().width() * context.getRowCount());
if (childColumnStats.isMinMaxInvalid()) {
return columnStatisticBuilder.build();
}
double minValue;
double maxValue;
try {
// min/max value is infinite, but they may be too large to convert to date
minValue = getDatetimeFromLong((long) childColumnStats.minValue).toLocalDate()
.atStartOfDay(ZoneId.systemDefault()).toEpochSecond();
maxValue = getDatetimeFromLong((long) childColumnStats.maxValue).toLocalDate()
.atStartOfDay(ZoneId.systemDefault()).toEpochSecond();
} catch (Exception e) {
// ignore DateTimeException
minValue = Double.NEGATIVE_INFINITY;
maxValue = Double.POSITIVE_INFINITY;
}
return columnStatisticBuilder.setMaxValue(maxValue)
.setMinValue(minValue).build();
}
private LocalDateTime getDatetimeFromLong(long dateTime) {
return LocalDateTime.ofInstant(Instant.ofEpochSecond(dateTime), ZoneId.systemDefault());
}
@Override
public ColumnStatistic visitToDays(ToDays toDays, Statistics context) {
ColumnStatistic childColumnStats = toDays.child().accept(this, context);
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(childColumnStats)
.setAvgSizeByte(toDays.getDataType().width())
.setDataSize(toDays.getDataType().width() * context.getRowCount());
if (childColumnStats.isMinMaxInvalid()) {
return columnStatisticBuilder.build();
}
double minValue;
double maxValue;
try {
minValue = getDatetimeFromLong((long) childColumnStats.minValue).toLocalDate().toEpochDay()
+ (double) DAYS_FROM_0_TO_1970;
maxValue = getDatetimeFromLong((long) childColumnStats.maxValue).toLocalDate().toEpochDay()
+ (double) DAYS_FROM_0_TO_1970;
} catch (Exception e) {
// ignore DateTimeException
minValue = Double.NEGATIVE_INFINITY;
maxValue = Double.POSITIVE_INFINITY;
}
return columnStatisticBuilder.setMaxValue(maxValue)
.setMinValue(minValue)
.build();
}
@Override
public ColumnStatistic visitFromDays(FromDays fromDays, Statistics context) {
ColumnStatistic childColumnStats = fromDays.child().accept(this, context);
double minValue = childColumnStats.minValue;
double maxValue = childColumnStats.maxValue;
if (minValue < DAYS_FROM_0_TO_1970) {
minValue = LocalDate.ofEpochDay(0).atStartOfDay(ZoneId.systemDefault()).toEpochSecond();
} else {
if (minValue > DAYS_FROM_0_TO_9999) {
minValue = LocalDate.ofEpochDay(DAYS_FROM_0_TO_9999 - DAYS_FROM_0_TO_1970)
.atStartOfDay(ZoneId.systemDefault()).toEpochSecond();
} else {
minValue = LocalDate.ofEpochDay((long) (minValue - DAYS_FROM_0_TO_1970))
.atStartOfDay(ZoneId.systemDefault()).toEpochSecond();
}
}
if (maxValue < DAYS_FROM_0_TO_1970) {
maxValue = LocalDate.ofEpochDay(0).atStartOfDay(ZoneId.systemDefault()).toEpochSecond();
} else {
if (maxValue > DAYS_FROM_0_TO_9999) {
maxValue = LocalDate.ofEpochDay(DAYS_FROM_0_TO_9999 - DAYS_FROM_0_TO_1970)
.atStartOfDay(ZoneId.systemDefault()).toEpochSecond();
} else {
maxValue = LocalDate.ofEpochDay((long) (maxValue - DAYS_FROM_0_TO_1970))
.atStartOfDay(ZoneId.systemDefault()).toEpochSecond();
}
}
return new ColumnStatisticBuilder(childColumnStats)
.setMinValue(minValue)
.setMaxValue(maxValue)
.setAvgSizeByte(fromDays.getDataType().width())
.setDataSize(fromDays.getDataType().width() * context.getRowCount()).build();
}
@Override
public ColumnStatistic visitAbs(Abs abs, Statistics context) {
ColumnStatistic childColumnStats = abs.child().accept(this, context);
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(childColumnStats);
double max = Math.max(Math.abs(childColumnStats.minValue), Math.abs(childColumnStats.maxValue));
double min;
if (childColumnStats.minValue < 0 && childColumnStats.maxValue < 0
|| childColumnStats.minValue >= 0 && childColumnStats.maxValue >= 0) {
min = Math.min(childColumnStats.minValue, childColumnStats.maxValue);
} else {
min = 0;
}
return columnStatisticBuilder
.setMinValue(min)
.setMaxValue(max)
.setNdv(max - min + 1)
.setAvgSizeByte(abs.getDataType().width())
.setDataSize(abs.getDataType().width() * context.getRowCount()).build();
}
@Override
public ColumnStatistic visitAcos(Acos acos, Statistics context) {
ColumnStatistic childColumnStats = acos.child().accept(this, context);
ColumnStatisticBuilder columnStatisticBuilder = new ColumnStatisticBuilder(childColumnStats);
return columnStatisticBuilder
.setMinValue(0)
.setAvgSizeByte(acos.getDataType().width())
.setDataSize(acos.getDataType().width() * context.getRowCount())
.setMaxValue(Math.PI).build();
}
@Override
public ColumnStatistic visitAsin(Asin asin, Statistics context) {
ColumnStatistic columnStatistic = asin.child().accept(this, context);
return new ColumnStatisticBuilder(columnStatistic)
.setMinValue(-Math.PI / 2)
.setMaxValue(Math.PI / 2)
.setAvgSizeByte(asin.getDataType().width())
.setDataSize(asin.getDataType().width() * context.getRowCount()).build();
}
@Override
public ColumnStatistic visitAtan(Atan atan, Statistics context) {
ColumnStatistic columnStatistic = atan.child().accept(this, context);
return new ColumnStatisticBuilder(columnStatistic)
.setMinValue(-Math.PI / 2)
.setMaxValue(Math.PI / 2)
.setAvgSizeByte(atan.getDataType().width())
.setDataSize(atan.getDataType().width() * context.getRowCount()).build();
}
@Override
public ColumnStatistic visitSqrt(Sqrt sqrt, Statistics context) {
ColumnStatistic columnStatistic = sqrt.child().accept(this, context);
return new ColumnStatisticBuilder(columnStatistic)
.setMinValue(0)
.setMaxValue(Math.sqrt(columnStatistic.maxValue))
.setAvgSizeByte(sqrt.getDataType().width())
.setDataSize(sqrt.getDataType().width() * context.getRowCount()).build();
}
@Override
public ColumnStatistic visitRadians(Radians radians, Statistics context) {
ColumnStatistic columnStatistic = radians.child().accept(this, context);
return new ColumnStatisticBuilder(columnStatistic)
.setMinValue(Math.toRadians(columnStatistic.minValue))
.setMaxValue(Math.toRadians(columnStatistic.maxValue))
.setAvgSizeByte(radians.getDataType().width())
.setDataSize(radians.getDataType().width() * context.getRowCount()).build();
}
@Override
public ColumnStatistic visitRandom(Random random, Statistics context) {
return new ColumnStatisticBuilder()
.setMinValue(0)
.setMaxValue(1)
.setNumNulls(0)
.setAvgSizeByte(random.getDataType().width())
.setDataSize(random.getDataType().width() * context.getRowCount()).build();
}
@Override
public ColumnStatistic visitNegative(Negative negative, Statistics context) {
ColumnStatistic columnStatistic = negative.child(0).accept(this, context);
return new ColumnStatisticBuilder(columnStatistic)
.setMinValue(Math.min(-columnStatistic.minValue, -columnStatistic.maxValue))
.setMaxValue(Math.max(-columnStatistic.minValue, -columnStatistic.maxValue))
.setAvgSizeByte(negative.getDataType().width())
.setDataSize(negative.getDataType().width() * context.getRowCount()).build();
}
@Override
public ColumnStatistic visitYearsAdd(YearsAdd yearsAdd, Statistics context) {
return dateAdd(yearsAdd, context);
}
@Override
public ColumnStatistic visitMonthsAdd(MonthsAdd monthsAdd, Statistics context) {
return dateAdd(monthsAdd, context);
}
@Override
public ColumnStatistic visitDaysAdd(DaysAdd daysAdd, Statistics context) {
return dateAdd(daysAdd, context);
}
@Override
public ColumnStatistic visitMinutesAdd(MinutesAdd minutesAdd, Statistics context) {
return dateAdd(minutesAdd, context);
}
@Override
public ColumnStatistic visitSecondsAdd(SecondsAdd secondsAdd, Statistics context) {
return dateAdd(secondsAdd, context);
}
@Override
public ColumnStatistic visitYearsSub(YearsSub yearsSub, Statistics context) {
return dateSub(yearsSub, context);
}
@Override
public ColumnStatistic visitMonthsSub(MonthsSub monthsSub, Statistics context) {
return dateSub(monthsSub, context);
}
@Override
public ColumnStatistic visitDaysSub(DaysSub daysSub, Statistics context) {
return dateSub(daysSub, context);
}
@Override
public ColumnStatistic visitHoursSub(HoursSub hoursSub, Statistics context) {
return dateSub(hoursSub, context);
}
@Override
public ColumnStatistic visitMinutesSub(MinutesSub minutesSub, Statistics context) {
return dateSub(minutesSub, context);
}
@Override
public ColumnStatistic visitSecondsSub(SecondsSub secondsSub, Statistics context) {
return dateSub(secondsSub, context);
}
private ColumnStatistic dateAdd(Expression date, Statistics context) {
ColumnStatistic leftChild = date.child(0).accept(this, context);
ColumnStatistic rightChild = date.child(1).accept(this, context);
return new ColumnStatisticBuilder(leftChild)
.setMinValue(leftChild.minValue + rightChild.minValue)
.setMaxValue(leftChild.maxValue + rightChild.maxValue)
.setAvgSizeByte(date.getDataType().width())
.setDataSize(date.getDataType().width() * context.getRowCount()).build();
}
private ColumnStatistic dateSub(Expression date, Statistics context) {
ColumnStatistic leftChild = date.child(0).accept(this, context);
ColumnStatistic rightChild = date.child(1).accept(this, context);
return new ColumnStatisticBuilder(leftChild)
.setMinValue(leftChild.minValue - rightChild.minValue)
.setMaxValue(leftChild.maxValue - rightChild.maxValue)
.setAvgSizeByte(date.getDataType().width())
.setDataSize(date.getDataType().width() * context.getRowCount()).build();
}
private ColumnStatistic dateDiff(double interval, Expression date, Statistics context) {
ColumnStatistic leftChild = date.child(0).accept(this, context);
ColumnStatistic rightChild = date.child(1).accept(this, context);
return new ColumnStatisticBuilder(leftChild)
.setMinValue((leftChild.minValue - rightChild.maxValue) / interval)
.setMaxValue((leftChild.maxValue - rightChild.minValue) / interval)
.setAvgSizeByte(date.getDataType().width())
.setDataSize(date.getDataType().width() * context.getRowCount()).build();
}
@Override
public ColumnStatistic visitYearsDiff(YearsDiff yearsDiff, Statistics context) {
return dateDiff(3600 * 24 * 365, yearsDiff, context);
}
@Override
public ColumnStatistic visitMonthsDiff(MonthsDiff monthsDiff, Statistics context) {
return dateDiff(3600 * 24 * 31, monthsDiff, context);
}
@Override
public ColumnStatistic visitWeeksDiff(WeeksDiff weeksDiff, Statistics context) {
return dateDiff(3600 * 24 * 7, weeksDiff, context);
}
@Override
public ColumnStatistic visitDaysDiff(DaysDiff daysDiff, Statistics context) {
return dateDiff(3600 * 24, daysDiff, context);
}
@Override
public ColumnStatistic visitHoursDiff(HoursDiff hoursDiff, Statistics context) {
return dateDiff(3600, hoursDiff, context);
}
@Override
public ColumnStatistic visitMinutesDiff(MinutesDiff minutesDiff, Statistics context) {
return dateDiff(60, minutesDiff, context);
}
@Override
public ColumnStatistic visitSecondsDiff(SecondsDiff secondsDiff, Statistics context) {
return dateDiff(1, secondsDiff, context);
}
}