be/src/exprs/aggregate/aggregate_function_sem.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <math.h> |
21 | | |
22 | | #include "core/assert_cast.h" |
23 | | #include "core/column/column.h" |
24 | | #include "core/data_type/data_type.h" |
25 | | #include "core/data_type/data_type_decimal.h" |
26 | | #include "core/data_type/define_primitive_type.h" |
27 | | #include "core/data_type/primitive_type.h" |
28 | | #include "core/types.h" |
29 | | #include "core/value/decimalv2_value.h" |
30 | | #include "exprs/aggregate/aggregate_function.h" |
31 | | |
32 | | namespace doris { |
33 | | #include "common/compile_check_begin.h" |
34 | | |
35 | | class Arena; |
36 | | class BufferReadable; |
37 | | class BufferWritable; |
38 | | |
39 | | /** |
40 | | * SEM = sqrt(variance / count) = sqrt(m2 / (count * (count - 1))) |
41 | | * It uses Welford’s method for numerically stable one-pass computation of the mean and variance. |
42 | | */ |
43 | | struct AggregateFunctionSemData { |
44 | | double mean {}; |
45 | | double m2 {}; // Cumulative sum of squares |
46 | | UInt64 count = 0; |
47 | | |
48 | | // Let the old mean be mean_{n-1} and we receive a new value x_n. |
49 | | // The new mean should be: mean_n = mean_{n-1} + (x_n - mean_{n-1}) / n |
50 | | // The new M2 should capture total squared deviations from the new mean: |
51 | | // M2_n = M2_{n-1} + (x_n - mean_{n-1}) * (x_n - mean_n) |
52 | 166 | void add(const double& value) { |
53 | 166 | count++; |
54 | 166 | double delta = value - mean; |
55 | 166 | mean += delta / static_cast<double>(count); |
56 | 166 | double delta2 = value - mean; |
57 | 166 | m2 += delta * delta2; |
58 | 166 | } |
59 | | |
60 | | // Suppose we have dataset A (count_a, mean_a, M2_a) and B (count_b, mean_b, M2_b). |
61 | | // When merging: |
62 | | // - The total count is count_a + count_b |
63 | | // - The new mean is the weighted average of the two means |
64 | | // - The new M2 accumulates both variances and an adjustment term to account for |
65 | | // the difference in means between A and B: |
66 | | // M2 = M2_a + M2_b + delta^2 * count_a * count_b / total_count |
67 | | // where delta = mean_b - mean_a |
68 | 42 | void merge(const AggregateFunctionSemData& rhs) { |
69 | 42 | UInt64 total_count = count + rhs.count; |
70 | 42 | double delta = rhs.mean - mean; |
71 | 42 | mean = (mean * static_cast<double>(count) + rhs.mean * static_cast<double>(rhs.count)) / |
72 | 42 | static_cast<double>(total_count); |
73 | 42 | m2 += rhs.m2 + delta * delta * static_cast<double>(count) * static_cast<double>(rhs.count) / |
74 | 42 | static_cast<double>(total_count); |
75 | 42 | count = total_count; |
76 | 42 | } |
77 | | |
78 | 42 | void write(BufferWritable& buf) const { |
79 | 42 | buf.write_binary(mean); |
80 | 42 | buf.write_binary(m2); |
81 | 42 | buf.write_binary(count); |
82 | 42 | } |
83 | | |
84 | 42 | void read(BufferReadable& buf) { |
85 | 42 | buf.read_binary(mean); |
86 | 42 | buf.read_binary(m2); |
87 | 42 | buf.read_binary(count); |
88 | 42 | } |
89 | | |
90 | 3 | void reset() { |
91 | 3 | mean = {}; |
92 | 3 | m2 = {}; |
93 | 3 | count = 0; |
94 | 3 | } |
95 | | |
96 | 27 | double result() const { |
97 | 27 | if (count < 2) { |
98 | 1 | return 0; |
99 | 1 | } |
100 | 26 | double dCount = static_cast<double>(count); |
101 | 26 | double result = std::sqrt(m2 / (dCount * (dCount - 1.0))); |
102 | 26 | return result; |
103 | 27 | } |
104 | | }; |
105 | | |
106 | | template <typename Data> |
107 | | class AggregateFunctionSem final |
108 | | : public IAggregateFunctionDataHelper<Data, AggregateFunctionSem<Data>>, |
109 | | UnaryExpression, |
110 | | NullableAggregateFunction { |
111 | | public: |
112 | | AggregateFunctionSem(const DataTypes& argument_types_) |
113 | 24 | : IAggregateFunctionDataHelper<Data, AggregateFunctionSem<Data>>(argument_types_) {} |
114 | | |
115 | 5 | String get_name() const override { return "sem"; } |
116 | | |
117 | 53 | DataTypePtr get_return_type() const override { return std::make_shared<DataTypeFloat64>(); } |
118 | | |
119 | | void add(AggregateDataPtr __restrict place, const IColumn** columns, ssize_t row_num, |
120 | 166 | Arena&) const override { |
121 | 166 | const auto& column = |
122 | 166 | assert_cast<const ColumnFloat64&, TypeCheckOnRelease::DISABLE>(*columns[0]); |
123 | 166 | this->data(place).add((double)column.get_data()[row_num]); |
124 | 166 | } |
125 | | |
126 | 3 | void reset(AggregateDataPtr place) const override { this->data(place).reset(); } |
127 | | |
128 | | void merge(AggregateDataPtr __restrict place, ConstAggregateDataPtr rhs, |
129 | 42 | Arena&) const override { |
130 | 42 | this->data(place).merge(this->data(rhs)); |
131 | 42 | } |
132 | | |
133 | 42 | void serialize(ConstAggregateDataPtr __restrict place, BufferWritable& buf) const override { |
134 | 42 | this->data(place).write(buf); |
135 | 42 | } |
136 | | |
137 | | void deserialize(AggregateDataPtr __restrict place, BufferReadable& buf, |
138 | 42 | Arena&) const override { |
139 | 42 | this->data(place).read(buf); |
140 | 42 | } |
141 | | |
142 | 27 | void insert_result_into(ConstAggregateDataPtr __restrict place, IColumn& to) const override { |
143 | 27 | auto& column = assert_cast<ColumnFloat64&>(to); |
144 | 27 | column.get_data().push_back(this->data(place).result()); |
145 | 27 | } |
146 | | }; |
147 | | |
148 | | #include "common/compile_check_end.h" |
149 | | } // namespace doris |