/root/doris/be/src/util/tdigest.h
| Line | Count | Source (jump to first uncovered line) | 
| 1 |  | // Licensed to the Apache Software Foundation (ASF) under one | 
| 2 |  | // or more contributor license agreements.  See the NOTICE file | 
| 3 |  | // distributed with this work for additional information | 
| 4 |  | // regarding copyright ownership.  The ASF licenses this file | 
| 5 |  | // to you under the Apache License, Version 2.0 (the | 
| 6 |  | // "License"); you may not use this file except in compliance | 
| 7 |  | // with the License.  You may obtain a copy of the License at | 
| 8 |  | // | 
| 9 |  | //   http://www.apache.org/licenses/LICENSE-2.0 | 
| 10 |  | // | 
| 11 |  | // Unless required by applicable law or agreed to in writing, | 
| 12 |  | // software distributed under the License is distributed on an | 
| 13 |  | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | 
| 14 |  | // KIND, either express or implied.  See the License for the | 
| 15 |  | // specific language governing permissions and limitations | 
| 16 |  | // under the License. | 
| 17 |  |  | 
| 18 |  | /* | 
| 19 |  |  * Licensed to Derrick R. Burns under one or more | 
| 20 |  |  * contributor license agreements.  See the NOTICES file distributed with | 
| 21 |  |  * this work for additional information regarding copyright ownership. | 
| 22 |  |  * The ASF licenses this file to You under the Apache License, Version 2.0 | 
| 23 |  |  * (the "License"); you may not use this file except in compliance with | 
| 24 |  |  * the License.  You may obtain a copy of the License at | 
| 25 |  |  * | 
| 26 |  |  *     http://www.apache.org/licenses/LICENSE-2.0 | 
| 27 |  |  * | 
| 28 |  |  * Unless required by applicable law or agreed to in writing, software | 
| 29 |  |  * distributed under the License is distributed on an "AS IS" BASIS, | 
| 30 |  |  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
| 31 |  |  * See the License for the specific language governing permissions and | 
| 32 |  |  * limitations under the License. | 
| 33 |  |  */ | 
| 34 |  |  | 
| 35 |  | // T-Digest :  Percentile and Quantile Estimation of Big Data | 
| 36 |  | // A new data structure for accurate on-line accumulation of rank-based statistics | 
| 37 |  | // such as quantiles and trimmed means. | 
| 38 |  | // See original paper: "Computing extremely accurate quantiles using t-digest" | 
| 39 |  | // by Ted Dunning and Otmar Ertl for more details | 
| 40 |  | // https://github.com/tdunning/t-digest/blob/07b8f2ca2be8d0a9f04df2feadad5ddc1bb73c88/docs/t-digest-paper/histo.pdf. | 
| 41 |  | // https://github.com/derrickburns/tdigest | 
| 42 |  |  | 
| 43 |  | #pragma once | 
| 44 |  |  | 
| 45 |  | #include <pdqsort.h> | 
| 46 |  |  | 
| 47 |  | #include <algorithm> | 
| 48 |  | #include <cfloat> | 
| 49 |  | #include <cmath> | 
| 50 |  | #include <iostream> | 
| 51 |  | #include <memory> | 
| 52 |  | #include <queue> | 
| 53 |  | #include <utility> | 
| 54 |  | #include <vector> | 
| 55 |  |  | 
| 56 |  | #include "common/factory_creator.h" | 
| 57 |  | #include "common/logging.h" | 
| 58 |  | #include "udf/udf.h" | 
| 59 |  | #include "util/debug_util.h" | 
| 60 |  |  | 
| 61 |  | namespace doris { | 
| 62 |  |  | 
| 63 |  | using Value = float; | 
| 64 |  | using Weight = float; | 
| 65 |  | using Index = size_t; | 
| 66 |  |  | 
| 67 |  | const size_t kHighWater = 40000; | 
| 68 |  |  | 
| 69 |  | class Centroid { | 
| 70 |  | public: | 
| 71 | 2.04k |     Centroid() : Centroid(0.0, 0.0) {} | 
| 72 |  |  | 
| 73 | 15.2k |     Centroid(Value mean, Weight weight) : _mean(mean), _weight(weight) {} | 
| 74 |  |  | 
| 75 | 312k |     Value mean() const noexcept { return _mean; } | 
| 76 |  |  | 
| 77 | 17.1k |     Weight weight() const noexcept { return _weight; } | 
| 78 |  |  | 
| 79 | 2.40k |     Value& mean() noexcept { return _mean; } | 
| 80 |  |  | 
| 81 | 1.20k |     Weight& weight() noexcept { return _weight; } | 
| 82 |  |  | 
| 83 | 9.78k |     void add(const Centroid& c) { | 
| 84 | 9.78k |         DCHECK_GT(c._weight, 0); | 
| 85 | 9.78k |         if (_weight != 0.0) {  Branch (85:13): [True: 9.78k, False: 0]
 | 
| 86 | 9.78k |             _weight += c._weight; | 
| 87 | 9.78k |             _mean += c._weight * (c._mean - _mean) / _weight; | 
| 88 | 9.78k |         } else { | 
| 89 | 0 |             _weight = c._weight; | 
| 90 | 0 |             _mean = c._mean; | 
| 91 | 0 |         } | 
| 92 | 9.78k |     } | 
| 93 |  |  | 
| 94 |  | private: | 
| 95 |  |     Value _mean = 0; | 
| 96 |  |     Weight _weight = 0; | 
| 97 |  | }; | 
| 98 |  |  | 
| 99 |  | struct CentroidList { | 
| 100 | 1 |     CentroidList(const std::vector<Centroid>& s) : iter(s.cbegin()), end(s.cend()) {} | 
| 101 |  |     std::vector<Centroid>::const_iterator iter; | 
| 102 |  |     std::vector<Centroid>::const_iterator end; | 
| 103 |  |  | 
| 104 | 100 |     bool advance() { return ++iter != end; } | 
| 105 |  | }; | 
| 106 |  |  | 
| 107 |  | class CentroidListComparator { | 
| 108 |  | public: | 
| 109 | 2 |     CentroidListComparator() {} | 
| 110 |  |  | 
| 111 | 0 |     bool operator()(const CentroidList& left, const CentroidList& right) const { | 
| 112 | 0 |         return left.iter->mean() > right.iter->mean(); | 
| 113 | 0 |     } | 
| 114 |  | }; | 
| 115 |  |  | 
| 116 |  | using CentroidListQueue = | 
| 117 |  |         std::priority_queue<CentroidList, std::vector<CentroidList>, CentroidListComparator>; | 
| 118 |  |  | 
| 119 |  | struct CentroidComparator { | 
| 120 | 156k |     bool operator()(const Centroid& a, const Centroid& b) const { return a.mean() < b.mean(); } | 
| 121 |  | }; | 
| 122 |  |  | 
| 123 |  | class TDigest { | 
| 124 |  |     ENABLE_FACTORY_CREATOR(TDigest); | 
| 125 |  |  | 
| 126 |  |     class TDigestComparator { | 
| 127 |  |     public: | 
| 128 | 2 |         TDigestComparator() {} | 
| 129 |  |  | 
| 130 | 0 |         bool operator()(const TDigest* left, const TDigest* right) const { | 
| 131 | 0 |             return left->totalSize() > right->totalSize(); | 
| 132 | 0 |         } | 
| 133 |  |     }; | 
| 134 |  |     using TDigestQueue = | 
| 135 |  |             std::priority_queue<const TDigest*, std::vector<const TDigest*>, TDigestComparator>; | 
| 136 |  |  | 
| 137 |  | public: | 
| 138 | 0 |     TDigest() : TDigest(10000) {} | 
| 139 |  |  | 
| 140 | 13 |     explicit TDigest(Value compression) : TDigest(compression, 0) {} | 
| 141 |  |  | 
| 142 | 13 |     TDigest(Value compression, Index bufferSize) : TDigest(compression, bufferSize, 0) {} | 
| 143 |  |  | 
| 144 |  |     TDigest(Value compression, Index unmergedSize, Index mergedSize) | 
| 145 |  |             : _compression(compression), | 
| 146 |  |               _max_processed(processedSize(mergedSize, compression)), | 
| 147 | 13 |               _max_unprocessed(unprocessedSize(unmergedSize, compression)) { | 
| 148 | 13 |         _processed.reserve(_max_processed); | 
| 149 | 13 |         _unprocessed.reserve(_max_unprocessed + 1); | 
| 150 | 13 |     } | 
| 151 |  |  | 
| 152 |  |     TDigest(std::vector<Centroid>&& processed, std::vector<Centroid>&& unprocessed, | 
| 153 |  |             Value compression, Index unmergedSize, Index mergedSize) | 
| 154 | 0 |             : TDigest(compression, unmergedSize, mergedSize) { | 
| 155 | 0 |         _processed = std::move(processed); | 
| 156 | 0 |         _unprocessed = std::move(unprocessed); | 
| 157 | 0 | 
 | 
| 158 | 0 |         _processed_weight = weight(_processed); | 
| 159 | 0 |         _unprocessed_weight = weight(_unprocessed); | 
| 160 | 0 |         if (_processed.size() > 0) { | 
| 161 | 0 |             _min = std::min(_min, _processed[0].mean()); | 
| 162 | 0 |             _max = std::max(_max, (_processed.cend() - 1)->mean()); | 
| 163 | 0 |         } | 
| 164 | 0 |         updateCumulative(); | 
| 165 | 0 |     } | 
| 166 |  |  | 
| 167 | 0 |     static Weight weight(std::vector<Centroid>& centroids) noexcept { | 
| 168 | 0 |         Weight w = 0.0; | 
| 169 | 0 |         for (auto centroid : centroids) { | 
| 170 | 0 |             w += centroid.weight(); | 
| 171 | 0 |         } | 
| 172 | 0 |         return w; | 
| 173 | 0 |     } | 
| 174 |  |  | 
| 175 | 0 |     TDigest& operator=(TDigest&& o) { | 
| 176 | 0 |         _compression = o._compression; | 
| 177 | 0 |         _max_processed = o._max_processed; | 
| 178 | 0 |         _max_unprocessed = o._max_unprocessed; | 
| 179 | 0 |         _processed_weight = o._processed_weight; | 
| 180 | 0 |         _unprocessed_weight = o._unprocessed_weight; | 
| 181 | 0 |         _processed = std::move(o._processed); | 
| 182 | 0 |         _unprocessed = std::move(o._unprocessed); | 
| 183 | 0 |         _cumulative = std::move(o._cumulative); | 
| 184 | 0 |         _min = o._min; | 
| 185 | 0 |         _max = o._max; | 
| 186 | 0 |         return *this; | 
| 187 | 0 |     } | 
| 188 |  |  | 
| 189 |  |     TDigest(TDigest&& o) | 
| 190 |  |             : TDigest(std::move(o._processed), std::move(o._unprocessed), o._compression, | 
| 191 | 0 |                       o._max_unprocessed, o._max_processed) {} | 
| 192 |  |  | 
| 193 | 13 |     static inline Index processedSize(Index size, Value compression) noexcept { | 
| 194 | 13 |         return (size == 0) ? static_cast<Index>(2 * std::ceil(compression)) : size;   Branch (194:16): [True: 13, False: 0]
 | 
| 195 | 13 |     } | 
| 196 |  |  | 
| 197 | 13 |     static inline Index unprocessedSize(Index size, Value compression) noexcept { | 
| 198 | 13 |         return (size == 0) ? static_cast<Index>(8 * std::ceil(compression)) : size;   Branch (198:16): [True: 13, False: 0]
 | 
| 199 | 13 |     } | 
| 200 |  |  | 
| 201 |  |     // merge in another t-digest | 
| 202 | 1 |     void merge(const TDigest* other) { | 
| 203 | 1 |         std::vector<const TDigest*> others {other}; | 
| 204 | 1 |         add(others.cbegin(), others.cend()); | 
| 205 | 1 |     } | 
| 206 |  |  | 
| 207 | 3 |     const std::vector<Centroid>& processed() const { return _processed; } | 
| 208 |  |  | 
| 209 | 0 |     const std::vector<Centroid>& unprocessed() const { return _unprocessed; } | 
| 210 |  |  | 
| 211 | 0 |     Index maxUnprocessed() const { return _max_unprocessed; } | 
| 212 |  |  | 
| 213 | 0 |     Index maxProcessed() const { return _max_processed; } | 
| 214 |  |  | 
| 215 | 1 |     void add(std::vector<const TDigest*> digests) { add(digests.cbegin(), digests.cend()); } | 
| 216 |  |  | 
| 217 |  |     // merge in a vector of tdigests in the most efficient manner possible | 
| 218 |  |     // in constant space | 
| 219 |  |     // works for any value of kHighWater | 
| 220 |  |     void add(std::vector<const TDigest*>::const_iterator iter, | 
| 221 | 2 |              std::vector<const TDigest*>::const_iterator end) { | 
| 222 | 2 |         if (iter != end) {  Branch (222:13): [True: 2, False: 0]
 | 
| 223 | 2 |             auto size = std::distance(iter, end); | 
| 224 | 2 |             TDigestQueue pq(TDigestComparator {}); | 
| 225 | 4 |             for (; iter != end; iter++) {  Branch (225:20): [True: 2, False: 2]
 | 
| 226 | 2 |                 pq.push((*iter)); | 
| 227 | 2 |             } | 
| 228 | 2 |             std::vector<const TDigest*> batch; | 
| 229 | 2 |             batch.reserve(size); | 
| 230 |  |  | 
| 231 | 2 |             size_t totalSize = 0; | 
| 232 | 4 |             while (!pq.empty()) {  Branch (232:20): [True: 2, False: 2]
 | 
| 233 | 2 |                 auto td = pq.top(); | 
| 234 | 2 |                 batch.push_back(td); | 
| 235 | 2 |                 pq.pop(); | 
| 236 | 2 |                 totalSize += td->totalSize(); | 
| 237 | 2 |                 if (totalSize >= kHighWater || pq.empty()) {  Branch (237:21): [True: 0, False: 2]
  Branch (237:48): [True: 2, False: 0]
 | 
| 238 | 2 |                     mergeProcessed(batch); | 
| 239 | 2 |                     mergeUnprocessed(batch); | 
| 240 | 2 |                     processIfNecessary(); | 
| 241 | 2 |                     batch.clear(); | 
| 242 | 2 |                     totalSize = 0; | 
| 243 | 2 |                 } | 
| 244 | 2 |             } | 
| 245 | 2 |             updateCumulative(); | 
| 246 | 2 |         } | 
| 247 | 2 |     } | 
| 248 |  |  | 
| 249 | 0 |     Weight processedWeight() const { return _processed_weight; } | 
| 250 |  |  | 
| 251 | 0 |     Weight unprocessedWeight() const { return _unprocessed_weight; } | 
| 252 |  |  | 
| 253 | 41 |     bool haveUnprocessed() const { return _unprocessed.size() > 0; } | 
| 254 |  |  | 
| 255 | 2 |     size_t totalSize() const { return _processed.size() + _unprocessed.size(); } | 
| 256 |  |  | 
| 257 | 2 |     long totalWeight() const { return static_cast<long>(_processed_weight + _unprocessed_weight); } | 
| 258 |  |  | 
| 259 |  |     // return the cdf on the t-digest | 
| 260 | 11 |     Value cdf(Value x) { | 
| 261 | 11 |         if (haveUnprocessed() || isDirty()) process();   Branch (261:13): [True: 0, False: 11]
  Branch (261:34): [True: 0, False: 11]
 | 
| 262 | 11 |         return cdfProcessed(x); | 
| 263 | 11 |     } | 
| 264 |  |  | 
| 265 | 13.2k |     bool isDirty() { | 
| 266 | 13.2k |         return _processed.size() > _max_processed || _unprocessed.size() > _max_unprocessed;   Branch (266:16): [True: 0, False: 13.2k]
  Branch (266:54): [True: 1, False: 13.2k]
 | 
| 267 | 13.2k |     } | 
| 268 |  |  | 
| 269 |  |     // return the cdf on the processed values | 
| 270 | 11 |     Value cdfProcessed(Value x) const { | 
| 271 | 11 |         VLOG_CRITICAL << "cdf value " << x; | Line | Count | Source |  | 43 | 0 | #define VLOG_CRITICAL VLOG(1) | 
 | 
| 272 | 11 |         VLOG_CRITICAL << "processed size " << _processed.size(); | Line | Count | Source |  | 43 | 0 | #define VLOG_CRITICAL VLOG(1) | 
 | 
| 273 | 11 |         if (_processed.size() == 0) {  Branch (273:13): [True: 0, False: 11]
 | 
| 274 |  |             // no data to examine | 
| 275 | 0 |             VLOG_CRITICAL << "no processed values"; | Line | Count | Source |  | 43 | 0 | #define VLOG_CRITICAL VLOG(1) | 
 | 
| 276 |  | 
 | 
| 277 | 0 |             return 0.0; | 
| 278 | 11 |         } else if (_processed.size() == 1) {  Branch (278:20): [True: 0, False: 11]
 | 
| 279 | 0 |             VLOG_CRITICAL << "one processed value " | Line | Count | Source |  | 43 | 0 | #define VLOG_CRITICAL VLOG(1) | 
 | 
| 280 | 0 |                           << " _min " << _min << " _max " << _max; | 
| 281 |  |             // exactly one centroid, should have _max==_min | 
| 282 | 0 |             auto width = _max - _min; | 
| 283 | 0 |             if (x < _min) {  Branch (283:17): [True: 0, False: 0]
 | 
| 284 | 0 |                 return 0.0; | 
| 285 | 0 |             } else if (x > _max) {  Branch (285:24): [True: 0, False: 0]
 | 
| 286 | 0 |                 return 1.0; | 
| 287 | 0 |             } else if (x - _min <= width) {  Branch (287:24): [True: 0, False: 0]
 | 
| 288 |  |                 // _min and _max are too close together to do any viable interpolation | 
| 289 | 0 |                 return 0.5; | 
| 290 | 0 |             } else { | 
| 291 |  |                 // interpolate if somehow we have weight > 0 and _max != _min | 
| 292 | 0 |                 return (x - _min) / (_max - _min); | 
| 293 | 0 |             } | 
| 294 | 11 |         } else { | 
| 295 | 11 |             auto n = _processed.size(); | 
| 296 | 11 |             if (x <= _min) {  Branch (296:17): [True: 1, False: 10]
 | 
| 297 | 1 |                 VLOG_CRITICAL << "below _min " | Line | Count | Source |  | 43 | 0 | #define VLOG_CRITICAL VLOG(1) | 
 | 
| 298 | 0 |                               << " _min " << _min << " x " << x; | 
| 299 | 1 |                 return 0; | 
| 300 | 1 |             } | 
| 301 |  |  | 
| 302 | 10 |             if (x >= _max) {  Branch (302:17): [True: 3, False: 7]
 | 
| 303 | 3 |                 VLOG_CRITICAL << "above _max " | Line | Count | Source |  | 43 | 0 | #define VLOG_CRITICAL VLOG(1) | 
 | 
| 304 | 0 |                               << " _max " << _max << " x " << x; | 
| 305 | 3 |                 return 1; | 
| 306 | 3 |             } | 
| 307 |  |  | 
| 308 |  |             // check for the left tail | 
| 309 | 7 |             if (x <= mean(0)) {  Branch (309:17): [True: 0, False: 7]
 | 
| 310 | 0 |                 VLOG_CRITICAL << "left tail " | Line | Count | Source |  | 43 | 0 | #define VLOG_CRITICAL VLOG(1) | 
 | 
| 311 | 0 |                               << " _min " << _min << " mean(0) " << mean(0) << " x " << x; | 
| 312 |  |  | 
| 313 |  |                 // note that this is different than mean(0) > _min ... this guarantees interpolation works | 
| 314 | 0 |                 if (mean(0) - _min > 0) {  Branch (314:21): [True: 0, False: 0]
 | 
| 315 | 0 |                     return (x - _min) / (mean(0) - _min) * weight(0) / _processed_weight / 2.0; | 
| 316 | 0 |                 } else { | 
| 317 | 0 |                     return 0; | 
| 318 | 0 |                 } | 
| 319 | 0 |             } | 
| 320 |  |  | 
| 321 |  |             // and the right tail | 
| 322 | 7 |             if (x >= mean(n - 1)) {  Branch (322:17): [True: 0, False: 7]
 | 
| 323 | 0 |                 VLOG_CRITICAL << "right tail" | Line | Count | Source |  | 43 | 0 | #define VLOG_CRITICAL VLOG(1) | 
 | 
| 324 | 0 |                               << " _max " << _max << " mean(n - 1) " << mean(n - 1) << " x " << x; | 
| 325 |  | 
 | 
| 326 | 0 |                 if (_max - mean(n - 1) > 0) {  Branch (326:21): [True: 0, False: 0]
 | 
| 327 | 0 |                     return 1.0 - (_max - x) / (_max - mean(n - 1)) * weight(n - 1) / | 
| 328 | 0 |                                          _processed_weight / 2.0; | 
| 329 | 0 |                 } else { | 
| 330 | 0 |                     return 1; | 
| 331 | 0 |                 } | 
| 332 | 0 |             } | 
| 333 |  |  | 
| 334 | 7 |             CentroidComparator cc; | 
| 335 | 7 |             auto iter = | 
| 336 | 7 |                     std::upper_bound(_processed.cbegin(), _processed.cend(), Centroid(x, 0), cc); | 
| 337 |  |  | 
| 338 | 7 |             auto i = std::distance(_processed.cbegin(), iter); | 
| 339 | 7 |             auto z1 = x - (iter - 1)->mean(); | 
| 340 | 7 |             auto z2 = (iter)->mean() - x; | 
| 341 | 7 |             DCHECK_LE(0.0, z1); | 
| 342 | 7 |             DCHECK_LE(0.0, z2); | 
| 343 | 7 |             VLOG_CRITICAL << "middle " | Line | Count | Source |  | 43 | 0 | #define VLOG_CRITICAL VLOG(1) | 
 | 
| 344 | 0 |                           << " z1 " << z1 << " z2 " << z2 << " x " << x; | 
| 345 |  |  | 
| 346 | 7 |             return weightedAverage(_cumulative[i - 1], z2, _cumulative[i], z1) / _processed_weight; | 
| 347 | 7 |         } | 
| 348 | 11 |     } | 
| 349 |  |  | 
| 350 |  |     // this returns a quantile on the t-digest | 
| 351 | 30 |     Value quantile(Value q) { | 
| 352 | 30 |         if (haveUnprocessed() || isDirty()) process();   Branch (352:13): [True: 4, False: 26]
  Branch (352:34): [True: 0, False: 26]
 | 
| 353 | 30 |         return quantileProcessed(q); | 
| 354 | 30 |     } | 
| 355 |  |  | 
| 356 |  |     // this returns a quantile on the currently processed values without changing the t-digest | 
| 357 |  |     // the value will not represent the unprocessed values | 
| 358 | 30 |     Value quantileProcessed(Value q) const { | 
| 359 | 30 |         if (q < 0 || q > 1) {  Branch (359:13): [True: 0, False: 30]
  Branch (359:22): [True: 0, False: 30]
 | 
| 360 | 0 |             VLOG_CRITICAL << "q should be in [0,1], got " << q; | Line | Count | Source |  | 43 | 0 | #define VLOG_CRITICAL VLOG(1) | 
 | 
| 361 | 0 |             return NAN; | 
| 362 | 0 |         } | 
| 363 |  |  | 
| 364 | 30 |         if (_processed.size() == 0) {  Branch (364:13): [True: 0, False: 30]
 | 
| 365 |  |             // no sorted means no data, no way to get a quantile | 
| 366 | 0 |             return NAN; | 
| 367 | 30 |         } else if (_processed.size() == 1) {  Branch (367:20): [True: 3, False: 27]
 | 
| 368 |  |             // with one data point, all quantiles lead to Rome | 
| 369 |  |  | 
| 370 | 3 |             return mean(0); | 
| 371 | 3 |         } | 
| 372 |  |  | 
| 373 |  |         // we know that there are at least two sorted now | 
| 374 | 27 |         auto n = _processed.size(); | 
| 375 |  |  | 
| 376 |  |         // if values were stored in a sorted array, index would be the offset we are Weighterested in | 
| 377 | 27 |         const auto index = q * _processed_weight; | 
| 378 |  |  | 
| 379 |  |         // at the boundaries, we return _min or _max | 
| 380 | 27 |         if (index <= weight(0) / 2.0) {  Branch (380:13): [True: 5, False: 22]
 | 
| 381 | 5 |             DCHECK_GT(weight(0), 0); | 
| 382 | 5 |             return _min + 2.0 * index / weight(0) * (mean(0) - _min); | 
| 383 | 5 |         } | 
| 384 |  |  | 
| 385 | 22 |         auto iter = std::lower_bound(_cumulative.cbegin(), _cumulative.cend(), index); | 
| 386 |  |  | 
| 387 | 22 |         if (iter + 1 != _cumulative.cend()) {  Branch (387:13): [True: 19, False: 3]
 | 
| 388 | 19 |             auto i = std::distance(_cumulative.cbegin(), iter); | 
| 389 | 19 |             auto z1 = index - *(iter - 1); | 
| 390 | 19 |             auto z2 = *(iter)-index; | 
| 391 |  |             // VLOG_CRITICAL << "z2 " << z2 << " index " << index << " z1 " << z1; | 
| 392 | 19 |             return weightedAverage(mean(i - 1), z2, mean(i), z1); | 
| 393 | 19 |         } | 
| 394 |  |  | 
| 395 | 3 |         DCHECK_LE(index, _processed_weight); | 
| 396 | 3 |         DCHECK_GE(index, _processed_weight - weight(n - 1) / 2.0); | 
| 397 |  |  | 
| 398 | 3 |         auto z1 = index - _processed_weight - weight(n - 1) / 2.0; | 
| 399 | 3 |         auto z2 = weight(n - 1) / 2 - z1; | 
| 400 | 3 |         return weightedAverage(mean(n - 1), z1, _max, z2); | 
| 401 | 22 |     } | 
| 402 |  |  | 
| 403 | 0 |     Value compression() const { return _compression; } | 
| 404 |  |  | 
| 405 | 3.17k |     void add(Value x) { add(x, 1); } | 
| 406 |  |  | 
| 407 | 3 |     void compress() { process(); } | 
| 408 |  |  | 
| 409 |  |     // add a single centroid to the unprocessed vector, processing previously unprocessed sorted if our limit has | 
| 410 |  |     // been reached. | 
| 411 | 13.1k |     bool add(Value x, Weight w) { | 
| 412 | 13.1k |         if (std::isnan(x)) {  Branch (412:13): [True: 0, False: 13.1k]
 | 
| 413 | 0 |             return false; | 
| 414 | 0 |         } | 
| 415 | 13.1k |         _unprocessed.push_back(Centroid(x, w)); | 
| 416 | 13.1k |         _unprocessed_weight += w; | 
| 417 | 13.1k |         processIfNecessary(); | 
| 418 | 13.1k |         return true; | 
| 419 | 13.1k |     } | 
| 420 |  |  | 
| 421 |  |     void add(std::vector<Centroid>::const_iterator iter, | 
| 422 | 0 |              std::vector<Centroid>::const_iterator end) { | 
| 423 | 0 |         while (iter != end) { | 
| 424 | 0 |             const size_t diff = std::distance(iter, end); | 
| 425 | 0 |             const size_t room = _max_unprocessed - _unprocessed.size(); | 
| 426 | 0 |             auto mid = iter + std::min(diff, room); | 
| 427 | 0 |             while (iter != mid) _unprocessed.push_back(*(iter++)); | 
| 428 | 0 |             if (_unprocessed.size() >= _max_unprocessed) { | 
| 429 | 0 |                 process(); | 
| 430 | 0 |             } | 
| 431 | 0 |         } | 
| 432 | 0 |     } | 
| 433 |  |  | 
| 434 | 5 |     uint32_t serialized_size() { | 
| 435 | 5 |         return sizeof(uint32_t) + sizeof(Value) * 5 + sizeof(Index) * 2 + sizeof(uint32_t) * 3 + | 
| 436 | 5 |                _processed.size() * sizeof(Centroid) + _unprocessed.size() * sizeof(Centroid) + | 
| 437 | 5 |                _cumulative.size() * sizeof(Weight); | 
| 438 | 5 |     } | 
| 439 |  |  | 
| 440 | 1 |     size_t serialize(uint8_t* writer) { | 
| 441 | 1 |         uint8_t* dst = writer; | 
| 442 | 1 |         uint32_t total_size = serialized_size(); | 
| 443 | 1 |         memcpy(writer, &total_size, sizeof(uint32_t)); | 
| 444 | 1 |         writer += sizeof(uint32_t); | 
| 445 | 1 |         memcpy(writer, &_compression, sizeof(Value)); | 
| 446 | 1 |         writer += sizeof(Value); | 
| 447 | 1 |         memcpy(writer, &_min, sizeof(Value)); | 
| 448 | 1 |         writer += sizeof(Value); | 
| 449 | 1 |         memcpy(writer, &_max, sizeof(Value)); | 
| 450 | 1 |         writer += sizeof(Value); | 
| 451 | 1 |         memcpy(writer, &_max_processed, sizeof(Index)); | 
| 452 | 1 |         writer += sizeof(Index); | 
| 453 | 1 |         memcpy(writer, &_max_unprocessed, sizeof(Index)); | 
| 454 | 1 |         writer += sizeof(Index); | 
| 455 | 1 |         memcpy(writer, &_processed_weight, sizeof(Value)); | 
| 456 | 1 |         writer += sizeof(Value); | 
| 457 | 1 |         memcpy(writer, &_unprocessed_weight, sizeof(Value)); | 
| 458 | 1 |         writer += sizeof(Value); | 
| 459 |  |  | 
| 460 | 1 |         uint32_t size = _processed.size(); | 
| 461 | 1 |         memcpy(writer, &size, sizeof(uint32_t)); | 
| 462 | 1 |         writer += sizeof(uint32_t); | 
| 463 | 1 |         for (int i = 0; i < size; i++) {  Branch (463:25): [True: 0, False: 1]
 | 
| 464 | 0 |             memcpy(writer, &_processed[i], sizeof(Centroid)); | 
| 465 | 0 |             writer += sizeof(Centroid); | 
| 466 | 0 |         } | 
| 467 |  |  | 
| 468 | 1 |         size = _unprocessed.size(); | 
| 469 | 1 |         memcpy(writer, &size, sizeof(uint32_t)); | 
| 470 | 1 |         writer += sizeof(uint32_t); | 
| 471 |  |         //TODO(weixiang): may be once memcpy is enough! | 
| 472 | 2.05k |         for (int i = 0; i < size; i++) {  Branch (472:25): [True: 2.04k, False: 1]
 | 
| 473 | 2.04k |             memcpy(writer, &_unprocessed[i], sizeof(Centroid)); | 
| 474 | 2.04k |             writer += sizeof(Centroid); | 
| 475 | 2.04k |         } | 
| 476 |  |  | 
| 477 | 1 |         size = _cumulative.size(); | 
| 478 | 1 |         memcpy(writer, &size, sizeof(uint32_t)); | 
| 479 | 1 |         writer += sizeof(uint32_t); | 
| 480 | 1 |         for (int i = 0; i < size; i++) {  Branch (480:25): [True: 0, False: 1]
 | 
| 481 | 0 |             memcpy(writer, &_cumulative[i], sizeof(Weight)); | 
| 482 | 0 |             writer += sizeof(Weight); | 
| 483 | 0 |         } | 
| 484 | 1 |         return writer - dst; | 
| 485 | 1 |     } | 
| 486 |  |  | 
| 487 | 1 |     void unserialize(const uint8_t* type_reader) { | 
| 488 | 1 |         uint32_t total_length = 0; | 
| 489 | 1 |         memcpy(&total_length, type_reader, sizeof(uint32_t)); | 
| 490 | 1 |         type_reader += sizeof(uint32_t); | 
| 491 | 1 |         memcpy(&_compression, type_reader, sizeof(Value)); | 
| 492 | 1 |         type_reader += sizeof(Value); | 
| 493 | 1 |         memcpy(&_min, type_reader, sizeof(Value)); | 
| 494 | 1 |         type_reader += sizeof(Value); | 
| 495 | 1 |         memcpy(&_max, type_reader, sizeof(Value)); | 
| 496 | 1 |         type_reader += sizeof(Value); | 
| 497 |  |  | 
| 498 | 1 |         memcpy(&_max_processed, type_reader, sizeof(Index)); | 
| 499 | 1 |         type_reader += sizeof(Index); | 
| 500 | 1 |         memcpy(&_max_unprocessed, type_reader, sizeof(Index)); | 
| 501 | 1 |         type_reader += sizeof(Index); | 
| 502 | 1 |         memcpy(&_processed_weight, type_reader, sizeof(Value)); | 
| 503 | 1 |         type_reader += sizeof(Value); | 
| 504 | 1 |         memcpy(&_unprocessed_weight, type_reader, sizeof(Value)); | 
| 505 | 1 |         type_reader += sizeof(Value); | 
| 506 |  |  | 
| 507 | 1 |         uint32_t size; | 
| 508 | 1 |         memcpy(&size, type_reader, sizeof(uint32_t)); | 
| 509 | 1 |         type_reader += sizeof(uint32_t); | 
| 510 | 1 |         _processed.resize(size); | 
| 511 | 1 |         for (int i = 0; i < size; i++) {  Branch (511:25): [True: 0, False: 1]
 | 
| 512 | 0 |             memcpy(&_processed[i], type_reader, sizeof(Centroid)); | 
| 513 | 0 |             type_reader += sizeof(Centroid); | 
| 514 | 0 |         } | 
| 515 | 1 |         memcpy(&size, type_reader, sizeof(uint32_t)); | 
| 516 | 1 |         type_reader += sizeof(uint32_t); | 
| 517 | 1 |         _unprocessed.resize(size); | 
| 518 | 2.05k |         for (int i = 0; i < size; i++) {  Branch (518:25): [True: 2.04k, False: 1]
 | 
| 519 | 2.04k |             memcpy(&_unprocessed[i], type_reader, sizeof(Centroid)); | 
| 520 | 2.04k |             type_reader += sizeof(Centroid); | 
| 521 | 2.04k |         } | 
| 522 | 1 |         memcpy(&size, type_reader, sizeof(uint32_t)); | 
| 523 | 1 |         type_reader += sizeof(uint32_t); | 
| 524 | 1 |         _cumulative.resize(size); | 
| 525 | 1 |         for (int i = 0; i < size; i++) {  Branch (525:25): [True: 0, False: 1]
 | 
| 526 | 0 |             memcpy(&_cumulative[i], type_reader, sizeof(Weight)); | 
| 527 | 0 |             type_reader += sizeof(Weight); | 
| 528 | 0 |         } | 
| 529 | 1 |     } | 
| 530 |  |  | 
| 531 |  | private: | 
| 532 |  |     Value _compression; | 
| 533 |  |  | 
| 534 |  |     Value _min = std::numeric_limits<Value>::max(); | 
| 535 |  |  | 
| 536 |  |     Value _max = std::numeric_limits<Value>::min(); | 
| 537 |  |  | 
| 538 |  |     Index _max_processed; | 
| 539 |  |  | 
| 540 |  |     Index _max_unprocessed; | 
| 541 |  |  | 
| 542 |  |     Value _processed_weight = 0.0; | 
| 543 |  |  | 
| 544 |  |     Value _unprocessed_weight = 0.0; | 
| 545 |  |  | 
| 546 |  |     std::vector<Centroid> _processed; | 
| 547 |  |  | 
| 548 |  |     std::vector<Centroid> _unprocessed; | 
| 549 |  |  | 
| 550 |  |     std::vector<Weight> _cumulative; | 
| 551 |  |  | 
| 552 |  |     // return mean of i-th centroid | 
| 553 | 63 |     Value mean(int i) const noexcept { return _processed[i].mean(); } | 
| 554 |  |  | 
| 555 |  |     // return weight of i-th centroid | 
| 556 | 2.57k |     Weight weight(int i) const noexcept { return _processed[i].weight(); } | 
| 557 |  |  | 
| 558 |  |     // append all unprocessed centroids into current unprocessed vector | 
| 559 | 2 |     void mergeUnprocessed(const std::vector<const TDigest*>& tdigests) { | 
| 560 | 2 |         if (tdigests.size() == 0) return;   Branch (560:13): [True: 0, False: 2]
 | 
| 561 |  |  | 
| 562 | 2 |         size_t total = _unprocessed.size(); | 
| 563 | 2 |         for (auto& td : tdigests) {  Branch (563:23): [True: 2, False: 2]
 | 
| 564 | 2 |             total += td->_unprocessed.size(); | 
| 565 | 2 |         } | 
| 566 |  |  | 
| 567 | 2 |         _unprocessed.reserve(total); | 
| 568 | 2 |         for (auto& td : tdigests) {  Branch (568:23): [True: 2, False: 2]
 | 
| 569 | 2 |             _unprocessed.insert(_unprocessed.end(), td->_unprocessed.cbegin(), | 
| 570 | 2 |                                 td->_unprocessed.cend()); | 
| 571 | 2 |             _unprocessed_weight += td->_unprocessed_weight; | 
| 572 | 2 |         } | 
| 573 | 2 |     } | 
| 574 |  |  | 
| 575 |  |     // merge all processed centroids together into a single sorted vector | 
| 576 | 2 |     void mergeProcessed(const std::vector<const TDigest*>& tdigests) { | 
| 577 | 2 |         if (tdigests.size() == 0) return;   Branch (577:13): [True: 0, False: 2]
 | 
| 578 |  |  | 
| 579 | 2 |         size_t total = 0; | 
| 580 | 2 |         CentroidListQueue pq(CentroidListComparator {}); | 
| 581 | 2 |         for (auto& td : tdigests) {  Branch (581:23): [True: 2, False: 2]
 | 
| 582 | 2 |             auto& sorted = td->_processed; | 
| 583 | 2 |             auto size = sorted.size(); | 
| 584 | 2 |             if (size > 0) {  Branch (584:17): [True: 1, False: 1]
 | 
| 585 | 1 |                 pq.push(CentroidList(sorted)); | 
| 586 | 1 |                 total += size; | 
| 587 | 1 |                 _processed_weight += td->_processed_weight; | 
| 588 | 1 |             } | 
| 589 | 2 |         } | 
| 590 | 2 |         if (total == 0) return;   Branch (590:13): [True: 1, False: 1]
 | 
| 591 |  |  | 
| 592 | 1 |         if (_processed.size() > 0) {  Branch (592:13): [True: 0, False: 1]
 | 
| 593 | 0 |             pq.push(CentroidList(_processed)); | 
| 594 | 0 |             total += _processed.size(); | 
| 595 | 0 |         } | 
| 596 |  |  | 
| 597 | 1 |         std::vector<Centroid> sorted; | 
| 598 | 1 |         VLOG_CRITICAL << "total " << total; | Line | Count | Source |  | 43 | 0 | #define VLOG_CRITICAL VLOG(1) | 
 | 
| 599 | 1 |         sorted.reserve(total); | 
| 600 |  |  | 
| 601 | 101 |         while (!pq.empty()) {  Branch (601:16): [True: 100, False: 1]
 | 
| 602 | 100 |             auto best = pq.top(); | 
| 603 | 100 |             pq.pop(); | 
| 604 | 100 |             sorted.push_back(*(best.iter)); | 
| 605 | 100 |             if (best.advance()) pq.push(best);   Branch (605:17): [True: 99, False: 1]
 | 
| 606 | 100 |         } | 
| 607 | 1 |         _processed = std::move(sorted); | 
| 608 | 1 |         if (_processed.size() > 0) {  Branch (608:13): [True: 1, False: 0]
 | 
| 609 | 1 |             _min = std::min(_min, _processed[0].mean()); | 
| 610 | 1 |             _max = std::max(_max, (_processed.cend() - 1)->mean()); | 
| 611 | 1 |         } | 
| 612 | 1 |     } | 
| 613 |  |  | 
| 614 | 13.1k |     void processIfNecessary() { | 
| 615 | 13.1k |         if (isDirty()) {  Branch (615:13): [True: 1, False: 13.1k]
 | 
| 616 | 1 |             process(); | 
| 617 | 1 |         } | 
| 618 | 13.1k |     } | 
| 619 |  |  | 
| 620 | 10 |     void updateCumulative() { | 
| 621 | 10 |         const auto n = _processed.size(); | 
| 622 | 10 |         _cumulative.clear(); | 
| 623 | 10 |         _cumulative.reserve(n + 1); | 
| 624 | 10 |         auto previous = 0.0; | 
| 625 | 2.53k |         for (Index i = 0; i < n; i++) {  Branch (625:27): [True: 2.52k, False: 10]
 | 
| 626 | 2.52k |             auto current = weight(i); | 
| 627 | 2.52k |             auto halfCurrent = current / 2.0; | 
| 628 | 2.52k |             _cumulative.push_back(previous + halfCurrent); | 
| 629 | 2.52k |             previous = previous + current; | 
| 630 | 2.52k |         } | 
| 631 | 10 |         _cumulative.push_back(previous); | 
| 632 | 10 |     } | 
| 633 |  |  | 
| 634 |  |     // merges _unprocessed centroids and _processed centroids together and processes them | 
| 635 |  |     // when complete, _unprocessed will be empty and _processed will have at most _max_processed centroids | 
| 636 | 8 |     void process() { | 
| 637 | 8 |         CentroidComparator cc; | 
| 638 |  |         // select percentile_approx(lo_orderkey,0.5) from lineorder; | 
| 639 |  |         // have test pdqsort and RadixSort, find here pdqsort performance is better when data is struct Centroid | 
| 640 |  |         // But when sort plain type like int/float of std::vector<T>, find RadixSort is better | 
| 641 | 8 |         pdqsort(_unprocessed.begin(), _unprocessed.end(), cc); | 
| 642 | 8 |         auto count = _unprocessed.size(); | 
| 643 | 8 |         _unprocessed.insert(_unprocessed.end(), _processed.cbegin(), _processed.cend()); | 
| 644 | 8 |         std::inplace_merge(_unprocessed.begin(), _unprocessed.begin() + count, _unprocessed.end(), | 
| 645 | 8 |                            cc); | 
| 646 |  |  | 
| 647 | 8 |         _processed_weight += _unprocessed_weight; | 
| 648 | 8 |         _unprocessed_weight = 0; | 
| 649 | 8 |         _processed.clear(); | 
| 650 |  |  | 
| 651 | 8 |         _processed.push_back(_unprocessed[0]); | 
| 652 | 8 |         Weight wSoFar = _unprocessed[0].weight(); | 
| 653 | 8 |         Weight wLimit = _processed_weight * integratedQ(1.0); | 
| 654 |  |  | 
| 655 | 8 |         auto end = _unprocessed.end(); | 
| 656 | 12.2k |         for (auto iter = _unprocessed.cbegin() + 1; iter < end; iter++) {  Branch (656:53): [True: 12.2k, False: 8]
 | 
| 657 | 12.2k |             auto& centroid = *iter; | 
| 658 | 12.2k |             Weight projectedW = wSoFar + centroid.weight(); | 
| 659 | 12.2k |             if (projectedW <= wLimit) {  Branch (659:17): [True: 9.78k, False: 2.41k]
 | 
| 660 | 9.78k |                 wSoFar = projectedW; | 
| 661 | 9.78k |                 (_processed.end() - 1)->add(centroid); | 
| 662 | 9.78k |             } else { | 
| 663 | 2.41k |                 auto k1 = integratedLocation(wSoFar / _processed_weight); | 
| 664 | 2.41k |                 wLimit = _processed_weight * integratedQ(k1 + 1.0); | 
| 665 | 2.41k |                 wSoFar += centroid.weight(); | 
| 666 | 2.41k |                 _processed.emplace_back(centroid); | 
| 667 | 2.41k |             } | 
| 668 | 12.2k |         } | 
| 669 | 8 |         _unprocessed.clear(); | 
| 670 | 8 |         _min = std::min(_min, _processed[0].mean()); | 
| 671 | 8 |         VLOG_CRITICAL << "new _min " << _min; | Line | Count | Source |  | 43 | 0 | #define VLOG_CRITICAL VLOG(1) | 
 | 
| 672 | 8 |         _max = std::max(_max, (_processed.cend() - 1)->mean()); | 
| 673 | 8 |         VLOG_CRITICAL << "new _max " << _max; | Line | Count | Source |  | 43 | 0 | #define VLOG_CRITICAL VLOG(1) | 
 | 
| 674 | 8 |         updateCumulative(); | 
| 675 | 8 |     } | 
| 676 |  |  | 
| 677 | 0 |     int checkWeights() { return checkWeights(_processed, _processed_weight); } | 
| 678 |  |  | 
| 679 | 0 |     size_t checkWeights(const std::vector<Centroid>& sorted, Value total) { | 
| 680 | 0 |         size_t badWeight = 0; | 
| 681 | 0 |         auto k1 = 0.0; | 
| 682 | 0 |         auto q = 0.0; | 
| 683 | 0 |         for (auto iter = sorted.cbegin(); iter != sorted.cend(); iter++) { | 
| 684 | 0 |             auto w = iter->weight(); | 
| 685 | 0 |             auto dq = w / total; | 
| 686 | 0 |             auto k2 = integratedLocation(q + dq); | 
| 687 | 0 |             if (k2 - k1 > 1 && w != 1) { | 
| 688 | 0 |                 VLOG_CRITICAL << "Oversize centroid at " << std::distance(sorted.cbegin(), iter) | 
| 689 | 0 |                               << " k1 " << k1 << " k2 " << k2 << " dk " << (k2 - k1) << " w " << w | 
| 690 | 0 |                               << " q " << q; | 
| 691 | 0 |                 badWeight++; | 
| 692 | 0 |             } | 
| 693 | 0 |             if (k2 - k1 > 1.5 && w != 1) { | 
| 694 | 0 |                 VLOG_CRITICAL << "Egregiously Oversize centroid at " | 
| 695 | 0 |                               << std::distance(sorted.cbegin(), iter) << " k1 " << k1 << " k2 " | 
| 696 | 0 |                               << k2 << " dk " << (k2 - k1) << " w " << w << " q " << q; | 
| 697 | 0 |                 badWeight++; | 
| 698 | 0 |             } | 
| 699 | 0 |             q += dq; | 
| 700 | 0 |             k1 = k2; | 
| 701 | 0 |         } | 
| 702 | 0 | 
 | 
| 703 | 0 |         return badWeight; | 
| 704 | 0 |     } | 
| 705 |  |  | 
| 706 |  |     /** | 
| 707 |  |     * Converts a quantile into a centroid scale value.  The centroid scale is nomin_ally | 
| 708 |  |     * the number k of the centroid that a quantile point q should belong to.  Due to | 
| 709 |  |     * round-offs, however, we can't align things perfectly without splitting points | 
| 710 |  |     * and sorted.  We don't want to do that, so we have to allow for offsets. | 
| 711 |  |     * In the end, the criterion is that any quantile range that spans a centroid | 
| 712 |  |     * scale range more than one should be split across more than one centroid if | 
| 713 |  |     * possible.  This won't be possible if the quantile range refers to a single point | 
| 714 |  |     * or an already existing centroid. | 
| 715 |  |     * <p/> | 
| 716 |  |     * This mapping is steep near q=0 or q=1 so each centroid there will correspond to | 
| 717 |  |     * less q range.  Near q=0.5, the mapping is flatter so that sorted there will | 
| 718 |  |     * represent a larger chunk of quantiles. | 
| 719 |  |     * | 
| 720 |  |     * @param q The quantile scale value to be mapped. | 
| 721 |  |     * @return The centroid scale value corresponding to q. | 
| 722 |  |     */ | 
| 723 | 2.41k |     Value integratedLocation(Value q) const { | 
| 724 | 2.41k |         return _compression * (std::asin(2.0 * q - 1.0) + M_PI / 2) / M_PI; | 
| 725 | 2.41k |     } | 
| 726 |  |  | 
| 727 | 2.42k |     Value integratedQ(Value k) const { | 
| 728 | 2.42k |         return (std::sin(std::min(k, _compression) * M_PI / _compression - M_PI / 2) + 1) / 2; | 
| 729 | 2.42k |     } | 
| 730 |  |  | 
| 731 |  |     /** | 
| 732 |  |      * Same as {@link #weightedAverageSorted(Value, Value, Value, Value)} but flips | 
| 733 |  |      * the order of the variables if <code>x2</code> is greater than | 
| 734 |  |      * <code>x1</code>. | 
| 735 |  |     */ | 
| 736 | 29 |     static Value weightedAverage(Value x1, Value w1, Value x2, Value w2) { | 
| 737 | 29 |         return (x1 <= x2) ? weightedAverageSorted(x1, w1, x2, w2)   Branch (737:16): [True: 29, False: 0]
 | 
| 738 | 29 |                           : weightedAverageSorted(x2, w2, x1, w1); | 
| 739 | 29 |     } | 
| 740 |  |  | 
| 741 |  |     /** | 
| 742 |  |     * Compute the weighted average between <code>x1</code> with a weight of | 
| 743 |  |     * <code>w1</code> and <code>x2</code> with a weight of <code>w2</code>. | 
| 744 |  |     * This expects <code>x1</code> to be less than or equal to <code>x2</code> | 
| 745 |  |     * and is guaranteed to return a number between <code>x1</code> and | 
| 746 |  |     * <code>x2</code>. | 
| 747 |  |     */ | 
| 748 | 29 |     static Value weightedAverageSorted(Value x1, Value w1, Value x2, Value w2) { | 
| 749 | 29 |         DCHECK_LE(x1, x2); | 
| 750 | 29 |         const Value x = (x1 * w1 + x2 * w2) / (w1 + w2); | 
| 751 | 29 |         return std::max(x1, std::min(x, x2)); | 
| 752 | 29 |     } | 
| 753 |  |  | 
| 754 | 0 |     static Value interpolate(Value x, Value x0, Value x1) { return (x - x0) / (x1 - x0); } | 
| 755 |  |  | 
| 756 |  |     /** | 
| 757 |  |     * Computes an interpolated value of a quantile that is between two sorted. | 
| 758 |  |     * | 
| 759 |  |     * Index is the quantile desired multiplied by the total number of samples - 1. | 
| 760 |  |     * | 
| 761 |  |     * @param index              Denormalized quantile desired | 
| 762 |  |     * @param previousIndex      The denormalized quantile corresponding to the center of the previous centroid. | 
| 763 |  |     * @param nextIndex          The denormalized quantile corresponding to the center of the following centroid. | 
| 764 |  |     * @param previousMean       The mean of the previous centroid. | 
| 765 |  |     * @param nextMean           The mean of the following centroid. | 
| 766 |  |     * @return  The interpolated mean. | 
| 767 |  |     */ | 
| 768 |  |     static Value quantile(Value index, Value previousIndex, Value nextIndex, Value previousMean, | 
| 769 | 0 |                           Value nextMean) { | 
| 770 | 0 |         const auto delta = nextIndex - previousIndex; | 
| 771 | 0 |         const auto previousWeight = (nextIndex - index) / delta; | 
| 772 | 0 |         const auto nextWeight = (index - previousIndex) / delta; | 
| 773 | 0 |         return previousMean * previousWeight + nextMean * nextWeight; | 
| 774 | 0 |     } | 
| 775 |  | }; | 
| 776 |  |  | 
| 777 |  | } // namespace doris |