Coverage Report

Created: 2025-09-12 18:53

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/root/doris/contrib/faiss/faiss/Clustering.h
Line
Count
Source
1
/*
2
 * Copyright (c) Meta Platforms, Inc. and affiliates.
3
 *
4
 * This source code is licensed under the MIT license found in the
5
 * LICENSE file in the root directory of this source tree.
6
 */
7
8
/** Implementation of k-means clustering with many variants. */
9
10
#ifndef FAISS_CLUSTERING_H
11
#define FAISS_CLUSTERING_H
12
#include <faiss/Index.h>
13
14
#include <vector>
15
16
namespace faiss {
17
18
/** Class for the clustering parameters. Can be passed to the
19
 * constructor of the Clustering object.
20
 */
21
struct ClusteringParameters {
22
    /// number of clustering iterations
23
    int niter = 25;
24
    /// redo clustering this many times and keep the clusters with the best
25
    /// objective
26
    int nredo = 1;
27
28
    bool verbose = false;
29
    /// whether to normalize centroids after each iteration (useful for inner
30
    /// product clustering)
31
    bool spherical = false;
32
    /// round centroids coordinates to integer after each iteration?
33
    bool int_centroids = false;
34
    /// re-train index after each iteration?
35
    bool update_index = false;
36
37
    /// Use the subset of centroids provided as input and do not change them
38
    /// during iterations
39
    bool frozen_centroids = false;
40
    /// If fewer than this number of training vectors per centroid are provided,
41
    /// writes a warning. Note that fewer than 1 point per centroid raises an
42
    /// exception.
43
    int min_points_per_centroid = 39;
44
    /// to limit size of dataset, otherwise the training set is subsampled
45
    int max_points_per_centroid = 256;
46
    /// seed for the random number generator.
47
    /// negative values lead to seeding an internal rng with
48
    /// std::high_resolution_clock.
49
    int seed = 1234;
50
51
    /// when the training set is encoded, batch size of the codec decoder
52
    size_t decode_block_size = 32768;
53
54
    /// whether to check for NaNs in an input data
55
    bool check_input_data_for_NaNs = true;
56
57
    /// Whether to use splitmix64-based random number generator for subsampling,
58
    /// which is faster, but may pick duplicate points.
59
    bool use_faster_subsampling = false;
60
};
61
62
struct ClusteringIterationStats {
63
    float obj;   ///< objective values (sum of distances reported by index)
64
    double time; ///< seconds for iteration
65
    double time_search;      ///< seconds for just search
66
    double imbalance_factor; ///< imbalance factor of iteration
67
    int nsplit;              ///< number of cluster splits
68
};
69
70
/** K-means clustering based on assignment - centroid update iterations
71
 *
72
 * The clustering is based on an Index object that assigns training
73
 * points to the centroids. Therefore, at each iteration the centroids
74
 * are added to the index.
75
 *
76
 * On output, the centoids table is set to the latest version
77
 * of the centroids and they are also added to the index. If the
78
 * centroids table it is not empty on input, it is also used for
79
 * initialization.
80
 *
81
 */
82
struct Clustering : ClusteringParameters {
83
    size_t d; ///< dimension of the vectors
84
    size_t k; ///< nb of centroids
85
86
    /** centroids (k * d)
87
     * if centroids are set on input to train, they will be used as
88
     * initialization
89
     */
90
    std::vector<float> centroids;
91
92
    /// stats at every iteration of clustering
93
    std::vector<ClusteringIterationStats> iteration_stats;
94
95
    Clustering(int d, int k);
96
    Clustering(int d, int k, const ClusteringParameters& cp);
97
98
    /** run k-means training
99
     *
100
     * @param x          training vectors, size n * d
101
     * @param index      index used for assignment
102
     * @param x_weights  weight associated to each vector: NULL or size n
103
     */
104
    virtual void train(
105
            idx_t n,
106
            const float* x,
107
            faiss::Index& index,
108
            const float* x_weights = nullptr);
109
110
    /** run with encoded vectors
111
     *
112
     * win addition to train()'s parameters takes a codec as parameter
113
     * to decode the input vectors.
114
     *
115
     * @param codec      codec used to decode the vectors (nullptr =
116
     *                   vectors are in fact floats)
117
     */
118
    void train_encoded(
119
            idx_t nx,
120
            const uint8_t* x_in,
121
            const Index* codec,
122
            Index& index,
123
            const float* weights = nullptr);
124
125
    /// Post-process the centroids after each centroid update.
126
    /// includes optional L2 normalization and nearest integer rounding
127
    void post_process_centroids();
128
129
0
    virtual ~Clustering() {}
130
};
131
132
/** Exact 1D clustering algorithm
133
 *
134
 * Since it does not use an index, it does not overload the train() function
135
 */
136
struct Clustering1D : Clustering {
137
    explicit Clustering1D(int k);
138
139
    Clustering1D(int k, const ClusteringParameters& cp);
140
141
    void train_exact(idx_t n, const float* x);
142
143
0
    virtual ~Clustering1D() {}
144
};
145
146
struct ProgressiveDimClusteringParameters : ClusteringParameters {
147
    int progressive_dim_steps; ///< number of incremental steps
148
    bool apply_pca;            ///< apply PCA on input
149
150
    ProgressiveDimClusteringParameters();
151
};
152
153
/** generates an index suitable for clustering when called */
154
struct ProgressiveDimIndexFactory {
155
    /// ownership transferred to caller
156
    virtual Index* operator()(int dim);
157
158
0
    virtual ~ProgressiveDimIndexFactory() {}
159
};
160
161
/** K-means clustering with progressive dimensions used
162
 *
163
 * The clustering first happens in dim 1, then with exponentially increasing
164
 * dimension until d (I steps). This is typically applied after a PCA
165
 * transformation (optional). Reference:
166
 *
167
 * "Improved Residual Vector Quantization for High-dimensional Approximate
168
 * Nearest Neighbor Search"
169
 *
170
 * Shicong Liu, Hongtao Lu, Junru Shao, AAAI'15
171
 *
172
 * https://arxiv.org/abs/1509.05195
173
 */
174
struct ProgressiveDimClustering : ProgressiveDimClusteringParameters {
175
    size_t d; ///< dimension of the vectors
176
    size_t k; ///< nb of centroids
177
178
    /** centroids (k * d) */
179
    std::vector<float> centroids;
180
181
    /// stats at every iteration of clustering
182
    std::vector<ClusteringIterationStats> iteration_stats;
183
184
    ProgressiveDimClustering(int d, int k);
185
    ProgressiveDimClustering(
186
            int d,
187
            int k,
188
            const ProgressiveDimClusteringParameters& cp);
189
190
    void train(idx_t n, const float* x, ProgressiveDimIndexFactory& factory);
191
192
0
    virtual ~ProgressiveDimClustering() {}
193
};
194
195
/** simplified interface
196
 *
197
 * @param d dimension of the data
198
 * @param n nb of training vectors
199
 * @param k nb of output centroids
200
 * @param x training set (size n * d)
201
 * @param centroids output centroids (size k * d)
202
 * @return final quantization error
203
 */
204
float kmeans_clustering(
205
        size_t d,
206
        size_t n,
207
        size_t k,
208
        const float* x,
209
        float* centroids);
210
211
} // namespace faiss
212
213
#endif