/root/doris/contrib/faiss/faiss/Clustering.h

Source
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

/** Implementation of k-means clustering with many variants. */

#ifndef FAISS_CLUSTERING_H
#define FAISS_CLUSTERING_H
#include <faiss/Index.h>

#include <vector>

namespace faiss {

/** Class for the clustering parameters. Can be passed to the
 * constructor of the Clustering object.
 */
struct ClusteringParameters {
    /// number of clustering iterations
    int niter = 25;
    /// redo clustering this many times and keep the clusters with the best
    /// objective
    int nredo = 1;

    bool verbose = false;
    /// whether to normalize centroids after each iteration (useful for inner
    /// product clustering)
    bool spherical = false;
    /// round centroids coordinates to integer after each iteration?
    bool int_centroids = false;
    /// re-train index after each iteration?
    bool update_index = false;

    /// Use the subset of centroids provided as input and do not change them
    /// during iterations
    bool frozen_centroids = false;
    /// If fewer than this number of training vectors per centroid are provided,
    /// writes a warning. Note that fewer than 1 point per centroid raises an
    /// exception.
    int min_points_per_centroid = 39;
    /// to limit size of dataset, otherwise the training set is subsampled
    int max_points_per_centroid = 256;
    /// seed for the random number generator.
    /// negative values lead to seeding an internal rng with
    /// std::high_resolution_clock.
    int seed = 1234;

    /// when the training set is encoded, batch size of the codec decoder
    size_t decode_block_size = 32768;

    /// whether to check for NaNs in an input data
    bool check_input_data_for_NaNs = true;

    /// Whether to use splitmix64-based random number generator for subsampling,
    /// which is faster, but may pick duplicate points.
    bool use_faster_subsampling = false;
};

struct ClusteringIterationStats {
    float obj;   ///< objective values (sum of distances reported by index)
    double time; ///< seconds for iteration
    double time_search;      ///< seconds for just search
    double imbalance_factor; ///< imbalance factor of iteration
    int nsplit;              ///< number of cluster splits
};

/** K-means clustering based on assignment - centroid update iterations
 *
 * The clustering is based on an Index object that assigns training
 * points to the centroids. Therefore, at each iteration the centroids
 * are added to the index.
 *
 * On output, the centoids table is set to the latest version
 * of the centroids and they are also added to the index. If the
 * centroids table it is not empty on input, it is also used for
 * initialization.
 *
 */
struct Clustering : ClusteringParameters {
    size_t d; ///< dimension of the vectors
    size_t k; ///< nb of centroids

    /** centroids (k * d)
     * if centroids are set on input to train, they will be used as
     * initialization
     */
    std::vector<float> centroids;

    /// stats at every iteration of clustering
    std::vector<ClusteringIterationStats> iteration_stats;

    Clustering(int d, int k);
    Clustering(int d, int k, const ClusteringParameters& cp);

    /** run k-means training
     *
     * @param x          training vectors, size n * d
     * @param index      index used for assignment
     * @param x_weights  weight associated to each vector: NULL or size n
     */
    virtual void train(
            idx_t n,
            const float* x,
            faiss::Index& index,
            const float* x_weights = nullptr);

    /** run with encoded vectors
     *
     * win addition to train()'s parameters takes a codec as parameter
     * to decode the input vectors.
     *
     * @param codec      codec used to decode the vectors (nullptr =
     *                   vectors are in fact floats)
     */
    void train_encoded(
            idx_t nx,
            const uint8_t* x_in,
            const Index* codec,
            Index& index,
            const float* weights = nullptr);

    /// Post-process the centroids after each centroid update.
    /// includes optional L2 normalization and nearest integer rounding
    void post_process_centroids();

    virtual ~Clustering() {}
};

/** Exact 1D clustering algorithm
 *
 * Since it does not use an index, it does not overload the train() function
 */
struct Clustering1D : Clustering {
    explicit Clustering1D(int k);

    Clustering1D(int k, const ClusteringParameters& cp);

    void train_exact(idx_t n, const float* x);

    virtual ~Clustering1D() {}
};

struct ProgressiveDimClusteringParameters : ClusteringParameters {
    int progressive_dim_steps; ///< number of incremental steps
    bool apply_pca;            ///< apply PCA on input

    ProgressiveDimClusteringParameters();
};

/** generates an index suitable for clustering when called */
struct ProgressiveDimIndexFactory {
    /// ownership transferred to caller
    virtual Index* operator()(int dim);

    virtual ~ProgressiveDimIndexFactory() {}
};

/** K-means clustering with progressive dimensions used
 *
 * The clustering first happens in dim 1, then with exponentially increasing
 * dimension until d (I steps). This is typically applied after a PCA
 * transformation (optional). Reference:
 *
 * "Improved Residual Vector Quantization for High-dimensional Approximate
 * Nearest Neighbor Search"
 *
 * Shicong Liu, Hongtao Lu, Junru Shao, AAAI'15
 *
 * https://arxiv.org/abs/1509.05195
 */
struct ProgressiveDimClustering : ProgressiveDimClusteringParameters {
    size_t d; ///< dimension of the vectors
    size_t k; ///< nb of centroids

    /** centroids (k * d) */
    std::vector<float> centroids;

    /// stats at every iteration of clustering
    std::vector<ClusteringIterationStats> iteration_stats;

    ProgressiveDimClustering(int d, int k);
    ProgressiveDimClustering(
            int d,
            int k,
            const ProgressiveDimClusteringParameters& cp);

    void train(idx_t n, const float* x, ProgressiveDimIndexFactory& factory);

    virtual ~ProgressiveDimClustering() {}
};

/** simplified interface
 *
 * @param d dimension of the data
 * @param n nb of training vectors
 * @param k nb of output centroids
 * @param x training set (size n * d)
 * @param centroids output centroids (size k * d)
 * @return final quantization error
 */
float kmeans_clustering(
        size_t d,
        size_t n,
        size_t k,
        const float* x,
        float* centroids);

} // namespace faiss

#endif

Coverage Report

Created: 2025-09-12 18:53

Line	Count	Source
1		/*
2		* Copyright (c) Meta Platforms, Inc. and affiliates.
3		*
4		* This source code is licensed under the MIT license found in the
5		* LICENSE file in the root directory of this source tree.
6		*/
7
8		/** Implementation of k-means clustering with many variants. */
9
10		#ifndef FAISS_CLUSTERING_H
11		#define FAISS_CLUSTERING_H
12		#include <faiss/Index.h>
13
14		#include <vector>
15
16		namespace faiss {
17
18		/** Class for the clustering parameters. Can be passed to the
19		* constructor of the Clustering object.
20		*/
21		struct ClusteringParameters {
22		/// number of clustering iterations
23		int niter = 25;
24		/// redo clustering this many times and keep the clusters with the best
25		/// objective
26		int nredo = 1;
27
28		bool verbose = false;
29		/// whether to normalize centroids after each iteration (useful for inner
30		/// product clustering)
31		bool spherical = false;
32		/// round centroids coordinates to integer after each iteration?
33		bool int_centroids = false;
34		/// re-train index after each iteration?
35		bool update_index = false;
36
37		/// Use the subset of centroids provided as input and do not change them
38		/// during iterations
39		bool frozen_centroids = false;
40		/// If fewer than this number of training vectors per centroid are provided,
41		/// writes a warning. Note that fewer than 1 point per centroid raises an
42		/// exception.
43		int min_points_per_centroid = 39;
44		/// to limit size of dataset, otherwise the training set is subsampled
45		int max_points_per_centroid = 256;
46		/// seed for the random number generator.
47		/// negative values lead to seeding an internal rng with
48		/// std::high_resolution_clock.
49		int seed = 1234;
50
51		/// when the training set is encoded, batch size of the codec decoder
52		size_t decode_block_size = 32768;
53
54		/// whether to check for NaNs in an input data
55		bool check_input_data_for_NaNs = true;
56
57		/// Whether to use splitmix64-based random number generator for subsampling,
58		/// which is faster, but may pick duplicate points.
59		bool use_faster_subsampling = false;
60		};
61
62		struct ClusteringIterationStats {
63		float obj; ///< objective values (sum of distances reported by index)
64		double time; ///< seconds for iteration
65		double time_search; ///< seconds for just search
66		double imbalance_factor; ///< imbalance factor of iteration
67		int nsplit; ///< number of cluster splits
68		};
69
70		/** K-means clustering based on assignment - centroid update iterations
71		*
72		* The clustering is based on an Index object that assigns training
73		* points to the centroids. Therefore, at each iteration the centroids
74		* are added to the index.
75		*
76		* On output, the centoids table is set to the latest version
77		* of the centroids and they are also added to the index. If the
78		* centroids table it is not empty on input, it is also used for
79		* initialization.
80		*
81		*/
82		struct Clustering : ClusteringParameters {
83		size_t d; ///< dimension of the vectors
84		size_t k; ///< nb of centroids
85
86		/** centroids (k * d)
87		* if centroids are set on input to train, they will be used as
88		* initialization
89		*/
90		std::vector<float> centroids;
91
92		/// stats at every iteration of clustering
93		std::vector<ClusteringIterationStats> iteration_stats;
94
95		Clustering(int d, int k);
96		Clustering(int d, int k, const ClusteringParameters& cp);
97
98		/** run k-means training
99		*
100		* @param x training vectors, size n * d
101		* @param index index used for assignment
102		* @param x_weights weight associated to each vector: NULL or size n
103		*/
104		virtual void train(
105		idx_t n,
106		const float* x,
107		faiss::Index& index,
108		const float* x_weights = nullptr);
109
110		/** run with encoded vectors
111		*
112		* win addition to train()'s parameters takes a codec as parameter
113		* to decode the input vectors.
114		*
115		* @param codec codec used to decode the vectors (nullptr =
116		* vectors are in fact floats)
117		*/
118		void train_encoded(
119		idx_t nx,
120		const uint8_t* x_in,
121		const Index* codec,
122		Index& index,
123		const float* weights = nullptr);
124
125		/// Post-process the centroids after each centroid update.
126		/// includes optional L2 normalization and nearest integer rounding
127		void post_process_centroids();
128
129	0	virtual ~Clustering() {}
130		};
131
132		/** Exact 1D clustering algorithm
133		*
134		* Since it does not use an index, it does not overload the train() function
135		*/
136		struct Clustering1D : Clustering {
137		explicit Clustering1D(int k);
138
139		Clustering1D(int k, const ClusteringParameters& cp);
140
141		void train_exact(idx_t n, const float* x);
142
143	0	virtual ~Clustering1D() {}
144		};
145
146		struct ProgressiveDimClusteringParameters : ClusteringParameters {
147		int progressive_dim_steps; ///< number of incremental steps
148		bool apply_pca; ///< apply PCA on input
149
150		ProgressiveDimClusteringParameters();
151		};
152
153		/** generates an index suitable for clustering when called */
154		struct ProgressiveDimIndexFactory {
155		/// ownership transferred to caller
156		virtual Index* operator()(int dim);
157
158	0	virtual ~ProgressiveDimIndexFactory() {}
159		};
160
161		/** K-means clustering with progressive dimensions used
162		*
163		* The clustering first happens in dim 1, then with exponentially increasing
164		* dimension until d (I steps). This is typically applied after a PCA
165		* transformation (optional). Reference:
166		*
167		* "Improved Residual Vector Quantization for High-dimensional Approximate
168		* Nearest Neighbor Search"
169		*
170		* Shicong Liu, Hongtao Lu, Junru Shao, AAAI'15
171		*
172		* https://arxiv.org/abs/1509.05195
173		*/
174		struct ProgressiveDimClustering : ProgressiveDimClusteringParameters {
175		size_t d; ///< dimension of the vectors
176		size_t k; ///< nb of centroids
177
178		/** centroids (k * d) */
179		std::vector<float> centroids;
180
181		/// stats at every iteration of clustering
182		std::vector<ClusteringIterationStats> iteration_stats;
183
184		ProgressiveDimClustering(int d, int k);
185		ProgressiveDimClustering(
186		int d,
187		int k,
188		const ProgressiveDimClusteringParameters& cp);
189
190		void train(idx_t n, const float* x, ProgressiveDimIndexFactory& factory);
191
192	0	virtual ~ProgressiveDimClustering() {}
193		};
194
195		/** simplified interface
196		*
197		* @param d dimension of the data
198		* @param n nb of training vectors
199		* @param k nb of output centroids
200		* @param x training set (size n * d)
201		* @param centroids output centroids (size k * d)
202		* @return final quantization error
203		*/
204		float kmeans_clustering(
205		size_t d,
206		size_t n,
207		size_t k,
208		const float* x,
209		float* centroids);
210
211		} // namespace faiss
212
213		#endif