contrib/faiss/faiss/utils/hamming_distance/avx2-inl.h

Source
/*
 * Copyright (c) Meta Platforms, Inc. and affiliates.
 *
 * This source code is licensed under the MIT license found in the
 * LICENSE file in the root directory of this source tree.
 */

#ifndef HAMMING_AVX2_INL_H
#define HAMMING_AVX2_INL_H

// AVX2 version

#include <cassert>
#include <cstddef>
#include <cstdint>

#include <faiss/impl/platform_macros.h>

#include <immintrin.h>

namespace faiss {

/* Elementary Hamming distance computation: unoptimized  */
template <size_t nbits, typename T>
inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
    const size_t nbytes = nbits / 8;
    size_t i;
    T h = 0;
    for (i = 0; i < nbytes; i++) {
        h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
    }
    return h;
}

/* Hamming distances for multiples of 64 bits */
template <size_t nbits>
inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) {
    const size_t nwords = nbits / 64;
    size_t i;
    hamdis_t h = 0;
    for (i = 0; i < nwords; i++) {
        h += popcount64(bs1[i] ^ bs2[i]);
    }
    return h;
}

/* specialized (optimized) functions */
template <>
inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
    return popcount64(pa[0] ^ pb[0]);
}

template <>
inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
}

template <>
inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
    return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) +
            popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]);
}

/* Hamming distances for multiple of 64 bits */
inline hamdis_t hamming(
        const uint64_t* bs1,
        const uint64_t* bs2,
        size_t nwords) {
    hamdis_t h = 0;
    for (size_t i = 0; i < nwords; i++) {
        h += popcount64(bs1[i] ^ bs2[i]);
    }
    return h;
}

/******************************************************************
 * The HammingComputer series of classes compares a single code of
 * size 4 to 32 to incoming codes. They are intended for use as a
 * template class where it would be inefficient to switch on the code
 * size in the inner loop. Hopefully the compiler will inline the
 * hamming() functions and put the a0, a1, ... in registers.
 ******************************************************************/

struct HammingComputer4 {
    uint32_t a0;

    HammingComputer4() {}

    HammingComputer4(const uint8_t* a, int code_size) {
        set(a, code_size);
    }

    void set(const uint8_t* a, int code_size) {
        assert(code_size == 4);
        a0 = *(uint32_t*)a;
    }

    inline int hamming(const uint8_t* b) const {
        return popcount64(*(uint32_t*)b ^ a0);
    }

    inline static constexpr int get_code_size() {
        return 4;
    }
};

struct HammingComputer8 {
    uint64_t a0;

    HammingComputer8() {}

    HammingComputer8(const uint8_t* a, int code_size) {
        set(a, code_size);
    }

    void set(const uint8_t* a, int code_size) {
        assert(code_size == 8);
        a0 = *(uint64_t*)a;
    }

    inline int hamming(const uint8_t* b) const {
        return popcount64(*(uint64_t*)b ^ a0);
    }

    inline static constexpr int get_code_size() {
        return 8;
    }
};

struct HammingComputer16 {
    uint64_t a0, a1;

    HammingComputer16() {}

    HammingComputer16(const uint8_t* a8, int code_size) {
        set(a8, code_size);
    }

    void set(const uint8_t* a8, int code_size) {
        assert(code_size == 16);
        const uint64_t* a = (uint64_t*)a8;
        a0 = a[0];
        a1 = a[1];
    }

    inline int hamming(const uint8_t* b8) const {
        const uint64_t* b = (uint64_t*)b8;
        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1);
    }

    inline static constexpr int get_code_size() {
        return 16;
    }
};

// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
// This incurs a penalty of ~10% wrt. fully aligned accesses.
struct HammingComputer20 {
    uint64_t a0, a1;
    uint32_t a2;

    HammingComputer20() {}

    HammingComputer20(const uint8_t* a8, int code_size) {
        set(a8, code_size);
    }

    void set(const uint8_t* a8, int code_size) {
        assert(code_size == 20);
        const uint64_t* a = (uint64_t*)a8;
        a0 = a[0];
        a1 = a[1];
        a2 = a[2];
    }

    inline int hamming(const uint8_t* b8) const {
        const uint64_t* b = (uint64_t*)b8;
        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
                popcount64(*(uint32_t*)(b + 2) ^ a2);
    }

    inline static constexpr int get_code_size() {
        return 20;
    }
};

struct HammingComputer32 {
    uint64_t a0, a1, a2, a3;

    HammingComputer32() {}

    HammingComputer32(const uint8_t* a8, int code_size) {
        set(a8, code_size);
    }

    void set(const uint8_t* a8, int code_size) {
        assert(code_size == 32);
        const uint64_t* a = (uint64_t*)a8;
        a0 = a[0];
        a1 = a[1];
        a2 = a[2];
        a3 = a[3];
    }

    inline int hamming(const uint8_t* b8) const {
        const uint64_t* b = (uint64_t*)b8;
        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3);
    }

    inline static constexpr int get_code_size() {
        return 32;
    }
};

struct HammingComputer64 {
    uint64_t a0, a1, a2, a3, a4, a5, a6, a7;

    HammingComputer64() {}

    HammingComputer64(const uint8_t* a8, int code_size) {
        set(a8, code_size);
    }

    void set(const uint8_t* a8, int code_size) {
        assert(code_size == 64);
        const uint64_t* a = (uint64_t*)a8;
        a0 = a[0];
        a1 = a[1];
        a2 = a[2];
        a3 = a[3];
        a4 = a[4];
        a5 = a[5];
        a6 = a[6];
        a7 = a[7];
    }

    inline int hamming(const uint8_t* b8) const {
        const uint64_t* b = (uint64_t*)b8;
        return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
                popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
                popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
                popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
    }

    inline static constexpr int get_code_size() {
        return 64;
    }
};

struct HammingComputerDefault {
    const uint8_t* a8;
    int quotient8;
    int remainder8;

    HammingComputerDefault() {}

    HammingComputerDefault(const uint8_t* a8, int code_size) {
        set(a8, code_size);
    }

    void set(const uint8_t* a8_2, int code_size) {
        this->a8 = a8_2;
        quotient8 = code_size / 8;
        remainder8 = code_size % 8;
    }

    int hamming(const uint8_t* b8) const {
        int accu = 0;

        const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
        const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
        int i = 0, len = quotient8;
        switch (len & 7) {
            default:
                while (len > 7) {
                    len -= 8;
                    accu += popcount64(a64[i] ^ b64[i]);
                    i++;
                    [[fallthrough]];
                    case 7:
                        accu += popcount64(a64[i] ^ b64[i]);
                        i++;
                        [[fallthrough]];
                    case 6:
                        accu += popcount64(a64[i] ^ b64[i]);
                        i++;
                        [[fallthrough]];
                    case 5:
                        accu += popcount64(a64[i] ^ b64[i]);
                        i++;
                        [[fallthrough]];
                    case 4:
                        accu += popcount64(a64[i] ^ b64[i]);
                        i++;
                        [[fallthrough]];
                    case 3:
                        accu += popcount64(a64[i] ^ b64[i]);
                        i++;
                        [[fallthrough]];
                    case 2:
                        accu += popcount64(a64[i] ^ b64[i]);
                        i++;
                        [[fallthrough]];
                    case 1:
                        accu += popcount64(a64[i] ^ b64[i]);
                        i++;
                }
        }
        if (remainder8) {
            const uint8_t* a = a8 + 8 * quotient8;
            const uint8_t* b = b8 + 8 * quotient8;
            switch (remainder8) {
                case 7:
                    accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
                    [[fallthrough]];
                case 6:
                    accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
                    [[fallthrough]];
                case 5:
                    accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
                    [[fallthrough]];
                case 4:
                    accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
                    [[fallthrough]];
                case 3:
                    accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
                    [[fallthrough]];
                case 2:
                    accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
                    [[fallthrough]];
                case 1:
                    accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
                    [[fallthrough]];
                default:
                    break;
            }
        }

        return accu;
    }

    inline int get_code_size() const {
        return quotient8 * 8 + remainder8;
    }
};

/***************************************************************************
 * generalized Hamming = number of bytes that are different between
 * two codes.
 ***************************************************************************/

inline int generalized_hamming_64(uint64_t a) {
    a |= a >> 1;
    a |= a >> 2;
    a |= a >> 4;
    a &= 0x0101010101010101UL;
    return popcount64(a);
}

struct GenHammingComputer8 {
    uint64_t a0;

    GenHammingComputer8(const uint8_t* a, int code_size) {
        assert(code_size == 8);
        a0 = *(uint64_t*)a;
    }

    inline int hamming(const uint8_t* b) const {
        return generalized_hamming_64(*(uint64_t*)b ^ a0);
    }

    inline static constexpr int get_code_size() {
        return 8;
    }
};

// I'm not sure whether this version is faster of slower, tbh
// todo: test on different CPUs
struct GenHammingComputer16 {
    __m128i a;

    GenHammingComputer16(const uint8_t* a8, int code_size) {
        assert(code_size == 16);
        a = _mm_loadu_si128((const __m128i_u*)a8);
    }

    inline int hamming(const uint8_t* b8) const {
        const __m128i b = _mm_loadu_si128((const __m128i_u*)b8);
        const __m128i cmp = _mm_cmpeq_epi8(a, b);
        const auto movemask = _mm_movemask_epi8(cmp);
        return 16 - popcount32(movemask);
    }

    inline static constexpr int get_code_size() {
        return 16;
    }
};

struct GenHammingComputer32 {
    __m256i a;

    GenHammingComputer32(const uint8_t* a8, int code_size) {
        assert(code_size == 32);
        a = _mm256_loadu_si256((const __m256i_u*)a8);
    }

    inline int hamming(const uint8_t* b8) const {
        const __m256i b = _mm256_loadu_si256((const __m256i_u*)b8);
        const __m256i cmp = _mm256_cmpeq_epi8(a, b);
        const uint32_t movemask = _mm256_movemask_epi8(cmp);
        return 32 - popcount32(movemask);
    }

    inline static constexpr int get_code_size() {
        return 32;
    }
};

// A specialized version might be needed for the very long
// GenHamming code_size. In such a case, one may accumulate
// counts using _mm256_sub_epi8 and then compute a horizontal
// sum (using _mm256_sad_epu8, maybe, in blocks of no larger
// than 256 * 32 bytes).

struct GenHammingComputerM8 {
    const uint64_t* a;
    int n;

    GenHammingComputerM8(const uint8_t* a8, int code_size) {
        assert(code_size % 8 == 0);
        a = (uint64_t*)a8;
        n = code_size / 8;
    }

    int hamming(const uint8_t* b8) const {
        const uint64_t* b = (uint64_t*)b8;
        int accu = 0;

        int i = 0;
        int n4 = (n / 4) * 4;
        for (; i < n4; i += 4) {
            const __m256i av = _mm256_loadu_si256((const __m256i_u*)(a + i));
            const __m256i bv = _mm256_loadu_si256((const __m256i_u*)(b + i));
            const __m256i cmp = _mm256_cmpeq_epi8(av, bv);
            const uint32_t movemask = _mm256_movemask_epi8(cmp);
            accu += 32 - popcount32(movemask);
        }

        for (; i < n; i++)
            accu += generalized_hamming_64(a[i] ^ b[i]);
        return accu;
    }

    inline int get_code_size() const {
        return n * 8;
    }
};

} // namespace faiss

#endif

Coverage Report

Created: 2026-03-27 17:06

Line	Count	Source
1		/*
2		* Copyright (c) Meta Platforms, Inc. and affiliates.
3		*
4		* This source code is licensed under the MIT license found in the
5		* LICENSE file in the root directory of this source tree.
6		*/
7
8		#ifndef HAMMING_AVX2_INL_H
9		#define HAMMING_AVX2_INL_H
10
11		// AVX2 version
12
13		#include <cassert>
14		#include <cstddef>
15		#include <cstdint>
16
17		#include <faiss/impl/platform_macros.h>
18
19		#include <immintrin.h>
20
21		namespace faiss {
22
23		/* Elementary Hamming distance computation: unoptimized */
24		template <size_t nbits, typename T>
25		inline T hamming(const uint8_t* bs1, const uint8_t* bs2) {
26		const size_t nbytes = nbits / 8;
27		size_t i;
28		T h = 0;
29		for (i = 0; i < nbytes; i++) {
30		h += (T)hamdis_tab_ham_bytes[bs1[i] ^ bs2[i]];
31		}
32		return h;
33		}
34
35		/* Hamming distances for multiples of 64 bits */
36		template <size_t nbits>
37	0	inline hamdis_t hamming(const uint64_t* bs1, const uint64_t* bs2) {
38	0	const size_t nwords = nbits / 64;
39	0	size_t i;
40	0	hamdis_t h = 0;
41	0	for (i = 0; i < nwords; i++) {
42	0	h += popcount64(bs1[i] ^ bs2[i]);
43	0	}
44	0	return h;
45	0	}
46
47		/* specialized (optimized) functions */
48		template <>
49	0	inline hamdis_t hamming<64>(const uint64_t* pa, const uint64_t* pb) {
50	0	return popcount64(pa[0] ^ pb[0]);
51	0	}
52
53		template <>
54	0	inline hamdis_t hamming<128>(const uint64_t* pa, const uint64_t* pb) {
55	0	return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]);
56	0	}
57
58		template <>
59	0	inline hamdis_t hamming<256>(const uint64_t* pa, const uint64_t* pb) {
60	0	return popcount64(pa[0] ^ pb[0]) + popcount64(pa[1] ^ pb[1]) +
61	0	popcount64(pa[2] ^ pb[2]) + popcount64(pa[3] ^ pb[3]);
62	0	}
63
64		/* Hamming distances for multiple of 64 bits */
65		inline hamdis_t hamming(
66		const uint64_t* bs1,
67		const uint64_t* bs2,
68	0	size_t nwords) {
69	0	hamdis_t h = 0;
70	0	for (size_t i = 0; i < nwords; i++) {
71	0	h += popcount64(bs1[i] ^ bs2[i]);
72	0	}
73	0	return h;
74	0	}
75
76		/******************************************************************
77		* The HammingComputer series of classes compares a single code of
78		* size 4 to 32 to incoming codes. They are intended for use as a
79		* template class where it would be inefficient to switch on the code
80		* size in the inner loop. Hopefully the compiler will inline the
81		* hamming() functions and put the a0, a1, ... in registers.
82		******************************************************************/
83
84		struct HammingComputer4 {
85		uint32_t a0;
86
87	0	HammingComputer4() {}
88
89	0	HammingComputer4(const uint8_t* a, int code_size) {
90	0	set(a, code_size);
91	0	}
92
93	0	void set(const uint8_t* a, int code_size) {
94	0	assert(code_size == 4);
95	0	a0 = (uint32_t)a;
96	0	}
97
98	0	inline int hamming(const uint8_t* b) const {
99	0	return popcount64((uint32_t)b ^ a0);
100	0	}
101
102	0	inline static constexpr int get_code_size() {
103	0	return 4;
104	0	}
105		};
106
107		struct HammingComputer8 {
108		uint64_t a0;
109
110	0	HammingComputer8() {}
111
112	0	HammingComputer8(const uint8_t* a, int code_size) {
113	0	set(a, code_size);
114	0	}
115
116	0	void set(const uint8_t* a, int code_size) {
117	0	assert(code_size == 8);
118	0	a0 = (uint64_t)a;
119	0	}
120
121	0	inline int hamming(const uint8_t* b) const {
122	0	return popcount64((uint64_t)b ^ a0);
123	0	}
124
125	0	inline static constexpr int get_code_size() {
126	0	return 8;
127	0	}
128		};
129
130		struct HammingComputer16 {
131		uint64_t a0, a1;
132
133	0	HammingComputer16() {}
134
135	0	HammingComputer16(const uint8_t* a8, int code_size) {
136	0	set(a8, code_size);
137	0	}
138
139	0	void set(const uint8_t* a8, int code_size) {
140	0	assert(code_size == 16);
141	0	const uint64_t* a = (uint64_t*)a8;
142	0	a0 = a[0];
143	0	a1 = a[1];
144	0	}
145
146	0	inline int hamming(const uint8_t* b8) const {
147	0	const uint64_t* b = (uint64_t*)b8;
148	0	return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1);
149	0	}
150
151	0	inline static constexpr int get_code_size() {
152	0	return 16;
153	0	}
154		};
155
156		// when applied to an array, 1/2 of the 64-bit accesses are unaligned.
157		// This incurs a penalty of ~10% wrt. fully aligned accesses.
158		struct HammingComputer20 {
159		uint64_t a0, a1;
160		uint32_t a2;
161
162	0	HammingComputer20() {}
163
164	0	HammingComputer20(const uint8_t* a8, int code_size) {
165	0	set(a8, code_size);
166	0	}
167
168	0	void set(const uint8_t* a8, int code_size) {
169	0	assert(code_size == 20);
170	0	const uint64_t* a = (uint64_t*)a8;
171	0	a0 = a[0];
172	0	a1 = a[1];
173	0	a2 = a[2];
174	0	}
175
176	0	inline int hamming(const uint8_t* b8) const {
177	0	const uint64_t* b = (uint64_t*)b8;
178	0	return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
179	0	popcount64((uint32_t)(b + 2) ^ a2);
180	0	}
181
182	0	inline static constexpr int get_code_size() {
183	0	return 20;
184	0	}
185		};
186
187		struct HammingComputer32 {
188		uint64_t a0, a1, a2, a3;
189
190	0	HammingComputer32() {}
191
192	0	HammingComputer32(const uint8_t* a8, int code_size) {
193	0	set(a8, code_size);
194	0	}
195
196	0	void set(const uint8_t* a8, int code_size) {
197	0	assert(code_size == 32);
198	0	const uint64_t* a = (uint64_t*)a8;
199	0	a0 = a[0];
200	0	a1 = a[1];
201	0	a2 = a[2];
202	0	a3 = a[3];
203	0	}
204
205	0	inline int hamming(const uint8_t* b8) const {
206	0	const uint64_t* b = (uint64_t*)b8;
207	0	return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
208	0	popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3);
209	0	}
210
211	0	inline static constexpr int get_code_size() {
212	0	return 32;
213	0	}
214		};
215
216		struct HammingComputer64 {
217		uint64_t a0, a1, a2, a3, a4, a5, a6, a7;
218
219	0	HammingComputer64() {}
220
221	0	HammingComputer64(const uint8_t* a8, int code_size) {
222	0	set(a8, code_size);
223	0	}
224
225	0	void set(const uint8_t* a8, int code_size) {
226	0	assert(code_size == 64);
227	0	const uint64_t* a = (uint64_t*)a8;
228	0	a0 = a[0];
229	0	a1 = a[1];
230	0	a2 = a[2];
231	0	a3 = a[3];
232	0	a4 = a[4];
233	0	a5 = a[5];
234	0	a6 = a[6];
235	0	a7 = a[7];
236	0	}
237
238	0	inline int hamming(const uint8_t* b8) const {
239	0	const uint64_t* b = (uint64_t*)b8;
240	0	return popcount64(b[0] ^ a0) + popcount64(b[1] ^ a1) +
241	0	popcount64(b[2] ^ a2) + popcount64(b[3] ^ a3) +
242	0	popcount64(b[4] ^ a4) + popcount64(b[5] ^ a5) +
243	0	popcount64(b[6] ^ a6) + popcount64(b[7] ^ a7);
244	0	}
245
246	0	inline static constexpr int get_code_size() {
247	0	return 64;
248	0	}
249		};
250
251		struct HammingComputerDefault {
252		const uint8_t* a8;
253		int quotient8;
254		int remainder8;
255
256	0	HammingComputerDefault() {}
257
258	0	HammingComputerDefault(const uint8_t* a8, int code_size) {
259	0	set(a8, code_size);
260	0	}
261
262	0	void set(const uint8_t* a8_2, int code_size) {
263	0	this->a8 = a8_2;
264	0	quotient8 = code_size / 8;
265	0	remainder8 = code_size % 8;
266	0	}
267
268	0	int hamming(const uint8_t* b8) const {
269	0	int accu = 0;
270
271	0	const uint64_t* a64 = reinterpret_cast<const uint64_t*>(a8);
272	0	const uint64_t* b64 = reinterpret_cast<const uint64_t*>(b8);
273	0	int i = 0, len = quotient8;
274	0	switch (len & 7) {
275	0	default:
276	0	while (len > 7) {
277	0	len -= 8;
278	0	accu += popcount64(a64[i] ^ b64[i]);
279	0	i++;
280	0	[[fallthrough]];
281	0	case 7:
282	0	accu += popcount64(a64[i] ^ b64[i]);
283	0	i++;
284	0	[[fallthrough]];
285	0	case 6:
286	0	accu += popcount64(a64[i] ^ b64[i]);
287	0	i++;
288	0	[[fallthrough]];
289	0	case 5:
290	0	accu += popcount64(a64[i] ^ b64[i]);
291	0	i++;
292	0	[[fallthrough]];
293	0	case 4:
294	0	accu += popcount64(a64[i] ^ b64[i]);
295	0	i++;
296	0	[[fallthrough]];
297	0	case 3:
298	0	accu += popcount64(a64[i] ^ b64[i]);
299	0	i++;
300	0	[[fallthrough]];
301	0	case 2:
302	0	accu += popcount64(a64[i] ^ b64[i]);
303	0	i++;
304	0	[[fallthrough]];
305	0	case 1:
306	0	accu += popcount64(a64[i] ^ b64[i]);
307	0	i++;
308	0	}
309	0	}
310	0	if (remainder8) {
311	0	const uint8_t* a = a8 + 8 * quotient8;
312	0	const uint8_t* b = b8 + 8 * quotient8;
313	0	switch (remainder8) {
314	0	case 7:
315	0	accu += hamdis_tab_ham_bytes[a[6] ^ b[6]];
316	0	[[fallthrough]];
317	0	case 6:
318	0	accu += hamdis_tab_ham_bytes[a[5] ^ b[5]];
319	0	[[fallthrough]];
320	0	case 5:
321	0	accu += hamdis_tab_ham_bytes[a[4] ^ b[4]];
322	0	[[fallthrough]];
323	0	case 4:
324	0	accu += hamdis_tab_ham_bytes[a[3] ^ b[3]];
325	0	[[fallthrough]];
326	0	case 3:
327	0	accu += hamdis_tab_ham_bytes[a[2] ^ b[2]];
328	0	[[fallthrough]];
329	0	case 2:
330	0	accu += hamdis_tab_ham_bytes[a[1] ^ b[1]];
331	0	[[fallthrough]];
332	0	case 1:
333	0	accu += hamdis_tab_ham_bytes[a[0] ^ b[0]];
334	0	[[fallthrough]];
335	0	default:
336	0	break;
337	0	}
338	0	}
339
340	0	return accu;
341	0	}
342
343	0	inline int get_code_size() const {
344	0	return quotient8 * 8 + remainder8;
345	0	}
346		};
347
348		/***************************************************************************
349		* generalized Hamming = number of bytes that are different between
350		* two codes.
351		***************************************************************************/
352
353	0	inline int generalized_hamming_64(uint64_t a) {
354	0	a \|= a >> 1;
355	0	a \|= a >> 2;
356	0	a \|= a >> 4;
357	0	a &= 0x0101010101010101UL;
358	0	return popcount64(a);
359	0	}
360
361		struct GenHammingComputer8 {
362		uint64_t a0;
363
364	0	GenHammingComputer8(const uint8_t* a, int code_size) {
365	0	assert(code_size == 8);
366	0	a0 = (uint64_t)a;
367	0	}
368
369	0	inline int hamming(const uint8_t* b) const {
370	0	return generalized_hamming_64((uint64_t)b ^ a0);
371	0	}
372
373	0	inline static constexpr int get_code_size() {
374	0	return 8;
375	0	}
376		};
377
378		// I'm not sure whether this version is faster of slower, tbh
379		// todo: test on different CPUs
380		struct GenHammingComputer16 {
381		__m128i a;
382
383	0	GenHammingComputer16(const uint8_t* a8, int code_size) {
384	0	assert(code_size == 16);
385	0	a = _mm_loadu_si128((const __m128i_u*)a8);
386	0	}
387
388	0	inline int hamming(const uint8_t* b8) const {
389	0	const __m128i b = _mm_loadu_si128((const __m128i_u*)b8);
390	0	const __m128i cmp = _mm_cmpeq_epi8(a, b);
391	0	const auto movemask = _mm_movemask_epi8(cmp);
392	0	return 16 - popcount32(movemask);
393	0	}
394
395	0	inline static constexpr int get_code_size() {
396	0	return 16;
397	0	}
398		};
399
400		struct GenHammingComputer32 {
401		__m256i a;
402
403	0	GenHammingComputer32(const uint8_t* a8, int code_size) {
404	0	assert(code_size == 32);
405	0	a = _mm256_loadu_si256((const __m256i_u*)a8);
406	0	}
407
408	0	inline int hamming(const uint8_t* b8) const {
409	0	const __m256i b = _mm256_loadu_si256((const __m256i_u*)b8);
410	0	const __m256i cmp = _mm256_cmpeq_epi8(a, b);
411	0	const uint32_t movemask = _mm256_movemask_epi8(cmp);
412	0	return 32 - popcount32(movemask);
413	0	}
414
415	0	inline static constexpr int get_code_size() {
416	0	return 32;
417	0	}
418		};
419
420		// A specialized version might be needed for the very long
421		// GenHamming code_size. In such a case, one may accumulate
422		// counts using _mm256_sub_epi8 and then compute a horizontal
423		// sum (using _mm256_sad_epu8, maybe, in blocks of no larger
424		// than 256 * 32 bytes).
425
426		struct GenHammingComputerM8 {
427		const uint64_t* a;
428		int n;
429
430	0	GenHammingComputerM8(const uint8_t* a8, int code_size) {
431	0	assert(code_size % 8 == 0);
432	0	a = (uint64_t*)a8;
433	0	n = code_size / 8;
434	0	}
435
436	0	int hamming(const uint8_t* b8) const {
437	0	const uint64_t* b = (uint64_t*)b8;
438	0	int accu = 0;
439
440	0	int i = 0;
441	0	int n4 = (n / 4) * 4;
442	0	for (; i < n4; i += 4) {
443	0	const __m256i av = _mm256_loadu_si256((const __m256i_u*)(a + i));
444	0	const __m256i bv = _mm256_loadu_si256((const __m256i_u*)(b + i));
445	0	const __m256i cmp = _mm256_cmpeq_epi8(av, bv);
446	0	const uint32_t movemask = _mm256_movemask_epi8(cmp);
447	0	accu += 32 - popcount32(movemask);
448	0	}
449
450	0	for (; i < n; i++)
451	0	accu += generalized_hamming_64(a[i] ^ b[i]);
452	0	return accu;
453	0	}
454
455	0	inline int get_code_size() const {
456	0	return n * 8;
457	0	}
458		};
459
460		} // namespace faiss
461
462		#endif