Coverage Report

Created: 2026-03-15 17:28

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exec/common/sip_hash.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
// This file is copied from
18
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Common/SipHash.h
19
// and modified by Doris
20
21
#pragma once
22
23
/** SipHash is a fast cryptographic hash function for short strings.
24
  * Taken from here: https://www.131002.net/siphash/
25
  *
26
  * This is SipHash 2-4 variant.
27
  *
28
  * Two changes are made:
29
  * - returns also 128 bits, not only 64;
30
  * - done streaming (can be calculated in parts).
31
  *
32
  * On short strings (URL, search phrases) more than 3 times faster than MD5 from OpenSSL.
33
  * (~ 700 MB/sec, 15 million strings per second)
34
  */
35
36
#include <string>
37
#include <type_traits>
38
39
#include "common/compiler_util.h" // IWYU pragma: keep
40
#include "core/types.h"
41
#include "util/unaligned.h"
42
43
namespace doris {
44
#include "common/compile_check_begin.h"
45
168M
#define ROTL(x, b) static_cast<UInt64>(((x) << (b)) | ((x) >> (64 - (b))))
46
47
#define SIPROUND           \
48
28.0M
    do {                   \
49
28.0M
        v0 += v1;          \
50
28.0M
        v1 = ROTL(v1, 13); \
51
28.0M
        v1 ^= v0;          \
52
28.0M
        v0 = ROTL(v0, 32); \
53
28.0M
        v2 += v3;          \
54
28.0M
        v3 = ROTL(v3, 16); \
55
28.0M
        v3 ^= v2;          \
56
28.0M
        v0 += v3;          \
57
28.0M
        v3 = ROTL(v3, 21); \
58
28.0M
        v3 ^= v0;          \
59
28.0M
        v2 += v1;          \
60
28.0M
        v1 = ROTL(v1, 17); \
61
28.0M
        v1 ^= v2;          \
62
28.0M
        v2 = ROTL(v2, 32); \
63
28.0M
    } while (0)
64
65
class SipHash {
66
private:
67
    /// State.
68
    UInt64 v0;
69
    UInt64 v1;
70
    UInt64 v2;
71
    UInt64 v3;
72
73
    /// How many bytes have been processed.
74
    UInt64 cnt;
75
76
    /// The current 8 bytes of input data.
77
    union {
78
        UInt64 current_word;
79
        UInt8 current_bytes[8];
80
    };
81
82
610k
    ALWAYS_INLINE void finalize() {
83
        /// In the last free byte, we write the remainder of the division by 256.
84
610k
        current_bytes[7] = uint8_t(cnt);
85
86
610k
        v3 ^= current_word;
87
610k
        SIPROUND;
88
610k
        SIPROUND;
89
610k
        v0 ^= current_word;
90
91
610k
        v2 ^= 0xff;
92
610k
        SIPROUND;
93
610k
        SIPROUND;
94
610k
        SIPROUND;
95
610k
        SIPROUND;
96
610k
    }
97
98
public:
99
    /// Arguments - seed.
100
610k
    SipHash(UInt64 k0 = 0, UInt64 k1 = 0) {
101
        /// Initialize the state with some random bytes and seed.
102
610k
        v0 = 0x736f6d6570736575ULL ^ k0;
103
610k
        v1 = 0x646f72616e646f6dULL ^ k1;
104
610k
        v2 = 0x6c7967656e657261ULL ^ k0;
105
610k
        v3 = 0x7465646279746573ULL ^ k1;
106
107
610k
        cnt = 0;
108
610k
        current_word = 0;
109
610k
    }
110
111
1.31M
    void update(const char* data, UInt64 size) {
112
1.31M
        const char* end = data + size;
113
114
        /// We'll finish to process the remainder of the previous update, if any.
115
1.31M
        if (cnt & 7) {
116
317k
            while (cnt & 7 && data < end) {
117
228k
                current_bytes[cnt & 7] = *data;
118
228k
                ++data;
119
228k
                ++cnt;
120
228k
            }
121
122
            /// If we still do not have enough bytes to an 8-byte word.
123
88.4k
            if (cnt & 7) return;
124
125
46.9k
            v3 ^= current_word;
126
46.9k
            SIPROUND;
127
46.9k
            SIPROUND;
128
46.9k
            v0 ^= current_word;
129
46.9k
        }
130
131
1.27M
        cnt += end - data;
132
133
13.4M
        while (data + 8 <= end) {
134
12.1M
            current_word = unaligned_load<UInt64>(data);
135
136
12.1M
            v3 ^= current_word;
137
12.1M
            SIPROUND;
138
12.1M
            SIPROUND;
139
12.1M
            v0 ^= current_word;
140
141
12.1M
            data += 8;
142
12.1M
        }
143
144
        /// Pad the remainder, which is missing up to an 8-byte word.
145
1.27M
        current_word = 0;
146
1.27M
        switch (end - data) {
147
4.94k
        case 7:
148
4.94k
            current_bytes[6] = data[6];
149
4.94k
            [[fallthrough]];
150
10.8k
        case 6:
151
10.8k
            current_bytes[5] = data[5];
152
10.8k
            [[fallthrough]];
153
16.5k
        case 5:
154
16.5k
            current_bytes[4] = data[4];
155
16.5k
            [[fallthrough]];
156
36.1k
        case 4:
157
36.1k
            current_bytes[3] = data[3];
158
36.1k
            [[fallthrough]];
159
43.3k
        case 3:
160
43.3k
            current_bytes[2] = data[2];
161
43.3k
            [[fallthrough]];
162
49.7k
        case 2:
163
49.7k
            current_bytes[1] = data[1];
164
49.7k
            [[fallthrough]];
165
655k
        case 1:
166
655k
            current_bytes[0] = data[0];
167
655k
            [[fallthrough]];
168
1.27M
        case 0:
169
1.27M
            break;
170
1.27M
        }
171
1.27M
    }
172
173
    template <typename T>
174
1.24M
    void update(const T& x) {
175
        if constexpr (std::is_same_v<T, std::string>) {
176
            throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!");
177
        }
178
1.24M
        update(reinterpret_cast<const char*>(&x), sizeof(x));
179
1.24M
    }
_ZN5doris7SipHash6updateINS_7DecimalIiEEEEvRKT_
Line
Count
Source
174
354
    void update(const T& x) {
175
        if constexpr (std::is_same_v<T, std::string>) {
176
            throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!");
177
        }
178
354
        update(reinterpret_cast<const char*>(&x), sizeof(x));
179
354
    }
_ZN5doris7SipHash6updateINS_7DecimalIlEEEEvRKT_
Line
Count
Source
174
576
    void update(const T& x) {
175
        if constexpr (std::is_same_v<T, std::string>) {
176
            throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!");
177
        }
178
576
        update(reinterpret_cast<const char*>(&x), sizeof(x));
179
576
    }
_ZN5doris7SipHash6updateINS_14DecimalV2ValueEEEvRKT_
Line
Count
Source
174
1.34k
    void update(const T& x) {
175
        if constexpr (std::is_same_v<T, std::string>) {
176
            throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!");
177
        }
178
1.34k
        update(reinterpret_cast<const char*>(&x), sizeof(x));
179
1.34k
    }
_ZN5doris7SipHash6updateINS_12Decimal128V3EEEvRKT_
Line
Count
Source
174
662
    void update(const T& x) {
175
        if constexpr (std::is_same_v<T, std::string>) {
176
            throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!");
177
        }
178
662
        update(reinterpret_cast<const char*>(&x), sizeof(x));
179
662
    }
_ZN5doris7SipHash6updateINS_7DecimalIN4wide7integerILm256EiEEEEEEvRKT_
Line
Count
Source
174
1.25k
    void update(const T& x) {
175
        if constexpr (std::is_same_v<T, std::string>) {
176
            throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!");
177
        }
178
1.25k
        update(reinterpret_cast<const char*>(&x), sizeof(x));
179
1.25k
    }
_ZN5doris7SipHash6updateIiEEvRKT_
Line
Count
Source
174
1.97k
    void update(const T& x) {
175
        if constexpr (std::is_same_v<T, std::string>) {
176
            throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!");
177
        }
178
1.97k
        update(reinterpret_cast<const char*>(&x), sizeof(x));
179
1.97k
    }
_ZN5doris7SipHash6updateIhEEvRKT_
Line
Count
Source
174
13.2k
    void update(const T& x) {
175
        if constexpr (std::is_same_v<T, std::string>) {
176
            throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!");
177
        }
178
13.2k
        update(reinterpret_cast<const char*>(&x), sizeof(x));
179
13.2k
    }
_ZN5doris7SipHash6updateIaEEvRKT_
Line
Count
Source
174
974
    void update(const T& x) {
175
        if constexpr (std::is_same_v<T, std::string>) {
176
            throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!");
177
        }
178
974
        update(reinterpret_cast<const char*>(&x), sizeof(x));
179
974
    }
_ZN5doris7SipHash6updateIsEEvRKT_
Line
Count
Source
174
888
    void update(const T& x) {
175
        if constexpr (std::is_same_v<T, std::string>) {
176
            throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!");
177
        }
178
888
        update(reinterpret_cast<const char*>(&x), sizeof(x));
179
888
    }
_ZN5doris7SipHash6updateIlEEvRKT_
Line
Count
Source
174
2.14k
    void update(const T& x) {
175
        if constexpr (std::is_same_v<T, std::string>) {
176
            throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!");
177
        }
178
2.14k
        update(reinterpret_cast<const char*>(&x), sizeof(x));
179
2.14k
    }
_ZN5doris7SipHash6updateInEEvRKT_
Line
Count
Source
174
674
    void update(const T& x) {
175
        if constexpr (std::is_same_v<T, std::string>) {
176
            throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!");
177
        }
178
674
        update(reinterpret_cast<const char*>(&x), sizeof(x));
179
674
    }
_ZN5doris7SipHash6updateIfEEvRKT_
Line
Count
Source
174
737
    void update(const T& x) {
175
        if constexpr (std::is_same_v<T, std::string>) {
176
            throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!");
177
        }
178
737
        update(reinterpret_cast<const char*>(&x), sizeof(x));
179
737
    }
_ZN5doris7SipHash6updateIdEEvRKT_
Line
Count
Source
174
821
    void update(const T& x) {
175
        if constexpr (std::is_same_v<T, std::string>) {
176
            throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!");
177
        }
178
821
        update(reinterpret_cast<const char*>(&x), sizeof(x));
179
821
    }
_ZN5doris7SipHash6updateIjEEvRKT_
Line
Count
Source
174
1.13k
    void update(const T& x) {
175
        if constexpr (std::is_same_v<T, std::string>) {
176
            throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!");
177
        }
178
1.13k
        update(reinterpret_cast<const char*>(&x), sizeof(x));
179
1.13k
    }
_ZN5doris7SipHash6updateIoEEvRKT_
Line
Count
Source
174
752
    void update(const T& x) {
175
        if constexpr (std::is_same_v<T, std::string>) {
176
            throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!");
177
        }
178
752
        update(reinterpret_cast<const char*>(&x), sizeof(x));
179
752
    }
_ZN5doris7SipHash6updateINS_16VecDateTimeValueEEEvRKT_
Line
Count
Source
174
816
    void update(const T& x) {
175
        if constexpr (std::is_same_v<T, std::string>) {
176
            throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!");
177
        }
178
816
        update(reinterpret_cast<const char*>(&x), sizeof(x));
179
816
    }
_ZN5doris7SipHash6updateINS_11DateV2ValueINS_15DateV2ValueTypeEEEEEvRKT_
Line
Count
Source
174
256
    void update(const T& x) {
175
        if constexpr (std::is_same_v<T, std::string>) {
176
            throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!");
177
        }
178
256
        update(reinterpret_cast<const char*>(&x), sizeof(x));
179
256
    }
_ZN5doris7SipHash6updateINS_11DateV2ValueINS_19DateTimeV2ValueTypeEEEEEvRKT_
Line
Count
Source
174
1.95k
    void update(const T& x) {
175
        if constexpr (std::is_same_v<T, std::string>) {
176
            throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!");
177
        }
178
1.95k
        update(reinterpret_cast<const char*>(&x), sizeof(x));
179
1.95k
    }
Unexecuted instantiation: _ZN5doris7SipHash6updateINS_16TimestampTzValueEEEvRKT_
_ZN5doris7SipHash6updateImEEvRKT_
Line
Count
Source
174
600k
    void update(const T& x) {
175
        if constexpr (std::is_same_v<T, std::string>) {
176
            throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!");
177
        }
178
600k
        update(reinterpret_cast<const char*>(&x), sizeof(x));
179
600k
    }
_ZN5doris7SipHash6updateIbEEvRKT_
Line
Count
Source
174
613k
    void update(const T& x) {
175
        if constexpr (std::is_same_v<T, std::string>) {
176
            throw Exception(ErrorCode::INTERNAL_ERROR, "String should not use SipHash!");
177
        }
178
613k
        update(reinterpret_cast<const char*>(&x), sizeof(x));
179
613k
    }
180
181
    /// Get the result in some form. This can only be done once!
182
183
604k
    void get128(char* out) {
184
604k
        finalize();
185
604k
        reinterpret_cast<UInt64*>(out)[0] = v0 ^ v1;
186
604k
        reinterpret_cast<UInt64*>(out)[1] = v2 ^ v3;
187
604k
    }
188
189
    /// template for avoiding 'unsigned long long' vs 'unsigned long' problem on old poco in macos
190
    template <typename T>
191
    ALWAYS_INLINE void get128(T& lo, T& hi) {
192
        static_assert(sizeof(T) == 8);
193
        finalize();
194
        lo = v0 ^ v1;
195
        hi = v2 ^ v3;
196
    }
197
198
5.41k
    UInt64 get64() {
199
5.41k
        finalize();
200
5.41k
        return v0 ^ v1 ^ v2 ^ v3;
201
5.41k
    }
202
203
    template <typename T>
204
600k
    ALWAYS_INLINE void get128(T& dst) {
205
600k
        static_assert(sizeof(T) == 16);
206
600k
        get128(reinterpret_cast<char*>(&dst));
207
600k
    }
208
};
209
210
#undef ROTL
211
#undef SIPROUND
212
213
#include <cstddef>
214
215
4.24k
inline void sip_hash128(const char* data, const size_t size, char* out) {
216
4.24k
    SipHash hash;
217
4.24k
    hash.update(data, size);
218
4.24k
    hash.get128(out);
219
4.24k
}
220
#include "common/compile_check_end.h"
221
} // namespace doris