/root/doris/be/src/gutil/endian.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2005 Google Inc. |
2 | | // |
3 | | // Licensed to the Apache Software Foundation (ASF) under one |
4 | | // or more contributor license agreements. See the NOTICE file |
5 | | // distributed with this work for additional information |
6 | | // regarding copyright ownership. The ASF licenses this file |
7 | | // to you under the Apache License, Version 2.0 (the |
8 | | // "License"); you may not use this file except in compliance |
9 | | // with the License. You may obtain a copy of the License at |
10 | | // |
11 | | // http://www.apache.org/licenses/LICENSE-2.0 |
12 | | // |
13 | | // Unless required by applicable law or agreed to in writing, |
14 | | // software distributed under the License is distributed on an |
15 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
16 | | // KIND, either express or implied. See the License for the |
17 | | // specific language governing permissions and limitations |
18 | | // under the License. |
19 | | // |
20 | | // --- |
21 | | // |
22 | | // |
23 | | // Utility functions that depend on bytesex. We define htonll and ntohll, |
24 | | // as well as "Google" versions of all the standards: ghtonl, ghtons, and |
25 | | // so on. These functions do exactly the same as their standard variants, |
26 | | // but don't require including the dangerous netinet/in.h. |
27 | | // |
28 | | // Buffer routines will copy to and from buffers without causing |
29 | | // a bus error when the architecture requires different byte alignments |
30 | | |
31 | | #pragma once |
32 | | |
33 | | #include <assert.h> |
34 | | |
35 | | #include "vec/core/wide_integer.h" |
36 | | |
37 | | // Portable handling of unaligned loads, stores, and copies. |
38 | | // On some platforms, like ARM, the copy functions can be more efficient |
39 | | // then a load and a store. |
40 | | |
41 | | #if defined(__i386) || defined(ARCH_ATHLON) || defined(__x86_64__) || defined(_ARCH_PPC) |
42 | | |
43 | | // x86 and x86-64 can perform unaligned loads/stores directly; |
44 | | // modern PowerPC hardware can also do unaligned integer loads and stores; |
45 | | // but note: the FPU still sends unaligned loads and stores to a trap handler! |
46 | | |
47 | | #define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16_t*>(_p)) |
48 | 256k | #define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32_t*>(_p)) |
49 | 5.38k | #define UNALIGNED_LOAD64(_p) (*reinterpret_cast<const uint64_t*>(_p)) |
50 | | |
51 | | #define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16_t*>(_p) = (_val)) |
52 | 0 | #define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32_t*>(_p) = (_val)) |
53 | | #define UNALIGNED_STORE64(_p, _val) (*reinterpret_cast<uint64_t*>(_p) = (_val)) |
54 | | |
55 | | #elif defined(__arm__) && !defined(__ARM_ARCH_5__) && !defined(__ARM_ARCH_5T__) && \ |
56 | | !defined(__ARM_ARCH_5TE__) && !defined(__ARM_ARCH_5TEJ__) && !defined(__ARM_ARCH_6__) && \ |
57 | | !defined(__ARM_ARCH_6J__) && !defined(__ARM_ARCH_6K__) && !defined(__ARM_ARCH_6Z__) && \ |
58 | | !defined(__ARM_ARCH_6ZK__) && !defined(__ARM_ARCH_6T2__) |
59 | | |
60 | | // ARMv7 and newer support native unaligned accesses, but only of 16-bit |
61 | | // and 32-bit values (not 64-bit); older versions either raise a fatal signal, |
62 | | // do an unaligned read and rotate the words around a bit, or do the reads very |
63 | | // slowly (trip through kernel mode). There's no simple #define that says just |
64 | | // “ARMv7 or higher”, so we have to filter away all ARMv5 and ARMv6 |
65 | | // sub-architectures. Newer gcc (>= 4.6) set an __ARM_FEATURE_ALIGNED #define, |
66 | | // so in time, maybe we can move on to that. |
67 | | // |
68 | | // This is a mess, but there's not much we can do about it. |
69 | | |
70 | | #define UNALIGNED_LOAD16(_p) (*reinterpret_cast<const uint16_t*>(_p)) |
71 | | #define UNALIGNED_LOAD32(_p) (*reinterpret_cast<const uint32_t*>(_p)) |
72 | | |
73 | | #define UNALIGNED_STORE16(_p, _val) (*reinterpret_cast<uint16_t*>(_p) = (_val)) |
74 | | #define UNALIGNED_STORE32(_p, _val) (*reinterpret_cast<uint32_t*>(_p) = (_val)) |
75 | | |
76 | | // TODO(user): NEON supports unaligned 64-bit loads and stores. |
77 | | // See if that would be more efficient on platforms supporting it, |
78 | | // at least for copies. |
79 | | |
80 | | inline uint64_t UNALIGNED_LOAD64(const void* p) { |
81 | | uint64_t t; |
82 | | memcpy(&t, p, sizeof t); |
83 | | return t; |
84 | | } |
85 | | |
86 | | inline void UNALIGNED_STORE64(void* p, uint64_t v) { |
87 | | memcpy(p, &v, sizeof v); |
88 | | } |
89 | | |
90 | | #else |
91 | | |
92 | | #define NEED_ALIGNED_LOADS |
93 | | |
94 | | // These functions are provided for architectures that don't support |
95 | | // unaligned loads and stores. |
96 | | |
97 | | inline uint16_t UNALIGNED_LOAD16(const void* p) { |
98 | | uint16_t t; |
99 | | memcpy(&t, p, sizeof t); |
100 | | return t; |
101 | | } |
102 | | |
103 | | inline uint32_t UNALIGNED_LOAD32(const void* p) { |
104 | | uint32_t t; |
105 | | memcpy(&t, p, sizeof t); |
106 | | return t; |
107 | | } |
108 | | |
109 | | inline uint64_t UNALIGNED_LOAD64(const void* p) { |
110 | | uint64_t t; |
111 | | memcpy(&t, p, sizeof t); |
112 | | return t; |
113 | | } |
114 | | |
115 | | inline void UNALIGNED_STORE16(void* p, uint16_t v) { |
116 | | memcpy(p, &v, sizeof v); |
117 | | } |
118 | | |
119 | | inline void UNALIGNED_STORE32(void* p, uint32_t v) { |
120 | | memcpy(p, &v, sizeof v); |
121 | | } |
122 | | |
123 | | inline void UNALIGNED_STORE64(void* p, uint64_t v) { |
124 | | memcpy(p, &v, sizeof v); |
125 | | } |
126 | | |
127 | | #endif |
128 | | |
129 | 62.8k | inline uint64_t gbswap_64(uint64_t host_int) { |
130 | 62.8k | #if defined(__GNUC__) && defined(__x86_64__) && !defined(__APPLE__) |
131 | | // Adapted from /usr/include/byteswap.h. Not available on Mac. |
132 | 62.8k | if (__builtin_constant_p(host_int)) { |
133 | 0 | return __bswap_constant_64(host_int); |
134 | 62.8k | } else { |
135 | 62.8k | uint64_t result; |
136 | 62.8k | __asm__("bswap %0" : "=r"(result) : "0"(host_int)); |
137 | 62.8k | return result; |
138 | 62.8k | } |
139 | | #elif defined(bswap_64) |
140 | | return bswap_64(host_int); |
141 | | #else |
142 | | return static_cast<uint64_t>(bswap_32(static_cast<uint32_t>(host_int >> 32))) | |
143 | | (static_cast<uint64_t>(bswap_32(static_cast<uint32_t>(host_int))) << 32); |
144 | | #endif // bswap_64 |
145 | 62.8k | } |
146 | | |
147 | 214 | inline unsigned __int128 gbswap_128(unsigned __int128 host_int) { |
148 | 214 | return static_cast<unsigned __int128>(bswap_64(static_cast<uint64_t>(host_int >> 64))) | |
149 | 214 | (static_cast<unsigned __int128>(bswap_64(static_cast<uint64_t>(host_int))) << 64); |
150 | 214 | } |
151 | | |
152 | 1 | inline wide::UInt256 gbswap_256(wide::UInt256 host_int) { |
153 | 1 | wide::UInt256 result {gbswap_64(host_int.items[3]), gbswap_64(host_int.items[2]), |
154 | 1 | gbswap_64(host_int.items[1]), gbswap_64(host_int.items[0])}; |
155 | 1 | return result; |
156 | 1 | } |
157 | | |
158 | | // Swap bytes of a 24-bit value. |
159 | 217 | inline uint32_t bswap_24(uint32_t x) { |
160 | 217 | return ((x & 0x0000ffULL) << 16) | ((x & 0x00ff00ULL)) | ((x & 0xff0000ULL) >> 16); |
161 | 217 | } |
162 | | |
163 | | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ |
164 | | |
165 | | // Definitions for ntohl etc. that don't require us to include |
166 | | // netinet/in.h. We wrap bswap_32 and bswap_16 in functions rather |
167 | | // than just #defining them because in debug mode, gcc doesn't |
168 | | // correctly handle the (rather involved) definitions of bswap_32. |
169 | | // gcc guarantees that inline functions are as fast as macros, so |
170 | | // this isn't a performance hit. |
171 | 0 | inline uint16_t ghtons(uint16_t x) { |
172 | 0 | return bswap_16(x); |
173 | 0 | } |
174 | 0 | inline uint32_t ghtonl(uint32_t x) { |
175 | 0 | return bswap_32(x); |
176 | 0 | } |
177 | 0 | inline uint64_t ghtonll(uint64_t x) { |
178 | 0 | return gbswap_64(x); |
179 | 0 | } |
180 | | |
181 | | #else |
182 | | |
183 | | // These definitions are simpler on big-endian machines |
184 | | // These are functions instead of macros to avoid self-assignment warnings |
185 | | // on calls such as "i = ghtnol(i);". This also provides type checking. |
186 | | inline uint16_t ghtons(uint16_t x) { |
187 | | return x; |
188 | | } |
189 | | inline uint32_t ghtonl(uint32_t x) { |
190 | | return x; |
191 | | } |
192 | | inline uint64_t ghtonll(uint64_t x) { |
193 | | return x; |
194 | | } |
195 | | |
196 | | #endif // bytesex |
197 | | |
198 | | // ntoh* and hton* are the same thing for any size and bytesex, |
199 | | // since the function is an involution, i.e., its own inverse. |
200 | | #if !defined(__APPLE__) |
201 | | // This one is safe to take as it's an extension |
202 | | #define htonll(x) ghtonll(x) |
203 | | #define ntohll(x) htonll(x) |
204 | | #endif |
205 | | |
206 | | // Utilities to convert numbers between the current hosts's native byte |
207 | | // order and little-endian byte order |
208 | | // |
209 | | // Load/Store methods are alignment safe |
210 | | class LittleEndian { |
211 | | public: |
212 | | // Conversion functions. |
213 | | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ |
214 | | |
215 | 0 | static uint16_t FromHost16(uint16_t x) { return x; } |
216 | 0 | static uint16_t ToHost16(uint16_t x) { return x; } |
217 | | |
218 | 0 | static uint32_t FromHost32(uint32_t x) { return x; } |
219 | 256k | static uint32_t ToHost32(uint32_t x) { return x; } |
220 | | |
221 | 0 | static uint64_t FromHost64(uint64_t x) { return x; } |
222 | 4.18k | static uint64_t ToHost64(uint64_t x) { return x; } |
223 | | |
224 | 0 | static unsigned __int128 FromHost128(unsigned __int128 x) { return x; } |
225 | 0 | static unsigned __int128 ToHost128(unsigned __int128 x) { return x; } |
226 | | |
227 | 0 | static wide::UInt256 FromHost256(wide::UInt256 x) { return x; } |
228 | 0 | static wide::UInt256 ToHost256(wide::UInt256 x) { return x; } |
229 | | |
230 | 0 | static bool IsLittleEndian() { return true; } |
231 | | |
232 | | #else |
233 | | |
234 | | static uint16_t FromHost16(uint16_t x) { return bswap_16(x); } |
235 | | static uint16_t ToHost16(uint16_t x) { return bswap_16(x); } |
236 | | |
237 | | static uint32_t FromHost32(uint32_t x) { return bswap_32(x); } |
238 | | static uint32_t ToHost32(uint32_t x) { return bswap_32(x); } |
239 | | |
240 | | static uint64_t FromHost64(uint64_t x) { return gbswap_64(x); } |
241 | | static uint64_t ToHost64(uint64_t x) { return gbswap_64(x); } |
242 | | |
243 | | static unsigned __int128 FromHost128(unsigned __int128 x) { return gbswap_128(x); } |
244 | | static unsigned __int128 ToHost128(unsigned __int128 x) { return gbswap_128(x); } |
245 | | |
246 | | static wide::UInt256 FromHost256(wide::UInt256 x) { return gbswap_256(x); } |
247 | | static wide::UInt256 ToHost256(wide::UInt256 x) { return gbswap_256(x); } |
248 | | |
249 | | static bool IsLittleEndian() { return false; } |
250 | | |
251 | | #endif /* ENDIAN */ |
252 | | |
253 | | // Functions to do unaligned loads and stores in little-endian order. |
254 | 0 | static uint16_t Load16(const void* p) { return ToHost16(UNALIGNED_LOAD16(p)); } |
255 | | |
256 | 0 | static void Store16(void* p, uint16_t v) { UNALIGNED_STORE16(p, FromHost16(v)); } |
257 | | |
258 | 256k | static uint32_t Load32(const void* p) { return ToHost32(UNALIGNED_LOAD32(p)); } |
259 | | |
260 | 0 | static void Store32(void* p, uint32_t v) { UNALIGNED_STORE32(p, FromHost32(v)); } |
261 | | |
262 | 4.18k | static uint64_t Load64(const void* p) { return ToHost64(UNALIGNED_LOAD64(p)); } |
263 | | |
264 | | // Build a uint64_t from 1-8 bytes. |
265 | | // 8 * len least significant bits are loaded from the memory with |
266 | | // LittleEndian order. The 64 - 8 * len most significant bits are |
267 | | // set all to 0. |
268 | | // In latex-friendly words, this function returns: |
269 | | // $\sum_{i=0}^{len-1} p[i] 256^{i}$, where p[i] is unsigned. |
270 | | // |
271 | | // This function is equivalent with: |
272 | | // uint64_t val = 0; |
273 | | // memcpy(&val, p, len); |
274 | | // return ToHost64(val); |
275 | | // TODO(user): write a small benchmark and benchmark the speed |
276 | | // of a memcpy based approach. |
277 | | // |
278 | | // For speed reasons this function does not work for len == 0. |
279 | | // The caller needs to guarantee that 1 <= len <= 8. |
280 | 0 | static uint64_t Load64VariableLength(const void* const p, int len) { |
281 | 0 | assert(len >= 1 && len <= 8); |
282 | 0 | const char* const buf = static_cast<const char*>(p); |
283 | 0 | uint64_t val = 0; |
284 | 0 | --len; |
285 | 0 | do { |
286 | 0 | val = (val << 8) | buf[len]; |
287 | 0 | // (--len >= 0) is about 10 % faster than (len--) in some benchmarks. |
288 | 0 | } while (--len >= 0); |
289 | 0 | // No ToHost64(...) needed. The bytes are accessed in little-endian manner |
290 | 0 | // on every architecture. |
291 | 0 | return val; |
292 | 0 | } |
293 | | |
294 | 0 | static void Store64(void* p, uint64_t v) { UNALIGNED_STORE64(p, FromHost64(v)); } |
295 | | }; |
296 | | |
297 | | // Utilities to convert numbers between the current hosts's native byte |
298 | | // order and big-endian byte order (same as network byte order) |
299 | | // |
300 | | // Load/Store methods are alignment safe |
301 | | class BigEndian { |
302 | | public: |
303 | | #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ |
304 | | |
305 | 249 | static uint16_t FromHost16(uint16_t x) { return bswap_16(x); } |
306 | 1 | static uint16_t ToHost16(uint16_t x) { return bswap_16(x); } |
307 | | |
308 | 217 | static uint32_t FromHost24(uint32_t x) { return bswap_24(x); } |
309 | 0 | static uint32_t ToHost24(uint32_t x) { return bswap_24(x); } |
310 | | |
311 | 978k | static uint32_t FromHost32(uint32_t x) { return bswap_32(x); } |
312 | 1 | static uint32_t ToHost32(uint32_t x) { return bswap_32(x); } |
313 | | |
314 | 61.6k | static uint64_t FromHost64(uint64_t x) { return gbswap_64(x); } |
315 | 1.19k | static uint64_t ToHost64(uint64_t x) { return gbswap_64(x); } |
316 | | |
317 | 213 | static unsigned __int128 FromHost128(unsigned __int128 x) { return gbswap_128(x); } |
318 | 1 | static unsigned __int128 ToHost128(unsigned __int128 x) { return gbswap_128(x); } |
319 | | |
320 | 0 | static wide::UInt256 FromHost256(wide::UInt256 x) { return gbswap_256(x); } |
321 | 1 | static wide::UInt256 ToHost256(wide::UInt256 x) { return gbswap_256(x); } |
322 | | |
323 | 0 | static bool IsLittleEndian() { return true; } |
324 | | |
325 | | #else |
326 | | |
327 | | static uint16_t FromHost16(uint16_t x) { return x; } |
328 | | static uint16_t ToHost16(uint16_t x) { return x; } |
329 | | |
330 | | static uint32_t FromHost24(uint32_t x) { return x; } |
331 | | static uint32_t ToHost24(uint32_t x) { return x; } |
332 | | |
333 | | static uint32_t FromHost32(uint32_t x) { return x; } |
334 | | static uint32_t ToHost32(uint32_t x) { return x; } |
335 | | |
336 | | static uint64_t FromHost64(uint64_t x) { return x; } |
337 | | static uint64_t ToHost64(uint64_t x) { return x; } |
338 | | |
339 | | static wide::UInt256 FromHost256(wide::UInt256 x) { return x; } |
340 | | static wide::UInt256 ToHost256(wide::UInt256 x) { return x; } |
341 | | |
342 | | static bool IsLittleEndian() { return false; } |
343 | | |
344 | | #endif /* ENDIAN */ |
345 | | // Functions to do unaligned loads and stores in little-endian order. |
346 | 0 | static uint16_t Load16(const void* p) { return ToHost16(UNALIGNED_LOAD16(p)); } |
347 | | |
348 | 0 | static void Store16(void* p, uint16_t v) { UNALIGNED_STORE16(p, FromHost16(v)); } |
349 | | |
350 | 0 | static uint32_t Load32(const void* p) { return ToHost32(UNALIGNED_LOAD32(p)); } |
351 | | |
352 | 0 | static void Store32(void* p, uint32_t v) { UNALIGNED_STORE32(p, FromHost32(v)); } |
353 | | |
354 | 0 | static uint64_t Load64(const void* p) { return ToHost64(UNALIGNED_LOAD64(p)); } |
355 | | |
356 | | // Build a uint64_t from 1-8 bytes. |
357 | | // 8 * len least significant bits are loaded from the memory with |
358 | | // BigEndian order. The 64 - 8 * len most significant bits are |
359 | | // set all to 0. |
360 | | // In latex-friendly words, this function returns: |
361 | | // $\sum_{i=0}^{len-1} p[i] 256^{i}$, where p[i] is unsigned. |
362 | | // |
363 | | // This function is equivalent with: |
364 | | // uint64_t val = 0; |
365 | | // memcpy(&val, p, len); |
366 | | // return ToHost64(val); |
367 | | // TODO(user): write a small benchmark and benchmark the speed |
368 | | // of a memcpy based approach. |
369 | | // |
370 | | // For speed reasons this function does not work for len == 0. |
371 | | // The caller needs to guarantee that 1 <= len <= 8. |
372 | 0 | static uint64_t Load64VariableLength(const void* const p, int len) { |
373 | 0 | assert(len >= 1 && len <= 8); |
374 | 0 | uint64_t val = Load64(p); |
375 | 0 | uint64_t mask = 0; |
376 | 0 | --len; |
377 | 0 | do { |
378 | 0 | mask = (mask << 8) | 0xff; |
379 | 0 | // (--len >= 0) is about 10 % faster than (len--) in some benchmarks. |
380 | 0 | } while (--len >= 0); |
381 | 0 | return val & mask; |
382 | 0 | } |
383 | | |
384 | 0 | static void Store64(void* p, uint64_t v) { UNALIGNED_STORE64(p, FromHost64(v)); } |
385 | | }; // BigEndian |
386 | | |
387 | | // Network byte order is big-endian |
388 | | typedef BigEndian NetworkByteOrder; |