/root/doris/be/src/gutil/strings/fastmem.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2008 Google Inc. All Rights Reserved. |
2 | | // |
3 | | // Fast memory copying and comparison routines. |
4 | | // strings::fastmemcmp_inlined() replaces memcmp() |
5 | | // strings::memcpy_inlined() replaces memcpy() |
6 | | // strings::memeq(a, b, n) replaces memcmp(a, b, n) == 0 |
7 | | // |
8 | | // strings::*_inlined() routines are inline versions of the |
9 | | // routines exported by this module. Sometimes using the inlined |
10 | | // versions is faster. Measure before using the inlined versions. |
11 | | // |
12 | | // Performance measurement: |
13 | | // strings::fastmemcmp_inlined |
14 | | // Analysis: memcmp, fastmemcmp_inlined, fastmemcmp |
15 | | // 2012-01-30 |
16 | | |
17 | | #pragma once |
18 | | |
19 | | #include <stddef.h> |
20 | | #include <stdint.h> |
21 | | #include <stdio.h> |
22 | | #include <string.h> |
23 | | |
24 | | #include "gutil/integral_types.h" |
25 | | #include "gutil/port.h" |
26 | | |
27 | | namespace strings { |
28 | | |
29 | | // Return true if the n bytes at a equal the n bytes at b. |
30 | | // The regions are allowed to overlap. |
31 | | // |
32 | | // The performance is similar to the performance memcmp(), but faster for |
33 | | // moderately-sized inputs, or inputs that share a common prefix and differ |
34 | | // somewhere in their last 8 bytes. Further optimizations can be added later |
35 | | // if it makes sense to do so. |
36 | 0 | inline bool memeq(const void* a_v, const void* b_v, size_t n) { |
37 | 0 | const uint8_t* a = reinterpret_cast<const uint8_t*>(a_v); |
38 | 0 | const uint8_t* b = reinterpret_cast<const uint8_t*>(b_v); |
39 | |
|
40 | 0 | size_t n_rounded_down = n & ~static_cast<size_t>(7); |
41 | 0 | if (PREDICT_FALSE(n_rounded_down == 0)) { // n <= 7 |
42 | 0 | return memcmp(a, b, n) == 0; |
43 | 0 | } |
44 | | // n >= 8 |
45 | 0 | uint64 u = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b); |
46 | 0 | uint64 v = UNALIGNED_LOAD64(a + n - 8) ^ UNALIGNED_LOAD64(b + n - 8); |
47 | 0 | if ((u | v) != 0) { // The first or last 8 bytes differ. |
48 | 0 | return false; |
49 | 0 | } |
50 | 0 | a += 8; |
51 | 0 | b += 8; |
52 | 0 | n = n_rounded_down - 8; |
53 | 0 | if (n > 128) { |
54 | | // As of 2012, memcmp on x86-64 uses a big unrolled loop with SSE2 |
55 | | // instructions, and while we could try to do something faster, it |
56 | | // doesn't seem worth pursuing. |
57 | 0 | return memcmp(a, b, n) == 0; |
58 | 0 | } |
59 | 0 | for (; n >= 16; n -= 16) { |
60 | 0 | uint64 x = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b); |
61 | 0 | uint64 y = UNALIGNED_LOAD64(a + 8) ^ UNALIGNED_LOAD64(b + 8); |
62 | 0 | if ((x | y) != 0) { |
63 | 0 | return false; |
64 | 0 | } |
65 | 0 | a += 16; |
66 | 0 | b += 16; |
67 | 0 | } |
68 | | // n must be 0 or 8 now because it was a multiple of 8 at the top of the loop. |
69 | 0 | return n == 0 || UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b); |
70 | 0 | } |
71 | | |
72 | 0 | inline int fastmemcmp_inlined(const void* a_void, const void* b_void, size_t n) { |
73 | 0 | const uint8_t* a = reinterpret_cast<const uint8_t*>(a_void); |
74 | 0 | const uint8_t* b = reinterpret_cast<const uint8_t*>(b_void); |
75 | 0 |
|
76 | 0 | if (n >= 64) { |
77 | 0 | return memcmp(a, b, n); |
78 | 0 | } |
79 | 0 | const void* a_limit = a + n; |
80 | 0 | const size_t sizeof_uint64 = sizeof(uint64); // NOLINT(runtime/sizeof) |
81 | 0 | while (a + sizeof_uint64 <= a_limit && UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b)) { |
82 | 0 | a += sizeof_uint64; |
83 | 0 | b += sizeof_uint64; |
84 | 0 | } |
85 | 0 | const size_t sizeof_uint32 = sizeof(uint32); // NOLINT(runtime/sizeof) |
86 | 0 | if (a + sizeof_uint32 <= a_limit && UNALIGNED_LOAD32(a) == UNALIGNED_LOAD32(b)) { |
87 | 0 | a += sizeof_uint32; |
88 | 0 | b += sizeof_uint32; |
89 | 0 | } |
90 | 0 | while (a < a_limit) { |
91 | 0 | int d = static_cast<uint32>(*a++) - static_cast<uint32>(*b++); |
92 | 0 | if (d) return d; |
93 | 0 | } |
94 | 0 | return 0; |
95 | 0 | } |
96 | | |
97 | | // The standard memcpy operation is slow for variable small sizes. |
98 | | // This implementation inlines the optimal realization for sizes 1 to 16. |
99 | | // To avoid code bloat don't use it in case of not performance-critical spots, |
100 | | // nor when you don't expect very frequent values of size <= 16. |
101 | 630k | inline void memcpy_inlined(void* dst, const void* src, size_t size) { |
102 | | // Compiler inlines code with minimal amount of data movement when third |
103 | | // parameter of memcpy is a constant. |
104 | 630k | switch (size) { |
105 | 0 | case 1: |
106 | 0 | memcpy(dst, src, 1); |
107 | 0 | break; |
108 | 0 | case 2: |
109 | 0 | memcpy(dst, src, 2); |
110 | 0 | break; |
111 | 0 | case 3: |
112 | 0 | memcpy(dst, src, 3); |
113 | 0 | break; |
114 | 0 | case 4: |
115 | 0 | memcpy(dst, src, 4); |
116 | 0 | break; |
117 | 15.9k | case 5: |
118 | 15.9k | memcpy(dst, src, 5); |
119 | 15.9k | break; |
120 | 127 | case 6: |
121 | 127 | memcpy(dst, src, 6); |
122 | 127 | break; |
123 | 16.2k | case 7: |
124 | 16.2k | memcpy(dst, src, 7); |
125 | 16.2k | break; |
126 | 37.5k | case 8: |
127 | 37.5k | memcpy(dst, src, 8); |
128 | 37.5k | break; |
129 | 146k | case 9: |
130 | 146k | memcpy(dst, src, 9); |
131 | 146k | break; |
132 | 140k | case 10: |
133 | 140k | memcpy(dst, src, 10); |
134 | 140k | break; |
135 | 0 | case 11: |
136 | 0 | memcpy(dst, src, 11); |
137 | 0 | break; |
138 | 753 | case 12: |
139 | 753 | memcpy(dst, src, 12); |
140 | 753 | break; |
141 | 92 | case 13: |
142 | 92 | memcpy(dst, src, 13); |
143 | 92 | break; |
144 | 124k | case 14: |
145 | 124k | memcpy(dst, src, 14); |
146 | 124k | break; |
147 | 139k | case 15: |
148 | 139k | memcpy(dst, src, 15); |
149 | 139k | break; |
150 | 75 | case 16: |
151 | 75 | memcpy(dst, src, 16); |
152 | 75 | break; |
153 | 9.20k | default: |
154 | 9.20k | memcpy(dst, src, size); |
155 | 9.20k | break; |
156 | 630k | } |
157 | 630k | } |
158 | | |
159 | | } // namespace strings |