Coverage Report

Created: 2024-11-22 12:06

/root/doris/be/src/gutil/strings/fastmem.h
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2008 Google Inc. All Rights Reserved.
2
//
3
// Fast memory copying and comparison routines.
4
//   strings::fastmemcmp_inlined() replaces memcmp()
5
//   strings::memcpy_inlined() replaces memcpy()
6
//   strings::memeq(a, b, n) replaces memcmp(a, b, n) == 0
7
//
8
// strings::*_inlined() routines are inline versions of the
9
// routines exported by this module.  Sometimes using the inlined
10
// versions is faster.  Measure before using the inlined versions.
11
//
12
// Performance measurement:
13
//   strings::fastmemcmp_inlined
14
//     Analysis: memcmp, fastmemcmp_inlined, fastmemcmp
15
//     2012-01-30
16
17
#pragma once
18
19
#include <stddef.h>
20
#include <stdint.h>
21
#include <stdio.h>
22
#include <string.h>
23
24
#include "gutil/integral_types.h"
25
#include "gutil/port.h"
26
27
namespace strings {
28
29
// Return true if the n bytes at a equal the n bytes at b.
30
// The regions are allowed to overlap.
31
//
32
// The performance is similar to the performance memcmp(), but faster for
33
// moderately-sized inputs, or inputs that share a common prefix and differ
34
// somewhere in their last 8 bytes. Further optimizations can be added later
35
// if it makes sense to do so.
36
0
inline bool memeq(const void* a_v, const void* b_v, size_t n) {
37
0
    const uint8_t* a = reinterpret_cast<const uint8_t*>(a_v);
38
0
    const uint8_t* b = reinterpret_cast<const uint8_t*>(b_v);
39
40
0
    size_t n_rounded_down = n & ~static_cast<size_t>(7);
41
0
    if (PREDICT_FALSE(n_rounded_down == 0)) { // n <= 7
42
0
        return memcmp(a, b, n) == 0;
43
0
    }
44
    // n >= 8
45
0
    uint64 u = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
46
0
    uint64 v = UNALIGNED_LOAD64(a + n - 8) ^ UNALIGNED_LOAD64(b + n - 8);
47
0
    if ((u | v) != 0) { // The first or last 8 bytes differ.
48
0
        return false;
49
0
    }
50
0
    a += 8;
51
0
    b += 8;
52
0
    n = n_rounded_down - 8;
53
0
    if (n > 128) {
54
        // As of 2012, memcmp on x86-64 uses a big unrolled loop with SSE2
55
        // instructions, and while we could try to do something faster, it
56
        // doesn't seem worth pursuing.
57
0
        return memcmp(a, b, n) == 0;
58
0
    }
59
0
    for (; n >= 16; n -= 16) {
60
0
        uint64 x = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
61
0
        uint64 y = UNALIGNED_LOAD64(a + 8) ^ UNALIGNED_LOAD64(b + 8);
62
0
        if ((x | y) != 0) {
63
0
            return false;
64
0
        }
65
0
        a += 16;
66
0
        b += 16;
67
0
    }
68
    // n must be 0 or 8 now because it was a multiple of 8 at the top of the loop.
69
0
    return n == 0 || UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b);
70
0
}
71
72
0
inline int fastmemcmp_inlined(const void* a_void, const void* b_void, size_t n) {
73
0
    const uint8_t* a = reinterpret_cast<const uint8_t*>(a_void);
74
0
    const uint8_t* b = reinterpret_cast<const uint8_t*>(b_void);
75
0
76
0
    if (n >= 64) {
77
0
        return memcmp(a, b, n);
78
0
    }
79
0
    const void* a_limit = a + n;
80
0
    const size_t sizeof_uint64 = sizeof(uint64); // NOLINT(runtime/sizeof)
81
0
    while (a + sizeof_uint64 <= a_limit && UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b)) {
82
0
        a += sizeof_uint64;
83
0
        b += sizeof_uint64;
84
0
    }
85
0
    const size_t sizeof_uint32 = sizeof(uint32); // NOLINT(runtime/sizeof)
86
0
    if (a + sizeof_uint32 <= a_limit && UNALIGNED_LOAD32(a) == UNALIGNED_LOAD32(b)) {
87
0
        a += sizeof_uint32;
88
0
        b += sizeof_uint32;
89
0
    }
90
0
    while (a < a_limit) {
91
0
        int d = static_cast<uint32>(*a++) - static_cast<uint32>(*b++);
92
0
        if (d) return d;
93
0
    }
94
0
    return 0;
95
0
}
96
97
// The standard memcpy operation is slow for variable small sizes.
98
// This implementation inlines the optimal realization for sizes 1 to 16.
99
// To avoid code bloat don't use it in case of not performance-critical spots,
100
// nor when you don't expect very frequent values of size <= 16.
101
630k
inline void memcpy_inlined(void* dst, const void* src, size_t size) {
102
    // Compiler inlines code with minimal amount of data movement when third
103
    // parameter of memcpy is a constant.
104
630k
    switch (size) {
105
0
    case 1:
106
0
        memcpy(dst, src, 1);
107
0
        break;
108
0
    case 2:
109
0
        memcpy(dst, src, 2);
110
0
        break;
111
0
    case 3:
112
0
        memcpy(dst, src, 3);
113
0
        break;
114
0
    case 4:
115
0
        memcpy(dst, src, 4);
116
0
        break;
117
15.9k
    case 5:
118
15.9k
        memcpy(dst, src, 5);
119
15.9k
        break;
120
127
    case 6:
121
127
        memcpy(dst, src, 6);
122
127
        break;
123
16.2k
    case 7:
124
16.2k
        memcpy(dst, src, 7);
125
16.2k
        break;
126
37.5k
    case 8:
127
37.5k
        memcpy(dst, src, 8);
128
37.5k
        break;
129
146k
    case 9:
130
146k
        memcpy(dst, src, 9);
131
146k
        break;
132
140k
    case 10:
133
140k
        memcpy(dst, src, 10);
134
140k
        break;
135
0
    case 11:
136
0
        memcpy(dst, src, 11);
137
0
        break;
138
753
    case 12:
139
753
        memcpy(dst, src, 12);
140
753
        break;
141
92
    case 13:
142
92
        memcpy(dst, src, 13);
143
92
        break;
144
124k
    case 14:
145
124k
        memcpy(dst, src, 14);
146
124k
        break;
147
139k
    case 15:
148
139k
        memcpy(dst, src, 15);
149
139k
        break;
150
75
    case 16:
151
75
        memcpy(dst, src, 16);
152
75
        break;
153
9.20k
    default:
154
9.20k
        memcpy(dst, src, size);
155
9.20k
        break;
156
630k
    }
157
630k
}
158
159
} // namespace strings