/root/doris/be/src/gutil/strings/fastmem.h

Source (jump to first uncovered line)
// Copyright 2008 Google Inc. All Rights Reserved.
//
// Fast memory copying and comparison routines.
//   strings::fastmemcmp_inlined() replaces memcmp()
//   strings::memcpy_inlined() replaces memcpy()
//   strings::memeq(a, b, n) replaces memcmp(a, b, n) == 0
//
// strings::*_inlined() routines are inline versions of the
// routines exported by this module.  Sometimes using the inlined
// versions is faster.  Measure before using the inlined versions.
//
// Performance measurement:
//   strings::fastmemcmp_inlined
//     Analysis: memcmp, fastmemcmp_inlined, fastmemcmp
//     2012-01-30

#pragma once

#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>

#include "gutil/integral_types.h"
#include "gutil/port.h"

namespace strings {

// Return true if the n bytes at a equal the n bytes at b.
// The regions are allowed to overlap.
//
// The performance is similar to the performance memcmp(), but faster for
// moderately-sized inputs, or inputs that share a common prefix and differ
// somewhere in their last 8 bytes. Further optimizations can be added later
// if it makes sense to do so.
inline bool memeq(const void* a_v, const void* b_v, size_t n) {
    const uint8_t* a = reinterpret_cast<const uint8_t*>(a_v);
    const uint8_t* b = reinterpret_cast<const uint8_t*>(b_v);

    size_t n_rounded_down = n & ~static_cast<size_t>(7);
    if (PREDICT_FALSE(n_rounded_down == 0)) { // n <= 7
        return memcmp(a, b, n) == 0;
    }
    // n >= 8
    uint64 u = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
    uint64 v = UNALIGNED_LOAD64(a + n - 8) ^ UNALIGNED_LOAD64(b + n - 8);
    if ((u | v) != 0) { // The first or last 8 bytes differ.
        return false;
    }
    a += 8;
    b += 8;
    n = n_rounded_down - 8;
    if (n > 128) {
        // As of 2012, memcmp on x86-64 uses a big unrolled loop with SSE2
        // instructions, and while we could try to do something faster, it
        // doesn't seem worth pursuing.
        return memcmp(a, b, n) == 0;
    }
    for (; n >= 16; n -= 16) {
        uint64 x = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
        uint64 y = UNALIGNED_LOAD64(a + 8) ^ UNALIGNED_LOAD64(b + 8);
        if ((x | y) != 0) {
            return false;
        }
        a += 16;
        b += 16;
    }
    // n must be 0 or 8 now because it was a multiple of 8 at the top of the loop.
    return n == 0 || UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b);
}

inline int fastmemcmp_inlined(const void* a_void, const void* b_void, size_t n) {
    const uint8_t* a = reinterpret_cast<const uint8_t*>(a_void);
    const uint8_t* b = reinterpret_cast<const uint8_t*>(b_void);

    if (n >= 64) {
        return memcmp(a, b, n);
    }
    const void* a_limit = a + n;
    const size_t sizeof_uint64 = sizeof(uint64); // NOLINT(runtime/sizeof)
    while (a + sizeof_uint64 <= a_limit && UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b)) {
        a += sizeof_uint64;
        b += sizeof_uint64;
    }
    const size_t sizeof_uint32 = sizeof(uint32); // NOLINT(runtime/sizeof)
    if (a + sizeof_uint32 <= a_limit && UNALIGNED_LOAD32(a) == UNALIGNED_LOAD32(b)) {
        a += sizeof_uint32;
        b += sizeof_uint32;
    }
    while (a < a_limit) {
        int d = static_cast<uint32>(*a++) - static_cast<uint32>(*b++);
        if (d) return d;
    }
    return 0;
}

// The standard memcpy operation is slow for variable small sizes.
// This implementation inlines the optimal realization for sizes 1 to 16.
// To avoid code bloat don't use it in case of not performance-critical spots,
// nor when you don't expect very frequent values of size <= 16.
inline void memcpy_inlined(void* dst, const void* src, size_t size) {
    // Compiler inlines code with minimal amount of data movement when third
    // parameter of memcpy is a constant.
    switch (size) {
    case 1:
        memcpy(dst, src, 1);
        break;
    case 2:
        memcpy(dst, src, 2);
        break;
    case 3:
        memcpy(dst, src, 3);
        break;
    case 4:
        memcpy(dst, src, 4);
        break;
    case 5:
        memcpy(dst, src, 5);
        break;
    case 6:
        memcpy(dst, src, 6);
        break;
    case 7:
        memcpy(dst, src, 7);
        break;
    case 8:
        memcpy(dst, src, 8);
        break;
    case 9:
        memcpy(dst, src, 9);
        break;
    case 10:
        memcpy(dst, src, 10);
        break;
    case 11:
        memcpy(dst, src, 11);
        break;
    case 12:
        memcpy(dst, src, 12);
        break;
    case 13:
        memcpy(dst, src, 13);
        break;
    case 14:
        memcpy(dst, src, 14);
        break;
    case 15:
        memcpy(dst, src, 15);
        break;
    case 16:
        memcpy(dst, src, 16);
        break;
    default:
        memcpy(dst, src, size);
        break;
    }
}

} // namespace strings

Line	Count	Source (jump to first uncovered line)
1		// Copyright 2008 Google Inc. All Rights Reserved.
2		//
3		// Fast memory copying and comparison routines.
4		// strings::fastmemcmp_inlined() replaces memcmp()
5		// strings::memcpy_inlined() replaces memcpy()
6		// strings::memeq(a, b, n) replaces memcmp(a, b, n) == 0
7		//
8		// strings::*_inlined() routines are inline versions of the
9		// routines exported by this module. Sometimes using the inlined
10		// versions is faster. Measure before using the inlined versions.
11		//
12		// Performance measurement:
13		// strings::fastmemcmp_inlined
14		// Analysis: memcmp, fastmemcmp_inlined, fastmemcmp
15		// 2012-01-30
16
17		#pragma once
18
19		#include <stddef.h>
20		#include <stdint.h>
21		#include <stdio.h>
22		#include <string.h>
23
24		#include "gutil/integral_types.h"
25		#include "gutil/port.h"
26
27		namespace strings {
28
29		// Return true if the n bytes at a equal the n bytes at b.
30		// The regions are allowed to overlap.
31		//
32		// The performance is similar to the performance memcmp(), but faster for
33		// moderately-sized inputs, or inputs that share a common prefix and differ
34		// somewhere in their last 8 bytes. Further optimizations can be added later
35		// if it makes sense to do so.
36	0	inline bool memeq(const void* a_v, const void* b_v, size_t n) {
37	0	const uint8_t* a = reinterpret_cast<const uint8_t*>(a_v);
38	0	const uint8_t* b = reinterpret_cast<const uint8_t*>(b_v);
39
40	0	size_t n_rounded_down = n & ~static_cast<size_t>(7);
41	0	if (PREDICT_FALSE(n_rounded_down == 0)) { // n <= 7
42	0	return memcmp(a, b, n) == 0;
43	0	}
44		// n >= 8
45	0	uint64 u = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
46	0	uint64 v = UNALIGNED_LOAD64(a + n - 8) ^ UNALIGNED_LOAD64(b + n - 8);
47	0	if ((u \| v) != 0) { // The first or last 8 bytes differ.
48	0	return false;
49	0	}
50	0	a += 8;
51	0	b += 8;
52	0	n = n_rounded_down - 8;
53	0	if (n > 128) {
54		// As of 2012, memcmp on x86-64 uses a big unrolled loop with SSE2
55		// instructions, and while we could try to do something faster, it
56		// doesn't seem worth pursuing.
57	0	return memcmp(a, b, n) == 0;
58	0	}
59	0	for (; n >= 16; n -= 16) {
60	0	uint64 x = UNALIGNED_LOAD64(a) ^ UNALIGNED_LOAD64(b);
61	0	uint64 y = UNALIGNED_LOAD64(a + 8) ^ UNALIGNED_LOAD64(b + 8);
62	0	if ((x \| y) != 0) {
63	0	return false;
64	0	}
65	0	a += 16;
66	0	b += 16;
67	0	}
68		// n must be 0 or 8 now because it was a multiple of 8 at the top of the loop.
69	0	return n == 0 \|\| UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b);
70	0	}
71
72	0	inline int fastmemcmp_inlined(const void* a_void, const void* b_void, size_t n) {
73	0	const uint8_t* a = reinterpret_cast<const uint8_t*>(a_void);
74	0	const uint8_t* b = reinterpret_cast<const uint8_t*>(b_void);
75	0
76	0	if (n >= 64) {
77	0	return memcmp(a, b, n);
78	0	}
79	0	const void* a_limit = a + n;
80	0	const size_t sizeof_uint64 = sizeof(uint64); // NOLINT(runtime/sizeof)
81	0	while (a + sizeof_uint64 <= a_limit && UNALIGNED_LOAD64(a) == UNALIGNED_LOAD64(b)) {
82	0	a += sizeof_uint64;
83	0	b += sizeof_uint64;
84	0	}
85	0	const size_t sizeof_uint32 = sizeof(uint32); // NOLINT(runtime/sizeof)
86	0	if (a + sizeof_uint32 <= a_limit && UNALIGNED_LOAD32(a) == UNALIGNED_LOAD32(b)) {
87	0	a += sizeof_uint32;
88	0	b += sizeof_uint32;
89	0	}
90	0	while (a < a_limit) {
91	0	int d = static_cast<uint32>(a++) - static_cast<uint32>(b++);
92	0	if (d) return d;
93	0	}
94	0	return 0;
95	0	}
96
97		// The standard memcpy operation is slow for variable small sizes.
98		// This implementation inlines the optimal realization for sizes 1 to 16.
99		// To avoid code bloat don't use it in case of not performance-critical spots,
100		// nor when you don't expect very frequent values of size <= 16.
101	630k	inline void memcpy_inlined(void* dst, const void* src, size_t size) {
102		// Compiler inlines code with minimal amount of data movement when third
103		// parameter of memcpy is a constant.
104	630k	switch (size) {
105	0	case 1:
106	0	memcpy(dst, src, 1);
107	0	break;
108	0	case 2:
109	0	memcpy(dst, src, 2);
110	0	break;
111	0	case 3:
112	0	memcpy(dst, src, 3);
113	0	break;
114	0	case 4:
115	0	memcpy(dst, src, 4);
116	0	break;
117	15.9k	case 5:
118	15.9k	memcpy(dst, src, 5);
119	15.9k	break;
120	127	case 6:
121	127	memcpy(dst, src, 6);
122	127	break;
123	16.2k	case 7:
124	16.2k	memcpy(dst, src, 7);
125	16.2k	break;
126	37.5k	case 8:
127	37.5k	memcpy(dst, src, 8);
128	37.5k	break;
129	146k	case 9:
130	146k	memcpy(dst, src, 9);
131	146k	break;
132	140k	case 10:
133	140k	memcpy(dst, src, 10);
134	140k	break;
135	0	case 11:
136	0	memcpy(dst, src, 11);
137	0	break;
138	753	case 12:
139	753	memcpy(dst, src, 12);
140	753	break;
141	92	case 13:
142	92	memcpy(dst, src, 13);
143	92	break;
144	124k	case 14:
145	124k	memcpy(dst, src, 14);
146	124k	break;
147	139k	case 15:
148	139k	memcpy(dst, src, 15);
149	139k	break;
150	75	case 16:
151	75	memcpy(dst, src, 16);
152	75	break;
153	9.20k	default:
154	9.20k	memcpy(dst, src, size);
155	9.20k	break;
156	630k	}
157	630k	}
158
159		} // namespace strings

Coverage Report

Created: 2024-11-20 15:53