/root/doris/be/src/util/memcpy_inlined.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #pragma once |
21 | | #ifdef __AVX2__ |
22 | | #include <emmintrin.h> |
23 | | #include <immintrin.h> |
24 | | #endif |
25 | | |
26 | | #include <stddef.h> |
27 | | #include <stdint.h> |
28 | | #include <stdio.h> |
29 | | #include <string.h> |
30 | | |
31 | | #include "common/compiler_util.h" |
32 | | #include "gutil/integral_types.h" |
33 | | #include "gutil/port.h" |
34 | | |
35 | | namespace doris { |
36 | | |
37 | | ALWAYS_INLINE inline void memcpy_inlined(void* __restrict _dst, const void* __restrict _src, |
38 | 630k | size_t size) { |
39 | 630k | auto dst = static_cast<uint8_t*>(_dst); |
40 | 630k | auto src = static_cast<const uint8_t*>(_src); |
41 | | |
42 | 630k | [[maybe_unused]] tail : |
43 | | /// Small sizes and tails after the loop for large sizes. |
44 | | /// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application. |
45 | | /// This order of branches is from the disassembly of glibc's code. |
46 | | /// We copy chunks of possibly uneven size with two overlapping movs. |
47 | | /// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3]. |
48 | 630k | if (size <= 16) { |
49 | 621k | if (size >= 8) { |
50 | | /// Chunks of 8..16 bytes. |
51 | 589k | __builtin_memcpy(dst + size - 8, src + size - 8, 8); |
52 | 589k | __builtin_memcpy(dst, src, 8); |
53 | 589k | } else if (size >= 4) { |
54 | | /// Chunks of 4..7 bytes. |
55 | 32.3k | __builtin_memcpy(dst + size - 4, src + size - 4, 4); |
56 | 32.3k | __builtin_memcpy(dst, src, 4); |
57 | 32.3k | } else if (size >= 2) { |
58 | | /// Chunks of 2..3 bytes. |
59 | 3 | __builtin_memcpy(dst + size - 2, src + size - 2, 2); |
60 | 3 | __builtin_memcpy(dst, src, 2); |
61 | 3 | } else if (size >= 1) { |
62 | | /// A single byte. |
63 | 0 | *dst = *src; |
64 | 0 | } |
65 | | /// No bytes remaining. |
66 | 621k | } |
67 | 9.32k | else { |
68 | | #ifdef __AVX2__ |
69 | | if (size <= 256) { |
70 | | if (size <= 32) { |
71 | | __builtin_memcpy(dst, src, 8); |
72 | | __builtin_memcpy(dst + 8, src + 8, 8); |
73 | | size -= 16; |
74 | | dst += 16; |
75 | | src += 16; |
76 | | goto tail; |
77 | | } |
78 | | |
79 | | /// Then we will copy every 16 bytes from the beginning in a loop. |
80 | | /// The last loop iteration will possibly overwrite some part of already copied last 32 bytes. |
81 | | /// This is Ok, similar to the code for small sizes above. |
82 | | while (size > 32) { |
83 | | _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), |
84 | | _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src))); |
85 | | dst += 32; |
86 | | src += 32; |
87 | | size -= 32; |
88 | | } |
89 | | |
90 | | _mm256_storeu_si256( |
91 | | reinterpret_cast<__m256i*>(dst + size - 32), |
92 | | _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + size - 32))); |
93 | | } else { |
94 | | if (size >= 512 * 1024 && size <= 2048 * 1024) { |
95 | | asm volatile("rep movsb" |
96 | | : "=D"(dst), "=S"(src), "=c"(size) |
97 | | : "0"(dst), "1"(src), "2"(size) |
98 | | : "memory"); |
99 | | } else { |
100 | | size_t padding = (32 - (reinterpret_cast<size_t>(dst) & 31)) & 31; |
101 | | |
102 | | if (padding > 0) { |
103 | | __m256i head = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src)); |
104 | | _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), head); |
105 | | dst += padding; |
106 | | src += padding; |
107 | | size -= padding; |
108 | | } |
109 | | |
110 | | /// Aligned unrolled copy. We will use half of available AVX registers. |
111 | | /// It's not possible to have both src and dst aligned. |
112 | | /// So, we will use aligned stores and unaligned loads. |
113 | | __m256i c0, c1, c2, c3, c4, c5, c6, c7; |
114 | | |
115 | | while (size >= 256) { |
116 | | c0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src)); |
117 | | c1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 32)); |
118 | | c2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 64)); |
119 | | c3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 96)); |
120 | | c4 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 128)); |
121 | | c5 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 160)); |
122 | | c6 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 192)); |
123 | | c7 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 224)); |
124 | | src += 256; |
125 | | |
126 | | _mm256_store_si256((reinterpret_cast<__m256i*>(dst)), c0); |
127 | | _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 32)), c1); |
128 | | _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 64)), c2); |
129 | | _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 96)), c3); |
130 | | _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 128)), c4); |
131 | | _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 160)), c5); |
132 | | _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 192)), c6); |
133 | | _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 224)), c7); |
134 | | dst += 256; |
135 | | |
136 | | size -= 256; |
137 | | } |
138 | | |
139 | | goto tail; |
140 | | } |
141 | | } |
142 | | #else |
143 | 9.32k | memcpy(dst, src, size); |
144 | 9.32k | #endif |
145 | 9.32k | } |
146 | 630k | } |
147 | | } // namespace doris |