/root/doris/be/src/util/memcpy_inlined.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #pragma once |
21 | | #ifdef __AVX2__ |
22 | | #include <emmintrin.h> |
23 | | #include <immintrin.h> |
24 | | #endif |
25 | | |
26 | | #include <stddef.h> |
27 | | #include <stdint.h> |
28 | | #include <stdio.h> |
29 | | #include <string.h> |
30 | | |
31 | | #include "common/compiler_util.h" |
32 | | #include "gutil/port.h" |
33 | | |
34 | | namespace doris { |
35 | | |
36 | | ALWAYS_INLINE inline void memcpy_inlined(void* __restrict _dst, const void* __restrict _src, |
37 | 703k | size_t size) { |
38 | 703k | auto dst = static_cast<uint8_t*>(_dst); |
39 | 703k | auto src = static_cast<const uint8_t*>(_src); |
40 | | |
41 | 772k | [[maybe_unused]] tail : |
42 | | /// Small sizes and tails after the loop for large sizes. |
43 | | /// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application. |
44 | | /// This order of branches is from the disassembly of glibc's code. |
45 | | /// We copy chunks of possibly uneven size with two overlapping movs. |
46 | | /// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3]. |
47 | 772k | if (size <= 16) { |
48 | 660k | if (size >= 8) { |
49 | | /// Chunks of 8..16 bytes. |
50 | 609k | __builtin_memcpy(dst + size - 8, src + size - 8, 8); |
51 | 609k | __builtin_memcpy(dst, src, 8); |
52 | 609k | } else if (size >= 4) { |
53 | | /// Chunks of 4..7 bytes. |
54 | 46.3k | __builtin_memcpy(dst + size - 4, src + size - 4, 4); |
55 | 46.3k | __builtin_memcpy(dst, src, 4); |
56 | 46.3k | } else if (size >= 2) { |
57 | | /// Chunks of 2..3 bytes. |
58 | 4.02k | __builtin_memcpy(dst + size - 2, src + size - 2, 2); |
59 | 4.02k | __builtin_memcpy(dst, src, 2); |
60 | 4.02k | } else if (size >= 1) { |
61 | | /// A single byte. |
62 | 581 | *dst = *src; |
63 | 581 | } |
64 | | /// No bytes remaining. |
65 | 660k | } |
66 | 112k | else { |
67 | 112k | #ifdef __AVX2__ |
68 | 112k | if (size <= 256) { |
69 | 61.9k | if (size <= 32) { |
70 | 18.6k | __builtin_memcpy(dst, src, 8); |
71 | 18.6k | __builtin_memcpy(dst + 8, src + 8, 8); |
72 | 18.6k | size -= 16; |
73 | 18.6k | dst += 16; |
74 | 18.6k | src += 16; |
75 | 18.6k | goto tail; |
76 | 18.6k | } |
77 | | |
78 | | /// Then we will copy every 16 bytes from the beginning in a loop. |
79 | | /// The last loop iteration will possibly overwrite some part of already copied last 32 bytes. |
80 | | /// This is Ok, similar to the code for small sizes above. |
81 | 211k | while (size > 32) { |
82 | 168k | _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), |
83 | 168k | _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src))); |
84 | 168k | dst += 32; |
85 | 168k | src += 32; |
86 | 168k | size -= 32; |
87 | 168k | } |
88 | | |
89 | 43.3k | _mm256_storeu_si256( |
90 | 43.3k | reinterpret_cast<__m256i*>(dst + size - 32), |
91 | 43.3k | _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + size - 32))); |
92 | 50.2k | } else { |
93 | 50.2k | if (size >= 512 * 1024 && size <= 2048 * 1024) { |
94 | 2 | asm volatile("rep movsb" |
95 | 2 | : "=D"(dst), "=S"(src), "=c"(size) |
96 | 2 | : "0"(dst), "1"(src), "2"(size) |
97 | 2 | : "memory"); |
98 | 50.2k | } else { |
99 | 50.2k | size_t padding = (32 - (reinterpret_cast<size_t>(dst) & 31)) & 31; |
100 | | |
101 | 50.2k | if (padding > 0) { |
102 | 45.4k | __m256i head = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src)); |
103 | 45.4k | _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), head); |
104 | 45.4k | dst += padding; |
105 | 45.4k | src += padding; |
106 | 45.4k | size -= padding; |
107 | 45.4k | } |
108 | | |
109 | | /// Aligned unrolled copy. We will use half of available AVX registers. |
110 | | /// It's not possible to have both src and dst aligned. |
111 | | /// So, we will use aligned stores and unaligned loads. |
112 | 50.2k | __m256i c0, c1, c2, c3, c4, c5, c6, c7; |
113 | | |
114 | 2.34M | while (size >= 256) { |
115 | 2.29M | c0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src)); |
116 | 2.29M | c1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 32)); |
117 | 2.29M | c2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 64)); |
118 | 2.29M | c3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 96)); |
119 | 2.29M | c4 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 128)); |
120 | 2.29M | c5 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 160)); |
121 | 2.29M | c6 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 192)); |
122 | 2.29M | c7 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 224)); |
123 | 2.29M | src += 256; |
124 | | |
125 | 2.29M | _mm256_store_si256((reinterpret_cast<__m256i*>(dst)), c0); |
126 | 2.29M | _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 32)), c1); |
127 | 2.29M | _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 64)), c2); |
128 | 2.29M | _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 96)), c3); |
129 | 2.29M | _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 128)), c4); |
130 | 2.29M | _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 160)), c5); |
131 | 2.29M | _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 192)), c6); |
132 | 2.29M | _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 224)), c7); |
133 | 2.29M | dst += 256; |
134 | | |
135 | 2.29M | size -= 256; |
136 | 2.29M | } |
137 | | |
138 | 50.2k | goto tail; |
139 | 50.2k | } |
140 | 50.2k | } |
141 | | #else |
142 | | memcpy(dst, src, size); |
143 | | #endif |
144 | 112k | } |
145 | 772k | } |
146 | | } // namespace doris |