/root/doris/be/src/util/memcpy_inlined.h
| Line | Count | Source | 
| 1 |  | // Licensed to the Apache Software Foundation (ASF) under one | 
| 2 |  | // or more contributor license agreements.  See the NOTICE file | 
| 3 |  | // distributed with this work for additional information | 
| 4 |  | // regarding copyright ownership.  The ASF licenses this file | 
| 5 |  | // to you under the Apache License, Version 2.0 (the | 
| 6 |  | // "License"); you may not use this file except in compliance | 
| 7 |  | // with the License.  You may obtain a copy of the License at | 
| 8 |  | // | 
| 9 |  | //   http://www.apache.org/licenses/LICENSE-2.0 | 
| 10 |  | // | 
| 11 |  | // Unless required by applicable law or agreed to in writing, | 
| 12 |  | // software distributed under the License is distributed on an | 
| 13 |  | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | 
| 14 |  | // KIND, either express or implied.  See the License for the | 
| 15 |  | // specific language governing permissions and limitations | 
| 16 |  | // under the License. | 
| 17 |  |  | 
| 18 |  | #pragma once | 
| 19 |  |  | 
| 20 |  | #pragma once | 
| 21 |  | #ifdef __AVX2__ | 
| 22 |  | #include <emmintrin.h> | 
| 23 |  | #include <immintrin.h> | 
| 24 |  | #endif | 
| 25 |  |  | 
| 26 |  | #include <stddef.h> | 
| 27 |  | #include <stdint.h> | 
| 28 |  | #include <stdio.h> | 
| 29 |  | #include <string.h> | 
| 30 |  |  | 
| 31 |  | #include "common/compiler_util.h" | 
| 32 |  |  | 
| 33 |  | namespace doris { | 
| 34 |  |  | 
| 35 |  | ALWAYS_INLINE inline void memcpy_inlined(void* __restrict _dst, const void* __restrict _src, | 
| 36 | 1.09M |                                          size_t size) { | 
| 37 | 1.09M |     auto dst = static_cast<uint8_t*>(_dst); | 
| 38 | 1.09M |     auto src = static_cast<const uint8_t*>(_src); | 
| 39 |  |  | 
| 40 | 1.27M |     [[maybe_unused]] tail : | 
| 41 |  |             /// Small sizes and tails after the loop for large sizes. | 
| 42 |  |             /// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application. | 
| 43 |  |             /// This order of branches is from the disassembly of glibc's code. | 
| 44 |  |             /// We copy chunks of possibly uneven size with two overlapping movs. | 
| 45 |  |             /// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3]. | 
| 46 | 1.27M |             if (size <= 16) { | 
| 47 | 889k |         if (size >= 8) { | 
| 48 |  |             /// Chunks of 8..16 bytes. | 
| 49 | 779k |             __builtin_memcpy(dst + size - 8, src + size - 8, 8); | 
| 50 | 779k |             __builtin_memcpy(dst, src, 8); | 
| 51 | 779k |         } else if (size >= 4) { | 
| 52 |  |             /// Chunks of 4..7 bytes. | 
| 53 | 83.3k |             __builtin_memcpy(dst + size - 4, src + size - 4, 4); | 
| 54 | 83.3k |             __builtin_memcpy(dst, src, 4); | 
| 55 | 83.3k |         } else if (size >= 2) { | 
| 56 |  |             /// Chunks of 2..3 bytes. | 
| 57 | 21.4k |             __builtin_memcpy(dst + size - 2, src + size - 2, 2); | 
| 58 | 21.4k |             __builtin_memcpy(dst, src, 2); | 
| 59 | 21.4k |         } else if (size >= 1) { | 
| 60 |  |             /// A single byte. | 
| 61 | 3.40k |             *dst = *src; | 
| 62 | 3.40k |         } | 
| 63 |  |         /// No bytes remaining. | 
| 64 | 889k |     } | 
| 65 | 382k |     else { | 
| 66 | 382k | #ifdef __AVX2__ | 
| 67 | 382k |         if (size <= 256) { | 
| 68 | 273k |             if (size <= 32) { | 
| 69 | 66.9k |                 __builtin_memcpy(dst, src, 8); | 
| 70 | 66.9k |                 __builtin_memcpy(dst + 8, src + 8, 8); | 
| 71 | 66.9k |                 size -= 16; | 
| 72 | 66.9k |                 dst += 16; | 
| 73 | 66.9k |                 src += 16; | 
| 74 | 66.9k |                 goto tail; | 
| 75 | 66.9k |             } | 
| 76 |  |  | 
| 77 |  |             /// Then we will copy every 16 bytes from the beginning in a loop. | 
| 78 |  |             /// The last loop iteration will possibly overwrite some part of already copied last 32 bytes. | 
| 79 |  |             /// This is Ok, similar to the code for small sizes above. | 
| 80 | 783k |             while (size > 32) { | 
| 81 | 577k |                 _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), | 
| 82 | 577k |                                     _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src))); | 
| 83 | 577k |                 dst += 32; | 
| 84 | 577k |                 src += 32; | 
| 85 | 577k |                 size -= 32; | 
| 86 | 577k |             } | 
| 87 |  |  | 
| 88 | 206k |             _mm256_storeu_si256( | 
| 89 | 206k |                     reinterpret_cast<__m256i*>(dst + size - 32), | 
| 90 | 206k |                     _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + size - 32))); | 
| 91 | 206k |         } else { | 
| 92 | 109k |             if (size >= 512 * 1024 && size <= 2048 * 1024) { | 
| 93 | 2 |                 asm volatile("rep movsb" | 
| 94 | 2 |                              : "=D"(dst), "=S"(src), "=c"(size) | 
| 95 | 2 |                              : "0"(dst), "1"(src), "2"(size) | 
| 96 | 2 |                              : "memory"); | 
| 97 | 109k |             } else { | 
| 98 | 109k |                 size_t padding = (32 - (reinterpret_cast<size_t>(dst) & 31)) & 31; | 
| 99 |  |  | 
| 100 | 109k |                 if (padding > 0) { | 
| 101 | 100k |                     __m256i head = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src)); | 
| 102 | 100k |                     _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), head); | 
| 103 | 100k |                     dst += padding; | 
| 104 | 100k |                     src += padding; | 
| 105 | 100k |                     size -= padding; | 
| 106 | 100k |                 } | 
| 107 |  |  | 
| 108 |  |                 /// Aligned unrolled copy. We will use half of available AVX registers. | 
| 109 |  |                 /// It's not possible to have both src and dst aligned. | 
| 110 |  |                 /// So, we will use aligned stores and unaligned loads. | 
| 111 | 109k |                 __m256i c0, c1, c2, c3, c4, c5, c6, c7; | 
| 112 |  |  | 
| 113 | 5.94M |                 while (size >= 256) { | 
| 114 | 5.83M |                     c0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src)); | 
| 115 | 5.83M |                     c1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 32)); | 
| 116 | 5.83M |                     c2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 64)); | 
| 117 | 5.83M |                     c3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 96)); | 
| 118 | 5.83M |                     c4 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 128)); | 
| 119 | 5.83M |                     c5 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 160)); | 
| 120 | 5.83M |                     c6 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 192)); | 
| 121 | 5.83M |                     c7 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 224)); | 
| 122 | 5.83M |                     src += 256; | 
| 123 |  |  | 
| 124 | 5.83M |                     _mm256_store_si256((reinterpret_cast<__m256i*>(dst)), c0); | 
| 125 | 5.83M |                     _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 32)), c1); | 
| 126 | 5.83M |                     _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 64)), c2); | 
| 127 | 5.83M |                     _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 96)), c3); | 
| 128 | 5.83M |                     _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 128)), c4); | 
| 129 | 5.83M |                     _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 160)), c5); | 
| 130 | 5.83M |                     _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 192)), c6); | 
| 131 | 5.83M |                     _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 224)), c7); | 
| 132 | 5.83M |                     dst += 256; | 
| 133 |  |  | 
| 134 | 5.83M |                     size -= 256; | 
| 135 | 5.83M |                 } | 
| 136 |  |  | 
| 137 | 109k |                 goto tail; | 
| 138 | 109k |             } | 
| 139 | 109k |         } | 
| 140 |  | #else | 
| 141 |  |         memcpy(dst, src, size); | 
| 142 |  | #endif | 
| 143 | 382k |     } | 
| 144 | 1.27M | } | 
| 145 |  | } // namespace doris |