Coverage Report

Created: 2026-03-13 12:15

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/memcpy_inlined.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#pragma once
21
#ifdef __AVX2__
22
#include <emmintrin.h>
23
#include <immintrin.h>
24
#endif
25
26
#include <stddef.h>
27
#include <stdint.h>
28
#include <stdio.h>
29
#include <string.h>
30
31
#include "common/compiler_util.h"
32
33
namespace doris {
34
35
ALWAYS_INLINE inline void memcpy_inlined(void* __restrict _dst, const void* __restrict _src,
36
154M
                                         size_t size) {
37
154M
    auto dst = static_cast<uint8_t*>(_dst);
38
154M
    auto src = static_cast<const uint8_t*>(_src);
39
40
184M
    [[maybe_unused]] tail :
41
            /// Small sizes and tails after the loop for large sizes.
42
            /// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application.
43
            /// This order of branches is from the disassembly of glibc's code.
44
            /// We copy chunks of possibly uneven size with two overlapping movs.
45
            /// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3].
46
184M
            if (size <= 16) {
47
135M
        if (size >= 8) {
48
            /// Chunks of 8..16 bytes.
49
46.6M
            __builtin_memcpy(dst + size - 8, src + size - 8, 8);
50
46.6M
            __builtin_memcpy(dst, src, 8);
51
89.3M
        } else if (size >= 4) {
52
            /// Chunks of 4..7 bytes.
53
26.2M
            __builtin_memcpy(dst + size - 4, src + size - 4, 4);
54
26.2M
            __builtin_memcpy(dst, src, 4);
55
63.0M
        } else if (size >= 2) {
56
            /// Chunks of 2..3 bytes.
57
10.8M
            __builtin_memcpy(dst + size - 2, src + size - 2, 2);
58
10.8M
            __builtin_memcpy(dst, src, 2);
59
52.1M
        } else if (size >= 1) {
60
            /// A single byte.
61
19.9M
            *dst = *src;
62
19.9M
        }
63
        /// No bytes remaining.
64
135M
    }
65
48.8M
    else {
66
48.8M
#ifdef __AVX2__
67
48.8M
        if (size <= 256) {
68
46.4M
            if (size <= 32) {
69
27.8M
                __builtin_memcpy(dst, src, 8);
70
27.8M
                __builtin_memcpy(dst + 8, src + 8, 8);
71
27.8M
                size -= 16;
72
27.8M
                dst += 16;
73
27.8M
                src += 16;
74
27.8M
                goto tail;
75
27.8M
            }
76
77
            /// Then we will copy every 16 bytes from the beginning in a loop.
78
            /// The last loop iteration will possibly overwrite some part of already copied last 32 bytes.
79
            /// This is Ok, similar to the code for small sizes above.
80
55.7M
            while (size > 32) {
81
37.1M
                _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst),
82
37.1M
                                    _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src)));
83
37.1M
                dst += 32;
84
37.1M
                src += 32;
85
37.1M
                size -= 32;
86
37.1M
            }
87
88
18.5M
            _mm256_storeu_si256(
89
18.5M
                    reinterpret_cast<__m256i*>(dst + size - 32),
90
18.5M
                    _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + size - 32)));
91
18.5M
        } else {
92
2.39M
            if (size >= 512 * 1024 && size <= 2048 * 1024) {
93
50
                asm volatile("rep movsb"
94
50
                             : "=D"(dst), "=S"(src), "=c"(size)
95
50
                             : "0"(dst), "1"(src), "2"(size)
96
50
                             : "memory");
97
2.39M
            } else {
98
2.39M
                size_t padding = (32 - (reinterpret_cast<size_t>(dst) & 31)) & 31;
99
100
2.83M
                if (padding > 0) {
101
2.83M
                    __m256i head = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
102
2.83M
                    _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), head);
103
2.83M
                    dst += padding;
104
2.83M
                    src += padding;
105
2.83M
                    size -= padding;
106
2.83M
                }
107
108
                /// Aligned unrolled copy. We will use half of available AVX registers.
109
                /// It's not possible to have both src and dst aligned.
110
                /// So, we will use aligned stores and unaligned loads.
111
2.39M
                __m256i c0, c1, c2, c3, c4, c5, c6, c7;
112
113
14.7M
                while (size >= 256) {
114
12.3M
                    c0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
115
12.3M
                    c1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 32));
116
12.3M
                    c2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 64));
117
12.3M
                    c3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 96));
118
12.3M
                    c4 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 128));
119
12.3M
                    c5 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 160));
120
12.3M
                    c6 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 192));
121
12.3M
                    c7 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 224));
122
12.3M
                    src += 256;
123
124
12.3M
                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst)), c0);
125
12.3M
                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 32)), c1);
126
12.3M
                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 64)), c2);
127
12.3M
                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 96)), c3);
128
12.3M
                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 128)), c4);
129
12.3M
                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 160)), c5);
130
12.3M
                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 192)), c6);
131
12.3M
                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 224)), c7);
132
12.3M
                    dst += 256;
133
134
12.3M
                    size -= 256;
135
12.3M
                }
136
137
2.39M
                goto tail;
138
2.39M
            }
139
2.39M
        }
140
#else
141
        memcpy(dst, src, size);
142
#endif
143
48.8M
    }
144
184M
}
145
} // namespace doris