be/src/util/memcpy_inlined.h

Source
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#pragma once
#ifdef __AVX2__
#include <emmintrin.h>
#include <immintrin.h>
#endif

#include <stddef.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>

#include "common/compiler_util.h"

namespace doris {

ALWAYS_INLINE inline void memcpy_inlined(void* __restrict _dst, const void* __restrict _src,
                                         size_t size) {
    auto dst = static_cast<uint8_t*>(_dst);
    auto src = static_cast<const uint8_t*>(_src);

    [[maybe_unused]] tail :
            /// Small sizes and tails after the loop for large sizes.
            /// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application.
            /// This order of branches is from the disassembly of glibc's code.
            /// We copy chunks of possibly uneven size with two overlapping movs.
            /// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3].
            if (size <= 16) {
        if (size >= 8) {
            /// Chunks of 8..16 bytes.
            __builtin_memcpy(dst + size - 8, src + size - 8, 8);
            __builtin_memcpy(dst, src, 8);
        } else if (size >= 4) {
            /// Chunks of 4..7 bytes.
            __builtin_memcpy(dst + size - 4, src + size - 4, 4);
            __builtin_memcpy(dst, src, 4);
        } else if (size >= 2) {
            /// Chunks of 2..3 bytes.
            __builtin_memcpy(dst + size - 2, src + size - 2, 2);
            __builtin_memcpy(dst, src, 2);
        } else if (size >= 1) {
            /// A single byte.
            *dst = *src;
        }
        /// No bytes remaining.
    }
    else {
#ifdef __AVX2__
        if (size <= 256) {
            if (size <= 32) {
                __builtin_memcpy(dst, src, 8);
                __builtin_memcpy(dst + 8, src + 8, 8);
                size -= 16;
                dst += 16;
                src += 16;
                goto tail;
            }

            /// Then we will copy every 16 bytes from the beginning in a loop.
            /// The last loop iteration will possibly overwrite some part of already copied last 32 bytes.
            /// This is Ok, similar to the code for small sizes above.
            while (size > 32) {
                _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst),
                                    _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src)));
                dst += 32;
                src += 32;
                size -= 32;
            }

            _mm256_storeu_si256(
                    reinterpret_cast<__m256i*>(dst + size - 32),
                    _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + size - 32)));
        } else {
            if (size >= 512 * 1024 && size <= 2048 * 1024) {
                asm volatile("rep movsb"
                             : "=D"(dst), "=S"(src), "=c"(size)
                             : "0"(dst), "1"(src), "2"(size)
                             : "memory");
            } else {
                size_t padding = (32 - (reinterpret_cast<size_t>(dst) & 31)) & 31;

                if (padding > 0) {
                    __m256i head = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
                    _mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), head);
                    dst += padding;
                    src += padding;
                    size -= padding;
                }

                /// Aligned unrolled copy. We will use half of available AVX registers.
                /// It's not possible to have both src and dst aligned.
                /// So, we will use aligned stores and unaligned loads.
                __m256i c0, c1, c2, c3, c4, c5, c6, c7;

                while (size >= 256) {
                    c0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
                    c1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 32));
                    c2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 64));
                    c3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 96));
                    c4 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 128));
                    c5 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 160));
                    c6 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 192));
                    c7 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 224));
                    src += 256;

                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst)), c0);
                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 32)), c1);
                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 64)), c2);
                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 96)), c3);
                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 128)), c4);
                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 160)), c5);
                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 192)), c6);
                    _mm256_store_si256((reinterpret_cast<__m256i*>(dst + 224)), c7);
                    dst += 256;

                    size -= 256;
                }

                goto tail;
            }
        }
#else
        memcpy(dst, src, size);
#endif
    }
}
} // namespace doris

Line	Count	Source
1		// Licensed to the Apache Software Foundation (ASF) under one
2		// or more contributor license agreements. See the NOTICE file
3		// distributed with this work for additional information
4		// regarding copyright ownership. The ASF licenses this file
5		// to you under the Apache License, Version 2.0 (the
6		// "License"); you may not use this file except in compliance
7		// with the License. You may obtain a copy of the License at
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing,
12		// software distributed under the License is distributed on an
13		// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14		// KIND, either express or implied. See the License for the
15		// specific language governing permissions and limitations
16		// under the License.
17
18		#pragma once
19
20		#pragma once
21		#ifdef __AVX2__
22		#include <emmintrin.h>
23		#include <immintrin.h>
24		#endif
25
26		#include <stddef.h>
27		#include <stdint.h>
28		#include <stdio.h>
29		#include <string.h>
30
31		#include "common/compiler_util.h"
32
33		namespace doris {
34
35		ALWAYS_INLINE inline void memcpy_inlined(void* __restrict _dst, const void* __restrict _src,
36	154M	size_t size) {
37	154M	auto dst = static_cast<uint8_t*>(_dst);
38	154M	auto src = static_cast<const uint8_t*>(_src);
39
40	184M	[[maybe_unused]] tail :
41		/// Small sizes and tails after the loop for large sizes.
42		/// The order of branches is important but in fact the optimal order depends on the distribution of sizes in your application.
43		/// This order of branches is from the disassembly of glibc's code.
44		/// We copy chunks of possibly uneven size with two overlapping movs.
45		/// Example: to copy 5 bytes [0, 1, 2, 3, 4] we will copy tail [1, 2, 3, 4] first and then head [0, 1, 2, 3].
46	184M	if (size <= 16) {
47	135M	if (size >= 8) {
48		/// Chunks of 8..16 bytes.
49	46.6M	__builtin_memcpy(dst + size - 8, src + size - 8, 8);
50	46.6M	__builtin_memcpy(dst, src, 8);
51	89.3M	} else if (size >= 4) {
52		/// Chunks of 4..7 bytes.
53	26.2M	__builtin_memcpy(dst + size - 4, src + size - 4, 4);
54	26.2M	__builtin_memcpy(dst, src, 4);
55	63.0M	} else if (size >= 2) {
56		/// Chunks of 2..3 bytes.
57	10.8M	__builtin_memcpy(dst + size - 2, src + size - 2, 2);
58	10.8M	__builtin_memcpy(dst, src, 2);
59	52.1M	} else if (size >= 1) {
60		/// A single byte.
61	19.9M	dst = src;
62	19.9M	}
63		/// No bytes remaining.
64	135M	}
65	48.8M	else {
66	48.8M	#ifdef __AVX2__
67	48.8M	if (size <= 256) {
68	46.4M	if (size <= 32) {
69	27.8M	__builtin_memcpy(dst, src, 8);
70	27.8M	__builtin_memcpy(dst + 8, src + 8, 8);
71	27.8M	size -= 16;
72	27.8M	dst += 16;
73	27.8M	src += 16;
74	27.8M	goto tail;
75	27.8M	}
76
77		/// Then we will copy every 16 bytes from the beginning in a loop.
78		/// The last loop iteration will possibly overwrite some part of already copied last 32 bytes.
79		/// This is Ok, similar to the code for small sizes above.
80	55.7M	while (size > 32) {
81	37.1M	_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst),
82	37.1M	_mm256_loadu_si256(reinterpret_cast<const __m256i*>(src)));
83	37.1M	dst += 32;
84	37.1M	src += 32;
85	37.1M	size -= 32;
86	37.1M	}
87
88	18.5M	_mm256_storeu_si256(
89	18.5M	reinterpret_cast<__m256i*>(dst + size - 32),
90	18.5M	_mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + size - 32)));
91	18.5M	} else {
92	2.39M	if (size >= 512 * 1024 && size <= 2048 * 1024) {
93	50	asm volatile("rep movsb"
94	50	: "=D"(dst), "=S"(src), "=c"(size)
95	50	: "0"(dst), "1"(src), "2"(size)
96	50	: "memory");
97	2.39M	} else {
98	2.39M	size_t padding = (32 - (reinterpret_cast<size_t>(dst) & 31)) & 31;
99
100	2.83M	if (padding > 0) {
101	2.83M	__m256i head = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
102	2.83M	_mm256_storeu_si256(reinterpret_cast<__m256i*>(dst), head);
103	2.83M	dst += padding;
104	2.83M	src += padding;
105	2.83M	size -= padding;
106	2.83M	}
107
108		/// Aligned unrolled copy. We will use half of available AVX registers.
109		/// It's not possible to have both src and dst aligned.
110		/// So, we will use aligned stores and unaligned loads.
111	2.39M	__m256i c0, c1, c2, c3, c4, c5, c6, c7;
112
113	14.7M	while (size >= 256) {
114	12.3M	c0 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src));
115	12.3M	c1 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 32));
116	12.3M	c2 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 64));
117	12.3M	c3 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 96));
118	12.3M	c4 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 128));
119	12.3M	c5 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 160));
120	12.3M	c6 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 192));
121	12.3M	c7 = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(src + 224));
122	12.3M	src += 256;
123
124	12.3M	_mm256_store_si256((reinterpret_cast<__m256i*>(dst)), c0);
125	12.3M	_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 32)), c1);
126	12.3M	_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 64)), c2);
127	12.3M	_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 96)), c3);
128	12.3M	_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 128)), c4);
129	12.3M	_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 160)), c5);
130	12.3M	_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 192)), c6);
131	12.3M	_mm256_store_si256((reinterpret_cast<__m256i*>(dst + 224)), c7);
132	12.3M	dst += 256;
133
134	12.3M	size -= 256;
135	12.3M	}
136
137	2.39M	goto tail;
138	2.39M	}
139	2.39M	}
140		#else
141		memcpy(dst, src, size);
142		#endif
143	48.8M	}
144	184M	}
145		} // namespace doris

Coverage Report

Created: 2026-03-13 12:15