be/src/util/simd/vstring_function.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #ifdef __AVX2__ |
21 | | #include <immintrin.h> |
22 | | #elif defined(__ARM_FEATURE_SVE) |
23 | | #include <arm_sve.h> |
24 | | #endif |
25 | | #include <unistd.h> |
26 | | |
27 | | #include <array> |
28 | | #include <cstddef> |
29 | | #include <cstdint> |
30 | | |
31 | | #include "core/string_ref.h" |
32 | | #include "util/simd/lower_upper_impl.h" |
33 | | #include "util/sse_util.hpp" |
34 | | |
35 | | namespace doris { |
36 | | |
37 | | static constexpr std::array<uint8_t, 256> UTF8_BYTE_LENGTH = { |
38 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
39 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
40 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
41 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
42 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
43 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
44 | | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
45 | | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, |
46 | | 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 6, 6}; |
47 | | |
48 | 9.60M | inline uint8_t get_utf8_byte_length(uint8_t character) { |
49 | 9.60M | return UTF8_BYTE_LENGTH[character]; |
50 | 9.60M | } |
51 | | |
52 | | // copy from https://github.com/lemire/fastvalidate-utf-8/blob/master/include/simdasciicheck.h |
53 | | // The function returns true (1) if all chars passed in src are |
54 | | // 7-bit values (0x00..0x7F). Otherwise, it returns false (0). |
55 | 0 | inline bool validate_ascii_fast(const char* src, size_t len) { |
56 | 0 | size_t i = 0; |
57 | 0 | __m128i has_error = _mm_setzero_si128(); |
58 | 0 | if (len >= 16) { |
59 | 0 | for (; i <= len - 16; i += 16) { |
60 | 0 | __m128i current_bytes = _mm_loadu_si128((const __m128i*)(src + i)); |
61 | 0 | has_error = _mm_or_si128(has_error, current_bytes); |
62 | 0 | } |
63 | 0 | } |
64 | 0 | int error_mask = _mm_movemask_epi8(has_error); |
65 | 0 |
|
66 | 0 | char tail_has_error = 0; |
67 | 0 | for (; i < len; i++) { |
68 | 0 | tail_has_error |= src[i]; |
69 | 0 | } |
70 | 0 | error_mask |= (tail_has_error & 0x80); |
71 | 0 |
|
72 | 0 | return !error_mask; |
73 | 0 | } |
74 | | |
75 | | #ifdef __AVX2__ |
76 | | #include <x86intrin.h> |
77 | | // The function returns true (1) if all chars passed in src are |
78 | | // 7-bit values (0x00..0x7F). Otherwise, it returns false (0). |
79 | 31.0k | inline bool validate_ascii_fast_avx(const char* src, size_t len) { |
80 | 31.0k | size_t i = 0; |
81 | 31.0k | __m256i has_error = _mm256_setzero_si256(); |
82 | 31.0k | if (len >= 32) { |
83 | 284k | for (; i <= len - 32; i += 32) { |
84 | 282k | __m256i current_bytes = _mm256_loadu_si256((const __m256i*)(src + i)); |
85 | 282k | has_error = _mm256_or_si256(has_error, current_bytes); |
86 | 282k | } |
87 | 1.32k | } |
88 | 31.0k | int error_mask = _mm256_movemask_epi8(has_error); |
89 | | |
90 | 31.0k | char tail_has_error = 0; |
91 | 189k | for (; i < len; i++) { |
92 | 158k | tail_has_error |= src[i]; |
93 | 158k | } |
94 | 31.0k | error_mask |= (tail_has_error & 0x80); |
95 | | |
96 | 31.0k | return !error_mask; |
97 | 31.0k | } |
98 | | #elif defined(__ARM_FEATURE_SVE) |
99 | | inline bool validate_ascii_fast_sve(const char* src, size_t len) { |
100 | | for (size_t i = 0; i < len; i += svcntb()) { |
101 | | svbool_t pg = svwhilelt_b8(i, len); |
102 | | svuint8_t v = svld1_u8(pg, reinterpret_cast<const uint8_t*>(src + i)); |
103 | | // Check sign bit set => byte < 0 as int8 => non-ASCII |
104 | | svbool_t neg = svcmplt_n_s8(pg, svreinterpret_s8(v), 0); |
105 | | if (svptest_any(pg, neg)) { |
106 | | return false; |
107 | | } |
108 | | } |
109 | | |
110 | | return true; |
111 | | } |
112 | | #endif |
113 | | |
114 | | namespace simd { |
115 | | |
116 | | class VStringFunctions { |
117 | | public: |
118 | | #if defined(__SSE2__) || defined(__aarch64__) |
119 | | /// n equals to 16 chars length |
120 | | static constexpr auto REGISTER_SIZE = sizeof(__m128i); |
121 | | #endif |
122 | | |
123 | | template <bool trim_single> |
124 | | static inline const unsigned char* rtrim(const unsigned char* begin, const unsigned char* end, |
125 | 305 | const StringRef& remove_str) { |
126 | 305 | if (remove_str.size == 0) { |
127 | 0 | return end; |
128 | 0 | } |
129 | 305 | const auto* p = end; |
130 | | |
131 | 305 | if constexpr (trim_single) { |
132 | 250 | const auto ch = remove_str.data[0]; |
133 | 250 | #if defined(__AVX2__) |
134 | 250 | constexpr auto AVX2_BYTES = sizeof(__m256i); |
135 | 250 | const auto size = end - begin; |
136 | 250 | const auto* const avx2_begin = end - size / AVX2_BYTES * AVX2_BYTES; |
137 | 250 | const auto spaces = _mm256_set1_epi8(ch); |
138 | 251 | for (p = end - AVX2_BYTES; p >= avx2_begin; p -= AVX2_BYTES) { |
139 | 5 | uint32_t masks = _mm256_movemask_epi8( |
140 | 5 | _mm256_cmpeq_epi8(_mm256_loadu_si256((__m256i*)p), spaces)); |
141 | 5 | if ((~masks)) { |
142 | 4 | break; |
143 | 4 | } |
144 | 5 | } |
145 | 250 | p += AVX2_BYTES; |
146 | | #elif defined(__ARM_FEATURE_SVE) |
147 | | const auto size = static_cast<size_t>(end - begin); |
148 | | const size_t vl = svcntb(); |
149 | | if (size >= vl) { |
150 | | const auto* const sve_begin = end - (size / vl) * vl; |
151 | | svbool_t pg = svptrue_b8(); |
152 | | for (p = end - vl; p >= sve_begin; p -= vl) { |
153 | | svuint8_t v = svld1_u8(pg, p); |
154 | | svbool_t neq = svcmpne_n_u8(pg, v, static_cast<uint8_t>(ch)); |
155 | | if (svptest_any(pg, neq)) { |
156 | | break; |
157 | | } |
158 | | } |
159 | | p += vl; |
160 | | } |
161 | | #endif |
162 | 493 | for (; (p - 1) >= begin && *(p - 1) == ch; p--) { |
163 | 243 | } |
164 | 250 | return p; |
165 | 250 | } |
166 | | |
167 | 0 | const auto remove_size = remove_str.size; |
168 | 305 | const auto* const remove_data = remove_str.data; |
169 | 309 | while (p - begin >= remove_size) { |
170 | 57 | if (memcmp(p - remove_size, remove_data, remove_size) == 0) { |
171 | 4 | p -= remove_str.size; |
172 | 53 | } else { |
173 | 53 | break; |
174 | 53 | } |
175 | 57 | } |
176 | 305 | return p; |
177 | 305 | } _ZN5doris4simd16VStringFunctions5rtrimILb1EEEPKhS4_S4_RKNS_9StringRefE Line | Count | Source | 125 | 250 | const StringRef& remove_str) { | 126 | 250 | if (remove_str.size == 0) { | 127 | 0 | return end; | 128 | 0 | } | 129 | 250 | const auto* p = end; | 130 | | | 131 | 250 | if constexpr (trim_single) { | 132 | 250 | const auto ch = remove_str.data[0]; | 133 | 250 | #if defined(__AVX2__) | 134 | 250 | constexpr auto AVX2_BYTES = sizeof(__m256i); | 135 | 250 | const auto size = end - begin; | 136 | 250 | const auto* const avx2_begin = end - size / AVX2_BYTES * AVX2_BYTES; | 137 | 250 | const auto spaces = _mm256_set1_epi8(ch); | 138 | 251 | for (p = end - AVX2_BYTES; p >= avx2_begin; p -= AVX2_BYTES) { | 139 | 5 | uint32_t masks = _mm256_movemask_epi8( | 140 | 5 | _mm256_cmpeq_epi8(_mm256_loadu_si256((__m256i*)p), spaces)); | 141 | 5 | if ((~masks)) { | 142 | 4 | break; | 143 | 4 | } | 144 | 5 | } | 145 | 250 | p += AVX2_BYTES; | 146 | | #elif defined(__ARM_FEATURE_SVE) | 147 | | const auto size = static_cast<size_t>(end - begin); | 148 | | const size_t vl = svcntb(); | 149 | | if (size >= vl) { | 150 | | const auto* const sve_begin = end - (size / vl) * vl; | 151 | | svbool_t pg = svptrue_b8(); | 152 | | for (p = end - vl; p >= sve_begin; p -= vl) { | 153 | | svuint8_t v = svld1_u8(pg, p); | 154 | | svbool_t neq = svcmpne_n_u8(pg, v, static_cast<uint8_t>(ch)); | 155 | | if (svptest_any(pg, neq)) { | 156 | | break; | 157 | | } | 158 | | } | 159 | | p += vl; | 160 | | } | 161 | | #endif | 162 | 493 | for (; (p - 1) >= begin && *(p - 1) == ch; p--) { | 163 | 243 | } | 164 | 250 | return p; | 165 | 250 | } | 166 | | | 167 | 0 | const auto remove_size = remove_str.size; | 168 | 250 | const auto* const remove_data = remove_str.data; | 169 | 250 | while (p - begin >= remove_size) { | 170 | 0 | if (memcmp(p - remove_size, remove_data, remove_size) == 0) { | 171 | 0 | p -= remove_str.size; | 172 | 0 | } else { | 173 | 0 | break; | 174 | 0 | } | 175 | 0 | } | 176 | 250 | return p; | 177 | 250 | } |
_ZN5doris4simd16VStringFunctions5rtrimILb0EEEPKhS4_S4_RKNS_9StringRefE Line | Count | Source | 125 | 55 | const StringRef& remove_str) { | 126 | 55 | if (remove_str.size == 0) { | 127 | 0 | return end; | 128 | 0 | } | 129 | 55 | const auto* p = end; | 130 | | | 131 | | if constexpr (trim_single) { | 132 | | const auto ch = remove_str.data[0]; | 133 | | #if defined(__AVX2__) | 134 | | constexpr auto AVX2_BYTES = sizeof(__m256i); | 135 | | const auto size = end - begin; | 136 | | const auto* const avx2_begin = end - size / AVX2_BYTES * AVX2_BYTES; | 137 | | const auto spaces = _mm256_set1_epi8(ch); | 138 | | for (p = end - AVX2_BYTES; p >= avx2_begin; p -= AVX2_BYTES) { | 139 | | uint32_t masks = _mm256_movemask_epi8( | 140 | | _mm256_cmpeq_epi8(_mm256_loadu_si256((__m256i*)p), spaces)); | 141 | | if ((~masks)) { | 142 | | break; | 143 | | } | 144 | | } | 145 | | p += AVX2_BYTES; | 146 | | #elif defined(__ARM_FEATURE_SVE) | 147 | | const auto size = static_cast<size_t>(end - begin); | 148 | | const size_t vl = svcntb(); | 149 | | if (size >= vl) { | 150 | | const auto* const sve_begin = end - (size / vl) * vl; | 151 | | svbool_t pg = svptrue_b8(); | 152 | | for (p = end - vl; p >= sve_begin; p -= vl) { | 153 | | svuint8_t v = svld1_u8(pg, p); | 154 | | svbool_t neq = svcmpne_n_u8(pg, v, static_cast<uint8_t>(ch)); | 155 | | if (svptest_any(pg, neq)) { | 156 | | break; | 157 | | } | 158 | | } | 159 | | p += vl; | 160 | | } | 161 | | #endif | 162 | | for (; (p - 1) >= begin && *(p - 1) == ch; p--) { | 163 | | } | 164 | | return p; | 165 | | } | 166 | | | 167 | 55 | const auto remove_size = remove_str.size; | 168 | 55 | const auto* const remove_data = remove_str.data; | 169 | 59 | while (p - begin >= remove_size) { | 170 | 57 | if (memcmp(p - remove_size, remove_data, remove_size) == 0) { | 171 | 4 | p -= remove_str.size; | 172 | 53 | } else { | 173 | 53 | break; | 174 | 53 | } | 175 | 57 | } | 176 | 55 | return p; | 177 | 55 | } |
|
178 | | |
179 | | template <bool trim_single> |
180 | | static inline const unsigned char* ltrim(const unsigned char* begin, const unsigned char* end, |
181 | 305 | const StringRef& remove_str) { |
182 | 305 | if (remove_str.size == 0) { |
183 | 0 | return begin; |
184 | 0 | } |
185 | 305 | const auto* p = begin; |
186 | | |
187 | 305 | if constexpr (trim_single) { |
188 | 200 | const auto ch = remove_str.data[0]; |
189 | 200 | #if defined(__AVX2__) |
190 | 200 | constexpr auto AVX2_BYTES = sizeof(__m256i); |
191 | 200 | const auto size = end - begin; |
192 | 200 | const auto* const avx2_end = begin + size / AVX2_BYTES * AVX2_BYTES; |
193 | 200 | const auto spaces = _mm256_set1_epi8(ch); |
194 | 200 | for (; p < avx2_end; p += AVX2_BYTES) { |
195 | 6 | uint32_t masks = _mm256_movemask_epi8( |
196 | 6 | _mm256_cmpeq_epi8(_mm256_loadu_si256((__m256i*)p), spaces)); |
197 | 6 | if ((~masks)) { |
198 | 6 | break; |
199 | 6 | } |
200 | 6 | } |
201 | | #elif defined(__ARM_FEATURE_SVE) |
202 | | const auto size = static_cast<size_t>(end - begin); |
203 | | const size_t vl = svcntb(); |
204 | | if (size >= vl) { |
205 | | const auto* const sve_end = begin + (size / vl) * vl; |
206 | | svbool_t pg = svptrue_b8(); |
207 | | for (; p < sve_end; p += vl) { |
208 | | svuint8_t v = svld1_u8(pg, p); |
209 | | svbool_t eq = svcmpne_n_u8(pg, v, static_cast<uint8_t>(ch)); |
210 | | if (svptest_any(pg, eq)) { |
211 | | break; |
212 | | } |
213 | | } |
214 | | } |
215 | | #endif |
216 | 441 | for (; p < end && *p == ch; ++p) { |
217 | 241 | } |
218 | 200 | return p; |
219 | 200 | } |
220 | | |
221 | 0 | const auto remove_size = remove_str.size; |
222 | 305 | const auto* const remove_data = remove_str.data; |
223 | 409 | while (end - p >= remove_size) { |
224 | 207 | if (memcmp(p, remove_data, remove_size) == 0) { |
225 | 104 | p += remove_str.size; |
226 | 104 | } else { |
227 | 103 | break; |
228 | 103 | } |
229 | 207 | } |
230 | 305 | return p; |
231 | 305 | } _ZN5doris4simd16VStringFunctions5ltrimILb1EEEPKhS4_S4_RKNS_9StringRefE Line | Count | Source | 181 | 200 | const StringRef& remove_str) { | 182 | 200 | if (remove_str.size == 0) { | 183 | 0 | return begin; | 184 | 0 | } | 185 | 200 | const auto* p = begin; | 186 | | | 187 | 200 | if constexpr (trim_single) { | 188 | 200 | const auto ch = remove_str.data[0]; | 189 | 200 | #if defined(__AVX2__) | 190 | 200 | constexpr auto AVX2_BYTES = sizeof(__m256i); | 191 | 200 | const auto size = end - begin; | 192 | 200 | const auto* const avx2_end = begin + size / AVX2_BYTES * AVX2_BYTES; | 193 | 200 | const auto spaces = _mm256_set1_epi8(ch); | 194 | 200 | for (; p < avx2_end; p += AVX2_BYTES) { | 195 | 6 | uint32_t masks = _mm256_movemask_epi8( | 196 | 6 | _mm256_cmpeq_epi8(_mm256_loadu_si256((__m256i*)p), spaces)); | 197 | 6 | if ((~masks)) { | 198 | 6 | break; | 199 | 6 | } | 200 | 6 | } | 201 | | #elif defined(__ARM_FEATURE_SVE) | 202 | | const auto size = static_cast<size_t>(end - begin); | 203 | | const size_t vl = svcntb(); | 204 | | if (size >= vl) { | 205 | | const auto* const sve_end = begin + (size / vl) * vl; | 206 | | svbool_t pg = svptrue_b8(); | 207 | | for (; p < sve_end; p += vl) { | 208 | | svuint8_t v = svld1_u8(pg, p); | 209 | | svbool_t eq = svcmpne_n_u8(pg, v, static_cast<uint8_t>(ch)); | 210 | | if (svptest_any(pg, eq)) { | 211 | | break; | 212 | | } | 213 | | } | 214 | | } | 215 | | #endif | 216 | 441 | for (; p < end && *p == ch; ++p) { | 217 | 241 | } | 218 | 200 | return p; | 219 | 200 | } | 220 | | | 221 | 0 | const auto remove_size = remove_str.size; | 222 | 200 | const auto* const remove_data = remove_str.data; | 223 | 200 | while (end - p >= remove_size) { | 224 | 0 | if (memcmp(p, remove_data, remove_size) == 0) { | 225 | 0 | p += remove_str.size; | 226 | 0 | } else { | 227 | 0 | break; | 228 | 0 | } | 229 | 0 | } | 230 | 200 | return p; | 231 | 200 | } |
_ZN5doris4simd16VStringFunctions5ltrimILb0EEEPKhS4_S4_RKNS_9StringRefE Line | Count | Source | 181 | 105 | const StringRef& remove_str) { | 182 | 105 | if (remove_str.size == 0) { | 183 | 0 | return begin; | 184 | 0 | } | 185 | 105 | const auto* p = begin; | 186 | | | 187 | | if constexpr (trim_single) { | 188 | | const auto ch = remove_str.data[0]; | 189 | | #if defined(__AVX2__) | 190 | | constexpr auto AVX2_BYTES = sizeof(__m256i); | 191 | | const auto size = end - begin; | 192 | | const auto* const avx2_end = begin + size / AVX2_BYTES * AVX2_BYTES; | 193 | | const auto spaces = _mm256_set1_epi8(ch); | 194 | | for (; p < avx2_end; p += AVX2_BYTES) { | 195 | | uint32_t masks = _mm256_movemask_epi8( | 196 | | _mm256_cmpeq_epi8(_mm256_loadu_si256((__m256i*)p), spaces)); | 197 | | if ((~masks)) { | 198 | | break; | 199 | | } | 200 | | } | 201 | | #elif defined(__ARM_FEATURE_SVE) | 202 | | const auto size = static_cast<size_t>(end - begin); | 203 | | const size_t vl = svcntb(); | 204 | | if (size >= vl) { | 205 | | const auto* const sve_end = begin + (size / vl) * vl; | 206 | | svbool_t pg = svptrue_b8(); | 207 | | for (; p < sve_end; p += vl) { | 208 | | svuint8_t v = svld1_u8(pg, p); | 209 | | svbool_t eq = svcmpne_n_u8(pg, v, static_cast<uint8_t>(ch)); | 210 | | if (svptest_any(pg, eq)) { | 211 | | break; | 212 | | } | 213 | | } | 214 | | } | 215 | | #endif | 216 | | for (; p < end && *p == ch; ++p) { | 217 | | } | 218 | | return p; | 219 | | } | 220 | | | 221 | 105 | const auto remove_size = remove_str.size; | 222 | 105 | const auto* const remove_data = remove_str.data; | 223 | 209 | while (end - p >= remove_size) { | 224 | 207 | if (memcmp(p, remove_data, remove_size) == 0) { | 225 | 104 | p += remove_str.size; | 226 | 104 | } else { | 227 | 103 | break; | 228 | 103 | } | 229 | 207 | } | 230 | 105 | return p; | 231 | 105 | } |
|
232 | | |
233 | | // Iterate a UTF-8 string without exceeding a given length n. |
234 | | // The function returns two values: |
235 | | // the first represents the byte length traversed, and the second represents the char length traversed. |
236 | | static inline std::pair<size_t, size_t> iterate_utf8_with_limit_length(const char* begin, |
237 | | const char* end, |
238 | 3.22k | size_t n) { |
239 | 3.22k | const char* p = begin; |
240 | 3.22k | int char_size = 0; |
241 | | |
242 | 3.22k | size_t i = 0; |
243 | 8.95k | for (; i < n && p < end; ++i, p += char_size) { |
244 | 5.73k | char_size = UTF8_BYTE_LENGTH[static_cast<uint8_t>(*p)]; |
245 | 5.73k | } |
246 | | |
247 | 3.22k | return {p - begin, i}; |
248 | 3.22k | } |
249 | | |
250 | | // Gcc will do auto simd in this function |
251 | | // if input empty, return true |
252 | 31.0k | static bool is_ascii(const StringRef& str) { |
253 | 31.0k | #ifdef __AVX2__ |
254 | 31.0k | return validate_ascii_fast_avx(str.data, str.size); |
255 | | #elif defined(__ARM_FEATURE_SVE) |
256 | | return validate_ascii_fast_sve(str.data, str.size); |
257 | | #endif |
258 | 0 | return validate_ascii_fast(str.data, str.size); |
259 | 31.0k | } |
260 | | |
261 | 124 | static void reverse(const StringRef& str, std::string* dst) { |
262 | 124 | if (is_ascii(str)) { |
263 | 87 | int64_t begin = 0; |
264 | 87 | int64_t end = str.size; |
265 | 87 | int64_t result_end = dst->size() - 1; |
266 | | |
267 | | // auto SIMD here |
268 | 87 | auto* __restrict l = dst->data(); |
269 | 87 | auto* __restrict r = str.data; |
270 | 844 | for (; begin < end; ++begin, --result_end) { |
271 | 757 | l[result_end] = r[begin]; |
272 | 757 | } |
273 | 87 | } else { |
274 | 37 | char* dst_data = dst->data(); |
275 | 306 | for (size_t i = 0, char_size = 0; i < str.size; i += char_size) { |
276 | 269 | char_size = UTF8_BYTE_LENGTH[(unsigned char)(str.data)[i]]; |
277 | | // there exists occasion where the last character is an illegal UTF-8 one which returns |
278 | | // a char_size larger than the actual space, which would cause offset execeeding the buffer range |
279 | | // for example, consider str.size=4, i = 3, then the last char returns char_size 2, then |
280 | | // the str.data + offset would exceed the buffer range |
281 | 269 | size_t offset = i + char_size; |
282 | 269 | if (offset > str.size) { |
283 | 3 | offset = str.size; |
284 | 3 | } |
285 | 269 | std::copy(str.data + i, str.data + offset, dst_data + str.size - offset); |
286 | 269 | } |
287 | 37 | } |
288 | 124 | } |
289 | | |
290 | 1.05k | static void hex_encode(const unsigned char* src_str, size_t length, char* dst_str) { |
291 | 1.05k | static constexpr auto hex_table = "0123456789ABCDEF"; |
292 | 1.05k | auto src_str_end = src_str + length; |
293 | | |
294 | 1.05k | #if defined(__SSE2__) || defined(__aarch64__) |
295 | 1.05k | constexpr auto step = sizeof(uint64_t); |
296 | 1.05k | if (src_str + step < src_str_end) { |
297 | 918 | const auto hex_map = _mm_loadu_si128(reinterpret_cast<const __m128i*>(hex_table)); |
298 | 918 | const auto mask_map = _mm_set1_epi8(0x0F); |
299 | | |
300 | 926 | do { |
301 | 926 | auto data = _mm_loadu_si64(src_str); |
302 | 926 | auto hex_loc = |
303 | 926 | _mm_and_si128(_mm_unpacklo_epi8(_mm_srli_epi64(data, 4), data), mask_map); |
304 | 926 | _mm_storeu_si128(reinterpret_cast<__m128i*>(dst_str), |
305 | 926 | _mm_shuffle_epi8(hex_map, hex_loc)); |
306 | | |
307 | 926 | src_str += step; |
308 | 926 | dst_str += step * 2; |
309 | 926 | } while (src_str + step < src_str_end); |
310 | 918 | } |
311 | 1.05k | #endif |
312 | 1.05k | char res[2]; |
313 | | // hex(str) str length is n, result must be 2 * n length |
314 | 8.55k | for (; src_str < src_str_end; src_str += 1, dst_str += 2) { |
315 | | // low 4 bits |
316 | 7.50k | *(res + 1) = hex_table[src_str[0] & 0x0F]; |
317 | | // high 4 bits |
318 | 7.50k | *res = hex_table[(src_str[0] >> 4)]; |
319 | 7.50k | std::copy(res, res + 2, dst_str); |
320 | 7.50k | } |
321 | 1.05k | } |
322 | | |
323 | 247 | static void to_lower(const uint8_t* src, int64_t len, uint8_t* dst) { |
324 | 247 | if (len <= 0) { |
325 | 3 | return; |
326 | 3 | } |
327 | 244 | LowerUpperImpl<'A', 'Z'> lowerUpper; |
328 | 244 | lowerUpper.transfer(src, src + len, dst); |
329 | 244 | } |
330 | | |
331 | 88 | static void to_upper(const uint8_t* src, int64_t len, uint8_t* dst) { |
332 | 88 | if (len <= 0) { |
333 | 3 | return; |
334 | 3 | } |
335 | 85 | LowerUpperImpl<'a', 'z'> lowerUpper; |
336 | 85 | lowerUpper.transfer(src, src + len, dst); |
337 | 85 | } |
338 | | |
339 | 723 | static inline size_t get_char_len(const char* src, size_t len, std::vector<size_t>& str_index) { |
340 | 723 | size_t char_len = 0; |
341 | 4.74k | for (size_t i = 0, char_size = 0; i < len; i += char_size) { |
342 | 4.02k | char_size = UTF8_BYTE_LENGTH[(unsigned char)src[i]]; |
343 | 4.02k | str_index.push_back(i); |
344 | 4.02k | ++char_len; |
345 | 4.02k | } |
346 | 723 | return char_len; |
347 | 723 | } |
348 | | |
349 | | // utf8-encoding: |
350 | | // - 1-byte: 0xxx_xxxx; |
351 | | // - 2-byte: 110x_xxxx 10xx_xxxx; |
352 | | // - 3-byte: 1110_xxxx 10xx_xxxx 10xx_xxxx; |
353 | | // - 4-byte: 1111_0xxx 10xx_xxxx 10xx_xxxx 10xx_xxxx. |
354 | | // Counting utf8 chars in a byte string is equivalent to counting first byte of utf chars, that |
355 | | // is to say, counting bytes which do not match 10xx_xxxx pattern. |
356 | | // All 0xxx_xxxx, 110x_xxxx, 1110_xxxx and 1111_0xxx are greater than 1011_1111 when use int8_t arithmetic, |
357 | | // so just count bytes greater than 1011_1111 in a byte string as the result of utf8_length. |
358 | | // get_char_len is used to return the UTF-8 length of a string. |
359 | | // The return value will never exceed len. |
360 | | template <typename T> |
361 | 1.29k | static inline T get_char_len(const char* src, T len) { |
362 | 1.29k | T char_len = 0; |
363 | 1.29k | const char* p = src; |
364 | 1.29k | const char* end = p + len; |
365 | 1.29k | #if defined(__SSE2__) || defined(__aarch64__) |
366 | 1.29k | constexpr auto bytes_sse2 = sizeof(__m128i); |
367 | 1.29k | const auto src_end_sse2 = p + (len & ~(bytes_sse2 - 1)); |
368 | | // threshold = 1011_1111 |
369 | 1.29k | const auto threshold = _mm_set1_epi8(0xBF); |
370 | 1.51k | for (; p < src_end_sse2; p += bytes_sse2) { |
371 | 223 | char_len += __builtin_popcount(_mm_movemask_epi8(_mm_cmpgt_epi8( |
372 | 223 | _mm_loadu_si128(reinterpret_cast<const __m128i*>(p)), threshold))); |
373 | 223 | } |
374 | 1.29k | #endif |
375 | | // process remaining bytes the number of which not exceed bytes_sse2 at the |
376 | | // tail of string, one by one. |
377 | 10.1k | for (; p < end; ++p) { |
378 | 8.87k | char_len += static_cast<int8_t>(*p) > static_cast<int8_t>(0xBF); |
379 | 8.87k | } |
380 | 1.29k | return char_len; |
381 | 1.29k | } _ZN5doris4simd16VStringFunctions12get_char_lenIiEET_PKcS3_ Line | Count | Source | 361 | 947 | static inline T get_char_len(const char* src, T len) { | 362 | 947 | T char_len = 0; | 363 | 947 | const char* p = src; | 364 | 947 | const char* end = p + len; | 365 | 947 | #if defined(__SSE2__) || defined(__aarch64__) | 366 | 947 | constexpr auto bytes_sse2 = sizeof(__m128i); | 367 | 947 | const auto src_end_sse2 = p + (len & ~(bytes_sse2 - 1)); | 368 | | // threshold = 1011_1111 | 369 | 947 | const auto threshold = _mm_set1_epi8(0xBF); | 370 | 1.16k | for (; p < src_end_sse2; p += bytes_sse2) { | 371 | 215 | char_len += __builtin_popcount(_mm_movemask_epi8(_mm_cmpgt_epi8( | 372 | 215 | _mm_loadu_si128(reinterpret_cast<const __m128i*>(p)), threshold))); | 373 | 215 | } | 374 | 947 | #endif | 375 | | // process remaining bytes the number of which not exceed bytes_sse2 at the | 376 | | // tail of string, one by one. | 377 | 7.57k | for (; p < end; ++p) { | 378 | 6.63k | char_len += static_cast<int8_t>(*p) > static_cast<int8_t>(0xBF); | 379 | 6.63k | } | 380 | 947 | return char_len; | 381 | 947 | } |
_ZN5doris4simd16VStringFunctions12get_char_lenImEET_PKcS3_ Line | Count | Source | 361 | 345 | static inline T get_char_len(const char* src, T len) { | 362 | 345 | T char_len = 0; | 363 | 345 | const char* p = src; | 364 | 345 | const char* end = p + len; | 365 | 345 | #if defined(__SSE2__) || defined(__aarch64__) | 366 | 345 | constexpr auto bytes_sse2 = sizeof(__m128i); | 367 | 345 | const auto src_end_sse2 = p + (len & ~(bytes_sse2 - 1)); | 368 | | // threshold = 1011_1111 | 369 | 345 | const auto threshold = _mm_set1_epi8(0xBF); | 370 | 353 | for (; p < src_end_sse2; p += bytes_sse2) { | 371 | 8 | char_len += __builtin_popcount(_mm_movemask_epi8(_mm_cmpgt_epi8( | 372 | 8 | _mm_loadu_si128(reinterpret_cast<const __m128i*>(p)), threshold))); | 373 | 8 | } | 374 | 345 | #endif | 375 | | // process remaining bytes the number of which not exceed bytes_sse2 at the | 376 | | // tail of string, one by one. | 377 | 2.58k | for (; p < end; ++p) { | 378 | 2.24k | char_len += static_cast<int8_t>(*p) > static_cast<int8_t>(0xBF); | 379 | 2.24k | } | 380 | 345 | return char_len; | 381 | 345 | } |
|
382 | | }; |
383 | | } // namespace simd |
384 | | } // namespace doris |