be/src/util/simd/lower_upper_impl.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <stdint.h> |
21 | | |
22 | | #include <iostream> |
23 | | #include <string> |
24 | | |
25 | | #include "util/sse_util.hpp" |
26 | | |
27 | | // the code refer: https://clickhouse.tech/codebrowser/html_report//ClickHouse/src/Functions/LowerUpperImpl.h.html |
28 | | // Doris only handle one character at a time, this function use SIMD to more characters at a time |
29 | | namespace doris::simd { |
30 | | |
31 | | template <char not_case_lower_bound, char not_case_upper_bound> |
32 | | class LowerUpperImpl { |
33 | | public: |
34 | 44 | static void transfer(const uint8_t* src, const uint8_t* src_end, uint8_t* dst) { |
35 | 44 | const auto flip_case_mask = 'A' ^ 'a'; |
36 | | |
37 | 44 | #if defined(__SSE2__) || defined(__aarch64__) |
38 | 44 | const auto bytes_sse = sizeof(__m128i); |
39 | 44 | const auto src_end_sse = src_end - (src_end - src) % bytes_sse; |
40 | | |
41 | 44 | const auto v_not_case_lower_bound = _mm_set1_epi8(not_case_lower_bound - 1); |
42 | 44 | const auto v_not_case_upper_bound = _mm_set1_epi8(not_case_upper_bound + 1); |
43 | 44 | const auto v_flip_case_mask = _mm_set1_epi8(flip_case_mask); |
44 | | |
45 | 63 | for (; src < src_end_sse; src += bytes_sse, dst += bytes_sse) { |
46 | 19 | const auto chars = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); |
47 | 19 | const auto is_not_case = _mm_and_si128(_mm_cmpgt_epi8(chars, v_not_case_lower_bound), |
48 | 19 | _mm_cmplt_epi8(chars, v_not_case_upper_bound)); |
49 | 19 | const auto xor_mask = _mm_and_si128(v_flip_case_mask, is_not_case); |
50 | 19 | const auto cased_chars = _mm_xor_si128(chars, xor_mask); |
51 | 19 | _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), cased_chars); |
52 | 19 | } |
53 | 44 | #endif |
54 | | |
55 | 354 | for (; src < src_end; ++src, ++dst) |
56 | 310 | if (*src >= not_case_lower_bound && *src <= not_case_upper_bound) |
57 | 152 | *dst = *src ^ flip_case_mask; |
58 | 158 | else |
59 | 158 | *dst = *src; |
60 | 44 | } _ZN5doris4simd14LowerUpperImplILc65ELc90EE8transferEPKhS4_Ph Line | Count | Source | 34 | 21 | static void transfer(const uint8_t* src, const uint8_t* src_end, uint8_t* dst) { | 35 | 21 | const auto flip_case_mask = 'A' ^ 'a'; | 36 | | | 37 | 21 | #if defined(__SSE2__) || defined(__aarch64__) | 38 | 21 | const auto bytes_sse = sizeof(__m128i); | 39 | 21 | const auto src_end_sse = src_end - (src_end - src) % bytes_sse; | 40 | | | 41 | 21 | const auto v_not_case_lower_bound = _mm_set1_epi8(not_case_lower_bound - 1); | 42 | 21 | const auto v_not_case_upper_bound = _mm_set1_epi8(not_case_upper_bound + 1); | 43 | 21 | const auto v_flip_case_mask = _mm_set1_epi8(flip_case_mask); | 44 | | | 45 | 23 | for (; src < src_end_sse; src += bytes_sse, dst += bytes_sse) { | 46 | 2 | const auto chars = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); | 47 | 2 | const auto is_not_case = _mm_and_si128(_mm_cmpgt_epi8(chars, v_not_case_lower_bound), | 48 | 2 | _mm_cmplt_epi8(chars, v_not_case_upper_bound)); | 49 | 2 | const auto xor_mask = _mm_and_si128(v_flip_case_mask, is_not_case); | 50 | 2 | const auto cased_chars = _mm_xor_si128(chars, xor_mask); | 51 | 2 | _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), cased_chars); | 52 | 2 | } | 53 | 21 | #endif | 54 | | | 55 | 194 | for (; src < src_end; ++src, ++dst) | 56 | 173 | if (*src >= not_case_lower_bound && *src <= not_case_upper_bound) | 57 | 100 | *dst = *src ^ flip_case_mask; | 58 | 73 | else | 59 | 73 | *dst = *src; | 60 | 21 | } |
_ZN5doris4simd14LowerUpperImplILc97ELc122EE8transferEPKhS4_Ph Line | Count | Source | 34 | 23 | static void transfer(const uint8_t* src, const uint8_t* src_end, uint8_t* dst) { | 35 | 23 | const auto flip_case_mask = 'A' ^ 'a'; | 36 | | | 37 | 23 | #if defined(__SSE2__) || defined(__aarch64__) | 38 | 23 | const auto bytes_sse = sizeof(__m128i); | 39 | 23 | const auto src_end_sse = src_end - (src_end - src) % bytes_sse; | 40 | | | 41 | 23 | const auto v_not_case_lower_bound = _mm_set1_epi8(not_case_lower_bound - 1); | 42 | 23 | const auto v_not_case_upper_bound = _mm_set1_epi8(not_case_upper_bound + 1); | 43 | 23 | const auto v_flip_case_mask = _mm_set1_epi8(flip_case_mask); | 44 | | | 45 | 40 | for (; src < src_end_sse; src += bytes_sse, dst += bytes_sse) { | 46 | 17 | const auto chars = _mm_loadu_si128(reinterpret_cast<const __m128i*>(src)); | 47 | 17 | const auto is_not_case = _mm_and_si128(_mm_cmpgt_epi8(chars, v_not_case_lower_bound), | 48 | 17 | _mm_cmplt_epi8(chars, v_not_case_upper_bound)); | 49 | 17 | const auto xor_mask = _mm_and_si128(v_flip_case_mask, is_not_case); | 50 | 17 | const auto cased_chars = _mm_xor_si128(chars, xor_mask); | 51 | 17 | _mm_storeu_si128(reinterpret_cast<__m128i*>(dst), cased_chars); | 52 | 17 | } | 53 | 23 | #endif | 54 | | | 55 | 160 | for (; src < src_end; ++src, ++dst) | 56 | 137 | if (*src >= not_case_lower_bound && *src <= not_case_upper_bound) | 57 | 52 | *dst = *src ^ flip_case_mask; | 58 | 85 | else | 59 | 85 | *dst = *src; | 60 | 23 | } |
|
61 | | }; |
62 | | } // namespace doris::simd |