be/src/exec/common/string_utils/string_utils.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // This file is copied from |
18 | | // https://github.com/ClickHouse/ClickHouse/blob/master/src/Commom/StringUtils/StringUtils.h |
19 | | // and modified by Doris |
20 | | |
21 | | #pragma once |
22 | | |
23 | | #include <cstddef> |
24 | | #include <cstring> |
25 | | #include <string> |
26 | | |
27 | | #include "common/compiler_util.h" |
28 | | |
29 | | /// More efficient than libc, because doesn't respect locale. But for some functions table implementation could be better. |
30 | | |
31 | 254 | inline bool is_ascii(char c) { |
32 | 254 | return static_cast<unsigned char>(c) < 0x80; |
33 | 254 | } |
34 | | |
35 | 34.2k | inline bool is_alpha_ascii(char c) { |
36 | 34.2k | return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); |
37 | 34.2k | } |
38 | | |
39 | 6.61M | inline bool is_numeric_ascii(char c) { |
40 | | /// This is faster than |
41 | | /// return UInt8(UInt8(c) - UInt8('0')) < UInt8(10); |
42 | | /// on Intel CPUs when compiled by gcc 8. |
43 | 6.61M | return (c >= '0' && c <= '9'); |
44 | 6.61M | } |
45 | | |
46 | 33.7k | inline bool is_alpha_numeric_ascii(char c) { |
47 | 33.7k | return is_alpha_ascii(c) || is_numeric_ascii(c); |
48 | 33.7k | } |
49 | | |
50 | 250 | inline bool is_word_char_ascii(char c) { |
51 | 250 | return is_alpha_numeric_ascii(c) || c == '_'; |
52 | 250 | } |
53 | | |
54 | 250 | inline bool is_valid_identifier_begin(char c) { |
55 | 250 | return is_alpha_ascii(c) || c == '_'; |
56 | 250 | } |
57 | | |
58 | 33.0k | inline bool is_non_alnum(char c) { |
59 | 33.0k | return !is_alpha_numeric_ascii(c); |
60 | 33.0k | } |
61 | | |
62 | 0 | inline bool is_tz_name_part_ascii(char c) { |
63 | 0 | return is_alpha_ascii(c) || c == '_'; |
64 | 0 | } |
65 | | |
66 | 0 | inline bool is_slash_ascii(char c) { |
67 | 0 | return c == '/'; |
68 | 0 | } |
69 | | |
70 | | // Our own definition of "isspace" that optimize on the ' ' branch. |
71 | 2.67M | inline bool is_whitespace_ascii(char c) { |
72 | 2.67M | return LIKELY(c == ' ') || |
73 | 2.67M | UNLIKELY(c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r'); |
74 | 2.67M | } |
75 | | |
76 | 306 | inline bool is_not_whitespace_ascii(char c) { |
77 | 306 | return !is_whitespace_ascii(c); |
78 | 306 | } |
79 | | |
80 | 424 | inline bool is_hex_ascii(char c) { |
81 | 424 | return (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') || is_numeric_ascii(c); |
82 | 424 | } |