be/src/exprs/function/url/domain.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // This file is copied from |
18 | | // https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/domain.h |
19 | | // and modified by Doris |
20 | | |
21 | | #pragma once |
22 | | |
23 | | #include <cstring> |
24 | | |
25 | | #include "exec/common/string_utils/string_utils.h" |
26 | | #include "exprs/function/url/find_symbols.h" |
27 | | #include "exprs/function/url/protocol.h" |
28 | | #include "exprs/function/url/tldLookup.h" |
29 | | |
30 | | namespace doris { |
31 | | |
32 | | inline StringRef check_and_return_host(const Pos& pos, const Pos& dot_pos, |
33 | 18 | const Pos& start_of_host) { |
34 | 18 | if (!dot_pos || start_of_host >= pos || pos - dot_pos == 1) return StringRef {}; |
35 | | |
36 | 14 | auto after_dot = *(dot_pos + 1); |
37 | 14 | if (after_dot == ':' || after_dot == '/' || after_dot == '?' || after_dot == '#') |
38 | 0 | return StringRef {}; |
39 | | |
40 | 14 | return StringRef(start_of_host, pos - start_of_host); |
41 | 14 | } |
42 | | |
43 | | /// Extracts host from given url. |
44 | | /// |
45 | | /// @return empty StringRef if the host is not valid (i.e. it does not have dot, or there no symbol after dot). |
46 | 18 | inline StringRef get_url_host(const char* data, size_t size) { |
47 | 18 | Pos pos = data; |
48 | 18 | Pos end = data + size; |
49 | | |
50 | 18 | if (*pos == '/' && *(pos + 1) == '/') { |
51 | 4 | pos += 2; |
52 | 14 | } else { |
53 | 14 | Pos scheme_end = data + std::min(size, 16UL); |
54 | 86 | for (++pos; pos < scheme_end; ++pos) { |
55 | 80 | if (!is_alpha_numeric_ascii(*pos)) { |
56 | 14 | switch (*pos) { |
57 | 6 | case '.': |
58 | 6 | case '-': |
59 | 6 | case '+': |
60 | 6 | break; |
61 | 0 | case ' ': /// restricted symbols |
62 | 0 | case '\t': |
63 | 0 | case '<': |
64 | 0 | case '>': |
65 | 0 | case '%': |
66 | 0 | case '{': |
67 | 0 | case '}': |
68 | 0 | case '|': |
69 | 0 | case '\\': |
70 | 0 | case '^': |
71 | 0 | case '~': |
72 | 0 | case '[': |
73 | 0 | case ']': |
74 | 0 | case ';': |
75 | 0 | case '=': |
76 | 0 | case '&': |
77 | 0 | return StringRef {}; |
78 | 8 | default: |
79 | 8 | goto exloop; |
80 | 14 | } |
81 | 14 | } |
82 | 80 | } |
83 | 14 | exloop: |
84 | 14 | if ((scheme_end - pos) > 2 && *pos == ':' && *(pos + 1) == '/' && *(pos + 2) == '/') |
85 | 6 | pos += 3; |
86 | 8 | else |
87 | 8 | pos = data; |
88 | 14 | } |
89 | | |
90 | 18 | Pos dot_pos = nullptr; |
91 | 18 | const auto* start_of_host = pos; |
92 | 236 | for (; pos < end; ++pos) { |
93 | 226 | switch (*pos) { |
94 | 28 | case '.': |
95 | 28 | dot_pos = pos; |
96 | 28 | break; |
97 | 6 | case ':': /// end symbols |
98 | 6 | case '/': |
99 | 8 | case '?': |
100 | 8 | case '#': |
101 | 8 | return check_and_return_host(pos, dot_pos, start_of_host); |
102 | 4 | case '@': /// myemail@gmail.com |
103 | 4 | start_of_host = pos + 1; |
104 | 4 | break; |
105 | 0 | case ' ': /// restricted symbols in whole URL |
106 | 0 | case '\t': |
107 | 0 | case '<': |
108 | 0 | case '>': |
109 | 0 | case '%': |
110 | 0 | case '{': |
111 | 0 | case '}': |
112 | 0 | case '|': |
113 | 0 | case '\\': |
114 | 0 | case '^': |
115 | 0 | case '~': |
116 | 0 | case '[': |
117 | 0 | case ']': |
118 | 0 | case ';': |
119 | 0 | case '=': |
120 | 0 | case '&': |
121 | 0 | return StringRef {}; |
122 | 226 | } |
123 | 226 | } |
124 | | |
125 | 10 | return check_and_return_host(pos, dot_pos, start_of_host); |
126 | 18 | } |
127 | | |
128 | | template <bool without_www> |
129 | | struct ExtractDomain { |
130 | 2 | static size_t get_reserve_length_for_element() { return 15; }_ZN5doris13ExtractDomainILb0EE30get_reserve_length_for_elementEv Line | Count | Source | 130 | 1 | static size_t get_reserve_length_for_element() { return 15; } |
_ZN5doris13ExtractDomainILb1EE30get_reserve_length_for_elementEv Line | Count | Source | 130 | 1 | static size_t get_reserve_length_for_element() { return 15; } |
|
131 | | |
132 | 18 | static void execute(Pos data, size_t size, Pos& res_data, size_t& res_size) { |
133 | 18 | StringRef host = get_url_host(data, size); |
134 | | |
135 | 18 | if (host.size == 0) { |
136 | 4 | res_data = data; |
137 | 4 | res_size = 0; |
138 | 14 | } else { |
139 | 14 | if (without_www && host.size > 4 && !strncmp(host.data, "www.", 4)) |
140 | 5 | host = {host.data + 4, host.size - 4}; |
141 | | |
142 | 14 | res_data = host.data; |
143 | 14 | res_size = host.size; |
144 | 14 | } |
145 | 18 | } _ZN5doris13ExtractDomainILb0EE7executeEPKcmRS3_Rm Line | Count | Source | 132 | 9 | static void execute(Pos data, size_t size, Pos& res_data, size_t& res_size) { | 133 | 9 | StringRef host = get_url_host(data, size); | 134 | | | 135 | 9 | if (host.size == 0) { | 136 | 2 | res_data = data; | 137 | 2 | res_size = 0; | 138 | 7 | } else { | 139 | 7 | if (without_www && host.size > 4 && !strncmp(host.data, "www.", 4)) | 140 | 0 | host = {host.data + 4, host.size - 4}; | 141 | | | 142 | 7 | res_data = host.data; | 143 | 7 | res_size = host.size; | 144 | 7 | } | 145 | 9 | } |
_ZN5doris13ExtractDomainILb1EE7executeEPKcmRS3_Rm Line | Count | Source | 132 | 9 | static void execute(Pos data, size_t size, Pos& res_data, size_t& res_size) { | 133 | 9 | StringRef host = get_url_host(data, size); | 134 | | | 135 | 9 | if (host.size == 0) { | 136 | 2 | res_data = data; | 137 | 2 | res_size = 0; | 138 | 7 | } else { | 139 | 7 | if (without_www && host.size > 4 && !strncmp(host.data, "www.", 4)) | 140 | 5 | host = {host.data + 4, host.size - 4}; | 141 | | | 142 | 7 | res_data = host.data; | 143 | 7 | res_size = host.size; | 144 | 7 | } | 145 | 9 | } |
|
146 | | }; |
147 | | |
148 | | struct ExtractTopLevelDomain { |
149 | 0 | static size_t get_reserve_length_for_element() { return 5; } |
150 | | |
151 | 0 | static void execute(const char* data, size_t size, const char*& res_data, size_t& res_size) { |
152 | 0 | res_data = data; |
153 | 0 | res_size = 0; |
154 | 0 | StringRef host = get_url_host(data, size); |
155 | |
|
156 | 0 | if (host.size == 0) { |
157 | 0 | return; |
158 | 0 | } else { |
159 | 0 | auto host_view = host.to_string_view(); |
160 | 0 | if (host_view[host_view.size() - 1] == '.') { |
161 | 0 | host_view.remove_suffix(1); |
162 | 0 | } |
163 | |
|
164 | 0 | const auto* host_end = host_view.data() + host_view.size(); |
165 | 0 | const char* last_dot = find_last_symbols_or_null<'.'>(host_view.data(), host_end); |
166 | 0 | if (!last_dot) { |
167 | 0 | return; |
168 | 0 | } |
169 | | |
170 | | /// For IPv4 addresses select nothing. |
171 | | /// |
172 | | /// NOTE: it is safe to access last_dot[1] |
173 | | /// since getURLHost() will not return a host if there is symbol after dot. |
174 | 0 | if (is_numeric_ascii(last_dot[1])) { |
175 | 0 | return; |
176 | 0 | } |
177 | | |
178 | 0 | res_data = last_dot + 1; |
179 | 0 | res_size = host_end - res_data; |
180 | 0 | } |
181 | 0 | } |
182 | | }; |
183 | | |
184 | | struct ExtractFirstSignificantSubdomain { |
185 | 0 | static size_t get_reserve_length_for_element() { return 10; } |
186 | | |
187 | | static void execute(const Pos data, const size_t size, Pos& res_data, size_t& res_size, |
188 | 0 | Pos* out_domain_end = nullptr) { |
189 | 0 | res_data = data; |
190 | 0 | res_size = 0; |
191 | |
|
192 | 0 | Pos tmp; |
193 | 0 | size_t domain_length = 0; |
194 | 0 | ExtractDomain<true>::execute(data, size, tmp, domain_length); |
195 | |
|
196 | 0 | if (domain_length == 0) { |
197 | 0 | return; |
198 | 0 | } |
199 | 0 | if (out_domain_end) { |
200 | 0 | *out_domain_end = tmp + domain_length; |
201 | 0 | } |
202 | | |
203 | | /// cut useless dot |
204 | 0 | if (tmp[domain_length - 1] == '.') { |
205 | 0 | --domain_length; |
206 | 0 | } |
207 | |
|
208 | 0 | res_data = tmp; |
209 | 0 | res_size = domain_length; |
210 | |
|
211 | 0 | const auto* begin = tmp; |
212 | 0 | const auto* end = begin + domain_length; |
213 | 0 | std::array<const char*, 3> last_periods {}; |
214 | |
|
215 | 0 | const auto* pos = find_first_symbols<'.'>(begin, end); |
216 | 0 | while (pos < end) { |
217 | 0 | last_periods[2] = last_periods[1]; |
218 | 0 | last_periods[1] = last_periods[0]; |
219 | 0 | last_periods[0] = pos; |
220 | 0 | pos = find_first_symbols<'.'>(pos + 1, end); |
221 | 0 | } |
222 | |
|
223 | 0 | if (!last_periods[0]) { |
224 | 0 | return; |
225 | 0 | } |
226 | | |
227 | 0 | if (!last_periods[1]) { |
228 | 0 | res_size = last_periods[0] - begin; |
229 | 0 | return; |
230 | 0 | } |
231 | | |
232 | 0 | if (!last_periods[2]) { |
233 | 0 | last_periods[2] = begin - 1; |
234 | 0 | } |
235 | |
|
236 | 0 | const auto* end_of_level_domain = find_first_symbols<'/'>(last_periods[0], end); |
237 | 0 | if (!end_of_level_domain) { |
238 | 0 | end_of_level_domain = end; |
239 | 0 | } |
240 | |
|
241 | 0 | auto host_len = static_cast<size_t>(end_of_level_domain - last_periods[1] - 1); |
242 | 0 | StringRef host {last_periods[1] + 1, host_len}; |
243 | 0 | if (tldLookup::is_valid(host.data, host.size)) { |
244 | 0 | res_data += last_periods[2] + 1 - begin; |
245 | 0 | res_size = last_periods[1] - last_periods[2] - 1; |
246 | 0 | } else { |
247 | 0 | res_data += last_periods[1] + 1 - begin; |
248 | 0 | res_size = last_periods[0] - last_periods[1] - 1; |
249 | 0 | } |
250 | 0 | } |
251 | | }; |
252 | | |
253 | | struct CutToFirstSignificantSubdomain { |
254 | 0 | static size_t get_reserve_length_for_element() { return 15; } |
255 | | |
256 | 0 | static void execute(const Pos data, const size_t size, Pos& res_data, size_t& res_size) { |
257 | 0 | res_data = data; |
258 | 0 | res_size = 0; |
259 | |
|
260 | 0 | Pos tmp_data = data; |
261 | 0 | size_t tmp_length; |
262 | 0 | Pos domain_end = data; |
263 | 0 | ExtractFirstSignificantSubdomain::execute(data, size, tmp_data, tmp_length, &domain_end); |
264 | |
|
265 | 0 | if (tmp_length == 0) { |
266 | 0 | return; |
267 | 0 | } |
268 | 0 | res_data = tmp_data; |
269 | 0 | res_size = domain_end - tmp_data; |
270 | 0 | } |
271 | | }; |
272 | | } // namespace doris |