/root/doris/be/src/util/url_parser.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "util/url_parser.h" |
19 | | |
20 | | #include <ctype.h> |
21 | | #include <stdint.h> |
22 | | |
23 | | #include <algorithm> |
24 | | #include <string> |
25 | | |
26 | | #include "runtime/string_search.hpp" |
27 | | #include "vec/common/string_ref.h" |
28 | | |
29 | | namespace doris { |
30 | | |
31 | | const StringRef UrlParser::_s_url_authority(const_cast<char*>("AUTHORITY"), 9); |
32 | | const StringRef UrlParser::_s_url_file(const_cast<char*>("FILE"), 4); |
33 | | const StringRef UrlParser::_s_url_host(const_cast<char*>("HOST"), 4); |
34 | | const StringRef UrlParser::_s_url_path(const_cast<char*>("PATH"), 4); |
35 | | const StringRef UrlParser::_s_url_protocol(const_cast<char*>("PROTOCOL"), 8); |
36 | | const StringRef UrlParser::_s_url_query(const_cast<char*>("QUERY"), 5); |
37 | | const StringRef UrlParser::_s_url_ref(const_cast<char*>("REF"), 3); |
38 | | const StringRef UrlParser::_s_url_userinfo(const_cast<char*>("USERINFO"), 8); |
39 | | const StringRef UrlParser::_s_url_port(const_cast<char*>("PORT"), 4); |
40 | | const StringRef UrlParser::_s_protocol(const_cast<char*>("://"), 3); |
41 | | const StringRef UrlParser::_s_at(const_cast<char*>("@"), 1); |
42 | | const StringRef UrlParser::_s_slash(const_cast<char*>("/"), 1); |
43 | | const StringRef UrlParser::_s_colon(const_cast<char*>(":"), 1); |
44 | | const StringRef UrlParser::_s_question(const_cast<char*>("?"), 1); |
45 | | const StringRef UrlParser::_s_hash(const_cast<char*>("#"), 1); |
46 | | const StringSearch UrlParser::_s_protocol_search(&_s_protocol); |
47 | | const StringSearch UrlParser::_s_at_search(&_s_at); |
48 | | const StringSearch UrlParser::_s_slash_search(&_s_slash); |
49 | | const StringSearch UrlParser::_s_colon_search(&_s_colon); |
50 | | const StringSearch UrlParser::_s_question_search(&_s_question); |
51 | | const StringSearch UrlParser::_s_hash_search(&_s_hash); |
52 | | |
53 | 19 | bool UrlParser::parse_url(const StringRef& url, UrlPart part, StringRef* result) { |
54 | 19 | result->data = nullptr; |
55 | 19 | result->size = 0; |
56 | | // Remove leading and trailing spaces. |
57 | 19 | StringRef trimmed_url = url.trim(); |
58 | | |
59 | | // All parts require checking for the _s_protocol. |
60 | 19 | int32_t protocol_pos = _s_protocol_search.search(&trimmed_url); |
61 | 19 | if (protocol_pos < 0) { |
62 | 2 | return false; |
63 | 2 | } |
64 | | |
65 | | // Positioned to first char after '://'. |
66 | 17 | StringRef protocol_end = trimmed_url.substring(protocol_pos + _s_protocol.size); |
67 | | |
68 | 17 | switch (part) { |
69 | 2 | case AUTHORITY: { |
70 | | // Find first '/'. |
71 | 2 | int32_t end_pos = _s_slash_search.search(&protocol_end); |
72 | 2 | *result = protocol_end.substring(0, end_pos); |
73 | 2 | break; |
74 | 0 | } |
75 | | |
76 | 2 | case FILE: |
77 | 4 | case PATH: { |
78 | | // Find first '/'. |
79 | 4 | int32_t start_pos = _s_slash_search.search(&protocol_end); |
80 | | |
81 | 4 | if (start_pos < 0) { |
82 | | // Return empty string. This is what Hive does. |
83 | 0 | return true; |
84 | 0 | } |
85 | | |
86 | 4 | StringRef path_start = protocol_end.substring(start_pos); |
87 | 4 | int32_t end_pos; |
88 | | |
89 | 4 | if (part == FILE) { |
90 | | // End _s_at '#'. |
91 | 2 | end_pos = _s_hash_search.search(&path_start); |
92 | 2 | } else { |
93 | | // End string _s_at next '?' or '#'. |
94 | 2 | end_pos = _s_question_search.search(&path_start); |
95 | | |
96 | 2 | if (end_pos < 0) { |
97 | | // No '?' was found, look for '#'. |
98 | 2 | end_pos = _s_hash_search.search(&path_start); |
99 | 2 | } |
100 | 2 | } |
101 | | |
102 | 4 | *result = path_start.substring(0, end_pos); |
103 | 4 | break; |
104 | 4 | } |
105 | | |
106 | 2 | case HOST: { |
107 | | // Find '@'. |
108 | 2 | int32_t start_pos = _s_at_search.search(&protocol_end); |
109 | | |
110 | 2 | if (start_pos < 0) { |
111 | | // No '@' was found, i.e., no user:pass info was given, start after _s_protocol. |
112 | 2 | start_pos = 0; |
113 | 2 | } else { |
114 | | // Skip '@'. |
115 | 0 | start_pos += _s_at.size; |
116 | 0 | } |
117 | | |
118 | 2 | StringRef host_start = protocol_end.substring(start_pos); |
119 | | // Find first '?'. |
120 | 2 | int32_t query_start_pos = _s_question_search.search(&host_start); |
121 | 2 | if (query_start_pos > 0) { |
122 | 2 | host_start = host_start.substring(0, query_start_pos); |
123 | 2 | } |
124 | | // Find ':' to strip out port. |
125 | 2 | int32_t end_pos = _s_colon_search.search(&host_start); |
126 | | |
127 | 2 | if (end_pos < 0) { |
128 | | // No port was given. search for '/' to determine ending position. |
129 | 2 | end_pos = _s_slash_search.search(&host_start); |
130 | 2 | } |
131 | | |
132 | 2 | *result = host_start.substring(0, end_pos); |
133 | 2 | break; |
134 | 4 | } |
135 | | |
136 | 2 | case PROTOCOL: { |
137 | 2 | *result = trimmed_url.substring(0, protocol_pos); |
138 | 2 | break; |
139 | 4 | } |
140 | | |
141 | 3 | case QUERY: { |
142 | | // Find first '?'. |
143 | 3 | int32_t start_pos = _s_question_search.search(&protocol_end); |
144 | | |
145 | 3 | if (start_pos < 0) { |
146 | | // Indicate no query was found. |
147 | 0 | return false; |
148 | 0 | } |
149 | | |
150 | 3 | StringRef query_start = protocol_end.substring(start_pos + _s_question.size); |
151 | | // End string _s_at next '#'. |
152 | 3 | int32_t end_pos = _s_hash_search.search(&query_start); |
153 | 3 | *result = query_start.substring(0, end_pos); |
154 | 3 | break; |
155 | 3 | } |
156 | | |
157 | 2 | case REF: { |
158 | | // Find '#'. |
159 | 2 | int32_t start_pos = _s_hash_search.search(&protocol_end); |
160 | | |
161 | 2 | if (start_pos < 0) { |
162 | | // Indicate no user and pass were given. |
163 | 2 | return false; |
164 | 2 | } |
165 | | |
166 | 0 | *result = protocol_end.substring(start_pos + _s_hash.size); |
167 | 0 | break; |
168 | 2 | } |
169 | | |
170 | 0 | case USERINFO: { |
171 | | // Find '@'. |
172 | 0 | int32_t end_pos = _s_at_search.search(&protocol_end); |
173 | |
|
174 | 0 | if (end_pos < 0) { |
175 | | // Indicate no user and pass were given. |
176 | 0 | return false; |
177 | 0 | } |
178 | | |
179 | 0 | *result = protocol_end.substring(0, end_pos); |
180 | 0 | break; |
181 | 0 | } |
182 | | |
183 | 2 | case PORT: { |
184 | | // Find '@'. |
185 | 2 | int32_t start_pos = _s_at_search.search(&protocol_end); |
186 | | |
187 | 2 | if (start_pos < 0) { |
188 | | // No '@' was found, i.e., no user:pass info was given, start after _s_protocol. |
189 | 2 | start_pos = 0; |
190 | 2 | } else { |
191 | | // Skip '@'. |
192 | 0 | start_pos += _s_at.size; |
193 | 0 | } |
194 | | |
195 | 2 | StringRef host_start = protocol_end.substring(start_pos); |
196 | | // Find ':' to strip out port. |
197 | 2 | int32_t end_pos = _s_colon_search.search(&host_start); |
198 | | //no port found |
199 | 2 | if (end_pos < 0) { |
200 | 1 | return false; |
201 | 1 | } |
202 | | |
203 | 1 | StringRef port_start_str = protocol_end.substring(end_pos + _s_colon.size); |
204 | 1 | int32_t port_end_pos = _s_slash_search.search(&port_start_str); |
205 | | //if '/' not found, try to find '?' |
206 | 1 | if (port_end_pos < 0) { |
207 | 0 | port_end_pos = _s_question_search.search(&port_start_str); |
208 | 0 | } |
209 | 1 | *result = port_start_str.substring(0, port_end_pos); |
210 | 1 | break; |
211 | 2 | } |
212 | | |
213 | 0 | case INVALID: |
214 | 0 | return false; |
215 | 17 | } |
216 | | |
217 | 14 | return true; |
218 | 17 | } |
219 | | |
220 | | bool UrlParser::parse_url_key(const StringRef& url, UrlPart part, const StringRef& key, |
221 | 4 | StringRef* result) { |
222 | | // Part must be query to ask for a specific query key. |
223 | 4 | if (part != QUERY) { |
224 | 1 | return false; |
225 | 1 | } |
226 | | |
227 | | // Remove leading and trailing spaces. |
228 | 3 | StringRef trimmed_url = url.trim(); |
229 | | |
230 | | // Search for the key in the url, ignoring malformed URLs for now. |
231 | 3 | StringSearch key_search(&key); |
232 | | |
233 | 3 | while (trimmed_url.size > 0) { |
234 | | // Search for the key in the current substring. |
235 | 3 | int32_t key_pos = key_search.search(&trimmed_url); |
236 | 3 | bool match = true; |
237 | | |
238 | 3 | if (key_pos < 0) { |
239 | 1 | return false; |
240 | 1 | } |
241 | | |
242 | | // Key pos must be != 0 because it must be preceded by a '?' or a '&'. |
243 | | // Check that the char before key_pos is either '?' or '&'. |
244 | 2 | if (key_pos == 0 || |
245 | 2 | (trimmed_url.data[key_pos - 1] != '?' && trimmed_url.data[key_pos - 1] != '&')) { |
246 | 0 | match = false; |
247 | 0 | } |
248 | | |
249 | | // Advance substring beyond matching key. |
250 | 2 | trimmed_url = trimmed_url.substring(key_pos + key.size); |
251 | | |
252 | 2 | if (!match) { |
253 | 0 | continue; |
254 | 0 | } |
255 | | |
256 | 2 | if (trimmed_url.size <= 0) { |
257 | 0 | break; |
258 | 0 | } |
259 | | |
260 | | // Next character must be '=', otherwise the match cannot be a key in the query part. |
261 | 2 | if (trimmed_url.data[0] != '=') { |
262 | 0 | continue; |
263 | 0 | } |
264 | | |
265 | 2 | int32_t pos = 1; |
266 | | |
267 | | // Find ending position of key's value by matching '#' or '&'. |
268 | 4 | while (pos < trimmed_url.size) { |
269 | 4 | switch (trimmed_url.data[pos]) { |
270 | 2 | case '#': |
271 | 2 | case '&': |
272 | 2 | *result = trimmed_url.substring(1, pos - 1); |
273 | 2 | return true; |
274 | 4 | } |
275 | | |
276 | 2 | ++pos; |
277 | 2 | } |
278 | | |
279 | | // Ending position is end of string. |
280 | 0 | *result = trimmed_url.substring(1); |
281 | 0 | return true; |
282 | 2 | } |
283 | | |
284 | 0 | return false; |
285 | 3 | } |
286 | | |
287 | 23 | UrlParser::UrlPart UrlParser::get_url_part(const StringRef& part) { |
288 | | // Quick filter on requested URL part, based on first character. |
289 | | // Hive requires the requested URL part to be all upper case. |
290 | 23 | std::string part_str = part.to_string(); |
291 | 23 | transform(part_str.begin(), part_str.end(), part_str.begin(), ::toupper); |
292 | 23 | StringRef newPart = StringRef(part_str); |
293 | 23 | switch (newPart.data[0]) { |
294 | 2 | case 'A': { |
295 | 2 | if (!newPart.eq(_s_url_authority)) { |
296 | 0 | return INVALID; |
297 | 0 | } |
298 | | |
299 | 2 | return AUTHORITY; |
300 | 2 | } |
301 | | |
302 | 2 | case 'F': { |
303 | 2 | if (!newPart.eq(_s_url_file)) { |
304 | 0 | return INVALID; |
305 | 0 | } |
306 | | |
307 | 2 | return FILE; |
308 | 2 | } |
309 | | |
310 | 5 | case 'H': { |
311 | 5 | if (!newPart.eq(_s_url_host)) { |
312 | 0 | return INVALID; |
313 | 0 | } |
314 | | |
315 | 5 | return HOST; |
316 | 5 | } |
317 | | |
318 | 6 | case 'P': { |
319 | 6 | if (newPart.eq(_s_url_path)) { |
320 | 2 | return PATH; |
321 | 4 | } else if (newPart.eq(_s_url_protocol)) { |
322 | 2 | return PROTOCOL; |
323 | 2 | } else if (newPart.eq(_s_url_port)) { |
324 | 2 | return PORT; |
325 | 2 | } else { |
326 | 0 | return INVALID; |
327 | 0 | } |
328 | 6 | } |
329 | | |
330 | 6 | case 'Q': { |
331 | 6 | if (!newPart.eq(_s_url_query)) { |
332 | 0 | return INVALID; |
333 | 0 | } |
334 | | |
335 | 6 | return QUERY; |
336 | 6 | } |
337 | | |
338 | 2 | case 'R': { |
339 | 2 | if (!newPart.eq(_s_url_ref)) { |
340 | 0 | return INVALID; |
341 | 0 | } |
342 | | |
343 | 2 | return REF; |
344 | 2 | } |
345 | | |
346 | 0 | case 'U': { |
347 | 0 | if (!newPart.eq(_s_url_userinfo)) { |
348 | 0 | return INVALID; |
349 | 0 | } |
350 | | |
351 | 0 | return USERINFO; |
352 | 0 | } |
353 | | |
354 | 0 | default: |
355 | 0 | return INVALID; |
356 | 23 | } |
357 | 23 | } |
358 | | |
359 | 10 | StringRef UrlParser::extract_url(StringRef url, StringRef name) { |
360 | 10 | StringRef result("", 0); |
361 | | // Remove leading and trailing spaces. |
362 | 10 | StringRef trimmed_url = url.trim(); |
363 | | // find '?' |
364 | 10 | int32_t question_pos = _s_question_search.search(&trimmed_url); |
365 | 10 | if (question_pos < 0) { |
366 | | // this url no parameters. |
367 | | // Example: https://doris.apache.org/ |
368 | 1 | return result; |
369 | 1 | } |
370 | | |
371 | | // find '#' |
372 | 9 | int32_t hash_pos = _s_hash_search.search(&trimmed_url); |
373 | 9 | StringRef sub_url; |
374 | 9 | if (hash_pos < 0) { |
375 | 2 | sub_url = trimmed_url.substring(question_pos + 1, trimmed_url.size - question_pos - 1); |
376 | 7 | } else { |
377 | 7 | sub_url = trimmed_url.substring(question_pos + 1, hash_pos - question_pos - 1); |
378 | 7 | } |
379 | | |
380 | | // find '&' and '=', and extract target parameter |
381 | | // Example: k1=aa&k2=bb&k3=cc&test=dd |
382 | 9 | int64_t and_pod; |
383 | 9 | auto len = sub_url.size; |
384 | 9 | StringRef key_url; |
385 | 20 | while (true) { |
386 | 20 | if (len <= 0) { |
387 | 4 | break; |
388 | 4 | } |
389 | 16 | and_pod = sub_url.find_first_of('&'); |
390 | 16 | if (and_pod != -1) { |
391 | 10 | key_url = sub_url.substring(0, and_pod); |
392 | 10 | sub_url = sub_url.substring(and_pod + 1, len - and_pod - 1); |
393 | 10 | } else { |
394 | 6 | auto end_pos = sub_url.find_first_of('#'); |
395 | 6 | key_url = end_pos == -1 ? sub_url : sub_url.substring(0, end_pos); |
396 | 6 | sub_url = result; |
397 | 6 | } |
398 | 16 | len = sub_url.size; |
399 | | |
400 | 16 | auto eq_pod = key_url.find_first_of('='); |
401 | 16 | if (eq_pod == -1) { |
402 | | // invalid url. like: k1&k2=bb |
403 | 1 | continue; |
404 | 1 | } |
405 | 15 | int32_t key_len = key_url.size; |
406 | 15 | auto key = key_url.substring(0, eq_pod); |
407 | 15 | if (name == key) { |
408 | 5 | return key_url.substring(eq_pod + 1, key_len - eq_pod - 1); |
409 | 5 | } |
410 | 15 | } |
411 | 4 | return result; |
412 | 9 | } |
413 | | } // namespace doris |