be/src/util/url_parser.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include "util/url_parser.h" |
19 | | |
20 | | #include <ctype.h> |
21 | | #include <stdint.h> |
22 | | |
23 | | #include <algorithm> |
24 | | #include <string> |
25 | | |
26 | | #include "core/string_ref.h" |
27 | | #include "util/string_search.hpp" |
28 | | |
29 | | namespace doris { |
30 | | const StringRef UrlParser::_s_url_authority("AUTHORITY", 9); |
31 | | const StringRef UrlParser::_s_url_file("FILE", 4); |
32 | | const StringRef UrlParser::_s_url_host("HOST", 4); |
33 | | const StringRef UrlParser::_s_url_path("PATH", 4); |
34 | | const StringRef UrlParser::_s_url_protocol("PROTOCOL", 8); |
35 | | const StringRef UrlParser::_s_url_query("QUERY", 5); |
36 | | const StringRef UrlParser::_s_url_ref("REF", 3); |
37 | | const StringRef UrlParser::_s_url_userinfo("USERINFO", 8); |
38 | | const StringRef UrlParser::_s_url_port("PORT", 4); |
39 | | const StringRef UrlParser::_s_protocol("://", 3); |
40 | | const StringRef UrlParser::_s_at("@", 1); |
41 | | const StringRef UrlParser::_s_slash("/", 1); |
42 | | const StringRef UrlParser::_s_colon(":", 1); |
43 | | const StringRef UrlParser::_s_question("?", 1); |
44 | | const StringRef UrlParser::_s_hash("#", 1); |
45 | | const StringSearch UrlParser::_s_protocol_search(&_s_protocol); |
46 | | const StringSearch UrlParser::_s_at_search(&_s_at); |
47 | | const StringSearch UrlParser::_s_slash_search(&_s_slash); |
48 | | const StringSearch UrlParser::_s_colon_search(&_s_colon); |
49 | | const StringSearch UrlParser::_s_question_search(&_s_question); |
50 | | const StringSearch UrlParser::_s_hash_search(&_s_hash); |
51 | | |
52 | 84 | bool UrlParser::parse_url(const StringRef& url, UrlPart part, StringRef* result) { |
53 | 84 | result->data = nullptr; |
54 | 84 | result->size = 0; |
55 | | // Remove leading and trailing spaces. |
56 | 84 | StringRef trimmed_url = url.trim(); |
57 | | |
58 | | // All parts require checking for the _s_protocol. |
59 | 84 | int32_t protocol_pos = _s_protocol_search.search(&trimmed_url); |
60 | 84 | if (protocol_pos < 0) { |
61 | 8 | return false; |
62 | 8 | } |
63 | | |
64 | | // Positioned to first char after '://'. |
65 | 76 | StringRef protocol_end = trimmed_url.substring(protocol_pos + _s_protocol.size); |
66 | | |
67 | 76 | switch (part) { |
68 | 8 | case AUTHORITY: { |
69 | | // Find first '/'. |
70 | 8 | int32_t end_pos = _s_slash_search.search(&protocol_end); |
71 | 8 | *result = protocol_end.substring(0, end_pos); |
72 | 8 | break; |
73 | 0 | } |
74 | | |
75 | 12 | case FILE: |
76 | 20 | case PATH: { |
77 | | // Find first '/'. |
78 | 20 | int32_t start_pos = _s_slash_search.search(&protocol_end); |
79 | | |
80 | 20 | if (start_pos < 0) { |
81 | | // Return empty string. This is what Hive does. |
82 | 4 | return true; |
83 | 4 | } |
84 | | |
85 | 16 | StringRef path_start = protocol_end.substring(start_pos); |
86 | 16 | int32_t end_pos; |
87 | | |
88 | 16 | if (part == FILE) { |
89 | | // End _s_at '#'. |
90 | 8 | end_pos = _s_hash_search.search(&path_start); |
91 | 8 | } else { |
92 | | // End string _s_at next '?' or '#'. |
93 | 8 | end_pos = _s_question_search.search(&path_start); |
94 | | |
95 | 8 | if (end_pos < 0) { |
96 | | // No '?' was found, look for '#'. |
97 | 8 | end_pos = _s_hash_search.search(&path_start); |
98 | 8 | } |
99 | 8 | } |
100 | | |
101 | 16 | *result = path_start.substring(0, end_pos); |
102 | 16 | break; |
103 | 20 | } |
104 | | |
105 | 12 | case HOST: { |
106 | | // Find '@'. |
107 | 12 | int32_t start_pos = _s_at_search.search(&protocol_end); |
108 | | |
109 | 12 | if (start_pos < 0) { |
110 | | // No '@' was found, i.e., no user:pass info was given, start after _s_protocol. |
111 | 12 | start_pos = 0; |
112 | 12 | } else { |
113 | | // Skip '@'. |
114 | 0 | start_pos += _s_at.size; |
115 | 0 | } |
116 | | |
117 | 12 | StringRef host_start = protocol_end.substring(start_pos); |
118 | | // Find first '?'. |
119 | 12 | int32_t query_start_pos = _s_question_search.search(&host_start); |
120 | 12 | if (query_start_pos > 0) { |
121 | 12 | host_start = host_start.substring(0, query_start_pos); |
122 | 12 | } |
123 | | // Find ':' to strip out port. |
124 | 12 | int32_t end_pos = _s_colon_search.search(&host_start); |
125 | | |
126 | 12 | if (end_pos < 0) { |
127 | | // No port was given. search for '/' to determine ending position. |
128 | 12 | end_pos = _s_slash_search.search(&host_start); |
129 | 12 | } |
130 | | |
131 | 12 | *result = host_start.substring(0, end_pos); |
132 | 12 | break; |
133 | 20 | } |
134 | | |
135 | 8 | case PROTOCOL: { |
136 | 8 | *result = trimmed_url.substring(0, protocol_pos); |
137 | 8 | break; |
138 | 20 | } |
139 | | |
140 | 12 | case QUERY: { |
141 | | // Find first '?'. |
142 | 12 | int32_t start_pos = _s_question_search.search(&protocol_end); |
143 | | |
144 | 12 | if (start_pos < 0) { |
145 | | // Indicate no query was found. |
146 | 0 | return false; |
147 | 0 | } |
148 | | |
149 | 12 | StringRef query_start = protocol_end.substring(start_pos + _s_question.size); |
150 | | // End string _s_at next '#'. |
151 | 12 | int32_t end_pos = _s_hash_search.search(&query_start); |
152 | 12 | *result = query_start.substring(0, end_pos); |
153 | 12 | break; |
154 | 12 | } |
155 | | |
156 | 8 | case REF: { |
157 | | // Find '#'. |
158 | 8 | int32_t start_pos = _s_hash_search.search(&protocol_end); |
159 | | |
160 | 8 | if (start_pos < 0) { |
161 | | // Indicate no user and pass were given. |
162 | 8 | return false; |
163 | 8 | } |
164 | | |
165 | 0 | *result = protocol_end.substring(start_pos + _s_hash.size); |
166 | 0 | break; |
167 | 8 | } |
168 | | |
169 | 0 | case USERINFO: { |
170 | | // Find '@'. |
171 | 0 | int32_t end_pos = _s_at_search.search(&protocol_end); |
172 | |
|
173 | 0 | if (end_pos < 0) { |
174 | | // Indicate no user and pass were given. |
175 | 0 | return false; |
176 | 0 | } |
177 | | |
178 | 0 | *result = protocol_end.substring(0, end_pos); |
179 | 0 | break; |
180 | 0 | } |
181 | | |
182 | 8 | case PORT: { |
183 | | // Find '@'. |
184 | 8 | int32_t start_pos = _s_at_search.search(&protocol_end); |
185 | | |
186 | 8 | if (start_pos < 0) { |
187 | | // No '@' was found, i.e., no user:pass info was given, start after _s_protocol. |
188 | 8 | start_pos = 0; |
189 | 8 | } else { |
190 | | // Skip '@'. |
191 | 0 | start_pos += _s_at.size; |
192 | 0 | } |
193 | | |
194 | 8 | StringRef host_start = protocol_end.substring(start_pos); |
195 | | // Find ':' to strip out port. |
196 | 8 | int32_t end_pos = _s_colon_search.search(&host_start); |
197 | | //no port found |
198 | 8 | if (end_pos < 0) { |
199 | 4 | return false; |
200 | 4 | } |
201 | | |
202 | 4 | StringRef port_start_str = host_start.substring(end_pos + _s_colon.size); |
203 | 4 | int32_t port_end_pos = _s_slash_search.search(&port_start_str); |
204 | | //if '/' not found, try to find '?' |
205 | 4 | if (port_end_pos < 0) { |
206 | 0 | port_end_pos = _s_question_search.search(&port_start_str); |
207 | 0 | } |
208 | 4 | *result = port_start_str.substring(0, port_end_pos); |
209 | 4 | break; |
210 | 8 | } |
211 | | |
212 | 0 | case INVALID: |
213 | 0 | return false; |
214 | 76 | } |
215 | | |
216 | 60 | return true; |
217 | 76 | } |
218 | | |
219 | | bool UrlParser::parse_url_key(const StringRef& url, UrlPart part, const StringRef& key, |
220 | 32 | StringRef* result) { |
221 | | // Part must be query to ask for a specific query key. |
222 | 32 | if (part != QUERY) { |
223 | 8 | return false; |
224 | 8 | } |
225 | | |
226 | | // Remove leading and trailing spaces. |
227 | 24 | StringRef trimmed_url = url.trim(); |
228 | | |
229 | | // Search for the key in the url, ignoring malformed URLs for now. |
230 | 24 | StringSearch key_search(&key); |
231 | | |
232 | 24 | while (trimmed_url.size > 0) { |
233 | | // Search for the key in the current substring. |
234 | 24 | int32_t key_pos = key_search.search(&trimmed_url); |
235 | 24 | bool match = true; |
236 | | |
237 | 24 | if (key_pos < 0) { |
238 | 8 | return false; |
239 | 8 | } |
240 | | |
241 | | // Key pos must be != 0 because it must be preceded by a '?' or a '&'. |
242 | | // Check that the char before key_pos is either '?' or '&'. |
243 | 16 | if (key_pos == 0 || |
244 | 16 | (trimmed_url.data[key_pos - 1] != '?' && trimmed_url.data[key_pos - 1] != '&')) { |
245 | 0 | match = false; |
246 | 0 | } |
247 | | |
248 | | // Advance substring beyond matching key. |
249 | 16 | trimmed_url = trimmed_url.substring(key_pos + key.size); |
250 | | |
251 | 16 | if (!match) { |
252 | 0 | continue; |
253 | 0 | } |
254 | | |
255 | 16 | if (trimmed_url.size <= 0) { |
256 | 0 | break; |
257 | 0 | } |
258 | | |
259 | | // Next character must be '=', otherwise the match cannot be a key in the query part. |
260 | 16 | if (trimmed_url.data[0] != '=') { |
261 | 0 | continue; |
262 | 0 | } |
263 | | |
264 | 16 | int32_t pos = 1; |
265 | | |
266 | | // Find ending position of key's value by matching '#' or '&'. |
267 | 32 | while (pos < trimmed_url.size) { |
268 | 32 | switch (trimmed_url.data[pos]) { |
269 | 16 | case '#': |
270 | 16 | case '&': |
271 | 16 | *result = trimmed_url.substring(1, pos - 1); |
272 | 16 | return true; |
273 | 32 | } |
274 | | |
275 | 16 | ++pos; |
276 | 16 | } |
277 | | |
278 | | // Ending position is end of string. |
279 | 0 | *result = trimmed_url.substring(1); |
280 | 0 | return true; |
281 | 16 | } |
282 | | |
283 | 0 | return false; |
284 | 24 | } |
285 | | |
286 | 116 | UrlParser::UrlPart UrlParser::get_url_part(const StringRef& part) { |
287 | | // Quick filter on requested URL part, based on first character. |
288 | | // Hive requires the requested URL part to be all upper case. |
289 | 116 | std::string part_str = part.to_string(); |
290 | 116 | transform(part_str.begin(), part_str.end(), part_str.begin(), ::toupper); |
291 | 116 | StringRef newPart = StringRef(part_str); |
292 | 116 | switch (newPart.data[0]) { |
293 | 8 | case 'A': { |
294 | 8 | if (!newPart.eq(_s_url_authority)) { |
295 | 0 | return INVALID; |
296 | 0 | } |
297 | | |
298 | 8 | return AUTHORITY; |
299 | 8 | } |
300 | | |
301 | 12 | case 'F': { |
302 | 12 | if (!newPart.eq(_s_url_file)) { |
303 | 0 | return INVALID; |
304 | 0 | } |
305 | | |
306 | 12 | return FILE; |
307 | 12 | } |
308 | | |
309 | 28 | case 'H': { |
310 | 28 | if (!newPart.eq(_s_url_host)) { |
311 | 0 | return INVALID; |
312 | 0 | } |
313 | | |
314 | 28 | return HOST; |
315 | 28 | } |
316 | | |
317 | 24 | case 'P': { |
318 | 24 | if (newPart.eq(_s_url_path)) { |
319 | 8 | return PATH; |
320 | 16 | } else if (newPart.eq(_s_url_protocol)) { |
321 | 8 | return PROTOCOL; |
322 | 8 | } else if (newPart.eq(_s_url_port)) { |
323 | 8 | return PORT; |
324 | 8 | } else { |
325 | 0 | return INVALID; |
326 | 0 | } |
327 | 24 | } |
328 | | |
329 | 36 | case 'Q': { |
330 | 36 | if (!newPart.eq(_s_url_query)) { |
331 | 0 | return INVALID; |
332 | 0 | } |
333 | | |
334 | 36 | return QUERY; |
335 | 36 | } |
336 | | |
337 | 8 | case 'R': { |
338 | 8 | if (!newPart.eq(_s_url_ref)) { |
339 | 0 | return INVALID; |
340 | 0 | } |
341 | | |
342 | 8 | return REF; |
343 | 8 | } |
344 | | |
345 | 0 | case 'U': { |
346 | 0 | if (!newPart.eq(_s_url_userinfo)) { |
347 | 0 | return INVALID; |
348 | 0 | } |
349 | | |
350 | 0 | return USERINFO; |
351 | 0 | } |
352 | | |
353 | 0 | default: |
354 | 0 | return INVALID; |
355 | 116 | } |
356 | 116 | } |
357 | | |
358 | 40 | StringRef UrlParser::extract_url(StringRef url, StringRef name) { |
359 | 40 | StringRef result("", 0); |
360 | | // Remove leading and trailing spaces. |
361 | 40 | StringRef trimmed_url = url.trim(); |
362 | | // find '?' |
363 | 40 | int32_t question_pos = _s_question_search.search(&trimmed_url); |
364 | 40 | if (question_pos < 0) { |
365 | | // this url no parameters. |
366 | | // Example: https://doris.apache.org/ |
367 | 4 | return result; |
368 | 4 | } |
369 | | |
370 | | // find '#' |
371 | 36 | int32_t hash_pos = _s_hash_search.search(&trimmed_url); |
372 | 36 | StringRef sub_url; |
373 | 36 | if (hash_pos < 0) { |
374 | 8 | sub_url = trimmed_url.substring(question_pos + 1, trimmed_url.size - question_pos - 1); |
375 | 28 | } else { |
376 | 28 | sub_url = trimmed_url.substring(question_pos + 1, hash_pos - question_pos - 1); |
377 | 28 | } |
378 | | |
379 | | // find '&' and '=', and extract target parameter |
380 | | // Example: k1=aa&k2=bb&k3=cc&test=dd |
381 | 36 | int64_t and_pod; |
382 | 36 | auto len = sub_url.size; |
383 | 36 | StringRef key_url; |
384 | 80 | while (true) { |
385 | 80 | if (len <= 0) { |
386 | 16 | break; |
387 | 16 | } |
388 | 64 | and_pod = sub_url.find_first_of('&'); |
389 | 64 | if (and_pod != -1) { |
390 | 40 | key_url = sub_url.substring(0, and_pod); |
391 | 40 | sub_url = sub_url.substring(and_pod + 1, len - and_pod - 1); |
392 | 40 | } else { |
393 | 24 | auto end_pos = sub_url.find_first_of('#'); |
394 | 24 | key_url = end_pos == -1 ? sub_url : sub_url.substring(0, end_pos); |
395 | 24 | sub_url = result; |
396 | 24 | } |
397 | 64 | len = sub_url.size; |
398 | | |
399 | 64 | auto eq_pod = key_url.find_first_of('='); |
400 | 64 | if (eq_pod == -1) { |
401 | | // invalid url. like: k1&k2=bb |
402 | 4 | continue; |
403 | 4 | } |
404 | 60 | auto key_len = key_url.size; |
405 | 60 | auto key = key_url.substring(0, eq_pod); |
406 | 60 | if (name == key) { |
407 | 20 | return key_url.substring(eq_pod + 1, key_len - eq_pod - 1); |
408 | 20 | } |
409 | 60 | } |
410 | 16 | return result; |
411 | 36 | } |
412 | | } // namespace doris |