Coverage Report

Created: 2024-11-20 19:28

/root/doris/be/src/util/url_parser.cpp
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "util/url_parser.h"
19
20
#include <ctype.h>
21
#include <stdint.h>
22
23
#include <algorithm>
24
#include <string>
25
26
#include "runtime/string_search.hpp"
27
#include "vec/common/string_ref.h"
28
29
namespace doris {
30
31
const StringRef UrlParser::_s_url_authority(const_cast<char*>("AUTHORITY"), 9);
32
const StringRef UrlParser::_s_url_file(const_cast<char*>("FILE"), 4);
33
const StringRef UrlParser::_s_url_host(const_cast<char*>("HOST"), 4);
34
const StringRef UrlParser::_s_url_path(const_cast<char*>("PATH"), 4);
35
const StringRef UrlParser::_s_url_protocol(const_cast<char*>("PROTOCOL"), 8);
36
const StringRef UrlParser::_s_url_query(const_cast<char*>("QUERY"), 5);
37
const StringRef UrlParser::_s_url_ref(const_cast<char*>("REF"), 3);
38
const StringRef UrlParser::_s_url_userinfo(const_cast<char*>("USERINFO"), 8);
39
const StringRef UrlParser::_s_url_port(const_cast<char*>("PORT"), 4);
40
const StringRef UrlParser::_s_protocol(const_cast<char*>("://"), 3);
41
const StringRef UrlParser::_s_at(const_cast<char*>("@"), 1);
42
const StringRef UrlParser::_s_slash(const_cast<char*>("/"), 1);
43
const StringRef UrlParser::_s_colon(const_cast<char*>(":"), 1);
44
const StringRef UrlParser::_s_question(const_cast<char*>("?"), 1);
45
const StringRef UrlParser::_s_hash(const_cast<char*>("#"), 1);
46
const StringSearch UrlParser::_s_protocol_search(&_s_protocol);
47
const StringSearch UrlParser::_s_at_search(&_s_at);
48
const StringSearch UrlParser::_s_slash_search(&_s_slash);
49
const StringSearch UrlParser::_s_colon_search(&_s_colon);
50
const StringSearch UrlParser::_s_question_search(&_s_question);
51
const StringSearch UrlParser::_s_hash_search(&_s_hash);
52
53
19
bool UrlParser::parse_url(const StringRef& url, UrlPart part, StringRef* result) {
54
19
    result->data = nullptr;
55
19
    result->size = 0;
56
    // Remove leading and trailing spaces.
57
19
    StringRef trimmed_url = url.trim();
58
59
    // All parts require checking for the _s_protocol.
60
19
    int32_t protocol_pos = _s_protocol_search.search(&trimmed_url);
61
19
    if (protocol_pos < 0) {
62
2
        return false;
63
2
    }
64
65
    // Positioned to first char after '://'.
66
17
    StringRef protocol_end = trimmed_url.substring(protocol_pos + _s_protocol.size);
67
68
17
    switch (part) {
69
2
    case AUTHORITY: {
70
        // Find first '/'.
71
2
        int32_t end_pos = _s_slash_search.search(&protocol_end);
72
2
        *result = protocol_end.substring(0, end_pos);
73
2
        break;
74
0
    }
75
76
2
    case FILE:
77
4
    case PATH: {
78
        // Find first '/'.
79
4
        int32_t start_pos = _s_slash_search.search(&protocol_end);
80
81
4
        if (start_pos < 0) {
82
            // Return empty string. This is what Hive does.
83
0
            return true;
84
0
        }
85
86
4
        StringRef path_start = protocol_end.substring(start_pos);
87
4
        int32_t end_pos;
88
89
4
        if (part == FILE) {
90
            // End _s_at '#'.
91
2
            end_pos = _s_hash_search.search(&path_start);
92
2
        } else {
93
            // End string _s_at next '?' or '#'.
94
2
            end_pos = _s_question_search.search(&path_start);
95
96
2
            if (end_pos < 0) {
97
                // No '?' was found, look for '#'.
98
2
                end_pos = _s_hash_search.search(&path_start);
99
2
            }
100
2
        }
101
102
4
        *result = path_start.substring(0, end_pos);
103
4
        break;
104
4
    }
105
106
2
    case HOST: {
107
        // Find '@'.
108
2
        int32_t start_pos = _s_at_search.search(&protocol_end);
109
110
2
        if (start_pos < 0) {
111
            // No '@' was found, i.e., no user:pass info was given, start after _s_protocol.
112
2
            start_pos = 0;
113
2
        } else {
114
            // Skip '@'.
115
0
            start_pos += _s_at.size;
116
0
        }
117
118
2
        StringRef host_start = protocol_end.substring(start_pos);
119
        // Find first '?'.
120
2
        int32_t query_start_pos = _s_question_search.search(&host_start);
121
2
        if (query_start_pos > 0) {
122
2
            host_start = host_start.substring(0, query_start_pos);
123
2
        }
124
        // Find ':' to strip out port.
125
2
        int32_t end_pos = _s_colon_search.search(&host_start);
126
127
2
        if (end_pos < 0) {
128
            // No port was given. search for '/' to determine ending position.
129
2
            end_pos = _s_slash_search.search(&host_start);
130
2
        }
131
132
2
        *result = host_start.substring(0, end_pos);
133
2
        break;
134
4
    }
135
136
2
    case PROTOCOL: {
137
2
        *result = trimmed_url.substring(0, protocol_pos);
138
2
        break;
139
4
    }
140
141
3
    case QUERY: {
142
        // Find first '?'.
143
3
        int32_t start_pos = _s_question_search.search(&protocol_end);
144
145
3
        if (start_pos < 0) {
146
            // Indicate no query was found.
147
0
            return false;
148
0
        }
149
150
3
        StringRef query_start = protocol_end.substring(start_pos + _s_question.size);
151
        // End string _s_at next '#'.
152
3
        int32_t end_pos = _s_hash_search.search(&query_start);
153
3
        *result = query_start.substring(0, end_pos);
154
3
        break;
155
3
    }
156
157
2
    case REF: {
158
        // Find '#'.
159
2
        int32_t start_pos = _s_hash_search.search(&protocol_end);
160
161
2
        if (start_pos < 0) {
162
            // Indicate no user and pass were given.
163
2
            return false;
164
2
        }
165
166
0
        *result = protocol_end.substring(start_pos + _s_hash.size);
167
0
        break;
168
2
    }
169
170
0
    case USERINFO: {
171
        // Find '@'.
172
0
        int32_t end_pos = _s_at_search.search(&protocol_end);
173
174
0
        if (end_pos < 0) {
175
            // Indicate no user and pass were given.
176
0
            return false;
177
0
        }
178
179
0
        *result = protocol_end.substring(0, end_pos);
180
0
        break;
181
0
    }
182
183
2
    case PORT: {
184
        // Find '@'.
185
2
        int32_t start_pos = _s_at_search.search(&protocol_end);
186
187
2
        if (start_pos < 0) {
188
            // No '@' was found, i.e., no user:pass info was given, start after _s_protocol.
189
2
            start_pos = 0;
190
2
        } else {
191
            // Skip '@'.
192
0
            start_pos += _s_at.size;
193
0
        }
194
195
2
        StringRef host_start = protocol_end.substring(start_pos);
196
        // Find ':' to strip out port.
197
2
        int32_t end_pos = _s_colon_search.search(&host_start);
198
        //no port found
199
2
        if (end_pos < 0) {
200
1
            return false;
201
1
        }
202
203
1
        StringRef port_start_str = protocol_end.substring(end_pos + _s_colon.size);
204
1
        int32_t port_end_pos = _s_slash_search.search(&port_start_str);
205
        //if '/' not found, try to find '?'
206
1
        if (port_end_pos < 0) {
207
0
            port_end_pos = _s_question_search.search(&port_start_str);
208
0
        }
209
1
        *result = port_start_str.substring(0, port_end_pos);
210
1
        break;
211
2
    }
212
213
0
    case INVALID:
214
0
        return false;
215
17
    }
216
217
14
    return true;
218
17
}
219
220
bool UrlParser::parse_url_key(const StringRef& url, UrlPart part, const StringRef& key,
221
4
                              StringRef* result) {
222
    // Part must be query to ask for a specific query key.
223
4
    if (part != QUERY) {
224
1
        return false;
225
1
    }
226
227
    // Remove leading and trailing spaces.
228
3
    StringRef trimmed_url = url.trim();
229
230
    // Search for the key in the url, ignoring malformed URLs for now.
231
3
    StringSearch key_search(&key);
232
233
3
    while (trimmed_url.size > 0) {
234
        // Search for the key in the current substring.
235
3
        int32_t key_pos = key_search.search(&trimmed_url);
236
3
        bool match = true;
237
238
3
        if (key_pos < 0) {
239
1
            return false;
240
1
        }
241
242
        // Key pos must be != 0 because it must be preceded by a '?' or a '&'.
243
        // Check that the char before key_pos is either '?' or '&'.
244
2
        if (key_pos == 0 ||
245
2
            (trimmed_url.data[key_pos - 1] != '?' && trimmed_url.data[key_pos - 1] != '&')) {
246
0
            match = false;
247
0
        }
248
249
        // Advance substring beyond matching key.
250
2
        trimmed_url = trimmed_url.substring(key_pos + key.size);
251
252
2
        if (!match) {
253
0
            continue;
254
0
        }
255
256
2
        if (trimmed_url.size <= 0) {
257
0
            break;
258
0
        }
259
260
        // Next character must be '=', otherwise the match cannot be a key in the query part.
261
2
        if (trimmed_url.data[0] != '=') {
262
0
            continue;
263
0
        }
264
265
2
        int32_t pos = 1;
266
267
        // Find ending position of key's value by matching '#' or '&'.
268
4
        while (pos < trimmed_url.size) {
269
4
            switch (trimmed_url.data[pos]) {
270
2
            case '#':
271
2
            case '&':
272
2
                *result = trimmed_url.substring(1, pos - 1);
273
2
                return true;
274
4
            }
275
276
2
            ++pos;
277
2
        }
278
279
        // Ending position is end of string.
280
0
        *result = trimmed_url.substring(1);
281
0
        return true;
282
2
    }
283
284
0
    return false;
285
3
}
286
287
23
UrlParser::UrlPart UrlParser::get_url_part(const StringRef& part) {
288
    // Quick filter on requested URL part, based on first character.
289
    // Hive requires the requested URL part to be all upper case.
290
23
    std::string part_str = part.to_string();
291
23
    transform(part_str.begin(), part_str.end(), part_str.begin(), ::toupper);
292
23
    StringRef newPart = StringRef(part_str);
293
23
    switch (newPart.data[0]) {
294
2
    case 'A': {
295
2
        if (!newPart.eq(_s_url_authority)) {
296
0
            return INVALID;
297
0
        }
298
299
2
        return AUTHORITY;
300
2
    }
301
302
2
    case 'F': {
303
2
        if (!newPart.eq(_s_url_file)) {
304
0
            return INVALID;
305
0
        }
306
307
2
        return FILE;
308
2
    }
309
310
5
    case 'H': {
311
5
        if (!newPart.eq(_s_url_host)) {
312
0
            return INVALID;
313
0
        }
314
315
5
        return HOST;
316
5
    }
317
318
6
    case 'P': {
319
6
        if (newPart.eq(_s_url_path)) {
320
2
            return PATH;
321
4
        } else if (newPart.eq(_s_url_protocol)) {
322
2
            return PROTOCOL;
323
2
        } else if (newPart.eq(_s_url_port)) {
324
2
            return PORT;
325
2
        } else {
326
0
            return INVALID;
327
0
        }
328
6
    }
329
330
6
    case 'Q': {
331
6
        if (!newPart.eq(_s_url_query)) {
332
0
            return INVALID;
333
0
        }
334
335
6
        return QUERY;
336
6
    }
337
338
2
    case 'R': {
339
2
        if (!newPart.eq(_s_url_ref)) {
340
0
            return INVALID;
341
0
        }
342
343
2
        return REF;
344
2
    }
345
346
0
    case 'U': {
347
0
        if (!newPart.eq(_s_url_userinfo)) {
348
0
            return INVALID;
349
0
        }
350
351
0
        return USERINFO;
352
0
    }
353
354
0
    default:
355
0
        return INVALID;
356
23
    }
357
23
}
358
359
10
StringRef UrlParser::extract_url(StringRef url, StringRef name) {
360
10
    StringRef result("", 0);
361
    // Remove leading and trailing spaces.
362
10
    StringRef trimmed_url = url.trim();
363
    // find '?'
364
10
    int32_t question_pos = _s_question_search.search(&trimmed_url);
365
10
    if (question_pos < 0) {
366
        // this url no parameters.
367
        // Example: https://doris.apache.org/
368
1
        return result;
369
1
    }
370
371
    // find '#'
372
9
    int32_t hash_pos = _s_hash_search.search(&trimmed_url);
373
9
    StringRef sub_url;
374
9
    if (hash_pos < 0) {
375
2
        sub_url = trimmed_url.substring(question_pos + 1, trimmed_url.size - question_pos - 1);
376
7
    } else {
377
7
        sub_url = trimmed_url.substring(question_pos + 1, hash_pos - question_pos - 1);
378
7
    }
379
380
    // find '&' and '=', and extract target parameter
381
    // Example: k1=aa&k2=bb&k3=cc&test=dd
382
9
    int64_t and_pod;
383
9
    auto len = sub_url.size;
384
9
    StringRef key_url;
385
20
    while (true) {
386
20
        if (len <= 0) {
387
4
            break;
388
4
        }
389
16
        and_pod = sub_url.find_first_of('&');
390
16
        if (and_pod != -1) {
391
10
            key_url = sub_url.substring(0, and_pod);
392
10
            sub_url = sub_url.substring(and_pod + 1, len - and_pod - 1);
393
10
        } else {
394
6
            auto end_pos = sub_url.find_first_of('#');
395
6
            key_url = end_pos == -1 ? sub_url : sub_url.substring(0, end_pos);
396
6
            sub_url = result;
397
6
        }
398
16
        len = sub_url.size;
399
400
16
        auto eq_pod = key_url.find_first_of('=');
401
16
        if (eq_pod == -1) {
402
            // invalid url. like: k1&k2=bb
403
1
            continue;
404
1
        }
405
15
        int32_t key_len = key_url.size;
406
15
        auto key = key_url.substring(0, eq_pod);
407
15
        if (name == key) {
408
5
            return key_url.substring(eq_pod + 1, key_len - eq_pod - 1);
409
5
        }
410
15
    }
411
4
    return result;
412
9
}
413
} // namespace doris