Coverage Report

Created: 2026-04-14 20:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/util/url_parser.cpp
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#include "util/url_parser.h"
19
20
#include <ctype.h>
21
#include <stdint.h>
22
23
#include <algorithm>
24
#include <string>
25
26
#include "core/string_ref.h"
27
#include "util/string_search.hpp"
28
29
namespace doris {
30
const StringRef UrlParser::_s_url_authority("AUTHORITY", 9);
31
const StringRef UrlParser::_s_url_file("FILE", 4);
32
const StringRef UrlParser::_s_url_host("HOST", 4);
33
const StringRef UrlParser::_s_url_path("PATH", 4);
34
const StringRef UrlParser::_s_url_protocol("PROTOCOL", 8);
35
const StringRef UrlParser::_s_url_query("QUERY", 5);
36
const StringRef UrlParser::_s_url_ref("REF", 3);
37
const StringRef UrlParser::_s_url_userinfo("USERINFO", 8);
38
const StringRef UrlParser::_s_url_port("PORT", 4);
39
const StringRef UrlParser::_s_protocol("://", 3);
40
const StringRef UrlParser::_s_at("@", 1);
41
const StringRef UrlParser::_s_slash("/", 1);
42
const StringRef UrlParser::_s_colon(":", 1);
43
const StringRef UrlParser::_s_question("?", 1);
44
const StringRef UrlParser::_s_hash("#", 1);
45
const StringSearch UrlParser::_s_protocol_search(&_s_protocol);
46
const StringSearch UrlParser::_s_at_search(&_s_at);
47
const StringSearch UrlParser::_s_slash_search(&_s_slash);
48
const StringSearch UrlParser::_s_colon_search(&_s_colon);
49
const StringSearch UrlParser::_s_question_search(&_s_question);
50
const StringSearch UrlParser::_s_hash_search(&_s_hash);
51
52
84
bool UrlParser::parse_url(const StringRef& url, UrlPart part, StringRef* result) {
53
84
    result->data = nullptr;
54
84
    result->size = 0;
55
    // Remove leading and trailing spaces.
56
84
    StringRef trimmed_url = url.trim();
57
58
    // All parts require checking for the _s_protocol.
59
84
    int32_t protocol_pos = _s_protocol_search.search(&trimmed_url);
60
84
    if (protocol_pos < 0) {
61
8
        return false;
62
8
    }
63
64
    // Positioned to first char after '://'.
65
76
    StringRef protocol_end = trimmed_url.substring(protocol_pos + _s_protocol.size);
66
67
76
    switch (part) {
68
8
    case AUTHORITY: {
69
        // Find first '/'.
70
8
        int32_t end_pos = _s_slash_search.search(&protocol_end);
71
8
        *result = protocol_end.substring(0, end_pos);
72
8
        break;
73
0
    }
74
75
12
    case FILE:
76
20
    case PATH: {
77
        // Find first '/'.
78
20
        int32_t start_pos = _s_slash_search.search(&protocol_end);
79
80
20
        if (start_pos < 0) {
81
            // Return empty string. This is what Hive does.
82
4
            return true;
83
4
        }
84
85
16
        StringRef path_start = protocol_end.substring(start_pos);
86
16
        int32_t end_pos;
87
88
16
        if (part == FILE) {
89
            // End _s_at '#'.
90
8
            end_pos = _s_hash_search.search(&path_start);
91
8
        } else {
92
            // End string _s_at next '?' or '#'.
93
8
            end_pos = _s_question_search.search(&path_start);
94
95
8
            if (end_pos < 0) {
96
                // No '?' was found, look for '#'.
97
8
                end_pos = _s_hash_search.search(&path_start);
98
8
            }
99
8
        }
100
101
16
        *result = path_start.substring(0, end_pos);
102
16
        break;
103
20
    }
104
105
12
    case HOST: {
106
        // Find '@'.
107
12
        int32_t start_pos = _s_at_search.search(&protocol_end);
108
109
12
        if (start_pos < 0) {
110
            // No '@' was found, i.e., no user:pass info was given, start after _s_protocol.
111
12
            start_pos = 0;
112
12
        } else {
113
            // Skip '@'.
114
0
            start_pos += _s_at.size;
115
0
        }
116
117
12
        StringRef host_start = protocol_end.substring(start_pos);
118
        // Find first '?'.
119
12
        int32_t query_start_pos = _s_question_search.search(&host_start);
120
12
        if (query_start_pos > 0) {
121
12
            host_start = host_start.substring(0, query_start_pos);
122
12
        }
123
        // Find ':' to strip out port.
124
12
        int32_t end_pos = _s_colon_search.search(&host_start);
125
126
12
        if (end_pos < 0) {
127
            // No port was given. search for '/' to determine ending position.
128
12
            end_pos = _s_slash_search.search(&host_start);
129
12
        }
130
131
12
        *result = host_start.substring(0, end_pos);
132
12
        break;
133
20
    }
134
135
8
    case PROTOCOL: {
136
8
        *result = trimmed_url.substring(0, protocol_pos);
137
8
        break;
138
20
    }
139
140
12
    case QUERY: {
141
        // Find first '?'.
142
12
        int32_t start_pos = _s_question_search.search(&protocol_end);
143
144
12
        if (start_pos < 0) {
145
            // Indicate no query was found.
146
0
            return false;
147
0
        }
148
149
12
        StringRef query_start = protocol_end.substring(start_pos + _s_question.size);
150
        // End string _s_at next '#'.
151
12
        int32_t end_pos = _s_hash_search.search(&query_start);
152
12
        *result = query_start.substring(0, end_pos);
153
12
        break;
154
12
    }
155
156
8
    case REF: {
157
        // Find '#'.
158
8
        int32_t start_pos = _s_hash_search.search(&protocol_end);
159
160
8
        if (start_pos < 0) {
161
            // Indicate no user and pass were given.
162
8
            return false;
163
8
        }
164
165
0
        *result = protocol_end.substring(start_pos + _s_hash.size);
166
0
        break;
167
8
    }
168
169
0
    case USERINFO: {
170
        // Find '@'.
171
0
        int32_t end_pos = _s_at_search.search(&protocol_end);
172
173
0
        if (end_pos < 0) {
174
            // Indicate no user and pass were given.
175
0
            return false;
176
0
        }
177
178
0
        *result = protocol_end.substring(0, end_pos);
179
0
        break;
180
0
    }
181
182
8
    case PORT: {
183
        // Find '@'.
184
8
        int32_t start_pos = _s_at_search.search(&protocol_end);
185
186
8
        if (start_pos < 0) {
187
            // No '@' was found, i.e., no user:pass info was given, start after _s_protocol.
188
8
            start_pos = 0;
189
8
        } else {
190
            // Skip '@'.
191
0
            start_pos += _s_at.size;
192
0
        }
193
194
8
        StringRef host_start = protocol_end.substring(start_pos);
195
        // Find ':' to strip out port.
196
8
        int32_t end_pos = _s_colon_search.search(&host_start);
197
        //no port found
198
8
        if (end_pos < 0) {
199
4
            return false;
200
4
        }
201
202
4
        StringRef port_start_str = host_start.substring(end_pos + _s_colon.size);
203
4
        int32_t port_end_pos = _s_slash_search.search(&port_start_str);
204
        //if '/' not found, try to find '?'
205
4
        if (port_end_pos < 0) {
206
0
            port_end_pos = _s_question_search.search(&port_start_str);
207
0
        }
208
4
        *result = port_start_str.substring(0, port_end_pos);
209
4
        break;
210
8
    }
211
212
0
    case INVALID:
213
0
        return false;
214
76
    }
215
216
60
    return true;
217
76
}
218
219
bool UrlParser::parse_url_key(const StringRef& url, UrlPart part, const StringRef& key,
220
32
                              StringRef* result) {
221
    // Part must be query to ask for a specific query key.
222
32
    if (part != QUERY) {
223
8
        return false;
224
8
    }
225
226
    // Remove leading and trailing spaces.
227
24
    StringRef trimmed_url = url.trim();
228
229
    // Search for the key in the url, ignoring malformed URLs for now.
230
24
    StringSearch key_search(&key);
231
232
24
    while (trimmed_url.size > 0) {
233
        // Search for the key in the current substring.
234
24
        int32_t key_pos = key_search.search(&trimmed_url);
235
24
        bool match = true;
236
237
24
        if (key_pos < 0) {
238
8
            return false;
239
8
        }
240
241
        // Key pos must be != 0 because it must be preceded by a '?' or a '&'.
242
        // Check that the char before key_pos is either '?' or '&'.
243
16
        if (key_pos == 0 ||
244
16
            (trimmed_url.data[key_pos - 1] != '?' && trimmed_url.data[key_pos - 1] != '&')) {
245
0
            match = false;
246
0
        }
247
248
        // Advance substring beyond matching key.
249
16
        trimmed_url = trimmed_url.substring(key_pos + key.size);
250
251
16
        if (!match) {
252
0
            continue;
253
0
        }
254
255
16
        if (trimmed_url.size <= 0) {
256
0
            break;
257
0
        }
258
259
        // Next character must be '=', otherwise the match cannot be a key in the query part.
260
16
        if (trimmed_url.data[0] != '=') {
261
0
            continue;
262
0
        }
263
264
16
        int32_t pos = 1;
265
266
        // Find ending position of key's value by matching '#' or '&'.
267
32
        while (pos < trimmed_url.size) {
268
32
            switch (trimmed_url.data[pos]) {
269
16
            case '#':
270
16
            case '&':
271
16
                *result = trimmed_url.substring(1, pos - 1);
272
16
                return true;
273
32
            }
274
275
16
            ++pos;
276
16
        }
277
278
        // Ending position is end of string.
279
0
        *result = trimmed_url.substring(1);
280
0
        return true;
281
16
    }
282
283
0
    return false;
284
24
}
285
286
116
UrlParser::UrlPart UrlParser::get_url_part(const StringRef& part) {
287
    // Quick filter on requested URL part, based on first character.
288
    // Hive requires the requested URL part to be all upper case.
289
116
    std::string part_str = part.to_string();
290
116
    transform(part_str.begin(), part_str.end(), part_str.begin(), ::toupper);
291
116
    StringRef newPart = StringRef(part_str);
292
116
    switch (newPart.data[0]) {
293
8
    case 'A': {
294
8
        if (!newPart.eq(_s_url_authority)) {
295
0
            return INVALID;
296
0
        }
297
298
8
        return AUTHORITY;
299
8
    }
300
301
12
    case 'F': {
302
12
        if (!newPart.eq(_s_url_file)) {
303
0
            return INVALID;
304
0
        }
305
306
12
        return FILE;
307
12
    }
308
309
28
    case 'H': {
310
28
        if (!newPart.eq(_s_url_host)) {
311
0
            return INVALID;
312
0
        }
313
314
28
        return HOST;
315
28
    }
316
317
24
    case 'P': {
318
24
        if (newPart.eq(_s_url_path)) {
319
8
            return PATH;
320
16
        } else if (newPart.eq(_s_url_protocol)) {
321
8
            return PROTOCOL;
322
8
        } else if (newPart.eq(_s_url_port)) {
323
8
            return PORT;
324
8
        } else {
325
0
            return INVALID;
326
0
        }
327
24
    }
328
329
36
    case 'Q': {
330
36
        if (!newPart.eq(_s_url_query)) {
331
0
            return INVALID;
332
0
        }
333
334
36
        return QUERY;
335
36
    }
336
337
8
    case 'R': {
338
8
        if (!newPart.eq(_s_url_ref)) {
339
0
            return INVALID;
340
0
        }
341
342
8
        return REF;
343
8
    }
344
345
0
    case 'U': {
346
0
        if (!newPart.eq(_s_url_userinfo)) {
347
0
            return INVALID;
348
0
        }
349
350
0
        return USERINFO;
351
0
    }
352
353
0
    default:
354
0
        return INVALID;
355
116
    }
356
116
}
357
358
40
StringRef UrlParser::extract_url(StringRef url, StringRef name) {
359
40
    StringRef result("", 0);
360
    // Remove leading and trailing spaces.
361
40
    StringRef trimmed_url = url.trim();
362
    // find '?'
363
40
    int32_t question_pos = _s_question_search.search(&trimmed_url);
364
40
    if (question_pos < 0) {
365
        // this url no parameters.
366
        // Example: https://doris.apache.org/
367
4
        return result;
368
4
    }
369
370
    // find '#'
371
36
    int32_t hash_pos = _s_hash_search.search(&trimmed_url);
372
36
    StringRef sub_url;
373
36
    if (hash_pos < 0) {
374
8
        sub_url = trimmed_url.substring(question_pos + 1, trimmed_url.size - question_pos - 1);
375
28
    } else {
376
28
        sub_url = trimmed_url.substring(question_pos + 1, hash_pos - question_pos - 1);
377
28
    }
378
379
    // find '&' and '=', and extract target parameter
380
    // Example: k1=aa&k2=bb&k3=cc&test=dd
381
36
    int64_t and_pod;
382
36
    auto len = sub_url.size;
383
36
    StringRef key_url;
384
80
    while (true) {
385
80
        if (len <= 0) {
386
16
            break;
387
16
        }
388
64
        and_pod = sub_url.find_first_of('&');
389
64
        if (and_pod != -1) {
390
40
            key_url = sub_url.substring(0, and_pod);
391
40
            sub_url = sub_url.substring(and_pod + 1, len - and_pod - 1);
392
40
        } else {
393
24
            auto end_pos = sub_url.find_first_of('#');
394
24
            key_url = end_pos == -1 ? sub_url : sub_url.substring(0, end_pos);
395
24
            sub_url = result;
396
24
        }
397
64
        len = sub_url.size;
398
399
64
        auto eq_pod = key_url.find_first_of('=');
400
64
        if (eq_pod == -1) {
401
            // invalid url. like: k1&k2=bb
402
4
            continue;
403
4
        }
404
60
        auto key_len = key_url.size;
405
60
        auto key = key_url.substring(0, eq_pod);
406
60
        if (name == key) {
407
20
            return key_url.substring(eq_pod + 1, key_len - eq_pod - 1);
408
20
        }
409
60
    }
410
16
    return result;
411
36
}
412
} // namespace doris