Coverage Report

Created: 2026-03-15 15:59

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/url/domain.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
// This file is copied from
18
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/domain.h
19
// and modified by Doris
20
21
#pragma once
22
23
#include <cstring>
24
25
#include "exec/common/string_utils/string_utils.h"
26
#include "exprs/function/url/find_symbols.h"
27
#include "exprs/function/url/protocol.h"
28
#include "exprs/function/url/tldLookup.h"
29
30
namespace doris {
31
32
inline StringRef check_and_return_host(const Pos& pos, const Pos& dot_pos,
33
119
                                       const Pos& start_of_host) {
34
119
    if (!dot_pos || start_of_host >= pos || pos - dot_pos == 1) return StringRef {};
35
36
50
    auto after_dot = *(dot_pos + 1);
37
50
    if (after_dot == ':' || after_dot == '/' || after_dot == '?' || after_dot == '#')
38
0
        return StringRef {};
39
40
50
    return StringRef(start_of_host, pos - start_of_host);
41
50
}
42
43
/// Extracts host from given url.
44
///
45
/// @return empty StringRef if the host is not valid (i.e. it does not have dot, or there no symbol after dot).
46
137
inline StringRef get_url_host(const char* data, size_t size) {
47
137
    Pos pos = data;
48
137
    Pos end = data + size;
49
50
137
    if (*pos == '/' && *(pos + 1) == '/') {
51
4
        pos += 2;
52
133
    } else {
53
133
        Pos scheme_end = data + std::min(size, 16UL);
54
886
        for (++pos; pos < scheme_end; ++pos) {
55
788
            if (!is_alpha_numeric_ascii(*pos)) {
56
86
                switch (*pos) {
57
51
                case '.':
58
51
                case '-':
59
51
                case '+':
60
51
                    break;
61
9
                case ' ': /// restricted symbols
62
9
                case '\t':
63
9
                case '<':
64
9
                case '>':
65
9
                case '%':
66
9
                case '{':
67
9
                case '}':
68
9
                case '|':
69
9
                case '\\':
70
9
                case '^':
71
9
                case '~':
72
9
                case '[':
73
9
                case ']':
74
9
                case ';':
75
9
                case '=':
76
9
                case '&':
77
9
                    return StringRef {};
78
26
                default:
79
26
                    goto exloop;
80
86
                }
81
86
            }
82
788
        }
83
124
    exloop:
84
124
        if ((scheme_end - pos) > 2 && *pos == ':' && *(pos + 1) == '/' && *(pos + 2) == '/')
85
24
            pos += 3;
86
100
        else
87
100
            pos = data;
88
124
    }
89
90
128
    Pos dot_pos = nullptr;
91
128
    const auto* start_of_host = pos;
92
1.34k
    for (; pos < end; ++pos) {
93
1.25k
        switch (*pos) {
94
115
        case '.':
95
115
            dot_pos = pos;
96
115
            break;
97
6
        case ':': /// end symbols
98
24
        case '/':
99
26
        case '?':
100
26
        case '#':
101
26
            return check_and_return_host(pos, dot_pos, start_of_host);
102
4
        case '@': /// myemail@gmail.com
103
4
            start_of_host = pos + 1;
104
4
            break;
105
9
        case ' ': /// restricted symbols in whole URL
106
9
        case '\t':
107
9
        case '<':
108
9
        case '>':
109
9
        case '%':
110
9
        case '{':
111
9
        case '}':
112
9
        case '|':
113
9
        case '\\':
114
9
        case '^':
115
9
        case '~':
116
9
        case '[':
117
9
        case ']':
118
9
        case ';':
119
9
        case '=':
120
9
        case '&':
121
9
            return StringRef {};
122
1.25k
        }
123
1.25k
    }
124
125
93
    return check_and_return_host(pos, dot_pos, start_of_host);
126
128
}
127
128
template <bool without_www>
129
struct ExtractDomain {
130
18
    static size_t get_reserve_length_for_element() { return 15; }
_ZN5doris13ExtractDomainILb0EE30get_reserve_length_for_elementEv
Line
Count
Source
130
9
    static size_t get_reserve_length_for_element() { return 15; }
_ZN5doris13ExtractDomainILb1EE30get_reserve_length_for_elementEv
Line
Count
Source
130
9
    static size_t get_reserve_length_for_element() { return 15; }
131
132
114
    static void execute(Pos data, size_t size, Pos& res_data, size_t& res_size) {
133
114
        StringRef host = get_url_host(data, size);
134
135
114
        if (host.size == 0) {
136
76
            res_data = data;
137
76
            res_size = 0;
138
76
        } else {
139
38
            if (without_www && host.size > 4 && !strncmp(host.data, "www.", 4))
140
21
                host = {host.data + 4, host.size - 4};
141
142
38
            res_data = host.data;
143
38
            res_size = host.size;
144
38
        }
145
114
    }
_ZN5doris13ExtractDomainILb0EE7executeEPKcmRS3_Rm
Line
Count
Source
132
34
    static void execute(Pos data, size_t size, Pos& res_data, size_t& res_size) {
133
34
        StringRef host = get_url_host(data, size);
134
135
34
        if (host.size == 0) {
136
27
            res_data = data;
137
27
            res_size = 0;
138
27
        } else {
139
7
            if (without_www && host.size > 4 && !strncmp(host.data, "www.", 4))
140
0
                host = {host.data + 4, host.size - 4};
141
142
7
            res_data = host.data;
143
7
            res_size = host.size;
144
7
        }
145
34
    }
_ZN5doris13ExtractDomainILb1EE7executeEPKcmRS3_Rm
Line
Count
Source
132
80
    static void execute(Pos data, size_t size, Pos& res_data, size_t& res_size) {
133
80
        StringRef host = get_url_host(data, size);
134
135
80
        if (host.size == 0) {
136
49
            res_data = data;
137
49
            res_size = 0;
138
49
        } else {
139
31
            if (without_www && host.size > 4 && !strncmp(host.data, "www.", 4))
140
21
                host = {host.data + 4, host.size - 4};
141
142
31
            res_data = host.data;
143
31
            res_size = host.size;
144
31
        }
145
80
    }
146
};
147
148
struct ExtractTopLevelDomain {
149
19
    static size_t get_reserve_length_for_element() { return 5; }
150
151
23
    static void execute(const char* data, size_t size, const char*& res_data, size_t& res_size) {
152
23
        res_data = data;
153
23
        res_size = 0;
154
23
        StringRef host = get_url_host(data, size);
155
156
23
        if (host.size == 0) {
157
11
            return;
158
12
        } else {
159
12
            auto host_view = host.to_string_view();
160
12
            if (host_view[host_view.size() - 1] == '.') {
161
0
                host_view.remove_suffix(1);
162
0
            }
163
164
12
            const auto* host_end = host_view.data() + host_view.size();
165
12
            const char* last_dot = find_last_symbols_or_null<'.'>(host_view.data(), host_end);
166
12
            if (!last_dot) {
167
0
                return;
168
0
            }
169
170
            /// For IPv4 addresses select nothing.
171
            ///
172
            /// NOTE: it is safe to access last_dot[1]
173
            /// since getURLHost() will not return a host if there is symbol after dot.
174
12
            if (is_numeric_ascii(last_dot[1])) {
175
0
                return;
176
0
            }
177
178
12
            res_data = last_dot + 1;
179
12
            res_size = host_end - res_data;
180
12
        }
181
23
    }
182
};
183
184
struct ExtractFirstSignificantSubdomain {
185
19
    static size_t get_reserve_length_for_element() { return 10; }
186
187
    static void execute(const Pos data, const size_t size, Pos& res_data, size_t& res_size,
188
46
                        Pos* out_domain_end = nullptr) {
189
46
        res_data = data;
190
46
        res_size = 0;
191
192
46
        Pos tmp;
193
46
        size_t domain_length = 0;
194
46
        ExtractDomain<true>::execute(data, size, tmp, domain_length);
195
196
46
        if (domain_length == 0) {
197
22
            return;
198
22
        }
199
24
        if (out_domain_end) {
200
12
            *out_domain_end = tmp + domain_length;
201
12
        }
202
203
        /// cut useless dot
204
24
        if (tmp[domain_length - 1] == '.') {
205
0
            --domain_length;
206
0
        }
207
208
24
        res_data = tmp;
209
24
        res_size = domain_length;
210
211
24
        const auto* begin = tmp;
212
24
        const auto* end = begin + domain_length;
213
24
        std::array<const char*, 3> last_periods {};
214
215
24
        const auto* pos = find_first_symbols<'.'>(begin, end);
216
66
        while (pos < end) {
217
42
            last_periods[2] = last_periods[1];
218
42
            last_periods[1] = last_periods[0];
219
42
            last_periods[0] = pos;
220
42
            pos = find_first_symbols<'.'>(pos + 1, end);
221
42
        }
222
223
24
        if (!last_periods[0]) {
224
0
            return;
225
0
        }
226
227
24
        if (!last_periods[1]) {
228
10
            res_size = last_periods[0] - begin;
229
10
            return;
230
10
        }
231
232
14
        if (!last_periods[2]) {
233
10
            last_periods[2] = begin - 1;
234
10
        }
235
236
14
        const auto* end_of_level_domain = find_first_symbols<'/'>(last_periods[0], end);
237
14
        if (!end_of_level_domain) {
238
0
            end_of_level_domain = end;
239
0
        }
240
241
14
        auto host_len = static_cast<size_t>(end_of_level_domain - last_periods[1] - 1);
242
14
        StringRef host {last_periods[1] + 1, host_len};
243
14
        if (tldLookup::is_valid(host.data, host.size)) {
244
10
            res_data += last_periods[2] + 1 - begin;
245
10
            res_size = last_periods[1] - last_periods[2] - 1;
246
10
        } else {
247
4
            res_data += last_periods[1] + 1 - begin;
248
4
            res_size = last_periods[0] - last_periods[1] - 1;
249
4
        }
250
14
    }
251
};
252
253
struct CutToFirstSignificantSubdomain {
254
19
    static size_t get_reserve_length_for_element() { return 15; }
255
256
23
    static void execute(const Pos data, const size_t size, Pos& res_data, size_t& res_size) {
257
23
        res_data = data;
258
23
        res_size = 0;
259
260
23
        Pos tmp_data = data;
261
23
        size_t tmp_length;
262
23
        Pos domain_end = data;
263
23
        ExtractFirstSignificantSubdomain::execute(data, size, tmp_data, tmp_length, &domain_end);
264
265
23
        if (tmp_length == 0) {
266
11
            return;
267
11
        }
268
12
        res_data = tmp_data;
269
12
        res_size = domain_end - tmp_data;
270
12
    }
271
};
272
} // namespace doris