Coverage Report

Created: 2026-03-15 18:01

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/url/domain.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
// This file is copied from
18
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/domain.h
19
// and modified by Doris
20
21
#pragma once
22
23
#include <cstring>
24
25
#include "exec/common/string_utils/string_utils.h"
26
#include "exprs/function/url/find_symbols.h"
27
#include "exprs/function/url/protocol.h"
28
#include "exprs/function/url/tldLookup.h"
29
30
namespace doris {
31
32
inline StringRef check_and_return_host(const Pos& pos, const Pos& dot_pos,
33
18
                                       const Pos& start_of_host) {
34
18
    if (!dot_pos || start_of_host >= pos || pos - dot_pos == 1) return StringRef {};
35
36
14
    auto after_dot = *(dot_pos + 1);
37
14
    if (after_dot == ':' || after_dot == '/' || after_dot == '?' || after_dot == '#')
38
0
        return StringRef {};
39
40
14
    return StringRef(start_of_host, pos - start_of_host);
41
14
}
42
43
/// Extracts host from given url.
44
///
45
/// @return empty StringRef if the host is not valid (i.e. it does not have dot, or there no symbol after dot).
46
18
inline StringRef get_url_host(const char* data, size_t size) {
47
18
    Pos pos = data;
48
18
    Pos end = data + size;
49
50
18
    if (*pos == '/' && *(pos + 1) == '/') {
51
4
        pos += 2;
52
14
    } else {
53
14
        Pos scheme_end = data + std::min(size, 16UL);
54
86
        for (++pos; pos < scheme_end; ++pos) {
55
80
            if (!is_alpha_numeric_ascii(*pos)) {
56
14
                switch (*pos) {
57
6
                case '.':
58
6
                case '-':
59
6
                case '+':
60
6
                    break;
61
0
                case ' ': /// restricted symbols
62
0
                case '\t':
63
0
                case '<':
64
0
                case '>':
65
0
                case '%':
66
0
                case '{':
67
0
                case '}':
68
0
                case '|':
69
0
                case '\\':
70
0
                case '^':
71
0
                case '~':
72
0
                case '[':
73
0
                case ']':
74
0
                case ';':
75
0
                case '=':
76
0
                case '&':
77
0
                    return StringRef {};
78
8
                default:
79
8
                    goto exloop;
80
14
                }
81
14
            }
82
80
        }
83
14
    exloop:
84
14
        if ((scheme_end - pos) > 2 && *pos == ':' && *(pos + 1) == '/' && *(pos + 2) == '/')
85
6
            pos += 3;
86
8
        else
87
8
            pos = data;
88
14
    }
89
90
18
    Pos dot_pos = nullptr;
91
18
    const auto* start_of_host = pos;
92
236
    for (; pos < end; ++pos) {
93
226
        switch (*pos) {
94
28
        case '.':
95
28
            dot_pos = pos;
96
28
            break;
97
6
        case ':': /// end symbols
98
6
        case '/':
99
8
        case '?':
100
8
        case '#':
101
8
            return check_and_return_host(pos, dot_pos, start_of_host);
102
4
        case '@': /// myemail@gmail.com
103
4
            start_of_host = pos + 1;
104
4
            break;
105
0
        case ' ': /// restricted symbols in whole URL
106
0
        case '\t':
107
0
        case '<':
108
0
        case '>':
109
0
        case '%':
110
0
        case '{':
111
0
        case '}':
112
0
        case '|':
113
0
        case '\\':
114
0
        case '^':
115
0
        case '~':
116
0
        case '[':
117
0
        case ']':
118
0
        case ';':
119
0
        case '=':
120
0
        case '&':
121
0
            return StringRef {};
122
226
        }
123
226
    }
124
125
10
    return check_and_return_host(pos, dot_pos, start_of_host);
126
18
}
127
128
template <bool without_www>
129
struct ExtractDomain {
130
2
    static size_t get_reserve_length_for_element() { return 15; }
_ZN5doris13ExtractDomainILb0EE30get_reserve_length_for_elementEv
Line
Count
Source
130
1
    static size_t get_reserve_length_for_element() { return 15; }
_ZN5doris13ExtractDomainILb1EE30get_reserve_length_for_elementEv
Line
Count
Source
130
1
    static size_t get_reserve_length_for_element() { return 15; }
131
132
18
    static void execute(Pos data, size_t size, Pos& res_data, size_t& res_size) {
133
18
        StringRef host = get_url_host(data, size);
134
135
18
        if (host.size == 0) {
136
4
            res_data = data;
137
4
            res_size = 0;
138
14
        } else {
139
14
            if (without_www && host.size > 4 && !strncmp(host.data, "www.", 4))
140
5
                host = {host.data + 4, host.size - 4};
141
142
14
            res_data = host.data;
143
14
            res_size = host.size;
144
14
        }
145
18
    }
_ZN5doris13ExtractDomainILb0EE7executeEPKcmRS3_Rm
Line
Count
Source
132
9
    static void execute(Pos data, size_t size, Pos& res_data, size_t& res_size) {
133
9
        StringRef host = get_url_host(data, size);
134
135
9
        if (host.size == 0) {
136
2
            res_data = data;
137
2
            res_size = 0;
138
7
        } else {
139
7
            if (without_www && host.size > 4 && !strncmp(host.data, "www.", 4))
140
0
                host = {host.data + 4, host.size - 4};
141
142
7
            res_data = host.data;
143
7
            res_size = host.size;
144
7
        }
145
9
    }
_ZN5doris13ExtractDomainILb1EE7executeEPKcmRS3_Rm
Line
Count
Source
132
9
    static void execute(Pos data, size_t size, Pos& res_data, size_t& res_size) {
133
9
        StringRef host = get_url_host(data, size);
134
135
9
        if (host.size == 0) {
136
2
            res_data = data;
137
2
            res_size = 0;
138
7
        } else {
139
7
            if (without_www && host.size > 4 && !strncmp(host.data, "www.", 4))
140
5
                host = {host.data + 4, host.size - 4};
141
142
7
            res_data = host.data;
143
7
            res_size = host.size;
144
7
        }
145
9
    }
146
};
147
148
struct ExtractTopLevelDomain {
149
0
    static size_t get_reserve_length_for_element() { return 5; }
150
151
0
    static void execute(const char* data, size_t size, const char*& res_data, size_t& res_size) {
152
0
        res_data = data;
153
0
        res_size = 0;
154
0
        StringRef host = get_url_host(data, size);
155
156
0
        if (host.size == 0) {
157
0
            return;
158
0
        } else {
159
0
            auto host_view = host.to_string_view();
160
0
            if (host_view[host_view.size() - 1] == '.') {
161
0
                host_view.remove_suffix(1);
162
0
            }
163
164
0
            const auto* host_end = host_view.data() + host_view.size();
165
0
            const char* last_dot = find_last_symbols_or_null<'.'>(host_view.data(), host_end);
166
0
            if (!last_dot) {
167
0
                return;
168
0
            }
169
170
            /// For IPv4 addresses select nothing.
171
            ///
172
            /// NOTE: it is safe to access last_dot[1]
173
            /// since getURLHost() will not return a host if there is symbol after dot.
174
0
            if (is_numeric_ascii(last_dot[1])) {
175
0
                return;
176
0
            }
177
178
0
            res_data = last_dot + 1;
179
0
            res_size = host_end - res_data;
180
0
        }
181
0
    }
182
};
183
184
struct ExtractFirstSignificantSubdomain {
185
0
    static size_t get_reserve_length_for_element() { return 10; }
186
187
    static void execute(const Pos data, const size_t size, Pos& res_data, size_t& res_size,
188
0
                        Pos* out_domain_end = nullptr) {
189
0
        res_data = data;
190
0
        res_size = 0;
191
192
0
        Pos tmp;
193
0
        size_t domain_length = 0;
194
0
        ExtractDomain<true>::execute(data, size, tmp, domain_length);
195
196
0
        if (domain_length == 0) {
197
0
            return;
198
0
        }
199
0
        if (out_domain_end) {
200
0
            *out_domain_end = tmp + domain_length;
201
0
        }
202
203
        /// cut useless dot
204
0
        if (tmp[domain_length - 1] == '.') {
205
0
            --domain_length;
206
0
        }
207
208
0
        res_data = tmp;
209
0
        res_size = domain_length;
210
211
0
        const auto* begin = tmp;
212
0
        const auto* end = begin + domain_length;
213
0
        std::array<const char*, 3> last_periods {};
214
215
0
        const auto* pos = find_first_symbols<'.'>(begin, end);
216
0
        while (pos < end) {
217
0
            last_periods[2] = last_periods[1];
218
0
            last_periods[1] = last_periods[0];
219
0
            last_periods[0] = pos;
220
0
            pos = find_first_symbols<'.'>(pos + 1, end);
221
0
        }
222
223
0
        if (!last_periods[0]) {
224
0
            return;
225
0
        }
226
227
0
        if (!last_periods[1]) {
228
0
            res_size = last_periods[0] - begin;
229
0
            return;
230
0
        }
231
232
0
        if (!last_periods[2]) {
233
0
            last_periods[2] = begin - 1;
234
0
        }
235
236
0
        const auto* end_of_level_domain = find_first_symbols<'/'>(last_periods[0], end);
237
0
        if (!end_of_level_domain) {
238
0
            end_of_level_domain = end;
239
0
        }
240
241
0
        auto host_len = static_cast<size_t>(end_of_level_domain - last_periods[1] - 1);
242
0
        StringRef host {last_periods[1] + 1, host_len};
243
0
        if (tldLookup::is_valid(host.data, host.size)) {
244
0
            res_data += last_periods[2] + 1 - begin;
245
0
            res_size = last_periods[1] - last_periods[2] - 1;
246
0
        } else {
247
0
            res_data += last_periods[1] + 1 - begin;
248
0
            res_size = last_periods[0] - last_periods[1] - 1;
249
0
        }
250
0
    }
251
};
252
253
struct CutToFirstSignificantSubdomain {
254
0
    static size_t get_reserve_length_for_element() { return 15; }
255
256
0
    static void execute(const Pos data, const size_t size, Pos& res_data, size_t& res_size) {
257
0
        res_data = data;
258
0
        res_size = 0;
259
260
0
        Pos tmp_data = data;
261
0
        size_t tmp_length;
262
0
        Pos domain_end = data;
263
0
        ExtractFirstSignificantSubdomain::execute(data, size, tmp_data, tmp_length, &domain_end);
264
265
0
        if (tmp_length == 0) {
266
0
            return;
267
0
        }
268
0
        res_data = tmp_data;
269
0
        res_size = domain_end - tmp_data;
270
0
    }
271
};
272
} // namespace doris