Coverage Report

Created: 2026-04-14 10:14

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/url/functions_url.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
// This file is copied from
18
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/FunctionsURL.h
19
// and modified by Doris
20
21
#pragma once
22
23
#include "core/column/column_string.h"
24
#include "core/memcpy_small.h"
25
26
namespace doris {
27
/** URL processing functions. See implementation in separate .cpp files.
28
  * All functions are not strictly follow RFC, instead they are maximally simplified for performance reasons.
29
  *
30
  * Functions for extraction parts of URL.
31
  * If URL has nothing like, then empty string is returned.
32
  *
33
  *  domain
34
  *  domainWithoutWWW
35
  *  topLevelDomain
36
  *  protocol
37
  *  path
38
  *  queryString
39
  *  fragment
40
  *  queryStringAndFragment
41
  *  netloc
42
  *
43
  * Functions, removing parts from URL.
44
  * If URL has nothing like, then it is returned unchanged.
45
  *
46
  *  cutWWW
47
  *  cutFragment
48
  *  cutQueryString
49
  *  cutQueryStringAndFragment
50
  *
51
  * Extract value of parameter in query string or in fragment identifier. Return empty string, if URL has no such parameter.
52
  * If there are many parameters with same name - return value of first one. Value is not %-decoded.
53
  *
54
  *  extractURLParameter(URL, name)
55
  *
56
  * Extract all parameters from URL in form of array of strings name=value.
57
  *  extractURLParameters(URL)
58
  *
59
  * Extract names of all parameters from URL in form of array of strings.
60
  *  extractURLParameterNames(URL)
61
  *
62
  * Remove specified parameter from URL.
63
  *  cutURLParameter(URL, name)
64
  *
65
  * Get array of URL 'hierarchy' as in web-analytics tree-like reports. See the docs.
66
  *  URLHierarchy(URL)
67
  */
68
69
using Pos = const char*;
70
71
/** Select part of string using the Extractor.
72
  */
73
template <typename Extractor>
74
struct ExtractSubstringImpl {
75
    static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets,
76
84
                         ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
77
84
        size_t size = offsets.size();
78
84
        res_offsets.resize(size);
79
84
        res_data.reserve(size * Extractor::get_reserve_length_for_element());
80
81
84
        size_t prev_offset = 0;
82
84
        size_t res_offset = 0;
83
84
        /// Matched part.
85
84
        Pos start;
86
84
        size_t length;
87
88
262
        for (size_t i = 0; i < size; ++i) {
89
178
            Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]),
90
178
                               offsets[i] - prev_offset, start, length);
91
178
            res_data.resize(res_data.size() + length);
92
178
            memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length);
93
178
            res_offset += length;
94
95
178
            res_offsets[i] = (ColumnString::Offset)res_offset;
96
178
            prev_offset = offsets[i];
97
178
        }
98
84
        return Status::OK();
99
84
    }
_ZN5doris20ExtractSubstringImplINS_13ExtractDomainILb0EEEE6vectorERKNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERKNS4_IjLm4096ES7_Lm16ELm15EEERS8_RSB_
Line
Count
Source
76
9
                         ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
77
9
        size_t size = offsets.size();
78
9
        res_offsets.resize(size);
79
9
        res_data.reserve(size * Extractor::get_reserve_length_for_element());
80
81
9
        size_t prev_offset = 0;
82
9
        size_t res_offset = 0;
83
84
        /// Matched part.
85
9
        Pos start;
86
9
        size_t length;
87
88
43
        for (size_t i = 0; i < size; ++i) {
89
34
            Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]),
90
34
                               offsets[i] - prev_offset, start, length);
91
34
            res_data.resize(res_data.size() + length);
92
34
            memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length);
93
34
            res_offset += length;
94
95
34
            res_offsets[i] = (ColumnString::Offset)res_offset;
96
34
            prev_offset = offsets[i];
97
34
        }
98
9
        return Status::OK();
99
9
    }
_ZN5doris20ExtractSubstringImplINS_13ExtractDomainILb1EEEE6vectorERKNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERKNS4_IjLm4096ES7_Lm16ELm15EEERS8_RSB_
Line
Count
Source
76
9
                         ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
77
9
        size_t size = offsets.size();
78
9
        res_offsets.resize(size);
79
9
        res_data.reserve(size * Extractor::get_reserve_length_for_element());
80
81
9
        size_t prev_offset = 0;
82
9
        size_t res_offset = 0;
83
84
        /// Matched part.
85
9
        Pos start;
86
9
        size_t length;
87
88
43
        for (size_t i = 0; i < size; ++i) {
89
34
            Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]),
90
34
                               offsets[i] - prev_offset, start, length);
91
34
            res_data.resize(res_data.size() + length);
92
34
            memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length);
93
34
            res_offset += length;
94
95
34
            res_offsets[i] = (ColumnString::Offset)res_offset;
96
34
            prev_offset = offsets[i];
97
34
        }
98
9
        return Status::OK();
99
9
    }
_ZN5doris20ExtractSubstringImplINS_15ExtractProtocolEE6vectorERKNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERKNS3_IjLm4096ES6_Lm16ELm15EEERS7_RSA_
Line
Count
Source
76
9
                         ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
77
9
        size_t size = offsets.size();
78
9
        res_offsets.resize(size);
79
9
        res_data.reserve(size * Extractor::get_reserve_length_for_element());
80
81
9
        size_t prev_offset = 0;
82
9
        size_t res_offset = 0;
83
84
        /// Matched part.
85
9
        Pos start;
86
9
        size_t length;
87
88
50
        for (size_t i = 0; i < size; ++i) {
89
41
            Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]),
90
41
                               offsets[i] - prev_offset, start, length);
91
41
            res_data.resize(res_data.size() + length);
92
41
            memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length);
93
41
            res_offset += length;
94
95
41
            res_offsets[i] = (ColumnString::Offset)res_offset;
96
41
            prev_offset = offsets[i];
97
41
        }
98
9
        return Status::OK();
99
9
    }
_ZN5doris20ExtractSubstringImplINS_21ExtractTopLevelDomainEE6vectorERKNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERKNS3_IjLm4096ES6_Lm16ELm15EEERS7_RSA_
Line
Count
Source
76
19
                         ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
77
19
        size_t size = offsets.size();
78
19
        res_offsets.resize(size);
79
19
        res_data.reserve(size * Extractor::get_reserve_length_for_element());
80
81
19
        size_t prev_offset = 0;
82
19
        size_t res_offset = 0;
83
84
        /// Matched part.
85
19
        Pos start;
86
19
        size_t length;
87
88
42
        for (size_t i = 0; i < size; ++i) {
89
23
            Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]),
90
23
                               offsets[i] - prev_offset, start, length);
91
23
            res_data.resize(res_data.size() + length);
92
23
            memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length);
93
23
            res_offset += length;
94
95
23
            res_offsets[i] = (ColumnString::Offset)res_offset;
96
23
            prev_offset = offsets[i];
97
23
        }
98
19
        return Status::OK();
99
19
    }
_ZN5doris20ExtractSubstringImplINS_32ExtractFirstSignificantSubdomainEE6vectorERKNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERKNS3_IjLm4096ES6_Lm16ELm15EEERS7_RSA_
Line
Count
Source
76
19
                         ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
77
19
        size_t size = offsets.size();
78
19
        res_offsets.resize(size);
79
19
        res_data.reserve(size * Extractor::get_reserve_length_for_element());
80
81
19
        size_t prev_offset = 0;
82
19
        size_t res_offset = 0;
83
84
        /// Matched part.
85
19
        Pos start;
86
19
        size_t length;
87
88
42
        for (size_t i = 0; i < size; ++i) {
89
23
            Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]),
90
23
                               offsets[i] - prev_offset, start, length);
91
23
            res_data.resize(res_data.size() + length);
92
23
            memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length);
93
23
            res_offset += length;
94
95
23
            res_offsets[i] = (ColumnString::Offset)res_offset;
96
23
            prev_offset = offsets[i];
97
23
        }
98
19
        return Status::OK();
99
19
    }
_ZN5doris20ExtractSubstringImplINS_30CutToFirstSignificantSubdomainEE6vectorERKNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb1EEELm16ELm15EEERKNS3_IjLm4096ES6_Lm16ELm15EEERS7_RSA_
Line
Count
Source
76
19
                         ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
77
19
        size_t size = offsets.size();
78
19
        res_offsets.resize(size);
79
19
        res_data.reserve(size * Extractor::get_reserve_length_for_element());
80
81
19
        size_t prev_offset = 0;
82
19
        size_t res_offset = 0;
83
84
        /// Matched part.
85
19
        Pos start;
86
19
        size_t length;
87
88
42
        for (size_t i = 0; i < size; ++i) {
89
23
            Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]),
90
23
                               offsets[i] - prev_offset, start, length);
91
23
            res_data.resize(res_data.size() + length);
92
23
            memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length);
93
23
            res_offset += length;
94
95
23
            res_offsets[i] = (ColumnString::Offset)res_offset;
96
23
            prev_offset = offsets[i];
97
23
        }
98
19
        return Status::OK();
99
19
    }
100
101
    static void constant(const std::string& data, std::string& res_data) {
102
        Pos start;
103
        size_t length;
104
        Extractor::execute(data.data(), data.size(), start, length);
105
        res_data.assign(start, length);
106
    }
107
};
108
109
/** Delete part of string using the Extractor.
110
  */
111
template <typename Extractor>
112
struct CutSubstringImpl {
113
    static void vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets,
114
                       ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
115
        res_data.reserve(data.size());
116
        size_t size = offsets.size();
117
        res_offsets.resize(size);
118
119
        size_t prev_offset = 0;
120
        size_t res_offset = 0;
121
122
        /// Matched part.
123
        Pos start;
124
        size_t length;
125
126
        for (size_t i = 0; i < size; ++i) {
127
            const char* current = reinterpret_cast<const char*>(&data[prev_offset]);
128
            Extractor::execute(current, offsets[i] - prev_offset, start, length);
129
            size_t start_index = start - reinterpret_cast<const char*>(data.data());
130
131
            res_data.resize(res_data.size() + offsets[i] - prev_offset - length);
132
            memcpy_small_allow_read_write_overflow15(&res_data[res_offset], current,
133
                                                     start - current);
134
            memcpy_small_allow_read_write_overflow15(&res_data[res_offset + start - current],
135
                                                     start + length,
136
                                                     offsets[i] - start_index - length);
137
            res_offset += offsets[i] - prev_offset - length;
138
139
            res_offsets[i] = res_offset;
140
            prev_offset = offsets[i];
141
        }
142
    }
143
144
    static void constant(const std::string& data, std::string& res_data) {
145
        Pos start;
146
        size_t length;
147
        Extractor::execute(data.data(), data.size(), start, length);
148
        res_data.reserve(data.size() - length);
149
        res_data.append(data.data(), start);
150
        res_data.append(start + length, data.data() + data.size());
151
    }
152
};
153
} // namespace doris