Coverage Report

Created: 2026-03-13 09:37

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/url/functions_url.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
// This file is copied from
18
// https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/FunctionsURL.h
19
// and modified by Doris
20
21
#pragma once
22
23
#include "core/column/column_string.h"
24
#include "core/memcpy_small.h"
25
26
namespace doris {
27
#include "common/compile_check_begin.h"
28
/** URL processing functions. See implementation in separate .cpp files.
29
  * All functions are not strictly follow RFC, instead they are maximally simplified for performance reasons.
30
  *
31
  * Functions for extraction parts of URL.
32
  * If URL has nothing like, then empty string is returned.
33
  *
34
  *  domain
35
  *  domainWithoutWWW
36
  *  topLevelDomain
37
  *  protocol
38
  *  path
39
  *  queryString
40
  *  fragment
41
  *  queryStringAndFragment
42
  *  netloc
43
  *
44
  * Functions, removing parts from URL.
45
  * If URL has nothing like, then it is returned unchanged.
46
  *
47
  *  cutWWW
48
  *  cutFragment
49
  *  cutQueryString
50
  *  cutQueryStringAndFragment
51
  *
52
  * Extract value of parameter in query string or in fragment identifier. Return empty string, if URL has no such parameter.
53
  * If there are many parameters with same name - return value of first one. Value is not %-decoded.
54
  *
55
  *  extractURLParameter(URL, name)
56
  *
57
  * Extract all parameters from URL in form of array of strings name=value.
58
  *  extractURLParameters(URL)
59
  *
60
  * Extract names of all parameters from URL in form of array of strings.
61
  *  extractURLParameterNames(URL)
62
  *
63
  * Remove specified parameter from URL.
64
  *  cutURLParameter(URL, name)
65
  *
66
  * Get array of URL 'hierarchy' as in web-analytics tree-like reports. See the docs.
67
  *  URLHierarchy(URL)
68
  */
69
70
using Pos = const char*;
71
72
/** Select part of string using the Extractor.
73
  */
74
template <typename Extractor>
75
struct ExtractSubstringImpl {
76
    static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets,
77
83
                         ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
78
83
        size_t size = offsets.size();
79
83
        res_offsets.resize(size);
80
83
        res_data.reserve(size * Extractor::get_reserve_length_for_element());
81
82
83
        size_t prev_offset = 0;
83
83
        size_t res_offset = 0;
84
85
        /// Matched part.
86
83
        Pos start;
87
83
        size_t length;
88
89
261
        for (size_t i = 0; i < size; ++i) {
90
178
            Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]),
91
178
                               offsets[i] - prev_offset, start, length);
92
178
            res_data.resize(res_data.size() + length);
93
178
            memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length);
94
178
            res_offset += length;
95
96
178
            res_offsets[i] = (ColumnString::Offset)res_offset;
97
178
            prev_offset = offsets[i];
98
178
        }
99
83
        return Status::OK();
100
83
    }
_ZN5doris20ExtractSubstringImplINS_13ExtractDomainILb0EEEE6vectorERKNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERKNS4_IjLm4096ES7_Lm16ELm15EEERS8_RSB_
Line
Count
Source
77
8
                         ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
78
8
        size_t size = offsets.size();
79
8
        res_offsets.resize(size);
80
8
        res_data.reserve(size * Extractor::get_reserve_length_for_element());
81
82
8
        size_t prev_offset = 0;
83
8
        size_t res_offset = 0;
84
85
        /// Matched part.
86
8
        Pos start;
87
8
        size_t length;
88
89
42
        for (size_t i = 0; i < size; ++i) {
90
34
            Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]),
91
34
                               offsets[i] - prev_offset, start, length);
92
34
            res_data.resize(res_data.size() + length);
93
34
            memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length);
94
34
            res_offset += length;
95
96
34
            res_offsets[i] = (ColumnString::Offset)res_offset;
97
34
            prev_offset = offsets[i];
98
34
        }
99
8
        return Status::OK();
100
8
    }
_ZN5doris20ExtractSubstringImplINS_13ExtractDomainILb1EEEE6vectorERKNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERKNS4_IjLm4096ES7_Lm16ELm15EEERS8_RSB_
Line
Count
Source
77
9
                         ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
78
9
        size_t size = offsets.size();
79
9
        res_offsets.resize(size);
80
9
        res_data.reserve(size * Extractor::get_reserve_length_for_element());
81
82
9
        size_t prev_offset = 0;
83
9
        size_t res_offset = 0;
84
85
        /// Matched part.
86
9
        Pos start;
87
9
        size_t length;
88
89
43
        for (size_t i = 0; i < size; ++i) {
90
34
            Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]),
91
34
                               offsets[i] - prev_offset, start, length);
92
34
            res_data.resize(res_data.size() + length);
93
34
            memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length);
94
34
            res_offset += length;
95
96
34
            res_offsets[i] = (ColumnString::Offset)res_offset;
97
34
            prev_offset = offsets[i];
98
34
        }
99
9
        return Status::OK();
100
9
    }
_ZN5doris20ExtractSubstringImplINS_15ExtractProtocolEE6vectorERKNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERKNS3_IjLm4096ES6_Lm16ELm15EEERS7_RSA_
Line
Count
Source
77
9
                         ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
78
9
        size_t size = offsets.size();
79
9
        res_offsets.resize(size);
80
9
        res_data.reserve(size * Extractor::get_reserve_length_for_element());
81
82
9
        size_t prev_offset = 0;
83
9
        size_t res_offset = 0;
84
85
        /// Matched part.
86
9
        Pos start;
87
9
        size_t length;
88
89
50
        for (size_t i = 0; i < size; ++i) {
90
41
            Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]),
91
41
                               offsets[i] - prev_offset, start, length);
92
41
            res_data.resize(res_data.size() + length);
93
41
            memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length);
94
41
            res_offset += length;
95
96
41
            res_offsets[i] = (ColumnString::Offset)res_offset;
97
41
            prev_offset = offsets[i];
98
41
        }
99
9
        return Status::OK();
100
9
    }
_ZN5doris20ExtractSubstringImplINS_21ExtractTopLevelDomainEE6vectorERKNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERKNS3_IjLm4096ES6_Lm16ELm15EEERS7_RSA_
Line
Count
Source
77
19
                         ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
78
19
        size_t size = offsets.size();
79
19
        res_offsets.resize(size);
80
19
        res_data.reserve(size * Extractor::get_reserve_length_for_element());
81
82
19
        size_t prev_offset = 0;
83
19
        size_t res_offset = 0;
84
85
        /// Matched part.
86
19
        Pos start;
87
19
        size_t length;
88
89
42
        for (size_t i = 0; i < size; ++i) {
90
23
            Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]),
91
23
                               offsets[i] - prev_offset, start, length);
92
23
            res_data.resize(res_data.size() + length);
93
23
            memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length);
94
23
            res_offset += length;
95
96
23
            res_offsets[i] = (ColumnString::Offset)res_offset;
97
23
            prev_offset = offsets[i];
98
23
        }
99
19
        return Status::OK();
100
19
    }
_ZN5doris20ExtractSubstringImplINS_32ExtractFirstSignificantSubdomainEE6vectorERKNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERKNS3_IjLm4096ES6_Lm16ELm15EEERS7_RSA_
Line
Count
Source
77
19
                         ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
78
19
        size_t size = offsets.size();
79
19
        res_offsets.resize(size);
80
19
        res_data.reserve(size * Extractor::get_reserve_length_for_element());
81
82
19
        size_t prev_offset = 0;
83
19
        size_t res_offset = 0;
84
85
        /// Matched part.
86
19
        Pos start;
87
19
        size_t length;
88
89
42
        for (size_t i = 0; i < size; ++i) {
90
23
            Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]),
91
23
                               offsets[i] - prev_offset, start, length);
92
23
            res_data.resize(res_data.size() + length);
93
23
            memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length);
94
23
            res_offset += length;
95
96
23
            res_offsets[i] = (ColumnString::Offset)res_offset;
97
23
            prev_offset = offsets[i];
98
23
        }
99
19
        return Status::OK();
100
19
    }
_ZN5doris20ExtractSubstringImplINS_30CutToFirstSignificantSubdomainEE6vectorERKNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERKNS3_IjLm4096ES6_Lm16ELm15EEERS7_RSA_
Line
Count
Source
77
19
                         ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
78
19
        size_t size = offsets.size();
79
19
        res_offsets.resize(size);
80
19
        res_data.reserve(size * Extractor::get_reserve_length_for_element());
81
82
19
        size_t prev_offset = 0;
83
19
        size_t res_offset = 0;
84
85
        /// Matched part.
86
19
        Pos start;
87
19
        size_t length;
88
89
42
        for (size_t i = 0; i < size; ++i) {
90
23
            Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]),
91
23
                               offsets[i] - prev_offset, start, length);
92
23
            res_data.resize(res_data.size() + length);
93
23
            memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length);
94
23
            res_offset += length;
95
96
23
            res_offsets[i] = (ColumnString::Offset)res_offset;
97
23
            prev_offset = offsets[i];
98
23
        }
99
19
        return Status::OK();
100
19
    }
101
102
    static void constant(const std::string& data, std::string& res_data) {
103
        Pos start;
104
        size_t length;
105
        Extractor::execute(data.data(), data.size(), start, length);
106
        res_data.assign(start, length);
107
    }
108
};
109
110
/** Delete part of string using the Extractor.
111
  */
112
template <typename Extractor>
113
struct CutSubstringImpl {
114
    static void vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets,
115
                       ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) {
116
        res_data.reserve(data.size());
117
        size_t size = offsets.size();
118
        res_offsets.resize(size);
119
120
        size_t prev_offset = 0;
121
        size_t res_offset = 0;
122
123
        /// Matched part.
124
        Pos start;
125
        size_t length;
126
127
        for (size_t i = 0; i < size; ++i) {
128
            const char* current = reinterpret_cast<const char*>(&data[prev_offset]);
129
            Extractor::execute(current, offsets[i] - prev_offset, start, length);
130
            size_t start_index = start - reinterpret_cast<const char*>(data.data());
131
132
            res_data.resize(res_data.size() + offsets[i] - prev_offset - length);
133
            memcpy_small_allow_read_write_overflow15(&res_data[res_offset], current,
134
                                                     start - current);
135
            memcpy_small_allow_read_write_overflow15(&res_data[res_offset + start - current],
136
                                                     start + length,
137
                                                     offsets[i] - start_index - length);
138
            res_offset += offsets[i] - prev_offset - length;
139
140
            res_offsets[i] = res_offset;
141
            prev_offset = offsets[i];
142
        }
143
    }
144
145
    static void constant(const std::string& data, std::string& res_data) {
146
        Pos start;
147
        size_t length;
148
        Extractor::execute(data.data(), data.size(), start, length);
149
        res_data.reserve(data.size() - length);
150
        res_data.append(data.data(), start);
151
        res_data.append(start + length, data.data() + data.size());
152
    }
153
};
154
#include "common/compile_check_end.h"
155
} // namespace doris