be/src/exprs/function/url/functions_url.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | // This file is copied from |
18 | | // https://github.com/ClickHouse/ClickHouse/blob/master/src/Functions/URL/FunctionsURL.h |
19 | | // and modified by Doris |
20 | | |
21 | | #pragma once |
22 | | |
23 | | #include "core/column/column_string.h" |
24 | | #include "core/memcpy_small.h" |
25 | | |
26 | | namespace doris { |
27 | | #include "common/compile_check_begin.h" |
28 | | /** URL processing functions. See implementation in separate .cpp files. |
29 | | * All functions are not strictly follow RFC, instead they are maximally simplified for performance reasons. |
30 | | * |
31 | | * Functions for extraction parts of URL. |
32 | | * If URL has nothing like, then empty string is returned. |
33 | | * |
34 | | * domain |
35 | | * domainWithoutWWW |
36 | | * topLevelDomain |
37 | | * protocol |
38 | | * path |
39 | | * queryString |
40 | | * fragment |
41 | | * queryStringAndFragment |
42 | | * netloc |
43 | | * |
44 | | * Functions, removing parts from URL. |
45 | | * If URL has nothing like, then it is returned unchanged. |
46 | | * |
47 | | * cutWWW |
48 | | * cutFragment |
49 | | * cutQueryString |
50 | | * cutQueryStringAndFragment |
51 | | * |
52 | | * Extract value of parameter in query string or in fragment identifier. Return empty string, if URL has no such parameter. |
53 | | * If there are many parameters with same name - return value of first one. Value is not %-decoded. |
54 | | * |
55 | | * extractURLParameter(URL, name) |
56 | | * |
57 | | * Extract all parameters from URL in form of array of strings name=value. |
58 | | * extractURLParameters(URL) |
59 | | * |
60 | | * Extract names of all parameters from URL in form of array of strings. |
61 | | * extractURLParameterNames(URL) |
62 | | * |
63 | | * Remove specified parameter from URL. |
64 | | * cutURLParameter(URL, name) |
65 | | * |
66 | | * Get array of URL 'hierarchy' as in web-analytics tree-like reports. See the docs. |
67 | | * URLHierarchy(URL) |
68 | | */ |
69 | | |
70 | | using Pos = const char*; |
71 | | |
72 | | /** Select part of string using the Extractor. |
73 | | */ |
74 | | template <typename Extractor> |
75 | | struct ExtractSubstringImpl { |
76 | | static Status vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, |
77 | 83 | ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { |
78 | 83 | size_t size = offsets.size(); |
79 | 83 | res_offsets.resize(size); |
80 | 83 | res_data.reserve(size * Extractor::get_reserve_length_for_element()); |
81 | | |
82 | 83 | size_t prev_offset = 0; |
83 | 83 | size_t res_offset = 0; |
84 | | |
85 | | /// Matched part. |
86 | 83 | Pos start; |
87 | 83 | size_t length; |
88 | | |
89 | 261 | for (size_t i = 0; i < size; ++i) { |
90 | 178 | Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]), |
91 | 178 | offsets[i] - prev_offset, start, length); |
92 | 178 | res_data.resize(res_data.size() + length); |
93 | 178 | memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length); |
94 | 178 | res_offset += length; |
95 | | |
96 | 178 | res_offsets[i] = (ColumnString::Offset)res_offset; |
97 | 178 | prev_offset = offsets[i]; |
98 | 178 | } |
99 | 83 | return Status::OK(); |
100 | 83 | } _ZN5doris20ExtractSubstringImplINS_13ExtractDomainILb0EEEE6vectorERKNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERKNS4_IjLm4096ES7_Lm16ELm15EEERS8_RSB_ Line | Count | Source | 77 | 8 | ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { | 78 | 8 | size_t size = offsets.size(); | 79 | 8 | res_offsets.resize(size); | 80 | 8 | res_data.reserve(size * Extractor::get_reserve_length_for_element()); | 81 | | | 82 | 8 | size_t prev_offset = 0; | 83 | 8 | size_t res_offset = 0; | 84 | | | 85 | | /// Matched part. | 86 | 8 | Pos start; | 87 | 8 | size_t length; | 88 | | | 89 | 42 | for (size_t i = 0; i < size; ++i) { | 90 | 34 | Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]), | 91 | 34 | offsets[i] - prev_offset, start, length); | 92 | 34 | res_data.resize(res_data.size() + length); | 93 | 34 | memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length); | 94 | 34 | res_offset += length; | 95 | | | 96 | 34 | res_offsets[i] = (ColumnString::Offset)res_offset; | 97 | 34 | prev_offset = offsets[i]; | 98 | 34 | } | 99 | 8 | return Status::OK(); | 100 | 8 | } |
_ZN5doris20ExtractSubstringImplINS_13ExtractDomainILb1EEEE6vectorERKNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERKNS4_IjLm4096ES7_Lm16ELm15EEERS8_RSB_ Line | Count | Source | 77 | 9 | ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { | 78 | 9 | size_t size = offsets.size(); | 79 | 9 | res_offsets.resize(size); | 80 | 9 | res_data.reserve(size * Extractor::get_reserve_length_for_element()); | 81 | | | 82 | 9 | size_t prev_offset = 0; | 83 | 9 | size_t res_offset = 0; | 84 | | | 85 | | /// Matched part. | 86 | 9 | Pos start; | 87 | 9 | size_t length; | 88 | | | 89 | 43 | for (size_t i = 0; i < size; ++i) { | 90 | 34 | Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]), | 91 | 34 | offsets[i] - prev_offset, start, length); | 92 | 34 | res_data.resize(res_data.size() + length); | 93 | 34 | memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length); | 94 | 34 | res_offset += length; | 95 | | | 96 | 34 | res_offsets[i] = (ColumnString::Offset)res_offset; | 97 | 34 | prev_offset = offsets[i]; | 98 | 34 | } | 99 | 9 | return Status::OK(); | 100 | 9 | } |
_ZN5doris20ExtractSubstringImplINS_15ExtractProtocolEE6vectorERKNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERKNS3_IjLm4096ES6_Lm16ELm15EEERS7_RSA_ Line | Count | Source | 77 | 9 | ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { | 78 | 9 | size_t size = offsets.size(); | 79 | 9 | res_offsets.resize(size); | 80 | 9 | res_data.reserve(size * Extractor::get_reserve_length_for_element()); | 81 | | | 82 | 9 | size_t prev_offset = 0; | 83 | 9 | size_t res_offset = 0; | 84 | | | 85 | | /// Matched part. | 86 | 9 | Pos start; | 87 | 9 | size_t length; | 88 | | | 89 | 50 | for (size_t i = 0; i < size; ++i) { | 90 | 41 | Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]), | 91 | 41 | offsets[i] - prev_offset, start, length); | 92 | 41 | res_data.resize(res_data.size() + length); | 93 | 41 | memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length); | 94 | 41 | res_offset += length; | 95 | | | 96 | 41 | res_offsets[i] = (ColumnString::Offset)res_offset; | 97 | 41 | prev_offset = offsets[i]; | 98 | 41 | } | 99 | 9 | return Status::OK(); | 100 | 9 | } |
_ZN5doris20ExtractSubstringImplINS_21ExtractTopLevelDomainEE6vectorERKNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERKNS3_IjLm4096ES6_Lm16ELm15EEERS7_RSA_ Line | Count | Source | 77 | 19 | ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { | 78 | 19 | size_t size = offsets.size(); | 79 | 19 | res_offsets.resize(size); | 80 | 19 | res_data.reserve(size * Extractor::get_reserve_length_for_element()); | 81 | | | 82 | 19 | size_t prev_offset = 0; | 83 | 19 | size_t res_offset = 0; | 84 | | | 85 | | /// Matched part. | 86 | 19 | Pos start; | 87 | 19 | size_t length; | 88 | | | 89 | 42 | for (size_t i = 0; i < size; ++i) { | 90 | 23 | Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]), | 91 | 23 | offsets[i] - prev_offset, start, length); | 92 | 23 | res_data.resize(res_data.size() + length); | 93 | 23 | memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length); | 94 | 23 | res_offset += length; | 95 | | | 96 | 23 | res_offsets[i] = (ColumnString::Offset)res_offset; | 97 | 23 | prev_offset = offsets[i]; | 98 | 23 | } | 99 | 19 | return Status::OK(); | 100 | 19 | } |
_ZN5doris20ExtractSubstringImplINS_32ExtractFirstSignificantSubdomainEE6vectorERKNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERKNS3_IjLm4096ES6_Lm16ELm15EEERS7_RSA_ Line | Count | Source | 77 | 19 | ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { | 78 | 19 | size_t size = offsets.size(); | 79 | 19 | res_offsets.resize(size); | 80 | 19 | res_data.reserve(size * Extractor::get_reserve_length_for_element()); | 81 | | | 82 | 19 | size_t prev_offset = 0; | 83 | 19 | size_t res_offset = 0; | 84 | | | 85 | | /// Matched part. | 86 | 19 | Pos start; | 87 | 19 | size_t length; | 88 | | | 89 | 42 | for (size_t i = 0; i < size; ++i) { | 90 | 23 | Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]), | 91 | 23 | offsets[i] - prev_offset, start, length); | 92 | 23 | res_data.resize(res_data.size() + length); | 93 | 23 | memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length); | 94 | 23 | res_offset += length; | 95 | | | 96 | 23 | res_offsets[i] = (ColumnString::Offset)res_offset; | 97 | 23 | prev_offset = offsets[i]; | 98 | 23 | } | 99 | 19 | return Status::OK(); | 100 | 19 | } |
_ZN5doris20ExtractSubstringImplINS_30CutToFirstSignificantSubdomainEE6vectorERKNS_8PODArrayIhLm4096ENS_9AllocatorILb0ELb0ELb0ENS_22DefaultMemoryAllocatorELb0EEELm16ELm15EEERKNS3_IjLm4096ES6_Lm16ELm15EEERS7_RSA_ Line | Count | Source | 77 | 19 | ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { | 78 | 19 | size_t size = offsets.size(); | 79 | 19 | res_offsets.resize(size); | 80 | 19 | res_data.reserve(size * Extractor::get_reserve_length_for_element()); | 81 | | | 82 | 19 | size_t prev_offset = 0; | 83 | 19 | size_t res_offset = 0; | 84 | | | 85 | | /// Matched part. | 86 | 19 | Pos start; | 87 | 19 | size_t length; | 88 | | | 89 | 42 | for (size_t i = 0; i < size; ++i) { | 90 | 23 | Extractor::execute(reinterpret_cast<const char*>(&data[prev_offset]), | 91 | 23 | offsets[i] - prev_offset, start, length); | 92 | 23 | res_data.resize(res_data.size() + length); | 93 | 23 | memcpy_small_allow_read_write_overflow15(&res_data[res_offset], start, length); | 94 | 23 | res_offset += length; | 95 | | | 96 | 23 | res_offsets[i] = (ColumnString::Offset)res_offset; | 97 | 23 | prev_offset = offsets[i]; | 98 | 23 | } | 99 | 19 | return Status::OK(); | 100 | 19 | } |
|
101 | | |
102 | | static void constant(const std::string& data, std::string& res_data) { |
103 | | Pos start; |
104 | | size_t length; |
105 | | Extractor::execute(data.data(), data.size(), start, length); |
106 | | res_data.assign(start, length); |
107 | | } |
108 | | }; |
109 | | |
110 | | /** Delete part of string using the Extractor. |
111 | | */ |
112 | | template <typename Extractor> |
113 | | struct CutSubstringImpl { |
114 | | static void vector(const ColumnString::Chars& data, const ColumnString::Offsets& offsets, |
115 | | ColumnString::Chars& res_data, ColumnString::Offsets& res_offsets) { |
116 | | res_data.reserve(data.size()); |
117 | | size_t size = offsets.size(); |
118 | | res_offsets.resize(size); |
119 | | |
120 | | size_t prev_offset = 0; |
121 | | size_t res_offset = 0; |
122 | | |
123 | | /// Matched part. |
124 | | Pos start; |
125 | | size_t length; |
126 | | |
127 | | for (size_t i = 0; i < size; ++i) { |
128 | | const char* current = reinterpret_cast<const char*>(&data[prev_offset]); |
129 | | Extractor::execute(current, offsets[i] - prev_offset, start, length); |
130 | | size_t start_index = start - reinterpret_cast<const char*>(data.data()); |
131 | | |
132 | | res_data.resize(res_data.size() + offsets[i] - prev_offset - length); |
133 | | memcpy_small_allow_read_write_overflow15(&res_data[res_offset], current, |
134 | | start - current); |
135 | | memcpy_small_allow_read_write_overflow15(&res_data[res_offset + start - current], |
136 | | start + length, |
137 | | offsets[i] - start_index - length); |
138 | | res_offset += offsets[i] - prev_offset - length; |
139 | | |
140 | | res_offsets[i] = res_offset; |
141 | | prev_offset = offsets[i]; |
142 | | } |
143 | | } |
144 | | |
145 | | static void constant(const std::string& data, std::string& res_data) { |
146 | | Pos start; |
147 | | size_t length; |
148 | | Extractor::execute(data.data(), data.size(), start, length); |
149 | | res_data.reserve(data.size() - length); |
150 | | res_data.append(data.data(), start); |
151 | | res_data.append(start + length, data.data() + data.size()); |
152 | | } |
153 | | }; |
154 | | #include "common/compile_check_end.h" |
155 | | } // namespace doris |