/root/doris/be/src/util/faststring.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <butil/macros.h> |
21 | | |
22 | | #include <cstdint> |
23 | | #include <cstring> |
24 | | #include <string> |
25 | | |
26 | | #include "gutil/dynamic_annotations.h" |
27 | | #include "gutil/port.h" |
28 | | #include "util/memcpy_inlined.h" |
29 | | #include "util/slice.h" |
30 | | #include "vec/common/allocator.h" |
31 | | |
32 | | namespace doris { |
33 | | |
34 | | // A faststring is similar to a std::string, except that it is faster for many |
35 | | // common use cases (in particular, resize() will fill with uninitialized data |
36 | | // instead of memsetting to \0) |
37 | | // only build() can transfer data to the outside. |
38 | | class faststring : private Allocator<false, false, false, DefaultMemoryAllocator> { |
39 | | public: |
40 | | enum { kInitialCapacity = 32 }; |
41 | | |
42 | 118k | faststring() : data_(initial_data_), len_(0), capacity_(kInitialCapacity) {} |
43 | | |
44 | | // Construct a string with the given capacity, in bytes. |
45 | | explicit faststring(size_t capacity) |
46 | 1.09k | : data_(initial_data_), len_(0), capacity_(kInitialCapacity) { |
47 | 1.09k | if (capacity > capacity_) { |
48 | 640 | data_ = reinterpret_cast<uint8_t*>(Allocator::alloc(capacity)); |
49 | 640 | capacity_ = capacity; |
50 | 640 | } |
51 | 1.09k | ASAN_POISON_MEMORY_REGION(data_, capacity_); |
52 | 1.09k | } |
53 | | |
54 | 119k | ~faststring() { |
55 | 119k | ASAN_UNPOISON_MEMORY_REGION(initial_data_, arraysize(initial_data_)); |
56 | 119k | if (data_ != initial_data_) { |
57 | 28.2k | Allocator::free(data_, capacity_); |
58 | 28.2k | } |
59 | 119k | } |
60 | | |
61 | | // Reset the valid length of the string to 0. |
62 | | // |
63 | | // This does not free up any memory. The capacity of the string remains unchanged. |
64 | 339k | void clear() { |
65 | 339k | resize(0); |
66 | 339k | ASAN_POISON_MEMORY_REGION(data_, capacity_); |
67 | 339k | } |
68 | | |
69 | | // Resize the string to the given length. |
70 | | // If the new length is larger than the old length, the capacity is expanded as necessary. |
71 | | // |
72 | | // NOTE: in contrast to std::string's implementation, Any newly "exposed" bytes of data are |
73 | | // not cleared. |
74 | 1.48M | void resize(size_t newsize) { |
75 | 1.48M | if (newsize > capacity_) { |
76 | 22.5k | reserve(newsize); |
77 | 22.5k | } |
78 | 1.48M | len_ = newsize; |
79 | 1.48M | ASAN_POISON_MEMORY_REGION(data_ + len_, capacity_ - len_); |
80 | 1.48M | ASAN_UNPOISON_MEMORY_REGION(data_, len_); |
81 | 1.48M | } |
82 | | |
83 | | // Return the buffer built so far and reset `this` to the initial status (size() == 0). |
84 | | // NOTE: the returned data pointer is not necessarily the pointer returned by data() |
85 | 26.4k | OwnedSlice build() { |
86 | 26.4k | uint8_t* ret = data_; |
87 | 26.4k | if (ret == initial_data_) { |
88 | 31 | ret = reinterpret_cast<uint8_t*>(Allocator::alloc(capacity_)); |
89 | 31 | DCHECK(len_ <= capacity_); |
90 | 31 | memcpy(ret, data_, len_); |
91 | 31 | } |
92 | 26.4k | OwnedSlice result(ret, len_, capacity_); |
93 | 26.4k | len_ = 0; |
94 | 26.4k | capacity_ = kInitialCapacity; |
95 | 26.4k | data_ = initial_data_; |
96 | 26.4k | ASAN_POISON_MEMORY_REGION(data_, capacity_); |
97 | 26.4k | return result; |
98 | 26.4k | } |
99 | | |
100 | | // Reserve space for the given total amount of data. If the current capacity is already |
101 | | // larger than the newly requested capacity, this is a no-op (i.e. it does not ever free memory). |
102 | | // |
103 | | // NOTE: even though the new capacity is reserved, it is illegal to begin writing into that memory |
104 | | // directly using pointers. If ASAN is enabled, this is ensured using manual memory poisoning. |
105 | 358k | void reserve(size_t newcapacity) { |
106 | 358k | if (PREDICT_TRUE(newcapacity <= capacity_)) return; |
107 | 95.7k | GrowArray(newcapacity); |
108 | 95.7k | } |
109 | | |
110 | | // Append the given data to the string, resizing capacity as necessary. |
111 | 1.15M | void append(const void* src_v, size_t count) { |
112 | 1.15M | const uint8_t* src = reinterpret_cast<const uint8_t*>(src_v); |
113 | 1.15M | EnsureRoomForAppend(count); |
114 | 1.15M | ASAN_UNPOISON_MEMORY_REGION(data_ + len_, count); |
115 | | |
116 | | // appending short values is common enough that this |
117 | | // actually helps, according to benchmarks. In theory |
118 | | // memcpy_inlined should already be just as good, but this |
119 | | // was ~20% faster for reading a large prefix-coded string file |
120 | | // where each string was only a few chars different |
121 | 1.15M | if (count <= 4) { |
122 | 519k | uint8_t* p = &data_[len_]; |
123 | 1.46M | for (int i = 0; i < count; i++) { |
124 | 947k | *p++ = *src++; |
125 | 947k | } |
126 | 630k | } else { |
127 | 630k | memcpy_inlined(&data_[len_], src, count); |
128 | 630k | } |
129 | 1.15M | len_ += count; |
130 | 1.15M | } |
131 | | |
132 | | // Append the given string to this string. |
133 | 8 | void append(const std::string& str) { append(str.data(), str.size()); } |
134 | | |
135 | | // Append the given character to this string. |
136 | 10.8k | void push_back(const char byte) { |
137 | 10.8k | EnsureRoomForAppend(1); |
138 | 10.8k | ASAN_UNPOISON_MEMORY_REGION(data_ + len_, 1); |
139 | 10.8k | data_[len_] = byte; |
140 | 10.8k | len_++; |
141 | 10.8k | } |
142 | | |
143 | | // Return the valid length of this string. |
144 | 14 | size_t length() const { return len_; } |
145 | | |
146 | | // Return the valid length of this string (identical to length()) |
147 | 1.54M | size_t size() const { return len_; } |
148 | | |
149 | | // Return the allocated capacity of this string. |
150 | 293k | size_t capacity() const { return capacity_; } |
151 | | |
152 | | // Return a pointer to the data in this string. Note that this pointer |
153 | | // may be invalidated by any later non-const operation. |
154 | 259k | const uint8_t* data() const { return &data_[0]; } |
155 | | |
156 | | // Return a pointer to the data in this string. Note that this pointer |
157 | | // may be invalidated by any later non-const operation. |
158 | 551k | uint8_t* data() { return &data_[0]; } |
159 | | |
160 | | // Return the given element of this string. Note that this does not perform |
161 | | // any bounds checking. |
162 | 0 | const uint8_t& at(size_t i) const { return data_[i]; } |
163 | | |
164 | | // Return the given element of this string. Note that this does not perform |
165 | | // any bounds checking. |
166 | 42.4k | const uint8_t& operator[](size_t i) const { return data_[i]; } |
167 | | |
168 | | // Return the given element of this string. Note that this does not perform |
169 | | // any bounds checking. |
170 | 797k | uint8_t& operator[](size_t i) { return data_[i]; } |
171 | | |
172 | | // Reset the contents of this string by copying 'len' bytes from 'src'. |
173 | 31.6k | void assign_copy(const uint8_t* src, size_t len) { |
174 | | // Reset length so that the first resize doesn't need to copy the current |
175 | | // contents of the array. |
176 | 31.6k | len_ = 0; |
177 | 31.6k | resize(len); |
178 | 31.6k | memcpy(data(), src, len); |
179 | 31.6k | } |
180 | | |
181 | | // Reset the contents of this string by copying from the given std::string. |
182 | 0 | void assign_copy(const std::string& str) { |
183 | 0 | assign_copy(reinterpret_cast<const uint8_t*>(str.c_str()), str.size()); |
184 | 0 | } |
185 | | |
186 | | // Reallocates the internal storage to fit only the current data. |
187 | | // |
188 | | // This may revert to using internal storage if the current length is shorter than |
189 | | // kInitialCapacity. In that case, after this call, capacity() will go down to |
190 | | // kInitialCapacity. |
191 | | // |
192 | | // Any pointers within this instance may be invalidated. |
193 | 102 | void shrink_to_fit() { |
194 | 102 | if (data_ == initial_data_ || capacity_ == len_) return; |
195 | 32 | ShrinkToFitInternal(); |
196 | 32 | } |
197 | | |
198 | | // Return a copy of this string as a std::string. |
199 | 0 | std::string ToString() const { |
200 | 0 | return std::string(reinterpret_cast<const char*>(data()), len_); |
201 | 0 | } |
202 | | |
203 | | private: |
204 | | DISALLOW_COPY_AND_ASSIGN(faststring); |
205 | | |
206 | | // If necessary, expand the buffer to fit at least 'count' more bytes. |
207 | | // If the array has to be grown, it is grown by at least 50%. |
208 | 1.16M | void EnsureRoomForAppend(size_t count) { |
209 | 1.16M | if (PREDICT_TRUE(len_ + count <= capacity_)) { |
210 | 1.15M | return; |
211 | 1.15M | } |
212 | | |
213 | | // Call the non-inline slow path - this reduces the number of instructions |
214 | | // on the hot path. |
215 | 2.12k | GrowToAtLeast(len_ + count); |
216 | 2.12k | } |
217 | | |
218 | | // The slow path of EnsureRoomForAppend. Grows the buffer by either |
219 | | // 'count' bytes, or 50%, whichever is more. |
220 | | void GrowToAtLeast(size_t newcapacity); |
221 | | |
222 | | // Grow the array to the given capacity, which must be more than |
223 | | // the current capacity. |
224 | | void GrowArray(size_t newcapacity); |
225 | | |
226 | | void ShrinkToFitInternal(); |
227 | | |
228 | | uint8_t* data_ = nullptr; |
229 | | uint8_t initial_data_[kInitialCapacity]; |
230 | | size_t len_; |
231 | | // NOTE: we will make a initial buffer as part of the object, so the smallest |
232 | | // possible value of capacity_ is kInitialCapacity. |
233 | | size_t capacity_; |
234 | | }; |
235 | | |
236 | | } // namespace doris |