/root/doris/be/src/util/faststring.h
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <butil/macros.h> |
21 | | #include <sanitizer/asan_interface.h> |
22 | | |
23 | | #include <cstdint> |
24 | | #include <cstring> |
25 | | #include <string> |
26 | | |
27 | | #include "util/memcpy_inlined.h" |
28 | | #include "util/slice.h" |
29 | | #include "vec/common/allocator.h" |
30 | | #include "vec/common/allocator_fwd.h" |
31 | | |
32 | | namespace doris { |
33 | | |
34 | | // A faststring is similar to a std::string, except that it is faster for many |
35 | | // common use cases (in particular, resize() will fill with uninitialized data |
36 | | // instead of memsetting to \0) |
37 | | // only build() can transfer data to the outside. |
38 | | class faststring : private Allocator<false, false, false, DefaultMemoryAllocator> { |
39 | | public: |
40 | | enum { kInitialCapacity = 32 }; |
41 | | |
42 | 169k | faststring() : data_(initial_data_), len_(0), capacity_(kInitialCapacity) {} |
43 | | |
44 | | // Construct a string with the given capacity, in bytes. |
45 | | explicit faststring(size_t capacity) |
46 | 36.3k | : data_(initial_data_), len_(0), capacity_(kInitialCapacity) { |
47 | 36.3k | if (capacity > capacity_) { |
48 | 3.21k | data_ = reinterpret_cast<uint8_t*>(Allocator::alloc(capacity)); |
49 | 3.21k | capacity_ = capacity; |
50 | 3.21k | } |
51 | 36.3k | ASAN_POISON_MEMORY_REGION(data_, capacity_); |
52 | 36.3k | } |
53 | | |
54 | 206k | ~faststring() { |
55 | 206k | ASAN_UNPOISON_MEMORY_REGION(initial_data_, arraysize(initial_data_)); |
56 | 206k | if (data_ != initial_data_) { |
57 | 85.9k | Allocator::free(data_, capacity_); |
58 | 85.9k | } |
59 | 206k | } |
60 | | |
61 | | // Reset the valid length of the string to 0. |
62 | | // |
63 | | // This does not free up any memory. The capacity of the string remains unchanged. |
64 | 378k | void clear() { |
65 | 378k | resize(0); |
66 | 378k | ASAN_POISON_MEMORY_REGION(data_, capacity_); |
67 | 378k | } |
68 | | |
69 | | // Resize the string to the given length. |
70 | | // If the new length is larger than the old length, the capacity is expanded as necessary. |
71 | | // |
72 | | // NOTE: in contrast to std::string's implementation, Any newly "exposed" bytes of data are |
73 | | // not cleared. |
74 | 1.83M | void resize(size_t newsize) { |
75 | 1.83M | if (newsize > capacity_) { |
76 | 45.0k | reserve(newsize); |
77 | 45.0k | } |
78 | 1.83M | len_ = newsize; |
79 | 1.83M | ASAN_POISON_MEMORY_REGION(data_ + len_, capacity_ - len_); |
80 | 1.83M | ASAN_UNPOISON_MEMORY_REGION(data_, len_); |
81 | 1.83M | } |
82 | | |
83 | | // Return the buffer built so far and reset `this` to the initial status (size() == 0). |
84 | | // NOTE: the returned data pointer is not necessarily the pointer returned by data() |
85 | 50.6k | OwnedSlice build() { |
86 | 50.6k | uint8_t* ret = data_; |
87 | 50.6k | if (ret == initial_data_) { |
88 | 1.11k | ret = reinterpret_cast<uint8_t*>(Allocator::alloc(capacity_)); |
89 | 1.11k | DCHECK(len_ <= capacity_); |
90 | 1.11k | memcpy(ret, data_, len_); |
91 | 1.11k | } |
92 | 50.6k | OwnedSlice result(ret, len_, capacity_); |
93 | 50.6k | len_ = 0; |
94 | 50.6k | capacity_ = kInitialCapacity; |
95 | 50.6k | data_ = initial_data_; |
96 | 50.6k | ASAN_POISON_MEMORY_REGION(data_, capacity_); |
97 | 50.6k | return result; |
98 | 50.6k | } |
99 | | |
100 | | // Reserve space for the given total amount of data. If the current capacity is already |
101 | | // larger than the newly requested capacity, this is a no-op (i.e. it does not ever free memory). |
102 | | // |
103 | | // NOTE: even though the new capacity is reserved, it is illegal to begin writing into that memory |
104 | | // directly using pointers. If ASAN is enabled, this is ensured using manual memory poisoning. |
105 | 456k | void reserve(size_t newcapacity) { |
106 | 456k | if (newcapacity <= capacity_) [[likely]] { |
107 | 274k | return; |
108 | 274k | } |
109 | 182k | GrowArray(newcapacity); |
110 | 182k | } |
111 | | |
112 | | // Append the given data to the string, resizing capacity as necessary. |
113 | 1.59M | void append(const void* src_v, size_t count) { |
114 | 1.59M | const uint8_t* src = reinterpret_cast<const uint8_t*>(src_v); |
115 | 1.59M | EnsureRoomForAppend(count); |
116 | 1.59M | ASAN_UNPOISON_MEMORY_REGION(data_ + len_, count); |
117 | | |
118 | | // appending short values is common enough that this |
119 | | // actually helps, according to benchmarks. In theory |
120 | | // memcpy_inlined should already be just as good, but this |
121 | | // was ~20% faster for reading a large prefix-coded string file |
122 | | // where each string was only a few chars different |
123 | 1.59M | if (count <= 4) { |
124 | 821k | uint8_t* p = &data_[len_]; |
125 | 2.54M | for (int i = 0; i < count; i++) { |
126 | 1.72M | *p++ = *src++; |
127 | 1.72M | } |
128 | 821k | } else { |
129 | 771k | memcpy_inlined(&data_[len_], src, count); |
130 | 771k | } |
131 | 1.59M | len_ += count; |
132 | 1.59M | } |
133 | | |
134 | | // Append the given string to this string. |
135 | 8 | void append(const std::string& str) { append(str.data(), str.size()); } |
136 | | |
137 | | // Append the given character to this string. |
138 | 43.8k | void push_back(const char byte) { |
139 | 43.8k | EnsureRoomForAppend(1); |
140 | 43.8k | ASAN_UNPOISON_MEMORY_REGION(data_ + len_, 1); |
141 | 43.8k | data_[len_] = byte; |
142 | 43.8k | len_++; |
143 | 43.8k | } |
144 | | |
145 | | // Return the valid length of this string. |
146 | 32.6k | size_t length() const { return len_; } |
147 | | |
148 | | // Return the valid length of this string (identical to length()) |
149 | 1.97M | size_t size() const { return len_; } |
150 | | |
151 | | // Return the allocated capacity of this string. |
152 | 293k | size_t capacity() const { return capacity_; } |
153 | | |
154 | | // Return a pointer to the data in this string. Note that this pointer |
155 | | // may be invalidated by any later non-const operation. |
156 | 261k | const uint8_t* data() const { return &data_[0]; } |
157 | | |
158 | | // Return a pointer to the data in this string. Note that this pointer |
159 | | // may be invalidated by any later non-const operation. |
160 | 684k | uint8_t* data() { return &data_[0]; } |
161 | | |
162 | | // Return the given element of this string. Note that this does not perform |
163 | | // any bounds checking. |
164 | 0 | const uint8_t& at(size_t i) const { return data_[i]; } |
165 | | |
166 | | // Return the given element of this string. Note that this does not perform |
167 | | // any bounds checking. |
168 | 65.1k | const uint8_t& operator[](size_t i) const { return data_[i]; } |
169 | | |
170 | | // Return the given element of this string. Note that this does not perform |
171 | | // any bounds checking. |
172 | 1.00M | uint8_t& operator[](size_t i) { return data_[i]; } |
173 | | |
174 | | // Reset the contents of this string by copying 'len' bytes from 'src'. |
175 | 59.9k | void assign_copy(const uint8_t* src, size_t len) { |
176 | | // Reset length so that the first resize doesn't need to copy the current |
177 | | // contents of the array. |
178 | 59.9k | len_ = 0; |
179 | 59.9k | resize(len); |
180 | 59.9k | memcpy(data(), src, len); |
181 | 59.9k | } |
182 | | |
183 | | // Reset the contents of this string by copying from the given std::string. |
184 | 0 | void assign_copy(const std::string& str) { |
185 | 0 | assign_copy(reinterpret_cast<const uint8_t*>(str.c_str()), str.size()); |
186 | 0 | } |
187 | | |
188 | | // Reallocates the internal storage to fit only the current data. |
189 | | // |
190 | | // This may revert to using internal storage if the current length is shorter than |
191 | | // kInitialCapacity. In that case, after this call, capacity() will go down to |
192 | | // kInitialCapacity. |
193 | | // |
194 | | // Any pointers within this instance may be invalidated. |
195 | 102 | void shrink_to_fit() { |
196 | 102 | if (data_ == initial_data_ || capacity_ == len_) return; |
197 | 38 | ShrinkToFitInternal(); |
198 | 38 | } |
199 | | |
200 | | // Return a copy of this string as a std::string. |
201 | 0 | std::string ToString() const { |
202 | 0 | return std::string(reinterpret_cast<const char*>(data()), len_); |
203 | 0 | } |
204 | | |
205 | | private: |
206 | | DISALLOW_COPY_AND_ASSIGN(faststring); |
207 | | |
208 | | // If necessary, expand the buffer to fit at least 'count' more bytes. |
209 | | // If the array has to be grown, it is grown by at least 50%. |
210 | 1.63M | void EnsureRoomForAppend(size_t count) { |
211 | 1.63M | if (len_ + count <= capacity_) [[likely]] { |
212 | 1.58M | return; |
213 | 1.58M | } |
214 | | |
215 | | // Call the non-inline slow path - this reduces the number of instructions |
216 | | // on the hot path. |
217 | 49.8k | GrowToAtLeast(len_ + count); |
218 | 49.8k | } |
219 | | |
220 | | // The slow path of EnsureRoomForAppend. Grows the buffer by either |
221 | | // 'count' bytes, or 50%, whichever is more. |
222 | | void GrowToAtLeast(size_t newcapacity); |
223 | | |
224 | | // Grow the array to the given capacity, which must be more than |
225 | | // the current capacity. |
226 | | void GrowArray(size_t newcapacity); |
227 | | |
228 | | void ShrinkToFitInternal(); |
229 | | |
230 | | uint8_t* data_ = nullptr; |
231 | | uint8_t initial_data_[kInitialCapacity]; |
232 | | size_t len_; |
233 | | // NOTE: we will make a initial buffer as part of the object, so the smallest |
234 | | // possible value of capacity_ is kInitialCapacity. |
235 | | size_t capacity_; |
236 | | }; |
237 | | |
238 | | } // namespace doris |