/root/doris/be/src/util/faststring.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #pragma once |
19 | | |
20 | | #include <butil/macros.h> |
21 | | |
22 | | #include <cstdint> |
23 | | #include <cstring> |
24 | | #include <string> |
25 | | |
26 | | #include "gutil/dynamic_annotations.h" |
27 | | #include "util/memcpy_inlined.h" |
28 | | #include "util/slice.h" |
29 | | #include "vec/common/allocator.h" |
30 | | |
31 | | namespace doris { |
32 | | |
33 | | // A faststring is similar to a std::string, except that it is faster for many |
34 | | // common use cases (in particular, resize() will fill with uninitialized data |
35 | | // instead of memsetting to \0) |
36 | | // only build() can transfer data to the outside. |
37 | | class faststring : private Allocator<false, false, false, DefaultMemoryAllocator> { |
38 | | public: |
39 | | enum { kInitialCapacity = 32 }; |
40 | | |
41 | 156k | faststring() : data_(initial_data_), len_(0), capacity_(kInitialCapacity) {} |
42 | | |
43 | | // Construct a string with the given capacity, in bytes. |
44 | | explicit faststring(size_t capacity) |
45 | 3.43k | : data_(initial_data_), len_(0), capacity_(kInitialCapacity) { |
46 | 3.43k | if (capacity > capacity_) { |
47 | 2.98k | data_ = reinterpret_cast<uint8_t*>(Allocator::alloc(capacity)); |
48 | 2.98k | capacity_ = capacity; |
49 | 2.98k | } |
50 | 3.43k | ASAN_POISON_MEMORY_REGION(data_, capacity_); |
51 | 3.43k | } |
52 | | |
53 | 159k | ~faststring() { |
54 | 159k | ASAN_UNPOISON_MEMORY_REGION(initial_data_, arraysize(initial_data_)); |
55 | 159k | if (data_ != initial_data_) { |
56 | 49.4k | Allocator::free(data_, capacity_); |
57 | 49.4k | } |
58 | 159k | } |
59 | | |
60 | | // Reset the valid length of the string to 0. |
61 | | // |
62 | | // This does not free up any memory. The capacity of the string remains unchanged. |
63 | 369k | void clear() { |
64 | 369k | resize(0); |
65 | 369k | ASAN_POISON_MEMORY_REGION(data_, capacity_); |
66 | 369k | } |
67 | | |
68 | | // Resize the string to the given length. |
69 | | // If the new length is larger than the old length, the capacity is expanded as necessary. |
70 | | // |
71 | | // NOTE: in contrast to std::string's implementation, Any newly "exposed" bytes of data are |
72 | | // not cleared. |
73 | 1.76M | void resize(size_t newsize) { |
74 | 1.76M | if (newsize > capacity_) { |
75 | 38.7k | reserve(newsize); |
76 | 38.7k | } |
77 | 1.76M | len_ = newsize; |
78 | 1.76M | ASAN_POISON_MEMORY_REGION(data_ + len_, capacity_ - len_); |
79 | 1.76M | ASAN_UNPOISON_MEMORY_REGION(data_, len_); |
80 | 1.76M | } |
81 | | |
82 | | // Return the buffer built so far and reset `this` to the initial status (size() == 0). |
83 | | // NOTE: the returned data pointer is not necessarily the pointer returned by data() |
84 | 37.5k | OwnedSlice build() { |
85 | 37.5k | uint8_t* ret = data_; |
86 | 37.5k | if (ret == initial_data_) { |
87 | 31 | ret = reinterpret_cast<uint8_t*>(Allocator::alloc(capacity_)); |
88 | 31 | DCHECK(len_ <= capacity_); |
89 | 31 | memcpy(ret, data_, len_); |
90 | 31 | } |
91 | 37.5k | OwnedSlice result(ret, len_, capacity_); |
92 | 37.5k | len_ = 0; |
93 | 37.5k | capacity_ = kInitialCapacity; |
94 | 37.5k | data_ = initial_data_; |
95 | 37.5k | ASAN_POISON_MEMORY_REGION(data_, capacity_); |
96 | 37.5k | return result; |
97 | 37.5k | } |
98 | | |
99 | | // Reserve space for the given total amount of data. If the current capacity is already |
100 | | // larger than the newly requested capacity, this is a no-op (i.e. it does not ever free memory). |
101 | | // |
102 | | // NOTE: even though the new capacity is reserved, it is illegal to begin writing into that memory |
103 | | // directly using pointers. If ASAN is enabled, this is ensured using manual memory poisoning. |
104 | 392k | void reserve(size_t newcapacity) { |
105 | 392k | if (newcapacity <= capacity_) [[likely]] { |
106 | 266k | return; |
107 | 266k | } |
108 | 125k | GrowArray(newcapacity); |
109 | 125k | } |
110 | | |
111 | | // Append the given data to the string, resizing capacity as necessary. |
112 | 1.30M | void append(const void* src_v, size_t count) { |
113 | 1.30M | const uint8_t* src = reinterpret_cast<const uint8_t*>(src_v); |
114 | 1.30M | EnsureRoomForAppend(count); |
115 | 1.30M | ASAN_UNPOISON_MEMORY_REGION(data_ + len_, count); |
116 | | |
117 | | // appending short values is common enough that this |
118 | | // actually helps, according to benchmarks. In theory |
119 | | // memcpy_inlined should already be just as good, but this |
120 | | // was ~20% faster for reading a large prefix-coded string file |
121 | | // where each string was only a few chars different |
122 | 1.30M | if (count <= 4) { |
123 | 598k | uint8_t* p = &data_[len_]; |
124 | 1.83M | for (int i = 0; i < count; i++) { |
125 | 1.23M | *p++ = *src++; |
126 | 1.23M | } |
127 | 703k | } else { |
128 | 703k | memcpy_inlined(&data_[len_], src, count); |
129 | 703k | } |
130 | 1.30M | len_ += count; |
131 | 1.30M | } |
132 | | |
133 | | // Append the given string to this string. |
134 | 8 | void append(const std::string& str) { append(str.data(), str.size()); } |
135 | | |
136 | | // Append the given character to this string. |
137 | 52.7k | void push_back(const char byte) { |
138 | 52.7k | EnsureRoomForAppend(1); |
139 | 52.7k | ASAN_UNPOISON_MEMORY_REGION(data_ + len_, 1); |
140 | 52.7k | data_[len_] = byte; |
141 | 52.7k | len_++; |
142 | 52.7k | } |
143 | | |
144 | | // Return the valid length of this string. |
145 | 14 | size_t length() const { return len_; } |
146 | | |
147 | | // Return the valid length of this string (identical to length()) |
148 | 1.82M | size_t size() const { return len_; } |
149 | | |
150 | | // Return the allocated capacity of this string. |
151 | 293k | size_t capacity() const { return capacity_; } |
152 | | |
153 | | // Return a pointer to the data in this string. Note that this pointer |
154 | | // may be invalidated by any later non-const operation. |
155 | 261k | const uint8_t* data() const { return &data_[0]; } |
156 | | |
157 | | // Return a pointer to the data in this string. Note that this pointer |
158 | | // may be invalidated by any later non-const operation. |
159 | 586k | uint8_t* data() { return &data_[0]; } |
160 | | |
161 | | // Return the given element of this string. Note that this does not perform |
162 | | // any bounds checking. |
163 | 0 | const uint8_t& at(size_t i) const { return data_[i]; } |
164 | | |
165 | | // Return the given element of this string. Note that this does not perform |
166 | | // any bounds checking. |
167 | 56.4k | const uint8_t& operator[](size_t i) const { return data_[i]; } |
168 | | |
169 | | // Return the given element of this string. Note that this does not perform |
170 | | // any bounds checking. |
171 | 1.02M | uint8_t& operator[](size_t i) { return data_[i]; } |
172 | | |
173 | | // Reset the contents of this string by copying 'len' bytes from 'src'. |
174 | 47.7k | void assign_copy(const uint8_t* src, size_t len) { |
175 | | // Reset length so that the first resize doesn't need to copy the current |
176 | | // contents of the array. |
177 | 47.7k | len_ = 0; |
178 | 47.7k | resize(len); |
179 | 47.7k | memcpy(data(), src, len); |
180 | 47.7k | } |
181 | | |
182 | | // Reset the contents of this string by copying from the given std::string. |
183 | 0 | void assign_copy(const std::string& str) { |
184 | 0 | assign_copy(reinterpret_cast<const uint8_t*>(str.c_str()), str.size()); |
185 | 0 | } |
186 | | |
187 | | // Reallocates the internal storage to fit only the current data. |
188 | | // |
189 | | // This may revert to using internal storage if the current length is shorter than |
190 | | // kInitialCapacity. In that case, after this call, capacity() will go down to |
191 | | // kInitialCapacity. |
192 | | // |
193 | | // Any pointers within this instance may be invalidated. |
194 | 102 | void shrink_to_fit() { |
195 | 102 | if (data_ == initial_data_ || capacity_ == len_) return; |
196 | 39 | ShrinkToFitInternal(); |
197 | 39 | } |
198 | | |
199 | | // Return a copy of this string as a std::string. |
200 | 0 | std::string ToString() const { |
201 | 0 | return std::string(reinterpret_cast<const char*>(data()), len_); |
202 | 0 | } |
203 | | |
204 | | private: |
205 | | DISALLOW_COPY_AND_ASSIGN(faststring); |
206 | | |
207 | | // If necessary, expand the buffer to fit at least 'count' more bytes. |
208 | | // If the array has to be grown, it is grown by at least 50%. |
209 | 1.35M | void EnsureRoomForAppend(size_t count) { |
210 | 1.35M | if (len_ + count <= capacity_) [[likely]] { |
211 | 1.34M | return; |
212 | 1.34M | } |
213 | | |
214 | | // Call the non-inline slow path - this reduces the number of instructions |
215 | | // on the hot path. |
216 | 9.90k | GrowToAtLeast(len_ + count); |
217 | 9.90k | } |
218 | | |
219 | | // The slow path of EnsureRoomForAppend. Grows the buffer by either |
220 | | // 'count' bytes, or 50%, whichever is more. |
221 | | void GrowToAtLeast(size_t newcapacity); |
222 | | |
223 | | // Grow the array to the given capacity, which must be more than |
224 | | // the current capacity. |
225 | | void GrowArray(size_t newcapacity); |
226 | | |
227 | | void ShrinkToFitInternal(); |
228 | | |
229 | | uint8_t* data_ = nullptr; |
230 | | uint8_t initial_data_[kInitialCapacity]; |
231 | | size_t len_; |
232 | | // NOTE: we will make a initial buffer as part of the object, so the smallest |
233 | | // possible value of capacity_ is kInitialCapacity. |
234 | | size_t capacity_; |
235 | | }; |
236 | | |
237 | | } // namespace doris |