/root/doris/be/src/util/faststring.h
| Line | Count | Source | 
| 1 |  | // Licensed to the Apache Software Foundation (ASF) under one | 
| 2 |  | // or more contributor license agreements.  See the NOTICE file | 
| 3 |  | // distributed with this work for additional information | 
| 4 |  | // regarding copyright ownership.  The ASF licenses this file | 
| 5 |  | // to you under the Apache License, Version 2.0 (the | 
| 6 |  | // "License"); you may not use this file except in compliance | 
| 7 |  | // with the License.  You may obtain a copy of the License at | 
| 8 |  | // | 
| 9 |  | //   http://www.apache.org/licenses/LICENSE-2.0 | 
| 10 |  | // | 
| 11 |  | // Unless required by applicable law or agreed to in writing, | 
| 12 |  | // software distributed under the License is distributed on an | 
| 13 |  | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | 
| 14 |  | // KIND, either express or implied.  See the License for the | 
| 15 |  | // specific language governing permissions and limitations | 
| 16 |  | // under the License. | 
| 17 |  |  | 
| 18 |  | #pragma once | 
| 19 |  |  | 
| 20 |  | #include <butil/macros.h> | 
| 21 |  | #include <sanitizer/asan_interface.h> | 
| 22 |  |  | 
| 23 |  | #include <cstdint> | 
| 24 |  | #include <cstring> | 
| 25 |  | #include <string> | 
| 26 |  |  | 
| 27 |  | #include "util/memcpy_inlined.h" | 
| 28 |  | #include "util/slice.h" | 
| 29 |  | #include "vec/common/allocator.h" | 
| 30 |  | #include "vec/common/allocator_fwd.h" | 
| 31 |  |  | 
| 32 |  | namespace doris { | 
| 33 |  |  | 
| 34 |  | // A faststring is similar to a std::string, except that it is faster for many | 
| 35 |  | // common use cases (in particular, resize() will fill with uninitialized data | 
| 36 |  | // instead of memsetting to \0) | 
| 37 |  | // only build() can transfer data to the outside. | 
| 38 |  | class faststring : private Allocator<false, false, false, DefaultMemoryAllocator> { | 
| 39 |  | public: | 
| 40 |  |     enum { kInitialCapacity = 32 }; | 
| 41 |  |  | 
| 42 | 190k |     faststring() : data_(initial_data_), len_(0), capacity_(kInitialCapacity) {} | 
| 43 |  |  | 
| 44 |  |     // Construct a string with the given capacity, in bytes. | 
| 45 |  |     explicit faststring(size_t capacity) | 
| 46 | 37.1k |             : data_(initial_data_), len_(0), capacity_(kInitialCapacity) { | 
| 47 | 37.1k |         if (capacity > capacity_) { | 
| 48 | 4.01k |             data_ = reinterpret_cast<uint8_t*>(Allocator::alloc(capacity)); | 
| 49 | 4.01k |             capacity_ = capacity; | 
| 50 | 4.01k |         } | 
| 51 | 37.1k |         ASAN_POISON_MEMORY_REGION(data_, capacity_); | 
| 52 | 37.1k |     } | 
| 53 |  |  | 
| 54 | 227k |     ~faststring() { | 
| 55 | 227k |         ASAN_UNPOISON_MEMORY_REGION(initial_data_, arraysize(initial_data_)); | 
| 56 | 227k |         if (data_ != initial_data_) { | 
| 57 | 93.1k |             Allocator::free(data_, capacity_); | 
| 58 | 93.1k |         } | 
| 59 | 227k |     } | 
| 60 |  |  | 
| 61 |  |     // Reset the valid length of the string to 0. | 
| 62 |  |     // | 
| 63 |  |     // This does not free up any memory. The capacity of the string remains unchanged. | 
| 64 | 395k |     void clear() { | 
| 65 | 395k |         resize(0); | 
| 66 | 395k |         ASAN_POISON_MEMORY_REGION(data_, capacity_); | 
| 67 | 395k |     } | 
| 68 |  |  | 
| 69 |  |     // Resize the string to the given length. | 
| 70 |  |     // If the new length is larger than the old length, the capacity is expanded as necessary. | 
| 71 |  |     // | 
| 72 |  |     // NOTE: in contrast to std::string's implementation, Any newly "exposed" bytes of data are | 
| 73 |  |     // not cleared. | 
| 74 | 2.54M |     void resize(size_t newsize) { | 
| 75 | 2.54M |         if (newsize > capacity_) { | 
| 76 | 50.0k |             reserve(newsize); | 
| 77 | 50.0k |         } | 
| 78 | 2.54M |         len_ = newsize; | 
| 79 | 2.54M |         ASAN_POISON_MEMORY_REGION(data_ + len_, capacity_ - len_); | 
| 80 | 2.54M |         ASAN_UNPOISON_MEMORY_REGION(data_, len_); | 
| 81 | 2.54M |     } | 
| 82 |  |  | 
| 83 |  |     // Return the buffer built so far and reset `this` to the initial status (size() == 0). | 
| 84 |  |     // NOTE: the returned data pointer is not necessarily the pointer returned by data() | 
| 85 | 56.7k |     OwnedSlice build() { | 
| 86 | 56.7k |         uint8_t* ret = data_; | 
| 87 | 56.7k |         if (ret == initial_data_) { | 
| 88 | 1.11k |             ret = reinterpret_cast<uint8_t*>(Allocator::alloc(capacity_)); | 
| 89 | 1.11k |             DCHECK(len_ <= capacity_); | 
| 90 | 1.11k |             memcpy(ret, data_, len_); | 
| 91 | 1.11k |         } | 
| 92 | 56.7k |         OwnedSlice result(ret, len_, capacity_); | 
| 93 | 56.7k |         len_ = 0; | 
| 94 | 56.7k |         capacity_ = kInitialCapacity; | 
| 95 | 56.7k |         data_ = initial_data_; | 
| 96 | 56.7k |         ASAN_POISON_MEMORY_REGION(data_, capacity_); | 
| 97 | 56.7k |         return result; | 
| 98 | 56.7k |     } | 
| 99 |  |  | 
| 100 |  |     // Reserve space for the given total amount of data. If the current capacity is already | 
| 101 |  |     // larger than the newly requested capacity, this is a no-op (i.e. it does not ever free memory). | 
| 102 |  |     // | 
| 103 |  |     // NOTE: even though the new capacity is reserved, it is illegal to begin writing into that memory | 
| 104 |  |     // directly using pointers. If ASAN is enabled, this is ensured using manual memory poisoning. | 
| 105 | 497k |     void reserve(size_t newcapacity) { | 
| 106 | 497k |         if (newcapacity <= capacity_) [[likely]] { | 
| 107 | 302k |             return; | 
| 108 | 302k |         } | 
| 109 | 194k |         GrowArray(newcapacity); | 
| 110 | 194k |     } | 
| 111 |  |  | 
| 112 |  |     // Append the given data to the string, resizing capacity as necessary. | 
| 113 | 1.94M |     void append(const void* src_v, size_t count) { | 
| 114 | 1.94M |         const uint8_t* src = reinterpret_cast<const uint8_t*>(src_v); | 
| 115 | 1.94M |         EnsureRoomForAppend(count); | 
| 116 | 1.94M |         ASAN_UNPOISON_MEMORY_REGION(data_ + len_, count); | 
| 117 |  |  | 
| 118 |  |         // appending short values is common enough that this | 
| 119 |  |         // actually helps, according to benchmarks. In theory | 
| 120 |  |         // memcpy_inlined should already be just as good, but this | 
| 121 |  |         // was ~20% faster for reading a large prefix-coded string file | 
| 122 |  |         // where each string was only a few chars different | 
| 123 | 1.94M |         if (count <= 4) { | 
| 124 | 1.02M |             uint8_t* p = &data_[len_]; | 
| 125 | 3.49M |             for (int i = 0; i < count; i++) { | 
| 126 | 2.47M |                 *p++ = *src++; | 
| 127 | 2.47M |             } | 
| 128 | 1.02M |         } else { | 
| 129 | 918k |             memcpy_inlined(&data_[len_], src, count); | 
| 130 | 918k |         } | 
| 131 | 1.94M |         len_ += count; | 
| 132 | 1.94M |     } | 
| 133 |  |  | 
| 134 |  |     // Append the given string to this string. | 
| 135 | 8 |     void append(const std::string& str) { append(str.data(), str.size()); } | 
| 136 |  |  | 
| 137 |  |     // Append the given character to this string. | 
| 138 | 51.9k |     void push_back(const char byte) { | 
| 139 | 51.9k |         EnsureRoomForAppend(1); | 
| 140 | 51.9k |         ASAN_UNPOISON_MEMORY_REGION(data_ + len_, 1); | 
| 141 | 51.9k |         data_[len_] = byte; | 
| 142 | 51.9k |         len_++; | 
| 143 | 51.9k |     } | 
| 144 |  |  | 
| 145 |  |     // Return the valid length of this string. | 
| 146 | 32.6k |     size_t length() const { return len_; } | 
| 147 |  |  | 
| 148 |  |     // Return the valid length of this string (identical to length()) | 
| 149 | 2.79M |     size_t size() const { return len_; } | 
| 150 |  |  | 
| 151 |  |     // Return the allocated capacity of this string. | 
| 152 | 318k |     size_t capacity() const { return capacity_; } | 
| 153 |  |  | 
| 154 |  |     // Return a pointer to the data in this string. Note that this pointer | 
| 155 |  |     // may be invalidated by any later non-const operation. | 
| 156 | 262k |     const uint8_t* data() const { return &data_[0]; } | 
| 157 |  |  | 
| 158 |  |     // Return a pointer to the data in this string. Note that this pointer | 
| 159 |  |     // may be invalidated by any later non-const operation. | 
| 160 | 736k |     uint8_t* data() { return &data_[0]; } | 
| 161 |  |  | 
| 162 |  |     // Return the given element of this string. Note that this does not perform | 
| 163 |  |     // any bounds checking. | 
| 164 | 0 |     const uint8_t& at(size_t i) const { return data_[i]; } | 
| 165 |  |  | 
| 166 |  |     // Return the given element of this string. Note that this does not perform | 
| 167 |  |     // any bounds checking. | 
| 168 | 73.5k |     const uint8_t& operator[](size_t i) const { return data_[i]; } | 
| 169 |  |  | 
| 170 |  |     // Return the given element of this string. Note that this does not perform | 
| 171 |  |     // any bounds checking. | 
| 172 | 1.66M |     uint8_t& operator[](size_t i) { return data_[i]; } | 
| 173 |  |  | 
| 174 |  |     // Reset the contents of this string by copying 'len' bytes from 'src'. | 
| 175 | 68.4k |     void assign_copy(const uint8_t* src, size_t len) { | 
| 176 |  |         // Reset length so that the first resize doesn't need to copy the current | 
| 177 |  |         // contents of the array. | 
| 178 | 68.4k |         len_ = 0; | 
| 179 | 68.4k |         resize(len); | 
| 180 | 68.4k |         memcpy(data(), src, len); | 
| 181 | 68.4k |     } | 
| 182 |  |  | 
| 183 |  |     // Reset the contents of this string by copying from the given std::string. | 
| 184 | 0 |     void assign_copy(const std::string& str) { | 
| 185 | 0 |         assign_copy(reinterpret_cast<const uint8_t*>(str.c_str()), str.size()); | 
| 186 | 0 |     } | 
| 187 |  |  | 
| 188 |  |     // Reallocates the internal storage to fit only the current data. | 
| 189 |  |     // | 
| 190 |  |     // This may revert to using internal storage if the current length is shorter than | 
| 191 |  |     // kInitialCapacity. In that case, after this call, capacity() will go down to | 
| 192 |  |     // kInitialCapacity. | 
| 193 |  |     // | 
| 194 |  |     // Any pointers within this instance may be invalidated. | 
| 195 | 102 |     void shrink_to_fit() { | 
| 196 | 102 |         if (data_ == initial_data_ || capacity_ == len_) return; | 
| 197 | 39 |         ShrinkToFitInternal(); | 
| 198 | 39 |     } | 
| 199 |  |  | 
| 200 |  |     // Return a copy of this string as a std::string. | 
| 201 | 0 |     std::string ToString() const { | 
| 202 | 0 |         return std::string(reinterpret_cast<const char*>(data()), len_); | 
| 203 | 0 |     } | 
| 204 |  |  | 
| 205 |  | private: | 
| 206 |  |     DISALLOW_COPY_AND_ASSIGN(faststring); | 
| 207 |  |  | 
| 208 |  |     // If necessary, expand the buffer to fit at least 'count' more bytes. | 
| 209 |  |     // If the array has to be grown, it is grown by at least 50%. | 
| 210 | 1.99M |     void EnsureRoomForAppend(size_t count) { | 
| 211 | 1.99M |         if (len_ + count <= capacity_) [[likely]] { | 
| 212 | 1.94M |             return; | 
| 213 | 1.94M |         } | 
| 214 |  |  | 
| 215 |  |         // Call the non-inline slow path - this reduces the number of instructions | 
| 216 |  |         // on the hot path. | 
| 217 | 49.9k |         GrowToAtLeast(len_ + count); | 
| 218 | 49.9k |     } | 
| 219 |  |  | 
| 220 |  |     // The slow path of EnsureRoomForAppend. Grows the buffer by either | 
| 221 |  |     // 'count' bytes, or 50%, whichever is more. | 
| 222 |  |     void GrowToAtLeast(size_t newcapacity); | 
| 223 |  |  | 
| 224 |  |     // Grow the array to the given capacity, which must be more than | 
| 225 |  |     // the current capacity. | 
| 226 |  |     void GrowArray(size_t newcapacity); | 
| 227 |  |  | 
| 228 |  |     void ShrinkToFitInternal(); | 
| 229 |  |  | 
| 230 |  |     uint8_t* data_ = nullptr; | 
| 231 |  |     uint8_t initial_data_[kInitialCapacity]; | 
| 232 |  |     size_t len_; | 
| 233 |  |     // NOTE: we will make a initial buffer as part of the object, so the smallest | 
| 234 |  |     // possible value of capacity_ is kInitialCapacity. | 
| 235 |  |     size_t capacity_; | 
| 236 |  | }; | 
| 237 |  |  | 
| 238 |  | } // namespace doris |