Coverage Report

Created: 2025-05-30 14:54

/root/doris/be/src/util/faststring.h
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <butil/macros.h>
21
#include <sanitizer/asan_interface.h>
22
23
#include <cstdint>
24
#include <cstring>
25
#include <string>
26
27
#include "util/memcpy_inlined.h"
28
#include "util/slice.h"
29
#include "vec/common/allocator.h"
30
31
namespace doris {
32
33
// A faststring is similar to a std::string, except that it is faster for many
34
// common use cases (in particular, resize() will fill with uninitialized data
35
// instead of memsetting to \0)
36
// only build() can transfer data to the outside.
37
class faststring : private Allocator<false, false, false, DefaultMemoryAllocator> {
38
public:
39
    enum { kInitialCapacity = 32 };
40
41
13.8M
    faststring() : data_(initial_data_), len_(0), capacity_(kInitialCapacity) {}
42
43
    // Construct a string with the given capacity, in bytes.
44
    explicit faststring(size_t capacity)
45
529k
            : data_(initial_data_), len_(0), capacity_(kInitialCapacity) {
46
529k
        if (capacity > capacity_) {
47
528k
            data_ = reinterpret_cast<uint8_t*>(Allocator::alloc(capacity));
48
528k
            capacity_ = capacity;
49
528k
        }
50
529k
        ASAN_POISON_MEMORY_REGION(data_, capacity_);
51
529k
    }
52
53
14.4M
    ~faststring() {
54
14.4M
        ASAN_UNPOISON_MEMORY_REGION(initial_data_, arraysize(initial_data_));
55
14.4M
        if (data_ != initial_data_) {
56
3.68M
            Allocator::free(data_, capacity_);
57
3.68M
        }
58
14.4M
    }
59
60
    // Reset the valid length of the string to 0.
61
    //
62
    // This does not free up any memory. The capacity of the string remains unchanged.
63
40.8M
    void clear() {
64
40.8M
        resize(0);
65
40.8M
        ASAN_POISON_MEMORY_REGION(data_, capacity_);
66
40.8M
    }
67
68
    // Resize the string to the given length.
69
    // If the new length is larger than the old length, the capacity is expanded as necessary.
70
    //
71
    // NOTE: in contrast to std::string's implementation, Any newly "exposed" bytes of data are
72
    // not cleared.
73
322M
    void resize(size_t newsize) {
74
322M
        if (newsize > capacity_) {
75
1.77M
            reserve(newsize);
76
1.77M
        }
77
322M
        len_ = newsize;
78
322M
        ASAN_POISON_MEMORY_REGION(data_ + len_, capacity_ - len_);
79
322M
        ASAN_UNPOISON_MEMORY_REGION(data_, len_);
80
322M
    }
81
82
    // Return the buffer built so far and reset `this` to the initial status (size() == 0).
83
    // NOTE: the returned data pointer is not necessarily the pointer returned by data()
84
2.74M
    OwnedSlice build() {
85
2.74M
        uint8_t* ret = data_;
86
2.74M
        if (ret == initial_data_) {
87
59.5k
            ret = reinterpret_cast<uint8_t*>(Allocator::alloc(capacity_));
88
59.5k
            DCHECK(len_ <= capacity_);
89
59.5k
            memcpy(ret, data_, len_);
90
59.5k
        }
91
2.74M
        OwnedSlice result(ret, len_, capacity_);
92
2.74M
        len_ = 0;
93
2.74M
        capacity_ = kInitialCapacity;
94
2.74M
        data_ = initial_data_;
95
2.74M
        ASAN_POISON_MEMORY_REGION(data_, capacity_);
96
2.74M
        return result;
97
2.74M
    }
98
99
    // Reserve space for the given total amount of data. If the current capacity is already
100
    // larger than the newly requested capacity, this is a no-op (i.e. it does not ever free memory).
101
    //
102
    // NOTE: even though the new capacity is reserved, it is illegal to begin writing into that memory
103
    // directly using pointers. If ASAN is enabled, this is ensured using manual memory poisoning.
104
14.5M
    void reserve(size_t newcapacity) {
105
14.5M
        if (newcapacity <= capacity_) [[likely]] {
106
8.92M
            return;
107
8.92M
        }
108
5.59M
        GrowArray(newcapacity);
109
5.59M
    }
110
111
    // Append the given data to the string, resizing capacity as necessary.
112
247M
    void append(const void* src_v, size_t count) {
113
247M
        const uint8_t* src = reinterpret_cast<const uint8_t*>(src_v);
114
247M
        EnsureRoomForAppend(count);
115
247M
        ASAN_UNPOISON_MEMORY_REGION(data_ + len_, count);
116
117
        // appending short values is common enough that this
118
        // actually helps, according to benchmarks. In theory
119
        // memcpy_inlined should already be just as good, but this
120
        // was ~20% faster for reading a large prefix-coded string file
121
        // where each string was only a few chars different
122
247M
        if (count <= 4) {
123
79.0M
            uint8_t* p = &data_[len_];
124
287M
            for (int i = 0; i < count; i++) {
125
208M
                *p++ = *src++;
126
208M
            }
127
168M
        } else {
128
168M
            memcpy_inlined(&data_[len_], src, count);
129
168M
        }
130
247M
        len_ += count;
131
247M
    }
132
133
    // Append the given string to this string.
134
    void append(const std::string& str) { append(str.data(), str.size()); }
135
136
    // Append the given character to this string.
137
27.2M
    void push_back(const char byte) {
138
27.2M
        EnsureRoomForAppend(1);
139
27.2M
        ASAN_UNPOISON_MEMORY_REGION(data_ + len_, 1);
140
27.2M
        data_[len_] = byte;
141
27.2M
        len_++;
142
27.2M
    }
143
144
    // Return the valid length of this string.
145
    size_t length() const { return len_; }
146
147
    // Return the valid length of this string (identical to length())
148
375M
    size_t size() const { return len_; }
149
150
    // Return the allocated capacity of this string.
151
7.87M
    size_t capacity() const { return capacity_; }
152
153
    // Return a pointer to the data in this string. Note that this pointer
154
    // may be invalidated by any later non-const operation.
155
84.2M
    const uint8_t* data() const { return &data_[0]; }
156
157
    // Return a pointer to the data in this string. Note that this pointer
158
    // may be invalidated by any later non-const operation.
159
26.4M
    uint8_t* data() { return &data_[0]; }
160
161
    // Return the given element of this string. Note that this does not perform
162
    // any bounds checking.
163
0
    const uint8_t& at(size_t i) const { return data_[i]; }
164
165
    // Return the given element of this string. Note that this does not perform
166
    // any bounds checking.
167
4.16M
    const uint8_t& operator[](size_t i) const { return data_[i]; }
168
169
    // Return the given element of this string. Note that this does not perform
170
    // any bounds checking.
171
224M
    uint8_t& operator[](size_t i) { return data_[i]; }
172
173
    // Reset the contents of this string by copying 'len' bytes from 'src'.
174
2.98M
    void assign_copy(const uint8_t* src, size_t len) {
175
        // Reset length so that the first resize doesn't need to copy the current
176
        // contents of the array.
177
2.98M
        len_ = 0;
178
2.98M
        resize(len);
179
2.98M
        memcpy(data(), src, len);
180
2.98M
    }
181
182
    // Reset the contents of this string by copying from the given std::string.
183
0
    void assign_copy(const std::string& str) {
184
0
        assign_copy(reinterpret_cast<const uint8_t*>(str.c_str()), str.size());
185
0
    }
186
187
    // Reallocates the internal storage to fit only the current data.
188
    //
189
    // This may revert to using internal storage if the current length is shorter than
190
    // kInitialCapacity. In that case, after this call, capacity() will go down to
191
    // kInitialCapacity.
192
    //
193
    // Any pointers within this instance may be invalidated.
194
    void shrink_to_fit() {
195
        if (data_ == initial_data_ || capacity_ == len_) return;
196
        ShrinkToFitInternal();
197
    }
198
199
    // Return a copy of this string as a std::string.
200
0
    std::string ToString() const {
201
0
        return std::string(reinterpret_cast<const char*>(data()), len_);
202
0
    }
203
204
private:
205
    DISALLOW_COPY_AND_ASSIGN(faststring);
206
207
    // If necessary, expand the buffer to fit at least 'count' more bytes.
208
    // If the array has to be grown, it is grown by at least 50%.
209
274M
    void EnsureRoomForAppend(size_t count) {
210
274M
        if (len_ + count <= capacity_) [[likely]] {
211
274M
            return;
212
274M
        }
213
214
        // Call the non-inline slow path - this reduces the number of instructions
215
        // on the hot path.
216
833k
        GrowToAtLeast(len_ + count);
217
833k
    }
218
219
    // The slow path of EnsureRoomForAppend. Grows the buffer by either
220
    // 'count' bytes, or 50%, whichever is more.
221
    void GrowToAtLeast(size_t newcapacity);
222
223
    // Grow the array to the given capacity, which must be more than
224
    // the current capacity.
225
    void GrowArray(size_t newcapacity);
226
227
    void ShrinkToFitInternal();
228
229
    uint8_t* data_ = nullptr;
230
    uint8_t initial_data_[kInitialCapacity];
231
    size_t len_;
232
    // NOTE: we will make a initial buffer as part of the object, so the smallest
233
    // possible value of capacity_ is kInitialCapacity.
234
    size_t capacity_;
235
};
236
237
} // namespace doris