Coverage Report

Created: 2025-05-09 19:09

/root/doris/be/src/util/faststring.h
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <butil/macros.h>
21
22
#include <cstdint>
23
#include <cstring>
24
#include <string>
25
26
#include "gutil/dynamic_annotations.h"
27
#include "util/memcpy_inlined.h"
28
#include "util/slice.h"
29
#include "vec/common/allocator.h"
30
31
namespace doris {
32
33
// A faststring is similar to a std::string, except that it is faster for many
34
// common use cases (in particular, resize() will fill with uninitialized data
35
// instead of memsetting to \0)
36
// only build() can transfer data to the outside.
37
class faststring : private Allocator<false, false, false, DefaultMemoryAllocator> {
38
public:
39
    enum { kInitialCapacity = 32 };
40
41
156k
    faststring() : data_(initial_data_), len_(0), capacity_(kInitialCapacity) {}
42
43
    // Construct a string with the given capacity, in bytes.
44
    explicit faststring(size_t capacity)
45
3.43k
            : data_(initial_data_), len_(0), capacity_(kInitialCapacity) {
46
3.43k
        if (capacity > capacity_) {
47
2.98k
            data_ = reinterpret_cast<uint8_t*>(Allocator::alloc(capacity));
48
2.98k
            capacity_ = capacity;
49
2.98k
        }
50
3.43k
        ASAN_POISON_MEMORY_REGION(data_, capacity_);
51
3.43k
    }
52
53
159k
    ~faststring() {
54
159k
        ASAN_UNPOISON_MEMORY_REGION(initial_data_, arraysize(initial_data_));
55
159k
        if (data_ != initial_data_) {
56
49.4k
            Allocator::free(data_, capacity_);
57
49.4k
        }
58
159k
    }
59
60
    // Reset the valid length of the string to 0.
61
    //
62
    // This does not free up any memory. The capacity of the string remains unchanged.
63
369k
    void clear() {
64
369k
        resize(0);
65
369k
        ASAN_POISON_MEMORY_REGION(data_, capacity_);
66
369k
    }
67
68
    // Resize the string to the given length.
69
    // If the new length is larger than the old length, the capacity is expanded as necessary.
70
    //
71
    // NOTE: in contrast to std::string's implementation, Any newly "exposed" bytes of data are
72
    // not cleared.
73
1.76M
    void resize(size_t newsize) {
74
1.76M
        if (newsize > capacity_) {
75
38.7k
            reserve(newsize);
76
38.7k
        }
77
1.76M
        len_ = newsize;
78
1.76M
        ASAN_POISON_MEMORY_REGION(data_ + len_, capacity_ - len_);
79
1.76M
        ASAN_UNPOISON_MEMORY_REGION(data_, len_);
80
1.76M
    }
81
82
    // Return the buffer built so far and reset `this` to the initial status (size() == 0).
83
    // NOTE: the returned data pointer is not necessarily the pointer returned by data()
84
37.5k
    OwnedSlice build() {
85
37.5k
        uint8_t* ret = data_;
86
37.5k
        if (ret == initial_data_) {
87
31
            ret = reinterpret_cast<uint8_t*>(Allocator::alloc(capacity_));
88
31
            DCHECK(len_ <= capacity_);
89
31
            memcpy(ret, data_, len_);
90
31
        }
91
37.5k
        OwnedSlice result(ret, len_, capacity_);
92
37.5k
        len_ = 0;
93
37.5k
        capacity_ = kInitialCapacity;
94
37.5k
        data_ = initial_data_;
95
37.5k
        ASAN_POISON_MEMORY_REGION(data_, capacity_);
96
37.5k
        return result;
97
37.5k
    }
98
99
    // Reserve space for the given total amount of data. If the current capacity is already
100
    // larger than the newly requested capacity, this is a no-op (i.e. it does not ever free memory).
101
    //
102
    // NOTE: even though the new capacity is reserved, it is illegal to begin writing into that memory
103
    // directly using pointers. If ASAN is enabled, this is ensured using manual memory poisoning.
104
392k
    void reserve(size_t newcapacity) {
105
392k
        if (newcapacity <= capacity_) [[likely]] {
106
266k
            return;
107
266k
        }
108
125k
        GrowArray(newcapacity);
109
125k
    }
110
111
    // Append the given data to the string, resizing capacity as necessary.
112
1.30M
    void append(const void* src_v, size_t count) {
113
1.30M
        const uint8_t* src = reinterpret_cast<const uint8_t*>(src_v);
114
1.30M
        EnsureRoomForAppend(count);
115
1.30M
        ASAN_UNPOISON_MEMORY_REGION(data_ + len_, count);
116
117
        // appending short values is common enough that this
118
        // actually helps, according to benchmarks. In theory
119
        // memcpy_inlined should already be just as good, but this
120
        // was ~20% faster for reading a large prefix-coded string file
121
        // where each string was only a few chars different
122
1.30M
        if (count <= 4) {
123
598k
            uint8_t* p = &data_[len_];
124
1.83M
            for (int i = 0; i < count; i++) {
125
1.23M
                *p++ = *src++;
126
1.23M
            }
127
703k
        } else {
128
703k
            memcpy_inlined(&data_[len_], src, count);
129
703k
        }
130
1.30M
        len_ += count;
131
1.30M
    }
132
133
    // Append the given string to this string.
134
8
    void append(const std::string& str) { append(str.data(), str.size()); }
135
136
    // Append the given character to this string.
137
52.7k
    void push_back(const char byte) {
138
52.7k
        EnsureRoomForAppend(1);
139
52.7k
        ASAN_UNPOISON_MEMORY_REGION(data_ + len_, 1);
140
52.7k
        data_[len_] = byte;
141
52.7k
        len_++;
142
52.7k
    }
143
144
    // Return the valid length of this string.
145
14
    size_t length() const { return len_; }
146
147
    // Return the valid length of this string (identical to length())
148
1.82M
    size_t size() const { return len_; }
149
150
    // Return the allocated capacity of this string.
151
293k
    size_t capacity() const { return capacity_; }
152
153
    // Return a pointer to the data in this string. Note that this pointer
154
    // may be invalidated by any later non-const operation.
155
261k
    const uint8_t* data() const { return &data_[0]; }
156
157
    // Return a pointer to the data in this string. Note that this pointer
158
    // may be invalidated by any later non-const operation.
159
586k
    uint8_t* data() { return &data_[0]; }
160
161
    // Return the given element of this string. Note that this does not perform
162
    // any bounds checking.
163
0
    const uint8_t& at(size_t i) const { return data_[i]; }
164
165
    // Return the given element of this string. Note that this does not perform
166
    // any bounds checking.
167
56.4k
    const uint8_t& operator[](size_t i) const { return data_[i]; }
168
169
    // Return the given element of this string. Note that this does not perform
170
    // any bounds checking.
171
1.02M
    uint8_t& operator[](size_t i) { return data_[i]; }
172
173
    // Reset the contents of this string by copying 'len' bytes from 'src'.
174
47.7k
    void assign_copy(const uint8_t* src, size_t len) {
175
        // Reset length so that the first resize doesn't need to copy the current
176
        // contents of the array.
177
47.7k
        len_ = 0;
178
47.7k
        resize(len);
179
47.7k
        memcpy(data(), src, len);
180
47.7k
    }
181
182
    // Reset the contents of this string by copying from the given std::string.
183
0
    void assign_copy(const std::string& str) {
184
0
        assign_copy(reinterpret_cast<const uint8_t*>(str.c_str()), str.size());
185
0
    }
186
187
    // Reallocates the internal storage to fit only the current data.
188
    //
189
    // This may revert to using internal storage if the current length is shorter than
190
    // kInitialCapacity. In that case, after this call, capacity() will go down to
191
    // kInitialCapacity.
192
    //
193
    // Any pointers within this instance may be invalidated.
194
102
    void shrink_to_fit() {
195
102
        if (data_ == initial_data_ || capacity_ == len_) return;
196
34
        ShrinkToFitInternal();
197
34
    }
198
199
    // Return a copy of this string as a std::string.
200
0
    std::string ToString() const {
201
0
        return std::string(reinterpret_cast<const char*>(data()), len_);
202
0
    }
203
204
private:
205
    DISALLOW_COPY_AND_ASSIGN(faststring);
206
207
    // If necessary, expand the buffer to fit at least 'count' more bytes.
208
    // If the array has to be grown, it is grown by at least 50%.
209
1.35M
    void EnsureRoomForAppend(size_t count) {
210
1.35M
        if (len_ + count <= capacity_) [[likely]] {
211
1.34M
            return;
212
1.34M
        }
213
214
        // Call the non-inline slow path - this reduces the number of instructions
215
        // on the hot path.
216
9.90k
        GrowToAtLeast(len_ + count);
217
9.90k
    }
218
219
    // The slow path of EnsureRoomForAppend. Grows the buffer by either
220
    // 'count' bytes, or 50%, whichever is more.
221
    void GrowToAtLeast(size_t newcapacity);
222
223
    // Grow the array to the given capacity, which must be more than
224
    // the current capacity.
225
    void GrowArray(size_t newcapacity);
226
227
    void ShrinkToFitInternal();
228
229
    uint8_t* data_ = nullptr;
230
    uint8_t initial_data_[kInitialCapacity];
231
    size_t len_;
232
    // NOTE: we will make a initial buffer as part of the object, so the smallest
233
    // possible value of capacity_ is kInitialCapacity.
234
    size_t capacity_;
235
};
236
237
} // namespace doris