Coverage Report

Created: 2025-07-23 18:47

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/root/doris/be/src/util/faststring.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <butil/macros.h>
21
#include <sanitizer/asan_interface.h>
22
23
#include <cstdint>
24
#include <cstring>
25
#include <string>
26
27
#include "util/memcpy_inlined.h"
28
#include "util/slice.h"
29
#include "vec/common/allocator.h"
30
#include "vec/common/allocator_fwd.h"
31
32
namespace doris {
33
34
// A faststring is similar to a std::string, except that it is faster for many
35
// common use cases (in particular, resize() will fill with uninitialized data
36
// instead of memsetting to \0)
37
// only build() can transfer data to the outside.
38
class faststring : private Allocator<false, false, false, DefaultMemoryAllocator> {
39
public:
40
    enum { kInitialCapacity = 32 };
41
42
169k
    faststring() : data_(initial_data_), len_(0), capacity_(kInitialCapacity) {}
43
44
    // Construct a string with the given capacity, in bytes.
45
    explicit faststring(size_t capacity)
46
36.3k
            : data_(initial_data_), len_(0), capacity_(kInitialCapacity) {
47
36.3k
        if (capacity > capacity_) {
48
3.21k
            data_ = reinterpret_cast<uint8_t*>(Allocator::alloc(capacity));
49
3.21k
            capacity_ = capacity;
50
3.21k
        }
51
36.3k
        ASAN_POISON_MEMORY_REGION(data_, capacity_);
52
36.3k
    }
53
54
206k
    ~faststring() {
55
206k
        ASAN_UNPOISON_MEMORY_REGION(initial_data_, arraysize(initial_data_));
56
206k
        if (data_ != initial_data_) {
57
85.9k
            Allocator::free(data_, capacity_);
58
85.9k
        }
59
206k
    }
60
61
    // Reset the valid length of the string to 0.
62
    //
63
    // This does not free up any memory. The capacity of the string remains unchanged.
64
378k
    void clear() {
65
378k
        resize(0);
66
378k
        ASAN_POISON_MEMORY_REGION(data_, capacity_);
67
378k
    }
68
69
    // Resize the string to the given length.
70
    // If the new length is larger than the old length, the capacity is expanded as necessary.
71
    //
72
    // NOTE: in contrast to std::string's implementation, Any newly "exposed" bytes of data are
73
    // not cleared.
74
1.83M
    void resize(size_t newsize) {
75
1.83M
        if (newsize > capacity_) {
76
45.0k
            reserve(newsize);
77
45.0k
        }
78
1.83M
        len_ = newsize;
79
1.83M
        ASAN_POISON_MEMORY_REGION(data_ + len_, capacity_ - len_);
80
1.83M
        ASAN_UNPOISON_MEMORY_REGION(data_, len_);
81
1.83M
    }
82
83
    // Return the buffer built so far and reset `this` to the initial status (size() == 0).
84
    // NOTE: the returned data pointer is not necessarily the pointer returned by data()
85
50.6k
    OwnedSlice build() {
86
50.6k
        uint8_t* ret = data_;
87
50.6k
        if (ret == initial_data_) {
88
1.11k
            ret = reinterpret_cast<uint8_t*>(Allocator::alloc(capacity_));
89
1.11k
            DCHECK(len_ <= capacity_);
90
1.11k
            memcpy(ret, data_, len_);
91
1.11k
        }
92
50.6k
        OwnedSlice result(ret, len_, capacity_);
93
50.6k
        len_ = 0;
94
50.6k
        capacity_ = kInitialCapacity;
95
50.6k
        data_ = initial_data_;
96
50.6k
        ASAN_POISON_MEMORY_REGION(data_, capacity_);
97
50.6k
        return result;
98
50.6k
    }
99
100
    // Reserve space for the given total amount of data. If the current capacity is already
101
    // larger than the newly requested capacity, this is a no-op (i.e. it does not ever free memory).
102
    //
103
    // NOTE: even though the new capacity is reserved, it is illegal to begin writing into that memory
104
    // directly using pointers. If ASAN is enabled, this is ensured using manual memory poisoning.
105
456k
    void reserve(size_t newcapacity) {
106
456k
        if (newcapacity <= capacity_) [[likely]] {
107
274k
            return;
108
274k
        }
109
182k
        GrowArray(newcapacity);
110
182k
    }
111
112
    // Append the given data to the string, resizing capacity as necessary.
113
1.59M
    void append(const void* src_v, size_t count) {
114
1.59M
        const uint8_t* src = reinterpret_cast<const uint8_t*>(src_v);
115
1.59M
        EnsureRoomForAppend(count);
116
1.59M
        ASAN_UNPOISON_MEMORY_REGION(data_ + len_, count);
117
118
        // appending short values is common enough that this
119
        // actually helps, according to benchmarks. In theory
120
        // memcpy_inlined should already be just as good, but this
121
        // was ~20% faster for reading a large prefix-coded string file
122
        // where each string was only a few chars different
123
1.59M
        if (count <= 4) {
124
821k
            uint8_t* p = &data_[len_];
125
2.54M
            for (int i = 0; i < count; i++) {
126
1.72M
                *p++ = *src++;
127
1.72M
            }
128
821k
        } else {
129
771k
            memcpy_inlined(&data_[len_], src, count);
130
771k
        }
131
1.59M
        len_ += count;
132
1.59M
    }
133
134
    // Append the given string to this string.
135
8
    void append(const std::string& str) { append(str.data(), str.size()); }
136
137
    // Append the given character to this string.
138
43.8k
    void push_back(const char byte) {
139
43.8k
        EnsureRoomForAppend(1);
140
43.8k
        ASAN_UNPOISON_MEMORY_REGION(data_ + len_, 1);
141
43.8k
        data_[len_] = byte;
142
43.8k
        len_++;
143
43.8k
    }
144
145
    // Return the valid length of this string.
146
32.6k
    size_t length() const { return len_; }
147
148
    // Return the valid length of this string (identical to length())
149
1.97M
    size_t size() const { return len_; }
150
151
    // Return the allocated capacity of this string.
152
293k
    size_t capacity() const { return capacity_; }
153
154
    // Return a pointer to the data in this string. Note that this pointer
155
    // may be invalidated by any later non-const operation.
156
261k
    const uint8_t* data() const { return &data_[0]; }
157
158
    // Return a pointer to the data in this string. Note that this pointer
159
    // may be invalidated by any later non-const operation.
160
684k
    uint8_t* data() { return &data_[0]; }
161
162
    // Return the given element of this string. Note that this does not perform
163
    // any bounds checking.
164
0
    const uint8_t& at(size_t i) const { return data_[i]; }
165
166
    // Return the given element of this string. Note that this does not perform
167
    // any bounds checking.
168
65.1k
    const uint8_t& operator[](size_t i) const { return data_[i]; }
169
170
    // Return the given element of this string. Note that this does not perform
171
    // any bounds checking.
172
1.00M
    uint8_t& operator[](size_t i) { return data_[i]; }
173
174
    // Reset the contents of this string by copying 'len' bytes from 'src'.
175
59.9k
    void assign_copy(const uint8_t* src, size_t len) {
176
        // Reset length so that the first resize doesn't need to copy the current
177
        // contents of the array.
178
59.9k
        len_ = 0;
179
59.9k
        resize(len);
180
59.9k
        memcpy(data(), src, len);
181
59.9k
    }
182
183
    // Reset the contents of this string by copying from the given std::string.
184
0
    void assign_copy(const std::string& str) {
185
0
        assign_copy(reinterpret_cast<const uint8_t*>(str.c_str()), str.size());
186
0
    }
187
188
    // Reallocates the internal storage to fit only the current data.
189
    //
190
    // This may revert to using internal storage if the current length is shorter than
191
    // kInitialCapacity. In that case, after this call, capacity() will go down to
192
    // kInitialCapacity.
193
    //
194
    // Any pointers within this instance may be invalidated.
195
102
    void shrink_to_fit() {
196
102
        if (data_ == initial_data_ || capacity_ == len_) return;
197
38
        ShrinkToFitInternal();
198
38
    }
199
200
    // Return a copy of this string as a std::string.
201
0
    std::string ToString() const {
202
0
        return std::string(reinterpret_cast<const char*>(data()), len_);
203
0
    }
204
205
private:
206
    DISALLOW_COPY_AND_ASSIGN(faststring);
207
208
    // If necessary, expand the buffer to fit at least 'count' more bytes.
209
    // If the array has to be grown, it is grown by at least 50%.
210
1.63M
    void EnsureRoomForAppend(size_t count) {
211
1.63M
        if (len_ + count <= capacity_) [[likely]] {
212
1.58M
            return;
213
1.58M
        }
214
215
        // Call the non-inline slow path - this reduces the number of instructions
216
        // on the hot path.
217
49.8k
        GrowToAtLeast(len_ + count);
218
49.8k
    }
219
220
    // The slow path of EnsureRoomForAppend. Grows the buffer by either
221
    // 'count' bytes, or 50%, whichever is more.
222
    void GrowToAtLeast(size_t newcapacity);
223
224
    // Grow the array to the given capacity, which must be more than
225
    // the current capacity.
226
    void GrowArray(size_t newcapacity);
227
228
    void ShrinkToFitInternal();
229
230
    uint8_t* data_ = nullptr;
231
    uint8_t initial_data_[kInitialCapacity];
232
    size_t len_;
233
    // NOTE: we will make a initial buffer as part of the object, so the smallest
234
    // possible value of capacity_ is kInitialCapacity.
235
    size_t capacity_;
236
};
237
238
} // namespace doris