Coverage Report

Created: 2025-04-22 23:04

/root/doris/be/src/gutil/strings/util.h
Line
Count
Source (jump to first uncovered line)
1
//
2
// Copyright 1999-2006 and onwards Google, Inc.
3
//
4
// Useful string functions and so forth.  This is a grab-bag file.
5
//
6
// You might also want to look at memutil.h, which holds mem*()
7
// equivalents of a lot of the str*() functions in string.h,
8
// eg memstr, mempbrk, etc.
9
//
10
// These functions work fine for UTF-8 strings as long as you can
11
// consider them to be just byte strings.  For example, due to the
12
// design of UTF-8 you do not need to worry about accidental matches,
13
// as long as all your inputs are valid UTF-8 (use \uHHHH, not \xHH or \oOOO).
14
//
15
// Caveats:
16
// * all the lengths in these routines refer to byte counts,
17
//   not character counts.
18
// * case-insensitivity in these routines assumes that all the letters
19
//   in question are in the range A-Z or a-z.
20
//
21
// If you need Unicode specific processing (for example being aware of
22
// Unicode character boundaries, or knowledge of Unicode casing rules,
23
// or various forms of equivalence and normalization), take a look at
24
// files in i18n/utf8.
25
26
#pragma once
27
28
#include <stddef.h>
29
#include <stdio.h>
30
#include <string.h>
31
#ifndef _MSC_VER
32
#include <strings.h> // for strcasecmp, but msvc does not have this header
33
#endif
34
35
#include <functional>
36
37
using std::less;
38
#include <string>
39
40
using std::string;
41
#include <vector>
42
43
using std::vector;
44
45
#include "gutil/port.h"
46
#include "gutil/strings/stringpiece.h"
47
48
// Newer functions.
49
50
namespace strings {
51
52
// Finds the next end-of-line sequence.
53
// An end-of-line sequence is one of:
54
//   \n    common on unix, including mac os x
55
//   \r    common on macos 9 and before
56
//   \r\n  common on windows
57
//
58
// Returns a StringPiece that contains the end-of-line sequence (a pointer into
59
// the input, 1 or 2 characters long).
60
//
61
// If the input does not contain an end-of-line sequence, returns an empty
62
// StringPiece located at the end of the input:
63
//    StringPiece(sp.data() + sp.length(), 0).
64
65
StringPiece FindEol(StringPiece sp);
66
67
} // namespace strings
68
69
// Older functions.
70
71
// Duplicates a non-null, non-empty char* string. Returns a pointer to the new
72
// string, or NULL if the input is null or empty.
73
0
inline char* strdup_nonempty(const char* src) {
74
0
    if (src && src[0]) return strdup(src);
75
0
    return NULL;
76
0
}
77
78
// Finds the first occurrence of a character in at most a given number of bytes
79
// of a char* string. Returns a pointer to the first occurrence, or NULL if no
80
// occurrence found in the first sz bytes.
81
// Never searches past the first null character in the string; therefore, only
82
// suitable for null-terminated strings.
83
// WARNING: Removes const-ness of string argument!
84
0
inline char* strnchr(const char* buf, char c, int sz) {
85
0
    const char* end = buf + sz;
86
0
    while (buf != end && *buf) {
87
0
        if (*buf == c) return const_cast<char*>(buf);
88
0
        ++buf;
89
0
    }
90
0
    return NULL;
91
0
}
92
93
// Finds the first occurrence of the null-terminated needle in at most the first
94
// haystack_len bytes of haystack. Returns NULL if needle is not found. Returns
95
// haystack if needle is empty.
96
// WARNING: Removes const-ness of string argument!
97
char* strnstr(const char* haystack, const char* needle, size_t haystack_len);
98
99
// Matches a prefix (which must be a char* literal!) against the beginning of
100
// str. Returns a pointer past the prefix, or NULL if the prefix wasn't matched.
101
// (Like the standard strcasecmp(), but for efficiency doesn't call strlen() on
102
// prefix, and returns a pointer rather than an int.)
103
//
104
// The ""'s catch people who don't pass in a literal for "prefix"
105
#ifndef strprefix
106
#define strprefix(str, prefix) \
107
    (strncmp(str, prefix, sizeof("" prefix "") - 1) == 0 ? str + sizeof(prefix) - 1 : NULL)
108
#endif
109
110
// Same as strprefix() (immediately above), but matches a case-insensitive
111
// prefix.
112
#ifndef strcaseprefix
113
#define strcaseprefix(str, prefix) \
114
    (strncasecmp(str, prefix, sizeof("" prefix "") - 1) == 0 ? str + sizeof(prefix) - 1 : NULL)
115
#endif
116
117
// Matches a prefix (up to the first needle_size bytes of needle) in the first
118
// haystack_size byte of haystack. Returns a pointer past the prefix, or NULL if
119
// the prefix wasn't matched. (Unlike strprefix(), prefix doesn't need to be a
120
// char* literal. Like the standard strncmp(), but also takes a haystack_size,
121
// and returns a pointer rather than an int.)
122
//
123
// Always returns either NULL or haystack + needle_size.
124
//
125
// Some windows header sometimes #defines strnprefix to something we
126
// don't want.
127
#ifdef strnprefix
128
#undef strnprefix
129
#endif
130
const char* strnprefix(const char* haystack, int haystack_size, const char* needle,
131
                       int needle_size);
132
133
// Matches a case-insensitive prefix (up to the first needle_size bytes of
134
// needle) in the first haystack_size byte of haystack. Returns a pointer past
135
// the prefix, or NULL if the prefix wasn't matched.
136
//
137
// Always returns either NULL or haystack + needle_size.
138
const char* strncaseprefix(const char* haystack, int haystack_size, const char* needle,
139
                           int needle_size);
140
141
// Matches a prefix; returns a pointer past the prefix, or NULL if not found.
142
// (Like strprefix() and strcaseprefix() but not restricted to searching for
143
// char* literals). Templated so searching a const char* returns a const char*,
144
// and searching a non-const char* returns a non-const char*.
145
template <class CharStar>
146
CharStar var_strprefix(CharStar str, const char* prefix) {
147
    const int len = strlen(prefix);
148
    return strncmp(str, prefix, len) == 0 ? str + len : NULL;
149
}
150
151
// Same as var_strprefix() (immediately above), but matches a case-insensitive
152
// prefix.
153
template <class CharStar>
154
CharStar var_strcaseprefix(CharStar str, const char* prefix) {
155
    const int len = strlen(prefix);
156
    return strncasecmp(str, prefix, len) == 0 ? str + len : NULL;
157
}
158
159
// Returns input, or "(null)" if NULL. (Useful for logging.)
160
0
inline const char* GetPrintableString(const char* const in) {
161
0
    return NULL == in ? "(null)" : in;
162
0
}
163
164
// Returns whether str begins with prefix.
165
0
inline bool HasPrefixString(const StringPiece& str, const StringPiece& prefix) {
166
0
    return str.starts_with(prefix);
167
0
}
168
169
// Returns whether str ends with suffix.
170
0
inline bool HasSuffixString(const StringPiece& str, const StringPiece& suffix) {
171
0
    return str.ends_with(suffix);
172
0
}
173
174
// Returns true if the string passed in matches the pattern. The pattern
175
// string can contain wildcards like * and ?
176
// The backslash character (\) is an escape character for * and ?
177
// We limit the patterns to having a max of 16 * or ? characters.
178
// ? matches 0 or 1 character, while * matches 0 or more characters.
179
bool MatchPattern(const StringPiece& string, const StringPiece& pattern);
180
181
// Returns where suffix begins in str, or NULL if str doesn't end with suffix.
182
0
inline char* strsuffix(char* str, const char* suffix) {
183
0
    const int lenstr = strlen(str);
184
0
    const int lensuffix = strlen(suffix);
185
0
    char* strbeginningoftheend = str + lenstr - lensuffix;
186
0
187
0
    if (lenstr >= lensuffix && 0 == strcmp(strbeginningoftheend, suffix)) {
188
0
        return (strbeginningoftheend);
189
0
    } else {
190
0
        return (NULL);
191
0
    }
192
0
}
193
0
inline const char* strsuffix(const char* str, const char* suffix) {
194
0
    return const_cast<const char*>(strsuffix(const_cast<char*>(str), suffix));
195
0
}
196
197
// Same as strsuffix() (immediately above), but matches a case-insensitive
198
// suffix.
199
char* strcasesuffix(char* str, const char* suffix);
200
0
inline const char* strcasesuffix(const char* str, const char* suffix) {
201
0
    return const_cast<const char*>(strcasesuffix(const_cast<char*>(str), suffix));
202
0
}
203
204
const char* strnsuffix(const char* haystack, int haystack_size, const char* needle,
205
                       int needle_size);
206
const char* strncasesuffix(const char* haystack, int haystack_size, const char* needle,
207
                           int needle_size);
208
209
// Returns the number of times a character occurs in a string for a null
210
// terminated string.
211
0
inline ptrdiff_t strcount(const char* buf, char c) {
212
0
    if (buf == NULL) return 0;
213
0
    ptrdiff_t num = 0;
214
0
    for (const char* bp = buf; *bp != '\0'; bp++) {
215
0
        if (*bp == c) num++;
216
0
    }
217
0
    return num;
218
0
}
219
// Returns the number of times a character occurs in a string for a string
220
// defined by a pointer to the first character and a pointer just past the last
221
// character.
222
0
inline ptrdiff_t strcount(const char* buf_begin, const char* buf_end, char c) {
223
0
    if (buf_begin == NULL) return 0;
224
0
    if (buf_end <= buf_begin) return 0;
225
0
    ptrdiff_t num = 0;
226
0
    for (const char* bp = buf_begin; bp != buf_end; bp++) {
227
0
        if (*bp == c) num++;
228
0
    }
229
0
    return num;
230
0
}
231
// Returns the number of times a character occurs in a string for a string
232
// defined by a pointer to the first char and a length:
233
0
inline ptrdiff_t strcount(const char* buf, size_t len, char c) {
234
0
    return strcount(buf, buf + len, c);
235
0
}
236
// Returns the number of times a character occurs in a string for a C++ string:
237
0
inline ptrdiff_t strcount(const string& buf, char c) {
238
0
    return strcount(buf.c_str(), buf.size(), c);
239
0
}
240
241
// Returns a pointer to the nth occurrence of a character in a null-terminated
242
// string.
243
// WARNING: Removes const-ness of string argument!
244
char* strchrnth(const char* str, const char& c, int n);
245
246
// Returns a pointer to the nth occurrence of a character in a null-terminated
247
// string, or the last occurrence if occurs fewer than n times.
248
// WARNING: Removes const-ness of string argument!
249
char* AdjustedLastPos(const char* str, char separator, int n);
250
251
// STL-compatible function objects for char* string keys:
252
253
// Compares two char* strings for equality. (Works with NULL, which compares
254
// equal only to another NULL). Useful in hash tables:
255
//    hash_map<const char*, Value, hash<const char*>, streq> ht;
256
struct streq {
257
0
    bool operator()(const char* s1, const char* s2) const {
258
0
        return ((s1 == 0 && s2 == 0) || (s1 && s2 && *s1 == *s2 && strcmp(s1, s2) == 0));
259
0
    }
260
};
261
262
// Compares two char* strings. (Works with NULL, which compares greater than any
263
// non-NULL). Useful in maps:
264
//    map<const char*, Value, strlt> m;
265
struct strlt {
266
0
    bool operator()(const char* s1, const char* s2) const {
267
0
        return (s1 != s2) && (s2 == 0 || (s1 != 0 && strcmp(s1, s2) < 0));
268
0
    }
269
};
270
271
// Returns whether str has only Ascii characters (as defined by ascii_isascii()
272
// in strings/ascii_ctype.h).
273
bool IsAscii(const char* str, int len);
274
0
inline bool IsAscii(const StringPiece& str) {
275
0
    return IsAscii(str.data(), str.size());
276
0
}
277
278
// Returns the smallest lexicographically larger string of equal or smaller
279
// length. Returns an empty string if there is no such successor (if the input
280
// is empty or consists entirely of 0xff bytes).
281
// Useful for calculating the smallest lexicographically larger string
282
// that will not be prefixed by the input string.
283
//
284
// Examples:
285
// "a" -> "b", "aaa" -> "aab", "aa\xff" -> "ab", "\xff" -> "", "" -> ""
286
string PrefixSuccessor(const StringPiece& prefix);
287
288
// Returns the immediate lexicographically-following string. This is useful to
289
// turn an inclusive range into something that can be used with Bigtable's
290
// SetLimitRow():
291
//
292
//     // Inclusive range [min_element, max_element].
293
//     string min_element = ...;
294
//     string max_element = ...;
295
//
296
//     // Equivalent range [range_start, range_end).
297
//     string range_start = min_element;
298
//     string range_end = ImmediateSuccessor(max_element);
299
//
300
// WARNING: Returns the input string with a '\0' appended; if you call c_str()
301
// on the result, it will compare equal to s.
302
//
303
// WARNING: Transforms "" -> "\0"; this doesn't account for Bigtable's special
304
// treatment of "" as infinity.
305
string ImmediateSuccessor(const StringPiece& s);
306
307
// Fills in *separator with a short string less than limit but greater than or
308
// equal to start. If limit is greater than start, *separator is the common
309
// prefix of start and limit, followed by the successor to the next character in
310
// start. Examples:
311
// FindShortestSeparator("foobar", "foxhunt", &sep) => sep == "fop"
312
// FindShortestSeparator("abracadabra", "bacradabra", &sep) => sep == "b"
313
// If limit is less than or equal to start, fills in *separator with start.
314
void FindShortestSeparator(const StringPiece& start, const StringPiece& limit, string* separator);
315
316
// Copies at most n-1 bytes from src to dest, and returns dest. If n >=1, null
317
// terminates dest; otherwise, returns dest unchanged. Unlike strncpy(), only
318
// puts one null character at the end of dest.
319
0
inline char* safestrncpy(char* dest, const char* src, size_t n) {
320
0
    if (n < 1) return dest;
321
0
322
0
    // Avoid using non-ANSI memccpy(), which is also deprecated in MSVC
323
0
    for (size_t i = 0; i < n; ++i) {
324
0
        if ((dest[i] = src[i]) == '\0') return dest;
325
0
    }
326
0
327
0
    dest[n - 1] = '\0';
328
0
    return dest;
329
0
}
330
331
namespace strings {
332
333
// BSD-style safe and consistent string copy functions.
334
// Copies |src| to |dst|, where |dst_size| is the total allocated size of |dst|.
335
// Copies at most |dst_size|-1 characters, and always NULL terminates |dst|, as
336
// long as |dst_size| is not 0.  Returns the length of |src| in characters.
337
// If the return value is >= dst_size, then the output was truncated.
338
// NOTE: All sizes are in number of characters, NOT in bytes.
339
size_t strlcpy(char* dst, const char* src, size_t dst_size);
340
341
} // namespace strings
342
343
// Replaces the first occurrence (if replace_all is false) or all occurrences
344
// (if replace_all is true) of oldsub in s with newsub. In the second version,
345
// *res must be distinct from all the other arguments.
346
string StringReplace(const StringPiece& s, const StringPiece& oldsub, const StringPiece& newsub,
347
                     bool replace_all);
348
void StringReplace(const StringPiece& s, const StringPiece& oldsub, const StringPiece& newsub,
349
                   bool replace_all, string* res);
350
351
// Replaces all occurrences of substring in s with replacement. Returns the
352
// number of instances replaced. s must be distinct from the other arguments.
353
//
354
// Less flexible, but faster, than RE::GlobalReplace().
355
int GlobalReplaceSubstring(const StringPiece& substring, const StringPiece& replacement, string* s);
356
357
// Removes v[i] for every element i in indices. Does *not* preserve the order of
358
// v. indices must be sorted in strict increasing order (no duplicates). Runs in
359
// O(indices.size()).
360
void RemoveStrings(vector<string>* v, const vector<int>& indices);
361
362
// Case-insensitive strstr(); use system strcasestr() instead.
363
// WARNING: Removes const-ness of string argument!
364
char* gstrcasestr(const char* haystack, const char* needle);
365
366
// Finds (case insensitively) the first occurrence of (null terminated) needle
367
// in at most the first len bytes of haystack. Returns a pointer into haystack,
368
// or NULL if needle wasn't found.
369
// WARNING: Removes const-ness of haystack!
370
const char* gstrncasestr(const char* haystack, const char* needle, size_t len);
371
char* gstrncasestr(char* haystack, const char* needle, size_t len);
372
373
// Finds (case insensitively), in str (which is a list of tokens separated by
374
// non_alpha), a token prefix and a token suffix. Returns a pointer into str of
375
// the position of prefix, or NULL if not found.
376
// WARNING: Removes const-ness of string argument!
377
char* gstrncasestr_split(const char* str, const char* prefix, char non_alpha, const char* suffix,
378
                         size_t n);
379
380
// Finds (case insensitively) needle in haystack, paying attention only to
381
// alphanumerics in either string. Returns a pointer into haystack, or NULL if
382
// not found.
383
// Example: strcasestr_alnum("This is a longer test string", "IS-A-LONGER")
384
// returns a pointer to "is a longer".
385
// WARNING: Removes const-ness of string argument!
386
char* strcasestr_alnum(const char* haystack, const char* needle);
387
388
// Returns the number times substring appears in text.
389
// Note: Runs in O(text.length() * substring.length()). Do *not* use on long
390
// strings.
391
int CountSubstring(StringPiece text, StringPiece substring);
392
393
// Finds, in haystack (which is a list of tokens separated by delim), an token
394
// equal to needle. Returns a pointer into haystack, or NULL if not found (or
395
// either needle or haystack is empty).
396
const char* strstr_delimited(const char* haystack, const char* needle, char delim);
397
398
// Gets the next token from string *stringp, where tokens are strings separated
399
// by characters from delim.
400
char* gstrsep(char** stringp, const char* delim);
401
402
// Appends StringPiece(data, len) to *s.
403
void FastStringAppend(string* s, const char* data, int len);
404
405
// Returns a duplicate of the_string, with memory allocated by new[].
406
char* strdup_with_new(const char* the_string);
407
408
// Returns a duplicate of up to the first max_length bytes of the_string, with
409
// memory allocated by new[].
410
char* strndup_with_new(const char* the_string, int max_length);
411
412
// Finds, in the_string, the first "word" (consecutive !ascii_isspace()
413
// characters). Returns pointer to the beginning of the word, and sets *end_ptr
414
// to the character after the word (which may be space or '\0'); returns NULL
415
// (and *end_ptr is undefined) if no next word found.
416
// end_ptr must not be NULL.
417
const char* ScanForFirstWord(const char* the_string, const char** end_ptr);
418
0
inline char* ScanForFirstWord(char* the_string, char** end_ptr) {
419
0
    // implicit_cast<> would be more appropriate for casting to const,
420
0
    // but we save the inclusion of "base/casts.h" here by using const_cast<>.
421
0
    return const_cast<char*>(ScanForFirstWord(const_cast<const char*>(the_string),
422
0
                                              const_cast<const char**>(end_ptr)));
423
0
}
424
425
// For the following functions, an "identifier" is a letter or underscore,
426
// followed by letters, underscores, or digits.
427
428
// Returns a pointer past the end of the "identifier" (see above) beginning at
429
// str, or NULL if str doesn't start with an identifier.
430
const char* AdvanceIdentifier(const char* str);
431
0
inline char* AdvanceIdentifier(char* str) {
432
0
    // implicit_cast<> would be more appropriate for casting to const,
433
0
    // but we save the inclusion of "base/casts.h" here by using const_cast<>.
434
0
    return const_cast<char*>(AdvanceIdentifier(const_cast<const char*>(str)));
435
0
}
436
437
// Returns whether str is an "identifier" (see above).
438
bool IsIdentifier(const char* str);
439
440
// Finds the first tag and value in a string of tag/value pairs.
441
//
442
// The first pair begins after the first occurrence of attribute_separator (or
443
// string_terminal, if not '\0'); tag_value_separator separates the tag and
444
// value; and the value ends before the following occurrence of
445
// attribute_separator (or string_terminal, if not '\0').
446
//
447
// Returns true (and populates tag, tag_len, value, and value_len) if a
448
// tag/value pair is founds; returns false otherwise.
449
bool FindTagValuePair(const char* in_str, char tag_value_separator, char attribute_separator,
450
                      char string_terminal, char** tag, int* tag_len, char** value, int* value_len);
451
452
// Inserts separator after every interval characters in *s (but never appends to
453
// the end of the original *s).
454
void UniformInsertString(string* s, int interval, const char* separator);
455
456
// Inserts separator into s at each specified index. indices must be sorted in
457
// ascending order.
458
void InsertString(string* s, const vector<uint32>& indices, char const* separator);
459
460
// Finds the nth occurrence of c in n; returns the index in s of that
461
// occurrence, or string::npos if fewer than n occurrences.
462
int FindNth(StringPiece s, char c, int n);
463
464
// Finds the nth-to-last occurrence of c in s; returns the index in s of that
465
// occurrence, or string::npos if fewer than n occurrences.
466
int ReverseFindNth(StringPiece s, char c, int n);
467
468
// Returns whether s contains only whitespace characters (including the case
469
// where s is empty).
470
bool OnlyWhitespace(const StringPiece& s);
471
472
// Formats a string in the same fashion as snprintf(), but returns either the
473
// number of characters written, or zero if not enough space was available.
474
// (snprintf() returns the number of characters that would have been written if
475
// enough space had been available.)
476
//
477
// A drop-in replacement for the safe_snprintf() macro.
478
int SafeSnprintf(char* str, size_t size, const char* format, ...) PRINTF_ATTRIBUTE(3, 4);
479
480
// Reads a line (terminated by delim) from file into *str. Reads delim from
481
// file, but doesn't copy it into *str. Returns true if read a delim-terminated
482
// line, or false on end-of-file or error.
483
bool GetlineFromStdioFile(FILE* file, string* str, char delim);