Coverage Report

Created: 2024-11-21 18:14

/root/doris/be/src/gutil/strings/util.h
Line
Count
Source (jump to first uncovered line)
1
//
2
// Copyright 1999-2006 and onwards Google, Inc.
3
//
4
// Useful string functions and so forth.  This is a grab-bag file.
5
//
6
// You might also want to look at memutil.h, which holds mem*()
7
// equivalents of a lot of the str*() functions in string.h,
8
// eg memstr, mempbrk, etc.
9
//
10
// These functions work fine for UTF-8 strings as long as you can
11
// consider them to be just byte strings.  For example, due to the
12
// design of UTF-8 you do not need to worry about accidental matches,
13
// as long as all your inputs are valid UTF-8 (use \uHHHH, not \xHH or \oOOO).
14
//
15
// Caveats:
16
// * all the lengths in these routines refer to byte counts,
17
//   not character counts.
18
// * case-insensitivity in these routines assumes that all the letters
19
//   in question are in the range A-Z or a-z.
20
//
21
// If you need Unicode specific processing (for example being aware of
22
// Unicode character boundaries, or knowledge of Unicode casing rules,
23
// or various forms of equivalence and normalization), take a look at
24
// files in i18n/utf8.
25
26
#pragma once
27
28
#include <stddef.h>
29
#include <stdio.h>
30
#include <string.h>
31
#ifndef _MSC_VER
32
#include <strings.h> // for strcasecmp, but msvc does not have this header
33
#endif
34
35
#include <functional>
36
37
using std::less;
38
#include <string>
39
40
using std::string;
41
#include <vector>
42
43
using std::vector;
44
45
#include "gutil/integral_types.h"
46
#include "gutil/port.h"
47
#include "gutil/strings/stringpiece.h"
48
49
// Newer functions.
50
51
namespace strings {
52
53
// Finds the next end-of-line sequence.
54
// An end-of-line sequence is one of:
55
//   \n    common on unix, including mac os x
56
//   \r    common on macos 9 and before
57
//   \r\n  common on windows
58
//
59
// Returns a StringPiece that contains the end-of-line sequence (a pointer into
60
// the input, 1 or 2 characters long).
61
//
62
// If the input does not contain an end-of-line sequence, returns an empty
63
// StringPiece located at the end of the input:
64
//    StringPiece(sp.data() + sp.length(), 0).
65
66
StringPiece FindEol(StringPiece sp);
67
68
} // namespace strings
69
70
// Older functions.
71
72
// Duplicates a non-null, non-empty char* string. Returns a pointer to the new
73
// string, or NULL if the input is null or empty.
74
0
inline char* strdup_nonempty(const char* src) {
75
0
    if (src && src[0]) return strdup(src);
76
0
    return NULL;
77
0
}
78
79
// Finds the first occurrence of a character in at most a given number of bytes
80
// of a char* string. Returns a pointer to the first occurrence, or NULL if no
81
// occurrence found in the first sz bytes.
82
// Never searches past the first null character in the string; therefore, only
83
// suitable for null-terminated strings.
84
// WARNING: Removes const-ness of string argument!
85
0
inline char* strnchr(const char* buf, char c, int sz) {
86
0
    const char* end = buf + sz;
87
0
    while (buf != end && *buf) {
88
0
        if (*buf == c) return const_cast<char*>(buf);
89
0
        ++buf;
90
0
    }
91
0
    return NULL;
92
0
}
93
94
// Finds the first occurrence of the null-terminated needle in at most the first
95
// haystack_len bytes of haystack. Returns NULL if needle is not found. Returns
96
// haystack if needle is empty.
97
// WARNING: Removes const-ness of string argument!
98
char* strnstr(const char* haystack, const char* needle, size_t haystack_len);
99
100
// Matches a prefix (which must be a char* literal!) against the beginning of
101
// str. Returns a pointer past the prefix, or NULL if the prefix wasn't matched.
102
// (Like the standard strcasecmp(), but for efficiency doesn't call strlen() on
103
// prefix, and returns a pointer rather than an int.)
104
//
105
// The ""'s catch people who don't pass in a literal for "prefix"
106
#ifndef strprefix
107
#define strprefix(str, prefix) \
108
    (strncmp(str, prefix, sizeof("" prefix "") - 1) == 0 ? str + sizeof(prefix) - 1 : NULL)
109
#endif
110
111
// Same as strprefix() (immediately above), but matches a case-insensitive
112
// prefix.
113
#ifndef strcaseprefix
114
#define strcaseprefix(str, prefix) \
115
    (strncasecmp(str, prefix, sizeof("" prefix "") - 1) == 0 ? str + sizeof(prefix) - 1 : NULL)
116
#endif
117
118
// Matches a prefix (up to the first needle_size bytes of needle) in the first
119
// haystack_size byte of haystack. Returns a pointer past the prefix, or NULL if
120
// the prefix wasn't matched. (Unlike strprefix(), prefix doesn't need to be a
121
// char* literal. Like the standard strncmp(), but also takes a haystack_size,
122
// and returns a pointer rather than an int.)
123
//
124
// Always returns either NULL or haystack + needle_size.
125
//
126
// Some windows header sometimes #defines strnprefix to something we
127
// don't want.
128
#ifdef strnprefix
129
#undef strnprefix
130
#endif
131
const char* strnprefix(const char* haystack, int haystack_size, const char* needle,
132
                       int needle_size);
133
134
// Matches a case-insensitive prefix (up to the first needle_size bytes of
135
// needle) in the first haystack_size byte of haystack. Returns a pointer past
136
// the prefix, or NULL if the prefix wasn't matched.
137
//
138
// Always returns either NULL or haystack + needle_size.
139
const char* strncaseprefix(const char* haystack, int haystack_size, const char* needle,
140
                           int needle_size);
141
142
// Matches a prefix; returns a pointer past the prefix, or NULL if not found.
143
// (Like strprefix() and strcaseprefix() but not restricted to searching for
144
// char* literals). Templated so searching a const char* returns a const char*,
145
// and searching a non-const char* returns a non-const char*.
146
template <class CharStar>
147
CharStar var_strprefix(CharStar str, const char* prefix) {
148
    const int len = strlen(prefix);
149
    return strncmp(str, prefix, len) == 0 ? str + len : NULL;
150
}
151
152
// Same as var_strprefix() (immediately above), but matches a case-insensitive
153
// prefix.
154
template <class CharStar>
155
CharStar var_strcaseprefix(CharStar str, const char* prefix) {
156
    const int len = strlen(prefix);
157
    return strncasecmp(str, prefix, len) == 0 ? str + len : NULL;
158
}
159
160
// Returns input, or "(null)" if NULL. (Useful for logging.)
161
0
inline const char* GetPrintableString(const char* const in) {
162
0
    return NULL == in ? "(null)" : in;
163
0
}
164
165
// Returns whether str begins with prefix.
166
0
inline bool HasPrefixString(const StringPiece& str, const StringPiece& prefix) {
167
0
    return str.starts_with(prefix);
168
0
}
169
170
// Returns whether str ends with suffix.
171
0
inline bool HasSuffixString(const StringPiece& str, const StringPiece& suffix) {
172
0
    return str.ends_with(suffix);
173
0
}
174
175
// Returns true if the string passed in matches the pattern. The pattern
176
// string can contain wildcards like * and ?
177
// The backslash character (\) is an escape character for * and ?
178
// We limit the patterns to having a max of 16 * or ? characters.
179
// ? matches 0 or 1 character, while * matches 0 or more characters.
180
bool MatchPattern(const StringPiece& string, const StringPiece& pattern);
181
182
// Returns where suffix begins in str, or NULL if str doesn't end with suffix.
183
0
inline char* strsuffix(char* str, const char* suffix) {
184
0
    const int lenstr = strlen(str);
185
0
    const int lensuffix = strlen(suffix);
186
0
    char* strbeginningoftheend = str + lenstr - lensuffix;
187
0
188
0
    if (lenstr >= lensuffix && 0 == strcmp(strbeginningoftheend, suffix)) {
189
0
        return (strbeginningoftheend);
190
0
    } else {
191
0
        return (NULL);
192
0
    }
193
0
}
194
0
inline const char* strsuffix(const char* str, const char* suffix) {
195
0
    return const_cast<const char*>(strsuffix(const_cast<char*>(str), suffix));
196
0
}
197
198
// Same as strsuffix() (immediately above), but matches a case-insensitive
199
// suffix.
200
char* strcasesuffix(char* str, const char* suffix);
201
0
inline const char* strcasesuffix(const char* str, const char* suffix) {
202
0
    return const_cast<const char*>(strcasesuffix(const_cast<char*>(str), suffix));
203
0
}
204
205
const char* strnsuffix(const char* haystack, int haystack_size, const char* needle,
206
                       int needle_size);
207
const char* strncasesuffix(const char* haystack, int haystack_size, const char* needle,
208
                           int needle_size);
209
210
// Returns the number of times a character occurs in a string for a null
211
// terminated string.
212
0
inline ptrdiff_t strcount(const char* buf, char c) {
213
0
    if (buf == NULL) return 0;
214
0
    ptrdiff_t num = 0;
215
0
    for (const char* bp = buf; *bp != '\0'; bp++) {
216
0
        if (*bp == c) num++;
217
0
    }
218
0
    return num;
219
0
}
220
// Returns the number of times a character occurs in a string for a string
221
// defined by a pointer to the first character and a pointer just past the last
222
// character.
223
0
inline ptrdiff_t strcount(const char* buf_begin, const char* buf_end, char c) {
224
0
    if (buf_begin == NULL) return 0;
225
0
    if (buf_end <= buf_begin) return 0;
226
0
    ptrdiff_t num = 0;
227
0
    for (const char* bp = buf_begin; bp != buf_end; bp++) {
228
0
        if (*bp == c) num++;
229
0
    }
230
0
    return num;
231
0
}
232
// Returns the number of times a character occurs in a string for a string
233
// defined by a pointer to the first char and a length:
234
0
inline ptrdiff_t strcount(const char* buf, size_t len, char c) {
235
0
    return strcount(buf, buf + len, c);
236
0
}
237
// Returns the number of times a character occurs in a string for a C++ string:
238
0
inline ptrdiff_t strcount(const string& buf, char c) {
239
0
    return strcount(buf.c_str(), buf.size(), c);
240
0
}
241
242
// Returns a pointer to the nth occurrence of a character in a null-terminated
243
// string.
244
// WARNING: Removes const-ness of string argument!
245
char* strchrnth(const char* str, const char& c, int n);
246
247
// Returns a pointer to the nth occurrence of a character in a null-terminated
248
// string, or the last occurrence if occurs fewer than n times.
249
// WARNING: Removes const-ness of string argument!
250
char* AdjustedLastPos(const char* str, char separator, int n);
251
252
// STL-compatible function objects for char* string keys:
253
254
// Compares two char* strings for equality. (Works with NULL, which compares
255
// equal only to another NULL). Useful in hash tables:
256
//    hash_map<const char*, Value, hash<const char*>, streq> ht;
257
struct streq {
258
0
    bool operator()(const char* s1, const char* s2) const {
259
0
        return ((s1 == 0 && s2 == 0) || (s1 && s2 && *s1 == *s2 && strcmp(s1, s2) == 0));
260
0
    }
261
};
262
263
// Compares two char* strings. (Works with NULL, which compares greater than any
264
// non-NULL). Useful in maps:
265
//    map<const char*, Value, strlt> m;
266
struct strlt {
267
0
    bool operator()(const char* s1, const char* s2) const {
268
0
        return (s1 != s2) && (s2 == 0 || (s1 != 0 && strcmp(s1, s2) < 0));
269
0
    }
270
};
271
272
// Returns whether str has only Ascii characters (as defined by ascii_isascii()
273
// in strings/ascii_ctype.h).
274
bool IsAscii(const char* str, int len);
275
0
inline bool IsAscii(const StringPiece& str) {
276
0
    return IsAscii(str.data(), str.size());
277
0
}
278
279
// Returns the smallest lexicographically larger string of equal or smaller
280
// length. Returns an empty string if there is no such successor (if the input
281
// is empty or consists entirely of 0xff bytes).
282
// Useful for calculating the smallest lexicographically larger string
283
// that will not be prefixed by the input string.
284
//
285
// Examples:
286
// "a" -> "b", "aaa" -> "aab", "aa\xff" -> "ab", "\xff" -> "", "" -> ""
287
string PrefixSuccessor(const StringPiece& prefix);
288
289
// Returns the immediate lexicographically-following string. This is useful to
290
// turn an inclusive range into something that can be used with Bigtable's
291
// SetLimitRow():
292
//
293
//     // Inclusive range [min_element, max_element].
294
//     string min_element = ...;
295
//     string max_element = ...;
296
//
297
//     // Equivalent range [range_start, range_end).
298
//     string range_start = min_element;
299
//     string range_end = ImmediateSuccessor(max_element);
300
//
301
// WARNING: Returns the input string with a '\0' appended; if you call c_str()
302
// on the result, it will compare equal to s.
303
//
304
// WARNING: Transforms "" -> "\0"; this doesn't account for Bigtable's special
305
// treatment of "" as infinity.
306
string ImmediateSuccessor(const StringPiece& s);
307
308
// Fills in *separator with a short string less than limit but greater than or
309
// equal to start. If limit is greater than start, *separator is the common
310
// prefix of start and limit, followed by the successor to the next character in
311
// start. Examples:
312
// FindShortestSeparator("foobar", "foxhunt", &sep) => sep == "fop"
313
// FindShortestSeparator("abracadabra", "bacradabra", &sep) => sep == "b"
314
// If limit is less than or equal to start, fills in *separator with start.
315
void FindShortestSeparator(const StringPiece& start, const StringPiece& limit, string* separator);
316
317
// Copies at most n-1 bytes from src to dest, and returns dest. If n >=1, null
318
// terminates dest; otherwise, returns dest unchanged. Unlike strncpy(), only
319
// puts one null character at the end of dest.
320
0
inline char* safestrncpy(char* dest, const char* src, size_t n) {
321
0
    if (n < 1) return dest;
322
0
323
0
    // Avoid using non-ANSI memccpy(), which is also deprecated in MSVC
324
0
    for (size_t i = 0; i < n; ++i) {
325
0
        if ((dest[i] = src[i]) == '\0') return dest;
326
0
    }
327
0
328
0
    dest[n - 1] = '\0';
329
0
    return dest;
330
0
}
331
332
namespace strings {
333
334
// BSD-style safe and consistent string copy functions.
335
// Copies |src| to |dst|, where |dst_size| is the total allocated size of |dst|.
336
// Copies at most |dst_size|-1 characters, and always NULL terminates |dst|, as
337
// long as |dst_size| is not 0.  Returns the length of |src| in characters.
338
// If the return value is >= dst_size, then the output was truncated.
339
// NOTE: All sizes are in number of characters, NOT in bytes.
340
size_t strlcpy(char* dst, const char* src, size_t dst_size);
341
342
} // namespace strings
343
344
// Replaces the first occurrence (if replace_all is false) or all occurrences
345
// (if replace_all is true) of oldsub in s with newsub. In the second version,
346
// *res must be distinct from all the other arguments.
347
string StringReplace(const StringPiece& s, const StringPiece& oldsub, const StringPiece& newsub,
348
                     bool replace_all);
349
void StringReplace(const StringPiece& s, const StringPiece& oldsub, const StringPiece& newsub,
350
                   bool replace_all, string* res);
351
352
// Replaces all occurrences of substring in s with replacement. Returns the
353
// number of instances replaced. s must be distinct from the other arguments.
354
//
355
// Less flexible, but faster, than RE::GlobalReplace().
356
int GlobalReplaceSubstring(const StringPiece& substring, const StringPiece& replacement, string* s);
357
358
// Removes v[i] for every element i in indices. Does *not* preserve the order of
359
// v. indices must be sorted in strict increasing order (no duplicates). Runs in
360
// O(indices.size()).
361
void RemoveStrings(vector<string>* v, const vector<int>& indices);
362
363
// Case-insensitive strstr(); use system strcasestr() instead.
364
// WARNING: Removes const-ness of string argument!
365
char* gstrcasestr(const char* haystack, const char* needle);
366
367
// Finds (case insensitively) the first occurrence of (null terminated) needle
368
// in at most the first len bytes of haystack. Returns a pointer into haystack,
369
// or NULL if needle wasn't found.
370
// WARNING: Removes const-ness of haystack!
371
const char* gstrncasestr(const char* haystack, const char* needle, size_t len);
372
char* gstrncasestr(char* haystack, const char* needle, size_t len);
373
374
// Finds (case insensitively), in str (which is a list of tokens separated by
375
// non_alpha), a token prefix and a token suffix. Returns a pointer into str of
376
// the position of prefix, or NULL if not found.
377
// WARNING: Removes const-ness of string argument!
378
char* gstrncasestr_split(const char* str, const char* prefix, char non_alpha, const char* suffix,
379
                         size_t n);
380
381
// Finds (case insensitively) needle in haystack, paying attention only to
382
// alphanumerics in either string. Returns a pointer into haystack, or NULL if
383
// not found.
384
// Example: strcasestr_alnum("This is a longer test string", "IS-A-LONGER")
385
// returns a pointer to "is a longer".
386
// WARNING: Removes const-ness of string argument!
387
char* strcasestr_alnum(const char* haystack, const char* needle);
388
389
// Returns the number times substring appears in text.
390
// Note: Runs in O(text.length() * substring.length()). Do *not* use on long
391
// strings.
392
int CountSubstring(StringPiece text, StringPiece substring);
393
394
// Finds, in haystack (which is a list of tokens separated by delim), an token
395
// equal to needle. Returns a pointer into haystack, or NULL if not found (or
396
// either needle or haystack is empty).
397
const char* strstr_delimited(const char* haystack, const char* needle, char delim);
398
399
// Gets the next token from string *stringp, where tokens are strings separated
400
// by characters from delim.
401
char* gstrsep(char** stringp, const char* delim);
402
403
// Appends StringPiece(data, len) to *s.
404
void FastStringAppend(string* s, const char* data, int len);
405
406
// Returns a duplicate of the_string, with memory allocated by new[].
407
char* strdup_with_new(const char* the_string);
408
409
// Returns a duplicate of up to the first max_length bytes of the_string, with
410
// memory allocated by new[].
411
char* strndup_with_new(const char* the_string, int max_length);
412
413
// Finds, in the_string, the first "word" (consecutive !ascii_isspace()
414
// characters). Returns pointer to the beginning of the word, and sets *end_ptr
415
// to the character after the word (which may be space or '\0'); returns NULL
416
// (and *end_ptr is undefined) if no next word found.
417
// end_ptr must not be NULL.
418
const char* ScanForFirstWord(const char* the_string, const char** end_ptr);
419
0
inline char* ScanForFirstWord(char* the_string, char** end_ptr) {
420
0
    // implicit_cast<> would be more appropriate for casting to const,
421
0
    // but we save the inclusion of "base/casts.h" here by using const_cast<>.
422
0
    return const_cast<char*>(ScanForFirstWord(const_cast<const char*>(the_string),
423
0
                                              const_cast<const char**>(end_ptr)));
424
0
}
425
426
// For the following functions, an "identifier" is a letter or underscore,
427
// followed by letters, underscores, or digits.
428
429
// Returns a pointer past the end of the "identifier" (see above) beginning at
430
// str, or NULL if str doesn't start with an identifier.
431
const char* AdvanceIdentifier(const char* str);
432
0
inline char* AdvanceIdentifier(char* str) {
433
0
    // implicit_cast<> would be more appropriate for casting to const,
434
0
    // but we save the inclusion of "base/casts.h" here by using const_cast<>.
435
0
    return const_cast<char*>(AdvanceIdentifier(const_cast<const char*>(str)));
436
0
}
437
438
// Returns whether str is an "identifier" (see above).
439
bool IsIdentifier(const char* str);
440
441
// Finds the first tag and value in a string of tag/value pairs.
442
//
443
// The first pair begins after the first occurrence of attribute_separator (or
444
// string_terminal, if not '\0'); tag_value_separator separates the tag and
445
// value; and the value ends before the following occurrence of
446
// attribute_separator (or string_terminal, if not '\0').
447
//
448
// Returns true (and populates tag, tag_len, value, and value_len) if a
449
// tag/value pair is founds; returns false otherwise.
450
bool FindTagValuePair(const char* in_str, char tag_value_separator, char attribute_separator,
451
                      char string_terminal, char** tag, int* tag_len, char** value, int* value_len);
452
453
// Inserts separator after every interval characters in *s (but never appends to
454
// the end of the original *s).
455
void UniformInsertString(string* s, int interval, const char* separator);
456
457
// Inserts separator into s at each specified index. indices must be sorted in
458
// ascending order.
459
void InsertString(string* s, const vector<uint32>& indices, char const* separator);
460
461
// Finds the nth occurrence of c in n; returns the index in s of that
462
// occurrence, or string::npos if fewer than n occurrences.
463
int FindNth(StringPiece s, char c, int n);
464
465
// Finds the nth-to-last occurrence of c in s; returns the index in s of that
466
// occurrence, or string::npos if fewer than n occurrences.
467
int ReverseFindNth(StringPiece s, char c, int n);
468
469
// Returns whether s contains only whitespace characters (including the case
470
// where s is empty).
471
bool OnlyWhitespace(const StringPiece& s);
472
473
// Formats a string in the same fashion as snprintf(), but returns either the
474
// number of characters written, or zero if not enough space was available.
475
// (snprintf() returns the number of characters that would have been written if
476
// enough space had been available.)
477
//
478
// A drop-in replacement for the safe_snprintf() macro.
479
int SafeSnprintf(char* str, size_t size, const char* format, ...) PRINTF_ATTRIBUTE(3, 4);
480
481
// Reads a line (terminated by delim) from file into *str. Reads delim from
482
// file, but doesn't copy it into *str. Returns true if read a delim-terminated
483
// line, or false on end-of-file or error.
484
bool GetlineFromStdioFile(FILE* file, string* str, char delim);