/root/doris/be/src/gutil/strings/split.h
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2008 and onwards Google, Inc. |
2 | | // |
3 | | // #status: RECOMMENDED |
4 | | // #category: operations on strings |
5 | | // #summary: Functions for splitting strings into substrings. |
6 | | // |
7 | | // This file contains functions for splitting strings. The new and recommended |
8 | | // API for string splitting is the strings::Split() function. The old API is a |
9 | | // large collection of standalone functions declared at the bottom of this file |
10 | | // in the global scope. |
11 | | // |
12 | | // TODO(user): Rough migration plan from old API to new API |
13 | | // (1) Add comments to old Split*() functions showing how to do the same things |
14 | | // with the new API. |
15 | | // (2) Reimplement some of the old Split*() functions in terms of the new |
16 | | // Split() API. This will allow deletion of code in split.cc. |
17 | | // (3) (Optional) Replace old Split*() API calls at call sites with calls to new |
18 | | // Split() API. |
19 | | // |
20 | | #pragma once |
21 | | |
22 | | // IWYU pragma: no_include <pstl/glue_algorithm_defs.h> |
23 | | |
24 | | #include <stddef.h> |
25 | | #include <algorithm> |
26 | | |
27 | | using std::copy; |
28 | | using std::max; |
29 | | using std::min; |
30 | | using std::reverse; |
31 | | using std::sort; |
32 | | using std::swap; |
33 | | #include <iterator> |
34 | | |
35 | | using std::back_insert_iterator; |
36 | | using std::iterator_traits; |
37 | | #include <map> |
38 | | |
39 | | using std::map; |
40 | | using std::multimap; |
41 | | #include <set> |
42 | | |
43 | | using std::multiset; |
44 | | using std::set; |
45 | | #include <string> |
46 | | |
47 | | using std::string; |
48 | | #include <utility> |
49 | | |
50 | | using std::make_pair; |
51 | | using std::pair; |
52 | | #include <vector> |
53 | | |
54 | | using std::vector; |
55 | | #include "common/logging.h" |
56 | | #include <unordered_map> |
57 | | #include <unordered_set> |
58 | | |
59 | | #include "gutil/integral_types.h" |
60 | | #include "gutil/strings/charset.h" |
61 | | #include "gutil/strings/stringpiece.h" |
62 | | #include "gutil/strings/strip.h" |
63 | | #include "gutil/strings/split_internal.h" // IWYU pragma: keep |
64 | | |
65 | | namespace strings { |
66 | | |
67 | | // The new Split API |
68 | | // aka Split2 |
69 | | // aka strings::Split() |
70 | | // |
71 | | // This string splitting API consists of a Split() function in the ::strings |
72 | | // namespace and a handful of delimiter objects in the ::strings::delimiter |
73 | | // namespace (more on delimiter objects below). The Split() function always |
74 | | // takes two arguments: the text to be split and the delimiter on which to split |
75 | | // the text. An optional third argument may also be given, which is a Predicate |
76 | | // functor that will be used to filter the results, e.g., to skip empty strings |
77 | | // (more on predicates below). The Split() function adapts the returned |
78 | | // collection to the type specified by the caller. |
79 | | // |
80 | | // Example 1: |
81 | | // // Splits the given string on commas. Returns the results in a |
82 | | // // vector of strings. |
83 | | // vector<string> v = strings::Split("a,b,c", ","); |
84 | | // assert(v.size() == 3); |
85 | | // |
86 | | // Example 2: |
87 | | // // By default, empty strings are *included* in the output. See the |
88 | | // // strings::SkipEmpty predicate below to omit them. |
89 | | // vector<string> v = strings::Split("a,b,,c", ","); |
90 | | // assert(v.size() == 4); // "a", "b", "", "c" |
91 | | // v = strings::Split("", ","); |
92 | | // assert(v.size() == 1); // v contains a single "" |
93 | | // |
94 | | // Example 3: |
95 | | // // Splits the string as in the previous example, except that the results |
96 | | // // are returned as StringPiece objects. Note that because we are storing |
97 | | // // the results within StringPiece objects, we have to ensure that the input |
98 | | // // string outlives any results. |
99 | | // vector<StringPiece> v = strings::Split("a,b,c", ","); |
100 | | // assert(v.size() == 3); |
101 | | // |
102 | | // Example 4: |
103 | | // // Stores results in a set<string>. |
104 | | // set<string> a = strings::Split("a,b,c,a,b,c", ","); |
105 | | // assert(a.size() == 3); |
106 | | // |
107 | | // Example 5: |
108 | | // // Stores results in a map. The map implementation assumes that the input |
109 | | // // is provided as a series of key/value pairs. For example, the 0th element |
110 | | // // resulting from the split will be stored as a key to the 1st element. If |
111 | | // // an odd number of elements are resolved, the last element is paired with |
112 | | // // a default-constructed value (e.g., empty string). |
113 | | // map<string, string> m = strings::Split("a,b,c", ","); |
114 | | // assert(m.size() == 2); |
115 | | // assert(m["a"] == "b"); |
116 | | // assert(m["c"] == ""); // last component value equals "" |
117 | | // |
118 | | // Example 6: |
119 | | // // Splits on the empty string, which results in each character of the input |
120 | | // // string becoming one element in the output collection. |
121 | | // vector<string> v = strings::Split("abc", ""); |
122 | | // assert(v.size() == 3); |
123 | | // |
124 | | // Example 7: |
125 | | // // Stores first two split strings as the members in an std::pair. |
126 | | // std::pair<string, string> p = strings::Split("a,b,c", ","); |
127 | | // EXPECT_EQ("a", p.first); |
128 | | // EXPECT_EQ("b", p.second); |
129 | | // // "c" is omitted because std::pair can hold only two elements. |
130 | | // |
131 | | // As illustrated above, the Split() function adapts the returned collection to |
132 | | // the type specified by the caller. The returned collections may contain |
133 | | // string, StringPiece, Cord, or any object that has a constructor (explicit or |
134 | | // not) that takes a single StringPiece argument. This pattern works for all |
135 | | // standard STL containers including vector, list, deque, set, multiset, map, |
136 | | // multimap, unordered_set and unordered_map, and even std::pair which is not |
137 | | // actually a container. |
138 | | // |
139 | | // Splitting to std::pair is an interesting case because it can hold only two |
140 | | // elements and is not a collection type. When splitting to an std::pair the |
141 | | // first two split strings become the std::pair's .first and .second members |
142 | | // respectively. The remaining split substrings are discarded. If there are less |
143 | | // than two split substrings, the empty string is used for the corresponding |
144 | | // std::pair member. |
145 | | // |
146 | | // The strings::Split() function can be used multiple times to perform more |
147 | | // complicated splitting logic, such as intelligently parsing key-value pairs. |
148 | | // For example |
149 | | // |
150 | | // // The input string "a=b=c,d=e,f=,g" becomes |
151 | | // // { "a" => "b=c", "d" => "e", "f" => "", "g" => "" } |
152 | | // map<string, string> m; |
153 | | // for (StringPiece sp : strings::Split("a=b=c,d=e,f=,g", ",")) { |
154 | | // m.insert(strings::Split(sp, strings::delimiter::Limit("=", 1))); |
155 | | // } |
156 | | // EXPECT_EQ("b=c", m.find("a")->second); |
157 | | // EXPECT_EQ("e", m.find("d")->second); |
158 | | // EXPECT_EQ("", m.find("f")->second); |
159 | | // EXPECT_EQ("", m.find("g")->second); |
160 | | // |
161 | | // The above example stores the results in an std::map. But depending on your |
162 | | // data requirements, you can just as easily store the results in an |
163 | | // std::multimap or even a vector<std::pair<>>. |
164 | | // |
165 | | // |
166 | | // Delimiters |
167 | | // |
168 | | // The Split() function also takes a second argument that is a delimiter. This |
169 | | // delimiter is actually an object that defines the boundaries between elements |
170 | | // in the provided input. If a string (const char*, ::string, or StringPiece) is |
171 | | // passed in place of an explicit Delimiter object, the argument is implicitly |
172 | | // converted to a ::strings::delimiter::Literal. |
173 | | // |
174 | | // With this split API comes the formal concept of a Delimiter (big D). A |
175 | | // Delimiter is an object with a Find() function that knows how find the first |
176 | | // occurrence of itself in a given StringPiece. Models of the Delimiter concept |
177 | | // represent specific kinds of delimiters, such as single characters, |
178 | | // substrings, or even regular expressions. |
179 | | // |
180 | | // The following Delimiter objects are provided as part of the Split() API: |
181 | | // |
182 | | // - Literal (default) |
183 | | // - AnyOf |
184 | | // - Limit |
185 | | // |
186 | | // The following are examples of using some provided Delimiter objects: |
187 | | // |
188 | | // Example 1: |
189 | | // // Because a string literal is converted to a strings::delimiter::Literal, |
190 | | // // the following two splits are equivalent. |
191 | | // vector<string> v1 = strings::Split("a,b,c", ","); // (1) |
192 | | // using ::strings::delimiter::Literal; |
193 | | // vector<string> v2 = strings::Split("a,b,c", Literal(",")); // (2) |
194 | | // |
195 | | // Example 2: |
196 | | // // Splits on any of the characters specified in the delimiter string. |
197 | | // using ::strings::delimiter::AnyOf; |
198 | | // vector<string> v = strings::Split("a,b;c-d", AnyOf(",;-")); |
199 | | // assert(v.size() == 4); |
200 | | // |
201 | | // Example 3: |
202 | | // // Uses the Limit meta-delimiter to limit the number of matches a delimiter |
203 | | // // can have. In this case, the delimiter of a Literal comma is limited to |
204 | | // // to matching at most one time. The last element in the returned |
205 | | // // collection will contain all unsplit pieces, which may contain instances |
206 | | // // of the delimiter. |
207 | | // using ::strings::delimiter::Limit; |
208 | | // vector<string> v = strings::Split("a,b,c", Limit(",", 1)); |
209 | | // assert(v.size() == 2); // Limited to 1 delimiter; so two elements found |
210 | | // assert(v[0] == "a"); |
211 | | // assert(v[1] == "b,c"); |
212 | | // |
213 | | // |
214 | | // Predicates |
215 | | // |
216 | | // Predicates can filter the results of a Split() operation by determining |
217 | | // whether or not a resultant element is included in the result set. A predicate |
218 | | // may be passed as an *optional* third argument to the Split() function. |
219 | | // |
220 | | // Predicates are unary functions (or functors) that take a single StringPiece |
221 | | // argument and return bool indicating whether the argument should be included |
222 | | // (true) or excluded (false). |
223 | | // |
224 | | // One example where this is useful is when filtering out empty substrings. By |
225 | | // default, empty substrings may be returned by strings::Split(), which is |
226 | | // similar to the way split functions work in other programming languages. For |
227 | | // example: |
228 | | // |
229 | | // // Empty strings *are* included in the returned collection. |
230 | | // vector<string> v = strings::Split(",a,,b,", ","); |
231 | | // assert(v.size() == 5); // v[0] == "", v[1] == "a", v[2] == "", ... |
232 | | // |
233 | | // These empty strings can be filtered out of the results by simply passing the |
234 | | // provided SkipEmpty predicate as the third argument to the Split() function. |
235 | | // SkipEmpty does not consider a string containing all whitespace to be empty. |
236 | | // For that behavior use the SkipWhitespace predicate. For example: |
237 | | // |
238 | | // Example 1: |
239 | | // // Uses SkipEmpty to omit empty strings. Strings containing whitespace are |
240 | | // // not empty and are therefore not skipped. |
241 | | // using strings::SkipEmpty; |
242 | | // vector<string> v = strings::Split(",a, ,b,", ",", SkipEmpty()); |
243 | | // assert(v.size() == 3); |
244 | | // assert(v[0] == "a"); |
245 | | // assert(v[1] == " "); // <-- The whitespace makes the string not empty. |
246 | | // assert(v[2] == "b"); |
247 | | // |
248 | | // Example 2: |
249 | | // // Uses SkipWhitespace to skip all strings that are either empty or contain |
250 | | // // only whitespace. |
251 | | // using strings::SkipWhitespace; |
252 | | // vector<string> v = strings::Split(",a, ,b,", ",", SkipWhitespace()); |
253 | | // assert(v.size() == 2); |
254 | | // assert(v[0] == "a"); |
255 | | // assert(v[1] == "b"); |
256 | | // |
257 | | // |
258 | | // Differences between Split1 and Split2 |
259 | | // |
260 | | // Split2 is the strings::Split() API described above. Split1 is a name for the |
261 | | // collection of legacy Split*() functions declared later in this file. Most of |
262 | | // the Split1 functions follow a set of conventions that don't necessarily match |
263 | | // the conventions used in Split2. The following are some of the important |
264 | | // differences between Split1 and Split2: |
265 | | // |
266 | | // Split1 -> Split2 |
267 | | // ---------------- |
268 | | // Append -> Assign: |
269 | | // The Split1 functions all returned their output collections via a pointer to |
270 | | // an out parameter as is typical in Google code. In some cases the comments |
271 | | // explicitly stated that results would be *appended* to the output |
272 | | // collection. In some cases it was ambiguous whether results were appended. |
273 | | // This ambiguity is gone in the Split2 API as results are always assigned to |
274 | | // the output collection, never appended. |
275 | | // |
276 | | // AnyOf -> Literal: |
277 | | // Most Split1 functions treated their delimiter argument as a string of |
278 | | // individual byte delimiters. For example, a delimiter of ",;" would split on |
279 | | // "," and ";", not the substring ",;". This behavior is equivalent to the |
280 | | // Split2 delimiter strings::delimiter::AnyOf, which is *not* the default. By |
281 | | // default, strings::Split() splits using strings::delimiter::Literal() which |
282 | | // would treat the whole string ",;" as a single delimiter string. |
283 | | // |
284 | | // SkipEmpty -> allow empty: |
285 | | // Most Split1 functions omitted empty substrings in the results. To keep |
286 | | // empty substrings one would have to use an explicitly named |
287 | | // Split*AllowEmpty() function. This behavior is reversed in Split2. By |
288 | | // default, strings::Split() *allows* empty substrings in the output. To skip |
289 | | // them, use the strings::SkipEmpty predicate. |
290 | | // |
291 | | // string -> user's choice: |
292 | | // Most Split1 functions return collections of string objects. Some return |
293 | | // char*, but the type returned is dictated by each Split1 function. With |
294 | | // Split2 the caller can choose which string-like object to return. (Note: |
295 | | // char* C-strings are not supported in Split2--use StringPiece instead). |
296 | | // |
297 | | |
298 | | // Definitions of the main Split() function. |
299 | | template <typename Delimiter> |
300 | 24 | internal::Splitter<Delimiter> Split(StringPiece text, Delimiter d) { |
301 | 24 | return internal::Splitter<Delimiter>(text, d); |
302 | 24 | } _ZN7strings5SplitINS_9delimiter9LimitImplINS1_7LiteralEEEEENS_8internal8SplitterIT_NS5_8NoFilterEEE11StringPieceS7_ Line | Count | Source | 300 | 24 | internal::Splitter<Delimiter> Split(StringPiece text, Delimiter d) { | 301 | 24 | return internal::Splitter<Delimiter>(text, d); | 302 | 24 | } |
Unexecuted instantiation: _ZN7strings5SplitINS_9delimiter5AnyOfEEENS_8internal8SplitterIT_NS3_8NoFilterEEE11StringPieceS5_ Unexecuted instantiation: _ZN7strings5SplitINS_9delimiter9LimitImplINS1_5AnyOfEEEEENS_8internal8SplitterIT_NS5_8NoFilterEEE11StringPieceS7_ |
303 | | |
304 | | template <typename Delimiter, typename Predicate> |
305 | 0 | internal::Splitter<Delimiter, Predicate> Split(StringPiece text, Delimiter d, Predicate p) { |
306 | 0 | return internal::Splitter<Delimiter, Predicate>(text, d, p); |
307 | 0 | } |
308 | | |
309 | | namespace delimiter { |
310 | | // A Delimiter object represents a single separator, such as a character, |
311 | | // literal string, or regular expression. A Delimiter object must have the |
312 | | // following member: |
313 | | // |
314 | | // StringPiece Find(StringPiece text); |
315 | | // |
316 | | // This Find() member function should return a StringPiece referring to the next |
317 | | // occurrence of the represented delimiter within the given string text. If no |
318 | | // delimiter is found in the given text, a zero-length StringPiece referring to |
319 | | // text.end() should be returned (e.g., StringPiece(text.end(), 0)). It is |
320 | | // important that the returned StringPiece always be within the bounds of the |
321 | | // StringPiece given as an argument--it must not refer to a string that is |
322 | | // physically located outside of the given string. The following example is a |
323 | | // simple Delimiter object that is created with a single char and will look for |
324 | | // that char in the text given to the Find() function: |
325 | | // |
326 | | // struct SimpleDelimiter { |
327 | | // const char c_; |
328 | | // explicit SimpleDelimiter(char c) : c_(c) {} |
329 | | // StringPiece Find(StringPiece text) { |
330 | | // int pos = text.find(c_); |
331 | | // if (pos == StringPiece::npos) return StringPiece(text.end(), 0); |
332 | | // return StringPiece(text, pos, 1); |
333 | | // } |
334 | | // }; |
335 | | |
336 | | // Represents a literal string delimiter. Examples: |
337 | | // |
338 | | // using ::strings::delimiter::Literal; |
339 | | // vector<string> v = strings::Split("a=>b=>c", Literal("=>")); |
340 | | // assert(v.size() == 3); |
341 | | // assert(v[0] == "a"); |
342 | | // assert(v[1] == "b"); |
343 | | // assert(v[2] == "c"); |
344 | | // |
345 | | // The next example uses the empty string as a delimiter. |
346 | | // |
347 | | // using ::strings::delimiter::Literal; |
348 | | // vector<string> v = strings::Split("abc", Literal("")); |
349 | | // assert(v.size() == 3); |
350 | | // assert(v[0] == "a"); |
351 | | // assert(v[1] == "b"); |
352 | | // assert(v[2] == "c"); |
353 | | // |
354 | | class Literal { |
355 | | public: |
356 | | explicit Literal(StringPiece sp); |
357 | | StringPiece Find(StringPiece text) const; |
358 | | |
359 | | private: |
360 | | const string delimiter_; |
361 | | }; |
362 | | |
363 | | // Represents a delimiter that will match any of the given byte-sized |
364 | | // characters. AnyOf is similar to Literal, except that AnyOf uses |
365 | | // StringPiece::find_first_of() and Literal uses StringPiece::find(). AnyOf |
366 | | // examples: |
367 | | // |
368 | | // using ::strings::delimiter::AnyOf; |
369 | | // vector<string> v = strings::Split("a,b=c", AnyOf(",=")); |
370 | | // |
371 | | // assert(v.size() == 3); |
372 | | // assert(v[0] == "a"); |
373 | | // assert(v[1] == "b"); |
374 | | // assert(v[2] == "c"); |
375 | | // |
376 | | // If AnyOf is given the empty string, it behaves exactly like Literal and |
377 | | // matches each individual character in the input string. |
378 | | // |
379 | | // Note: The string passed to AnyOf is assumed to be a string of single-byte |
380 | | // ASCII characters. AnyOf does not work with multi-byte characters. |
381 | | class AnyOf { |
382 | | public: |
383 | | explicit AnyOf(StringPiece sp); |
384 | | StringPiece Find(StringPiece text) const; |
385 | | |
386 | | private: |
387 | | const string delimiters_; |
388 | | }; |
389 | | |
390 | | // Wraps another delimiter and sets a max number of matches for that delimiter. |
391 | | // Create LimitImpls using the Limit() function. Example: |
392 | | // |
393 | | // using ::strings::delimiter::Limit; |
394 | | // vector<string> v = strings::Split("a,b,c,d", Limit(",", 2)); |
395 | | // |
396 | | // assert(v.size() == 3); // Split on 2 commas, giving a vector with 3 items |
397 | | // assert(v[0] == "a"); |
398 | | // assert(v[1] == "b"); |
399 | | // assert(v[2] == "c,d"); |
400 | | // |
401 | | template <typename Delimiter> |
402 | | class LimitImpl { |
403 | | public: |
404 | | LimitImpl(Delimiter delimiter, int limit) |
405 | 24 | : delimiter_(std::move(delimiter)), limit_(limit), count_(0) {} _ZN7strings9delimiter9LimitImplINS0_7LiteralEEC2ES2_i Line | Count | Source | 405 | 24 | : delimiter_(std::move(delimiter)), limit_(limit), count_(0) {} |
Unexecuted instantiation: _ZN7strings9delimiter9LimitImplINS0_5AnyOfEEC2ES2_i |
406 | 47 | StringPiece Find(StringPiece text) { |
407 | 47 | if (count_++ == limit_) { |
408 | 18 | return StringPiece(text.end(), 0); // No more matches. |
409 | 18 | } |
410 | 29 | return delimiter_.Find(text); |
411 | 47 | } _ZN7strings9delimiter9LimitImplINS0_7LiteralEE4FindE11StringPiece Line | Count | Source | 406 | 47 | StringPiece Find(StringPiece text) { | 407 | 47 | if (count_++ == limit_) { | 408 | 18 | return StringPiece(text.end(), 0); // No more matches. | 409 | 18 | } | 410 | 29 | return delimiter_.Find(text); | 411 | 47 | } |
Unexecuted instantiation: _ZN7strings9delimiter9LimitImplINS0_5AnyOfEE4FindE11StringPiece |
412 | | |
413 | | private: |
414 | | Delimiter delimiter_; |
415 | | const int limit_; |
416 | | int count_; |
417 | | }; |
418 | | |
419 | | // Overloaded Limit() function to create LimitImpl<> objects. Uses the Delimiter |
420 | | // Literal as the default if string-like objects are passed as the delimiter |
421 | | // parameter. This is similar to the overloads for Split() below. |
422 | | template <typename Delimiter> |
423 | 0 | LimitImpl<Delimiter> Limit(Delimiter delim, int limit) { |
424 | 0 | return LimitImpl<Delimiter>(delim, limit); |
425 | 0 | } |
426 | | |
427 | 14 | inline LimitImpl<Literal> Limit(const char* s, int limit) { |
428 | 14 | return LimitImpl<Literal>(Literal(s), limit); |
429 | 14 | } |
430 | | |
431 | 10 | inline LimitImpl<Literal> Limit(const string& s, int limit) { |
432 | 10 | return LimitImpl<Literal>(Literal(s), limit); |
433 | 10 | } |
434 | | |
435 | 0 | inline LimitImpl<Literal> Limit(StringPiece s, int limit) { |
436 | 0 | return LimitImpl<Literal>(Literal(s), limit); |
437 | 0 | } |
438 | | |
439 | | } // namespace delimiter |
440 | | |
441 | | // |
442 | | // Predicates are functors that return bool indicating whether the given |
443 | | // StringPiece should be included in the split output. If the predicate returns |
444 | | // false then the string will be excluded from the output from strings::Split(). |
445 | | // |
446 | | |
447 | | // Always returns true, indicating that all strings--including empty |
448 | | // strings--should be included in the split output. This predicate is not |
449 | | // strictly needed because this is the default behavior of the strings::Split() |
450 | | // function. But it might be useful at some call sites to make the intent |
451 | | // explicit. |
452 | | // |
453 | | // vector<string> v = Split(" a , ,,b,", ",", AllowEmpty()); |
454 | | // EXPECT_THAT(v, ElementsAre(" a ", " ", "", "b", "")); |
455 | | struct AllowEmpty { |
456 | 0 | bool operator()(StringPiece sp) const { return true; } |
457 | | }; |
458 | | |
459 | | // Returns false if the given StringPiece is empty, indicating that the |
460 | | // strings::Split() API should omit the empty string. |
461 | | // |
462 | | // vector<string> v = Split(" a , ,,b,", ",", SkipEmpty()); |
463 | | // EXPECT_THAT(v, ElementsAre(" a ", " ", "b")); |
464 | | struct SkipEmpty { |
465 | 0 | bool operator()(StringPiece sp) const { return !sp.empty(); } |
466 | | }; |
467 | | |
468 | | // Returns false if the given StringPiece is empty or contains only whitespace, |
469 | | // indicating that the strings::Split() API should omit the string. |
470 | | // |
471 | | // vector<string> v = Split(" a , ,,b,", ",", SkipWhitespace()); |
472 | | // EXPECT_THAT(v, ElementsAre(" a ", "b")); |
473 | | struct SkipWhitespace { |
474 | 1.96k | bool operator()(StringPiece sp) const { |
475 | 1.96k | StripWhiteSpace(&sp); |
476 | 1.96k | return !sp.empty(); |
477 | 1.96k | } |
478 | | }; |
479 | | |
480 | | // Split() function overloads to effectively give Split() a default Delimiter |
481 | | // type of Literal. If Split() is called and a string is passed as the delimiter |
482 | | // instead of an actual Delimiter object, then one of these overloads will be |
483 | | // invoked and will create a Splitter<Literal> with the delimiter string. |
484 | | // |
485 | | // Since Split() is a function template above, these overload signatures need to |
486 | | // be explicit about the string type so they match better than the templated |
487 | | // version. These functions are overloaded for: |
488 | | // |
489 | | // - const char* |
490 | | // - const string& |
491 | | // - StringPiece |
492 | | |
493 | 166 | inline internal::Splitter<delimiter::Literal> Split(StringPiece text, const char* delimiter) { |
494 | 166 | return internal::Splitter<delimiter::Literal>(text, delimiter::Literal(delimiter)); |
495 | 166 | } |
496 | | |
497 | 89 | inline internal::Splitter<delimiter::Literal> Split(StringPiece text, const string& delimiter) { |
498 | 89 | return internal::Splitter<delimiter::Literal>(text, delimiter::Literal(delimiter)); |
499 | 89 | } |
500 | | |
501 | 0 | inline internal::Splitter<delimiter::Literal> Split(StringPiece text, StringPiece delimiter) { |
502 | 0 | return internal::Splitter<delimiter::Literal>(text, delimiter::Literal(delimiter)); |
503 | 0 | } |
504 | | |
505 | | // Same overloads as above, but also including a Predicate argument. |
506 | | template <typename Predicate> |
507 | | internal::Splitter<delimiter::Literal, Predicate> Split(StringPiece text, const char* delimiter, |
508 | 388 | Predicate p) { |
509 | 388 | return internal::Splitter<delimiter::Literal, Predicate>(text, delimiter::Literal(delimiter), |
510 | 388 | p); |
511 | 388 | } _ZN7strings5SplitINS_14SkipWhitespaceEEENS_8internal8SplitterINS_9delimiter7LiteralET_EE11StringPiecePKcS6_ Line | Count | Source | 508 | 388 | Predicate p) { | 509 | 388 | return internal::Splitter<delimiter::Literal, Predicate>(text, delimiter::Literal(delimiter), | 510 | 388 | p); | 511 | 388 | } |
Unexecuted instantiation: _ZN7strings5SplitINS_9SkipEmptyEEENS_8internal8SplitterINS_9delimiter7LiteralET_EE11StringPiecePKcS6_ |
512 | | |
513 | | template <typename Predicate> |
514 | | internal::Splitter<delimiter::Literal, Predicate> Split(StringPiece text, const string& delimiter, |
515 | | Predicate p) { |
516 | | return internal::Splitter<delimiter::Literal, Predicate>(text, delimiter::Literal(delimiter), |
517 | | p); |
518 | | } |
519 | | |
520 | | template <typename Predicate> |
521 | | internal::Splitter<delimiter::Literal, Predicate> Split(StringPiece text, StringPiece delimiter, |
522 | | Predicate p) { |
523 | | return internal::Splitter<delimiter::Literal, Predicate>(text, delimiter::Literal(delimiter), |
524 | | p); |
525 | | } |
526 | | |
527 | | } // namespace strings |
528 | | |
529 | | // |
530 | | // ==================== LEGACY SPLIT FUNCTIONS ==================== |
531 | | // |
532 | | |
533 | | // NOTE: The instruction below creates a Module titled |
534 | | // GlobalSplitFunctions within the auto-generated Doxygen documentation. |
535 | | // This instruction is needed to expose global functions that are not |
536 | | // within a namespace. |
537 | | // |
538 | | // START DOXYGEN SplitFunctions grouping |
539 | | /* @defgroup SplitFunctions |
540 | | * @{ */ |
541 | | |
542 | | // ---------------------------------------------------------------------- |
543 | | // ClipString |
544 | | // Clip a string to a max length. We try to clip on a word boundary |
545 | | // if this is possible. If the string is clipped, we append an |
546 | | // ellipsis. |
547 | | // |
548 | | // ***NOTE*** |
549 | | // ClipString counts length with strlen. If you have non-ASCII |
550 | | // strings like UTF-8, this is wrong. If you are displaying the |
551 | | // clipped strings to users in a frontend, consider using |
552 | | // ClipStringOnWordBoundary in |
553 | | // webserver/util/snippets/rewriteboldtags, which considers the width |
554 | | // of the string, not just the number of bytes. |
555 | | // |
556 | | // TODO(user) Move ClipString back to strutil. The problem with this is |
557 | | // that ClipStringHelper is used behind the scenes by SplitStringToLines, but |
558 | | // probably shouldn't be exposed in the .h files. |
559 | | // ---------------------------------------------------------------------- |
560 | | void ClipString(char* str, int max_len); |
561 | | |
562 | | // ---------------------------------------------------------------------- |
563 | | // ClipString |
564 | | // Version of ClipString() that uses string instead of char*. |
565 | | // NOTE: See comment above. |
566 | | // ---------------------------------------------------------------------- |
567 | | void ClipString(string* full_str, int max_len); |
568 | | |
569 | | // ---------------------------------------------------------------------- |
570 | | // SplitStringToLines() Split a string into lines of maximum length |
571 | | // 'max_len'. Append the resulting lines to 'result'. Will attempt |
572 | | // to split on word boundaries. If 'num_lines' |
573 | | // is zero it splits up the whole string regardless of length. If |
574 | | // 'num_lines' is positive, it returns at most num_lines lines, and |
575 | | // appends a "..." to the end of the last line if the string is too |
576 | | // long to fit completely into 'num_lines' lines. |
577 | | // ---------------------------------------------------------------------- |
578 | | void SplitStringToLines(const char* full, int max_len, int num_lines, vector<string>* result); |
579 | | |
580 | | // ---------------------------------------------------------------------- |
581 | | // SplitOneStringToken() |
582 | | // Returns the first "delim" delimited string from "*source" and modifies |
583 | | // *source to point after the delimiter that was found. If no delimiter is |
584 | | // found, *source is set to NULL. |
585 | | // |
586 | | // If the start of *source is a delimiter, an empty string is returned. |
587 | | // If *source is NULL, an empty string is returned. |
588 | | // |
589 | | // "delim" is treated as a sequence of 1 or more character delimiters. Any one |
590 | | // of the characters present in "delim" is considered to be a single |
591 | | // delimiter; The delimiter is not "delim" as a whole. For example: |
592 | | // |
593 | | // const char* s = "abc=;de"; |
594 | | // string r = SplitOneStringToken(&s, ";="); |
595 | | // // r = "abc" |
596 | | // // s points to ";de" |
597 | | // ---------------------------------------------------------------------- |
598 | | string SplitOneStringToken(const char** source, const char* delim); |
599 | | |
600 | | // ---------------------------------------------------------------------- |
601 | | // SplitUsing() |
602 | | // Split a string into substrings based on the nul-terminated list |
603 | | // of bytes at delimiters (uses strsep) and return a vector of |
604 | | // those strings. Modifies 'full' We allocate the return vector, |
605 | | // and you should free it. Note that empty fields are ignored. |
606 | | // Use SplitToVector with last argument 'false' if you want the |
607 | | // empty fields. |
608 | | // ---------------------------------------------------------------------- |
609 | | vector<char*>* SplitUsing(char* full, const char* delimiters); |
610 | | |
611 | | // ---------------------------------------------------------------------- |
612 | | // SplitToVector() |
613 | | // Split a string into substrings based on the nul-terminated list |
614 | | // of bytes at delim (uses strsep) and appends the split |
615 | | // strings to 'vec'. Modifies "full". If omit empty strings is |
616 | | // true, empty strings are omitted from the resulting vector. |
617 | | // ---------------------------------------------------------------------- |
618 | | void SplitToVector(char* full, const char* delimiters, vector<char*>* vec, bool omit_empty_strings); |
619 | | void SplitToVector(char* full, const char* delimiters, vector<const char*>* vec, |
620 | | bool omit_empty_strings); |
621 | | |
622 | | // ---------------------------------------------------------------------- |
623 | | // SplitStringPieceToVector |
624 | | // Split a StringPiece into sub-StringPieces based on the |
625 | | // nul-terminated list of bytes at delim and appends the |
626 | | // pieces to 'vec'. If omit empty strings is true, empty strings |
627 | | // are omitted from the resulting vector. |
628 | | // Expects the original string (from which 'full' is derived) to exist |
629 | | // for the full lifespan of 'vec'. |
630 | | // ---------------------------------------------------------------------- |
631 | | void SplitStringPieceToVector(const StringPiece& full, const char* delim, vector<StringPiece>* vec, |
632 | | bool omit_empty_strings); |
633 | | |
634 | | // ---------------------------------------------------------------------- |
635 | | // SplitStringUsing() |
636 | | // SplitStringToHashsetUsing() |
637 | | // SplitStringToSetUsing() |
638 | | // SplitStringToMapUsing() |
639 | | // SplitStringToHashmapUsing() |
640 | | |
641 | | // Splits a string using one or more byte delimiters, presented as a |
642 | | // nul-terminated c string. Append the components to 'result'. If there are |
643 | | // consecutive delimiters, this function skips over all of them: in other words, |
644 | | // empty components are dropped. If you want to keep empty components, try |
645 | | // SplitStringAllowEmpty(). |
646 | | // |
647 | | // NOTE: Do not use this for multi-byte delimiters such as UTF-8 strings. Use |
648 | | // strings::Split() with strings::delimiter::Literal as the delimiter. |
649 | | // |
650 | | // ==> NEW API: Consider using the new Split API defined above. <== |
651 | | // Example: |
652 | | // |
653 | | // using strings::SkipEmpty; |
654 | | // using strings::Split; |
655 | | // using strings::delimiter::AnyOf; |
656 | | // |
657 | | // vector<string> v = Split(full, AnyOf(delimiter), SkipEmpty()); |
658 | | // |
659 | | // For even better performance, store the result in a vector<StringPiece> |
660 | | // to avoid string copies. |
661 | | // ---------------------------------------------------------------------- |
662 | | void SplitStringUsing(const string& full, const char* delimiters, vector<string>* result); |
663 | | void SplitStringToHashsetUsing(const string& full, const char* delimiters, |
664 | | std::unordered_set<string>* result); |
665 | | void SplitStringToSetUsing(const string& full, const char* delimiters, set<string>* result); |
666 | | // The even-positioned (0-based) components become the keys for the |
667 | | // odd-positioned components that follow them. When there is an odd |
668 | | // number of components, the value for the last key will be unchanged |
669 | | // if the key was already present in the hash table, or will be the |
670 | | // empty string if the key is a newly inserted key. |
671 | | void SplitStringToMapUsing(const string& full, const char* delim, map<string, string>* result); |
672 | | void SplitStringToHashmapUsing(const string& full, const char* delim, |
673 | | std::unordered_map<string, string>* result); |
674 | | |
675 | | // ---------------------------------------------------------------------- |
676 | | // SplitStringAllowEmpty() |
677 | | // |
678 | | // Split a string using one or more byte delimiters, presented as a |
679 | | // nul-terminated c string. Append the components to 'result'. If there are |
680 | | // consecutive delimiters, this function will return corresponding empty |
681 | | // strings. If you want to drop the empty strings, try SplitStringUsing(). |
682 | | // |
683 | | // If "full" is the empty string, yields an empty string as the only value. |
684 | | // |
685 | | // ==> NEW API: Consider using the new Split API defined above. <== |
686 | | // |
687 | | // using strings::Split; |
688 | | // using strings::delimiter::AnyOf; |
689 | | // |
690 | | // vector<string> v = Split(full, AnyOf(delimiter)); |
691 | | // |
692 | | // For even better performance, store the result in a vector<StringPiece> to |
693 | | // avoid string copies. |
694 | | // ---------------------------------------------------------------------- |
695 | | void SplitStringAllowEmpty(const string& full, const char* delim, vector<string>* result); |
696 | | |
697 | | // ---------------------------------------------------------------------- |
698 | | // SplitStringWithEscaping() |
699 | | // SplitStringWithEscapingAllowEmpty() |
700 | | // SplitStringWithEscapingToSet() |
701 | | // SplitStringWithEscapingToHashset() |
702 | | |
703 | | // Split the string using the specified delimiters, taking escaping into |
704 | | // account. '\' is not allowed as a delimiter. |
705 | | // |
706 | | // Within the string, preserve a delimiter preceded by a backslash as a |
707 | | // literal delimiter. In addition, preserve two consecutive backslashes as |
708 | | // a single literal backslash. Do not unescape any other backslash-character |
709 | | // sequence. |
710 | | // |
711 | | // Eg. 'foo\=bar=baz\\qu\ux' split on '=' becomes ('foo=bar', 'baz\qu\ux') |
712 | | // |
713 | | // All versions other than "AllowEmpty" discard any empty substrings. |
714 | | // ---------------------------------------------------------------------- |
715 | | void SplitStringWithEscaping(const string& full, const strings::CharSet& delimiters, |
716 | | vector<string>* result); |
717 | | void SplitStringWithEscapingAllowEmpty(const string& full, const strings::CharSet& delimiters, |
718 | | vector<string>* result); |
719 | | void SplitStringWithEscapingToSet(const string& full, const strings::CharSet& delimiters, |
720 | | set<string>* result); |
721 | | void SplitStringWithEscapingToHashset(const string& full, const strings::CharSet& delimiters, |
722 | | std::unordered_set<string>* result); |
723 | | |
724 | | // ---------------------------------------------------------------------- |
725 | | // SplitStringIntoNPiecesAllowEmpty() |
726 | | |
727 | | // Split a string using a nul-terminated list of byte |
728 | | // delimiters. Append the components to 'result'. If there are |
729 | | // consecutive delimiters, this function will return corresponding |
730 | | // empty strings. The string is split into at most the specified |
731 | | // number of pieces greedily. This means that the last piece may |
732 | | // possibly be split further. To split into as many pieces as |
733 | | // possible, specify 0 as the number of pieces. |
734 | | // |
735 | | // If "full" is the empty string, yields an empty string as the only value. |
736 | | // ---------------------------------------------------------------------- |
737 | | void SplitStringIntoNPiecesAllowEmpty(const string& full, const char* delimiters, int pieces, |
738 | | vector<string>* result); |
739 | | |
740 | | // ---------------------------------------------------------------------- |
741 | | // SplitStringAndParse() |
742 | | // SplitStringAndParseToContainer() |
743 | | // SplitStringAndParseToList() |
744 | | // Split a string using a nul-terminated list of character |
745 | | // delimiters. For each component, parse using the provided |
746 | | // parsing function and if successful, append it to 'result'. |
747 | | // Return true if and only if all components parse successfully. |
748 | | // If there are consecutive delimiters, this function skips over |
749 | | // all of them. This function will correctly handle parsing |
750 | | // strings that have embedded \0s. |
751 | | // |
752 | | // SplitStringAndParse fills into a vector. |
753 | | // SplitStringAndParseToContainer fills into any container that implements |
754 | | // a single-argument insert function. (i.e. insert(const value_type& x) ). |
755 | | // SplitStringAndParseToList fills into any container that implements a single- |
756 | | // argument push_back function (i.e. push_back(const value_type& x) ), plus |
757 | | // value_type& back() and pop_back(). |
758 | | // NOTE: This implementation relies on parsing in-place into the "back()" |
759 | | // reference, so its performance may depend on the efficiency of back(). |
760 | | // |
761 | | // Example Usage: |
762 | | // vector<double> values; |
763 | | // CHECK(SplitStringAndParse("1.0,2.0,3.0", ",", &safe_strtod, &values)); |
764 | | // CHECK_EQ(3, values.size()); |
765 | | // |
766 | | // vector<int64> values; |
767 | | // CHECK(SplitStringAndParse("1M,2M,3M", ",", |
768 | | // &HumanReadableNumBytes::ToInt64, &values)); |
769 | | // CHECK_EQ(3, values.size()); |
770 | | // |
771 | | // set<int64> values; |
772 | | // CHECK(SplitStringAndParseToContainer("3,1,1,2", ",", |
773 | | // &safe_strto64, &values)); |
774 | | // CHECK_EQ(4, values.size()); |
775 | | // |
776 | | // deque<int64> values; |
777 | | // CHECK(SplitStringAndParseToList("3,1,1,2", ",", &safe_strto64, &values)); |
778 | | // CHECK_EQ(4, values.size()); |
779 | | // ---------------------------------------------------------------------- |
780 | | template <class T> |
781 | | bool SplitStringAndParse(StringPiece source, StringPiece delim, |
782 | | bool (*parse)(const string& str, T* value), vector<T>* result); |
783 | | template <class Container> |
784 | | bool SplitStringAndParseToContainer(StringPiece source, StringPiece delim, |
785 | | bool (*parse)(const string& str, |
786 | | typename Container::value_type* value), |
787 | | Container* result); |
788 | | |
789 | | template <class List> |
790 | | bool SplitStringAndParseToList(StringPiece source, StringPiece delim, |
791 | | bool (*parse)(const string& str, typename List::value_type* value), |
792 | | List* result); |
793 | | // ---------------------------------------------------------------------- |
794 | | // SplitRange() |
795 | | // Splits a string of the form "<from>-<to>". Either or both can be |
796 | | // missing. A raw number (<to>) is interpreted as "<to>-". Modifies |
797 | | // parameters insofar as they're specified by the string. RETURNS |
798 | | // true iff the input is a well-formed range. If it RETURNS false, |
799 | | // from and to remain unchanged. The range in rangestr should be |
800 | | // terminated either by "\0" or by whitespace. |
801 | | // ---------------------------------------------------------------------- |
802 | | bool SplitRange(const char* rangestr, int* from, int* to); |
803 | | |
804 | | // ---------------------------------------------------------------------- |
805 | | // SplitCSVLineWithDelimiter() |
806 | | // CSV lines come in many guises. There's the Comma Separated Values |
807 | | // variety, in which fields are separated by (surprise!) commas. There's |
808 | | // also the tab-separated values variant, in which tabs separate the |
809 | | // fields. This routine handles both, which makes it almost like |
810 | | // SplitUsing(line, delimiter), but for some special processing. For both |
811 | | // delimiters, whitespace is trimmed from either side of the field value. |
812 | | // If the delimiter is ',', we play additional games with quotes. A |
813 | | // field value surrounded by double quotes is allowed to contain commas, |
814 | | // which are not treated as field separators. Within a double-quoted |
815 | | // string, a series of two double quotes signals an escaped single double |
816 | | // quote. It'll be clearer in the examples. |
817 | | // Example: |
818 | | // Google , x , "Buchheit, Paul", "string with "" quote in it" |
819 | | // --> [Google], [x], [Buchheit, Paul], [string with " quote in it] |
820 | | // |
821 | | // SplitCSVLine() |
822 | | // A convenience wrapper around SplitCSVLineWithDelimiter which uses |
823 | | // ',' as the delimiter. |
824 | | // |
825 | | // The following variants of SplitCSVLine() are not recommended for new code. |
826 | | // Please consider the CSV parser in //util/csv as an alternative. Examples: |
827 | | // To parse a single line: |
828 | | // #include "util/csv/parser.h" |
829 | | // vector<string> fields = util::csv::ParseLine(line).fields(); |
830 | | // |
831 | | // To parse an entire file: |
832 | | // #include "util/csv/parser.h" |
833 | | // for (Record rec : Parser(source)) { |
834 | | // vector<string> fields = rec.fields(); |
835 | | // } |
836 | | // |
837 | | // See //util/csv/parser.h for more complete documentation. |
838 | | // |
839 | | // ---------------------------------------------------------------------- |
840 | | void SplitCSVLine(char* line, vector<char*>* cols); |
841 | | void SplitCSVLineWithDelimiter(char* line, char delimiter, vector<char*>* cols); |
842 | | // SplitCSVLine string wrapper that internally makes a copy of string line. |
843 | | void SplitCSVLineWithDelimiterForStrings(const string& line, char delimiter, vector<string>* cols); |
844 | | |
845 | | // ---------------------------------------------------------------------- |
846 | | // SplitStructuredLine() |
847 | | // Splits a line using the given delimiter, and places the columns |
848 | | // into 'cols'. This is unlike 'SplitUsing(line, ",")' because you can |
849 | | // define pairs of opening closing symbols inside which the delimiter should |
850 | | // be ignored. If the symbol_pair string has an odd number of characters, |
851 | | // the last character (which cannot be paired) will be assumed to be both an |
852 | | // opening and closing symbol. |
853 | | // WARNING : The input string 'line' is destroyed in the process. |
854 | | // The function returns 0 if the line was parsed correctly (i.e all the |
855 | | // opened braces had their closing braces) otherwise, it returns the position |
856 | | // of the error. |
857 | | // Example: |
858 | | // SplitStructuredLine("item1,item2,{subitem1,subitem2},item4,[5,{6,7}]", |
859 | | // ',', |
860 | | // "{}[]", &output) |
861 | | // --> output = { "item1", "item2", "{subitem1,subitem2}", "item4", |
862 | | // "[5,{6,7}]" } |
863 | | // Example2: trying to split "item1,[item2,{4,5],5}" will fail and the |
864 | | // function will return the position of the problem : ] |
865 | | // |
866 | | // ---------------------------------------------------------------------- |
867 | | char* SplitStructuredLine(char* line, char delimiter, const char* symbol_pairs, |
868 | | vector<char*>* cols); |
869 | | |
870 | | // Similar to the function with the same name above, but splits a StringPiece |
871 | | // into StringPiece parts. Returns true if successful. |
872 | | bool SplitStructuredLine(StringPiece line, char delimiter, const char* symbol_pairs, |
873 | | vector<StringPiece>* cols); |
874 | | |
875 | | // ---------------------------------------------------------------------- |
876 | | // SplitStructuredLineWithEscapes() |
877 | | // Like SplitStructuredLine but also allows characters to be escaped. |
878 | | // |
879 | | // WARNING: the escape characters will be replicated in the output |
880 | | // columns rather than being consumed, i.e. if {} were the opening and |
881 | | // closing symbols, using \{ to quote a curly brace in the middle of |
882 | | // an option would pass this unchanged. |
883 | | // |
884 | | // Example: |
885 | | // SplitStructuredLineWithEscapes( |
886 | | // "\{item1\},it\\em2,{\{subitem1\},sub\\item2},item4\,item5,[5,{6,7}]", |
887 | | // ',', |
888 | | // "{}[]", |
889 | | // &output) |
890 | | // --> output = { "\{item1\}", "it\\em2", "{\{subitem1\},sub\\item2}", |
891 | | // "item4\,item5", "[5,{6,7}]" } |
892 | | // |
893 | | // ---------------------------------------------------------------------- |
894 | | char* SplitStructuredLineWithEscapes(char* line, char delimiter, const char* symbol_pairs, |
895 | | vector<char*>* cols); |
896 | | |
897 | | // Similar to the function with the same name above, but splits a StringPiece |
898 | | // into StringPiece parts. Returns true if successful. |
899 | | bool SplitStructuredLineWithEscapes(StringPiece line, char delimiter, const char* symbol_pairs, |
900 | | vector<StringPiece>* cols); |
901 | | |
902 | | // ---------------------------------------------------------------------- |
903 | | // DEPRECATED(jgm): See the "NEW API" comment about this function below for |
904 | | // example code showing an alternative. |
905 | | // |
906 | | // SplitStringIntoKeyValues() |
907 | | // Split a line into a key string and a vector of value strings. The line has |
908 | | // the following format: |
909 | | // |
910 | | // <key><kvsep>+<vvsep>*<value1><vvsep>+<value2><vvsep>+<value3>...<vvsep>* |
911 | | // |
912 | | // where key and value are strings; */+ means zero/one or more; <kvsep> is |
913 | | // a delimiter character to separate key and value; and <vvsep> is a delimiter |
914 | | // character to separate between values. The user can specify a bunch of |
915 | | // delimiter characters using a string. For example, if the user specifies |
916 | | // the separator string as "\t ", then either ' ' or '\t' or any combination |
917 | | // of them wil be treated as separator. For <vvsep>, the user can specify a |
918 | | // empty string to indicate there is only one value. |
919 | | // |
920 | | // Note: this function assumes the input string begins exactly with a |
921 | | // key. Therefore, if you use whitespaces to separate key and value, you |
922 | | // should not let whitespace precedes the key in the input. Otherwise, you |
923 | | // will get an empty string as the key. |
924 | | // |
925 | | // A line with no <kvsep> will return an empty string as the key, even if |
926 | | // <key> is non-empty! |
927 | | // |
928 | | // The syntax makes it impossible for a value to be the empty string. |
929 | | // It is possible for the number of values to be zero. |
930 | | // |
931 | | // Returns false if the line has no <kvsep> or if the number of values is |
932 | | // zero. |
933 | | // |
934 | | // ==> NEW API: Consider using the new Split API defined above. <== |
935 | | // |
936 | | // The SplitStringIntoKeyValues() function has some subtle and surprising |
937 | | // semantics in various corner cases. To avoid this the strings::Split API is |
938 | | // recommended. The following example shows how to split a string of delimited |
939 | | // key-value pairs into a vector of pairs using the strings::Split API. |
940 | | // |
941 | | // using strings::Split; |
942 | | // using strings::delimiter::AnyOf; |
943 | | // using strings::delimiter::Limit; |
944 | | // |
945 | | // pair<string, StringPiece> key_values = |
946 | | // Split(line, Limit(AnyOf(kv_delim), 1)); |
947 | | // string key = key_values.first; |
948 | | // vector<string> values = Split(key_values.second, AnyOf(vv_delim)); |
949 | | // |
950 | | // ---------------------------------------------------------------------- |
951 | | bool SplitStringIntoKeyValues(const string& line, const string& key_value_delimiters, |
952 | | const string& value_value_delimiters, string* key, |
953 | | vector<string>* values); |
954 | | |
955 | | // ---------------------------------------------------------------------- |
956 | | // SplitStringIntoKeyValuePairs() |
957 | | // Split a line into a vector of <key, value> pairs. The line has |
958 | | // the following format: |
959 | | // |
960 | | // <kvpsep>*<key1><kvsep>+<value1><kvpsep>+<key2><kvsep>+<value2>...<kvpsep>* |
961 | | // |
962 | | // Where key and value are strings; */+ means zero/one or more. <kvsep> is |
963 | | // a delimiter character to separate key and value and <kvpsep> is a delimiter |
964 | | // character to separate key value pairs. The user can specify a bunch of |
965 | | // delimiter characters using a string. |
966 | | // |
967 | | // Note: this function assumes each key-value pair begins exactly with a |
968 | | // key. Therefore, if you use whitespaces to separate key and value, you |
969 | | // should not let whitespace precede the key in the pair. Otherwise, you |
970 | | // will get an empty string as the key. |
971 | | // |
972 | | // A pair with no <kvsep> will return empty strings as the key and value, |
973 | | // even if <key> is non-empty! |
974 | | // |
975 | | // Returns false for pairs with no <kvsep> specified and for pairs with |
976 | | // empty strings as values. |
977 | | // |
978 | | // ==> NEW API: Consider using the new Split API defined above. <== |
979 | | // |
980 | | // The SplitStringIntoKeyValuePairs() function has some subtle and surprising |
981 | | // semantics in various corner cases. To avoid this the strings::Split API is |
982 | | // recommended. The following example shows how to split a string of delimited |
983 | | // key-value pairs into a vector of pairs using the strings::Split API. |
984 | | // |
985 | | // using strings::SkipEmpty; |
986 | | // using strings::Split; |
987 | | // using strings::delimiter::AnyOf; |
988 | | // using strings::delimiter::Limit; |
989 | | // |
990 | | // vector<pair<string, string>> pairs; // or even map<string, string> |
991 | | // for (StringPiece sp : Split(line, AnyOf(pair_delim), SkipEmpty())) { |
992 | | // pairs.push_back(Split(sp, Limit(AnyOf(kv_delim), 1), SkipEmpty())); |
993 | | // } |
994 | | // |
995 | | // ---------------------------------------------------------------------- |
996 | | bool SplitStringIntoKeyValuePairs(const string& line, const string& key_value_delimiters, |
997 | | const string& key_value_pair_delimiters, |
998 | | vector<pair<string, string>>* kv_pairs); |
999 | | |
1000 | | // ---------------------------------------------------------------------- |
1001 | | // SplitLeadingDec32Values() |
1002 | | // SplitLeadingDec64Values() |
1003 | | // A simple parser for space-separated decimal int32/int64 values. |
1004 | | // Appends parsed integers to the end of the result vector, stopping |
1005 | | // at the first unparsable spot. Skips past leading and repeated |
1006 | | // whitespace (does not consume trailing whitespace), and returns |
1007 | | // a pointer beyond the last character parsed. |
1008 | | // -------------------------------------------------------------------- |
1009 | | const char* SplitLeadingDec32Values(const char* next, vector<int32>* result); |
1010 | | const char* SplitLeadingDec64Values(const char* next, vector<int64>* result); |
1011 | | |
1012 | | // ---------------------------------------------------------------------- |
1013 | | // SplitOneIntToken() |
1014 | | // SplitOneInt32Token() |
1015 | | // SplitOneUint32Token() |
1016 | | // SplitOneInt64Token() |
1017 | | // SplitOneUint64Token() |
1018 | | // SplitOneDoubleToken() |
1019 | | // SplitOneFloatToken() |
1020 | | // Parse a single "delim" delimited number from "*source" into "*value". |
1021 | | // Modify *source to point after the delimiter. |
1022 | | // If no delimiter is present after the number, set *source to NULL. |
1023 | | // |
1024 | | // If the start of *source is not an number, return false. |
1025 | | // If the int is followed by the null character, return true. |
1026 | | // If the int is not followed by a character from delim, return false. |
1027 | | // If *source is NULL, return false. |
1028 | | // |
1029 | | // They cannot handle decimal numbers with leading 0s, since they will be |
1030 | | // treated as octal. |
1031 | | // ---------------------------------------------------------------------- |
1032 | | bool SplitOneIntToken(const char** source, const char* delim, int* value); |
1033 | | bool SplitOneInt32Token(const char** source, const char* delim, int32* value); |
1034 | | bool SplitOneUint32Token(const char** source, const char* delim, uint32* value); |
1035 | | bool SplitOneInt64Token(const char** source, const char* delim, int64* value); |
1036 | | bool SplitOneUint64Token(const char** source, const char* delim, uint64* value); |
1037 | | bool SplitOneDoubleToken(const char** source, const char* delim, double* value); |
1038 | | bool SplitOneFloatToken(const char** source, const char* delim, float* value); |
1039 | | |
1040 | | // Some aliases, so that the function names are standardized against the names |
1041 | | // of the reflection setters/getters in proto2. This makes it easier to use |
1042 | | // certain macros with reflection when creating custom text formats for protos. |
1043 | | |
1044 | 0 | inline bool SplitOneUInt32Token(const char** source, const char* delim, uint32* value) { |
1045 | 0 | return SplitOneUint32Token(source, delim, value); |
1046 | 0 | } |
1047 | | |
1048 | 0 | inline bool SplitOneUInt64Token(const char** source, const char* delim, uint64* value) { |
1049 | 0 | return SplitOneUint64Token(source, delim, value); |
1050 | 0 | } |
1051 | | |
1052 | | // ---------------------------------------------------------------------- |
1053 | | // SplitOneDecimalIntToken() |
1054 | | // SplitOneDecimalInt32Token() |
1055 | | // SplitOneDecimalUint32Token() |
1056 | | // SplitOneDecimalInt64Token() |
1057 | | // SplitOneDecimalUint64Token() |
1058 | | // Parse a single "delim"-delimited number from "*source" into "*value". |
1059 | | // Unlike SplitOneIntToken, etc., this function always interprets |
1060 | | // the numbers as decimal. |
1061 | | bool SplitOneDecimalIntToken(const char** source, const char* delim, int* value); |
1062 | | bool SplitOneDecimalInt32Token(const char** source, const char* delim, int32* value); |
1063 | | bool SplitOneDecimalUint32Token(const char** source, const char* delim, uint32* value); |
1064 | | bool SplitOneDecimalInt64Token(const char** source, const char* delim, int64* value); |
1065 | | bool SplitOneDecimalUint64Token(const char** source, const char* delim, uint64* value); |
1066 | | |
1067 | | // ---------------------------------------------------------------------- |
1068 | | // SplitOneHexUint32Token() |
1069 | | // SplitOneHexUint64Token() |
1070 | | // Once more, for hexadecimal numbers (unsigned only). |
1071 | | bool SplitOneHexUint32Token(const char** source, const char* delim, uint32* value); |
1072 | | bool SplitOneHexUint64Token(const char** source, const char* delim, uint64* value); |
1073 | | |
1074 | | // ###################### TEMPLATE INSTANTIATIONS BELOW ####################### |
1075 | | |
1076 | | // SplitStringAndParse() -- see description above |
1077 | | template <class T> |
1078 | | bool SplitStringAndParse(StringPiece source, StringPiece delim, |
1079 | 0 | bool (*parse)(const string& str, T* value), vector<T>* result) { |
1080 | 0 | return SplitStringAndParseToList(source, delim, parse, result); |
1081 | 0 | } |
1082 | | |
1083 | | namespace strings { |
1084 | | namespace internal { |
1085 | | |
1086 | | template <class Container, class InsertPolicy> |
1087 | | bool SplitStringAndParseToInserter(StringPiece source, StringPiece delim, |
1088 | | bool (*parse)(const string& str, |
1089 | | typename Container::value_type* value), |
1090 | 0 | Container* result, InsertPolicy insert_policy) { |
1091 | 0 | CHECK(NULL != parse); |
1092 | 0 | CHECK(NULL != result); |
1093 | 0 | CHECK(NULL != delim.data()); |
1094 | 0 | CHECK_GT(delim.size(), 0); |
1095 | 0 | bool retval = true; |
1096 | 0 | vector<StringPiece> pieces = |
1097 | 0 | strings::Split(source, strings::delimiter::AnyOf(delim), strings::SkipEmpty()); |
1098 | 0 | for (const auto& piece : pieces) { |
1099 | 0 | typename Container::value_type t; |
1100 | 0 | if (parse(piece.as_string(), &t)) { |
1101 | 0 | insert_policy(result, t); |
1102 | 0 | } else { |
1103 | 0 | retval = false; |
1104 | 0 | } |
1105 | 0 | } |
1106 | 0 | return retval; |
1107 | 0 | } |
1108 | | |
1109 | | // Cannot use output iterator here (e.g. std::inserter, std::back_inserter) |
1110 | | // because some callers use non-standard containers that don't have iterators, |
1111 | | // only an insert() or push_back() method. |
1112 | | struct BasicInsertPolicy { |
1113 | | template <class C, class V> |
1114 | | void operator()(C* c, const V& v) const { |
1115 | | c->insert(v); |
1116 | | } |
1117 | | }; |
1118 | | |
1119 | | struct BackInsertPolicy { |
1120 | | template <class C, class V> |
1121 | 0 | void operator()(C* c, const V& v) const { |
1122 | 0 | c->push_back(v); |
1123 | 0 | } |
1124 | | }; |
1125 | | |
1126 | | } // namespace internal |
1127 | | } // namespace strings |
1128 | | |
1129 | | // SplitStringAndParseToContainer() -- see description above |
1130 | | template <class Container> |
1131 | | bool SplitStringAndParseToContainer(StringPiece source, StringPiece delim, |
1132 | | bool (*parse)(const string& str, |
1133 | | typename Container::value_type* value), |
1134 | | Container* result) { |
1135 | | return strings::internal::SplitStringAndParseToInserter(source, delim, parse, result, |
1136 | | strings::internal::BasicInsertPolicy()); |
1137 | | } |
1138 | | |
1139 | | // SplitStringAndParseToList() -- see description above |
1140 | | template <class List> |
1141 | | bool SplitStringAndParseToList(StringPiece source, StringPiece delim, |
1142 | | bool (*parse)(const string& str, typename List::value_type* value), |
1143 | 0 | List* result) { |
1144 | 0 | return strings::internal::SplitStringAndParseToInserter(source, delim, parse, result, |
1145 | 0 | strings::internal::BackInsertPolicy()); |
1146 | 0 | } |
1147 | | |
1148 | | // END DOXYGEN SplitFunctions grouping |
1149 | | /* @} */ |