/root/doris/be/src/gutil/strings/split.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2008 and onwards Google Inc. All rights reserved. |
2 | | // |
3 | | // Maintainer: Greg Miller <jgm@google.com> |
4 | | |
5 | | #include "gutil/strings/split.h" |
6 | | |
7 | | #include <assert.h> |
8 | | #include <stdlib.h> |
9 | | #include <string.h> |
10 | | #include <iterator> |
11 | | #include <ostream> |
12 | | |
13 | | using std::back_insert_iterator; |
14 | | using std::iterator_traits; |
15 | | #include <limits> |
16 | | |
17 | | using std::numeric_limits; |
18 | | |
19 | | using std::unordered_map; |
20 | | using std::unordered_set; |
21 | | |
22 | | #include "common/logging.h" |
23 | | |
24 | | #include "gutil/integral_types.h" |
25 | | // IWYU pragma: no_include <butil/macros.h> |
26 | | #include "gutil/macros.h" // IWYU pragma: keep |
27 | | #include "gutil/strings/ascii_ctype.h" |
28 | | #include "gutil/strings/util.h" |
29 | | #include "gutil/strtoint.h" |
30 | | |
31 | | // Implementations for some of the Split2 API. Much of the Split2 API is |
32 | | // templated so it exists in header files, either strings/split.h or |
33 | | // strings/split_iternal.h. |
34 | | namespace strings { |
35 | | namespace delimiter { |
36 | | |
37 | | namespace { |
38 | | |
39 | | // This GenericFind() template function encapsulates the finding algorithm |
40 | | // shared between the Literal and AnyOf delimiters. The FindPolicy template |
41 | | // parameter allows each delimiter to customize the actual find function to use |
42 | | // and the length of the found delimiter. For example, the Literal delimiter |
43 | | // will ultimately use StringPiece::find(), and the AnyOf delimiter will use |
44 | | // StringPiece::find_first_of(). |
45 | | template <typename FindPolicy> |
46 | 2.94k | StringPiece GenericFind(StringPiece text, StringPiece delimiter, FindPolicy find_policy) { |
47 | 2.94k | if (delimiter.empty() && text.length() > 0) { |
48 | | // Special case for empty string delimiters: always return a zero-length |
49 | | // StringPiece referring to the item at position 1. |
50 | 0 | return StringPiece(text.begin() + 1, 0); |
51 | 0 | } |
52 | 2.94k | int found_pos = StringPiece::npos; |
53 | 2.94k | StringPiece found(text.end(), 0); // By default, not found |
54 | 2.94k | found_pos = find_policy.Find(text, delimiter); |
55 | 2.94k | if (found_pos != StringPiece::npos) { |
56 | 2.29k | found.set(text.data() + found_pos, find_policy.Length(delimiter)); |
57 | 2.29k | } |
58 | 2.94k | return found; |
59 | 2.94k | } split.cc:_ZN7strings9delimiter12_GLOBAL__N_111GenericFindINS1_13LiteralPolicyEEE11StringPieceS4_S4_T_ Line | Count | Source | 46 | 2.94k | StringPiece GenericFind(StringPiece text, StringPiece delimiter, FindPolicy find_policy) { | 47 | 2.94k | if (delimiter.empty() && text.length() > 0) { | 48 | | // Special case for empty string delimiters: always return a zero-length | 49 | | // StringPiece referring to the item at position 1. | 50 | 0 | return StringPiece(text.begin() + 1, 0); | 51 | 0 | } | 52 | 2.94k | int found_pos = StringPiece::npos; | 53 | 2.94k | StringPiece found(text.end(), 0); // By default, not found | 54 | 2.94k | found_pos = find_policy.Find(text, delimiter); | 55 | 2.94k | if (found_pos != StringPiece::npos) { | 56 | 2.29k | found.set(text.data() + found_pos, find_policy.Length(delimiter)); | 57 | 2.29k | } | 58 | 2.94k | return found; | 59 | 2.94k | } |
Unexecuted instantiation: split.cc:_ZN7strings9delimiter12_GLOBAL__N_111GenericFindINS1_11AnyOfPolicyEEE11StringPieceS4_S4_T_ |
60 | | |
61 | | // Finds using StringPiece::find(), therefore the length of the found delimiter |
62 | | // is delimiter.length(). |
63 | | struct LiteralPolicy { |
64 | 2.94k | int Find(StringPiece text, StringPiece delimiter) { return text.find(delimiter); } |
65 | 2.29k | int Length(StringPiece delimiter) { return delimiter.length(); } |
66 | | }; |
67 | | |
68 | | // Finds using StringPiece::find_first_of(), therefore the length of the found |
69 | | // delimiter is 1. |
70 | | struct AnyOfPolicy { |
71 | 0 | size_t Find(StringPiece text, StringPiece delimiter) { return text.find_first_of(delimiter); } |
72 | 0 | int Length(StringPiece delimiter) { return 1; } |
73 | | }; |
74 | | |
75 | | } // namespace |
76 | | |
77 | | // |
78 | | // Literal |
79 | | // |
80 | | |
81 | 667 | Literal::Literal(StringPiece sp) : delimiter_(sp.ToString()) {} |
82 | | |
83 | 2.94k | StringPiece Literal::Find(StringPiece text) const { |
84 | 2.94k | return GenericFind(text, delimiter_, LiteralPolicy()); |
85 | 2.94k | } |
86 | | |
87 | | // |
88 | | // AnyOf |
89 | | // |
90 | | |
91 | 0 | AnyOf::AnyOf(StringPiece sp) : delimiters_(sp.ToString()) {} |
92 | | |
93 | 0 | StringPiece AnyOf::Find(StringPiece text) const { |
94 | 0 | return GenericFind(text, delimiters_, AnyOfPolicy()); |
95 | 0 | } |
96 | | |
97 | | } // namespace delimiter |
98 | | } // namespace strings |
99 | | |
100 | | // |
101 | | // ==================== LEGACY SPLIT FUNCTIONS ==================== |
102 | | // |
103 | | |
104 | | using ::strings::SkipEmpty; |
105 | | using ::strings::delimiter::AnyOf; |
106 | | using ::strings::delimiter::Limit; |
107 | | |
108 | | namespace { |
109 | | |
110 | | // Appends the results of a split to the specified container. This function has |
111 | | // the following overloads: |
112 | | // - vector<string> - for better performance |
113 | | // - map<string, string> - to change append semantics |
114 | | // - unordered_map<string, string> - to change append semantics |
115 | | template <typename Container, typename Splitter> |
116 | 0 | void AppendToImpl(Container* container, Splitter splitter) { |
117 | 0 | Container c = splitter; // Calls implicit conversion operator. |
118 | 0 | std::copy(c.begin(), c.end(), std::inserter(*container, container->end())); |
119 | 0 | } Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_112AppendToImplISt13unordered_setINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESt4hashIS7_ESt8equal_toIS7_ESaIS7_EEN7strings8internal8SplitterINSE_9delimiter5AnyOfENSE_9SkipEmptyEEEEEvPT_T0_ Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_112AppendToImplISt3setINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESt4lessIS7_ESaIS7_EEN7strings8internal8SplitterINSC_9delimiter5AnyOfENSC_9SkipEmptyEEEEEvPT_T0_ Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_112AppendToImplISt6vectorI11StringPieceSaIS2_EEN7strings8internal8SplitterINS5_9delimiter5AnyOfENS5_9SkipEmptyEEEEEvPT_T0_ Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_112AppendToImplISt6vectorI11StringPieceSaIS2_EEN7strings8internal8SplitterINS5_9delimiter5AnyOfENS6_8NoFilterEEEEEvPT_T0_ |
120 | | |
121 | | // Overload of AppendToImpl() that is optimized for appending to vector<string>. |
122 | | // This version eliminates a couple string copies by using a vector<StringPiece> |
123 | | // as the intermediate container. |
124 | | template <typename Splitter> |
125 | 0 | void AppendToImpl(vector<string>* container, Splitter splitter) { |
126 | 0 | vector<StringPiece> vsp = splitter; // Calls implicit conversion operator. |
127 | 0 | size_t container_size = container->size(); |
128 | 0 | container->resize(container_size + vsp.size()); |
129 | 0 | for (const auto& sp : vsp) { |
130 | 0 | sp.CopyToString(&(*container)[container_size++]); |
131 | 0 | } |
132 | 0 | } Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_112AppendToImplIN7strings8internal8SplitterINS1_9delimiter5AnyOfENS2_8NoFilterEEEEEvPSt6vectorINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaISE_EET_ Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_112AppendToImplIN7strings8internal8SplitterINS1_9delimiter9LimitImplINS4_5AnyOfEEENS2_8NoFilterEEEEEvPSt6vectorINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaISG_EET_ |
133 | | |
134 | | // Here we define two AppendToImpl() overloads for map<> and unordered_map<>. Both of |
135 | | // these overloads call through to this AppendToMap() function. This is needed |
136 | | // because inserting a duplicate key into a map does NOT overwrite the previous |
137 | | // value, which was not the behavior of the split1 Split*() functions. Consider |
138 | | // this example: |
139 | | // |
140 | | // map<string, string> m; |
141 | | // m.insert(std::make_pair("a", "1")); |
142 | | // m.insert(std::make_pair("a", "2")); // <-- doesn't actually insert. |
143 | | // ASSERT_EQ(m["a"], "1"); // <-- "a" has value "1" not "2". |
144 | | // |
145 | | // Due to this behavior of map::insert, we can't rely on a normal std::inserter |
146 | | // for a maps. Instead, maps and unordered_maps need to be special cased to implement |
147 | | // the desired append semantic of inserting an existing value overwrites the |
148 | | // previous value. |
149 | | // |
150 | | // This same issue is true with sets as well. However, since sets don't have a |
151 | | // separate key and value, failing to overwrite an existing value in a set is |
152 | | // fine because the value already exists in the set. |
153 | | // |
154 | | template <typename Map, typename Splitter> |
155 | 0 | void AppendToMap(Map* m, Splitter splitter) { |
156 | 0 | Map tmp = splitter; // Calls implicit conversion operator. |
157 | 0 | for (typename Map::const_iterator it = tmp.begin(); it != tmp.end(); ++it) { |
158 | 0 | (*m)[it->first] = it->second; |
159 | 0 | } |
160 | 0 | } Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_111AppendToMapISt3mapINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_St4lessIS7_ESaISt4pairIKS7_S7_EEEN7strings8internal8SplitterINSF_9delimiter5AnyOfENSF_9SkipEmptyEEEEEvPT_T0_ Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_111AppendToMapISt13unordered_mapINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_St4hashIS7_ESt8equal_toIS7_ESaISt4pairIKS7_S7_EEEN7strings8internal8SplitterINSH_9delimiter5AnyOfENSH_9SkipEmptyEEEEEvPT_T0_ |
161 | | |
162 | | template <typename Splitter> |
163 | 0 | void AppendToImpl(map<string, string>* map_container, Splitter splitter) { |
164 | 0 | AppendToMap(map_container, splitter); |
165 | 0 | } |
166 | | |
167 | | template <typename Splitter> |
168 | 0 | void AppendToImpl(unordered_map<string, string>* map_container, Splitter splitter) { |
169 | 0 | AppendToMap(map_container, splitter); |
170 | 0 | } |
171 | | |
172 | | // Appends the results of a call to strings::Split() to the specified container. |
173 | | // This function is used with the new strings::Split() API to implement the |
174 | | // append semantics of the legacy Split*() functions. |
175 | | // |
176 | | // The "Splitter" template parameter is intended to be a |
177 | | // ::strings::internal::Splitter<>, which is the return value of a call to |
178 | | // strings::Split(). Sample usage: |
179 | | // |
180 | | // vector<string> v; |
181 | | // ... add stuff to "v" ... |
182 | | // AppendTo(&v, strings::Split("a,b,c", ",")); |
183 | | // |
184 | | template <typename Container, typename Splitter> |
185 | 0 | void AppendTo(Container* container, Splitter splitter) { |
186 | 0 | if (container->empty()) { |
187 | | // "Appending" to an empty container is by far the common case. For this we |
188 | | // assign directly to the output container, which is more efficient than |
189 | | // explicitly appending. |
190 | 0 | *container = splitter; // Calls implicit conversion operator. |
191 | 0 | } else { |
192 | 0 | AppendToImpl(container, splitter); |
193 | 0 | } |
194 | 0 | } Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_18AppendToISt6vectorINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaIS7_EEN7strings8internal8SplitterINSA_9delimiter5AnyOfENSB_8NoFilterEEEEEvPT_T0_ Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_18AppendToISt6vectorINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaIS7_EEN7strings8internal8SplitterINSA_9delimiter9LimitImplINSD_5AnyOfEEENSB_8NoFilterEEEEEvPT_T0_ Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_18AppendToISt13unordered_setINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESt4hashIS7_ESt8equal_toIS7_ESaIS7_EEN7strings8internal8SplitterINSE_9delimiter5AnyOfENSE_9SkipEmptyEEEEEvPT_T0_ Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_18AppendToISt3setINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESt4lessIS7_ESaIS7_EEN7strings8internal8SplitterINSC_9delimiter5AnyOfENSC_9SkipEmptyEEEEEvPT_T0_ Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_18AppendToISt3mapINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_St4lessIS7_ESaISt4pairIKS7_S7_EEEN7strings8internal8SplitterINSF_9delimiter5AnyOfENSF_9SkipEmptyEEEEEvPT_T0_ Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_18AppendToISt13unordered_mapINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEES7_St4hashIS7_ESt8equal_toIS7_ESaISt4pairIKS7_S7_EEEN7strings8internal8SplitterINSH_9delimiter5AnyOfENSH_9SkipEmptyEEEEEvPT_T0_ Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_18AppendToISt6vectorI11StringPieceSaIS2_EEN7strings8internal8SplitterINS5_9delimiter5AnyOfENS5_9SkipEmptyEEEEEvPT_T0_ Unexecuted instantiation: split.cc:_ZN12_GLOBAL__N_18AppendToISt6vectorI11StringPieceSaIS2_EEN7strings8internal8SplitterINS5_9delimiter5AnyOfENS6_8NoFilterEEEEEvPT_T0_ |
195 | | |
196 | | } // anonymous namespace |
197 | | |
198 | | // Constants for ClipString() |
199 | | static const int kMaxOverCut = 12; |
200 | | // The ellipsis to add to strings that are too long |
201 | | static const char kCutStr[] = "..."; |
202 | | static const int kCutStrSize = sizeof(kCutStr) - 1; |
203 | | |
204 | | // ---------------------------------------------------------------------- |
205 | | // Return the place to clip the string at, or -1 |
206 | | // if the string doesn't need to be clipped. |
207 | | // ---------------------------------------------------------------------- |
208 | 0 | static int ClipStringHelper(const char* str, int max_len, bool use_ellipsis) { |
209 | 0 | if (strlen(str) <= max_len) return -1; |
210 | | |
211 | 0 | int max_substr_len = max_len; |
212 | |
|
213 | 0 | if (use_ellipsis && max_len > kCutStrSize) { |
214 | 0 | max_substr_len -= kCutStrSize; |
215 | 0 | } |
216 | |
|
217 | 0 | const char* cut_by = (max_substr_len < kMaxOverCut ? str : str + max_len - kMaxOverCut); |
218 | 0 | const char* cut_at = str + max_substr_len; |
219 | 0 | while (!ascii_isspace(*cut_at) && cut_at > cut_by) cut_at--; |
220 | |
|
221 | 0 | if (cut_at == cut_by) { |
222 | | // No space was found |
223 | 0 | return max_substr_len; |
224 | 0 | } else { |
225 | 0 | return cut_at - str; |
226 | 0 | } |
227 | 0 | } |
228 | | |
229 | | // ---------------------------------------------------------------------- |
230 | | // ClipString |
231 | | // Clip a string to a max length. We try to clip on a word boundary |
232 | | // if this is possible. If the string is clipped, we append an |
233 | | // ellipsis. |
234 | | // ---------------------------------------------------------------------- |
235 | | |
236 | 0 | void ClipString(char* str, int max_len) { |
237 | 0 | int cut_at = ClipStringHelper(str, max_len, true); |
238 | 0 | if (cut_at != -1) { |
239 | 0 | if (max_len > kCutStrSize) { |
240 | 0 | strcpy(str + cut_at, kCutStr); |
241 | 0 | } else { |
242 | 0 | strcpy(str + cut_at, ""); |
243 | 0 | } |
244 | 0 | } |
245 | 0 | } |
246 | | |
247 | | // ---------------------------------------------------------------------- |
248 | | // ClipString |
249 | | // Version of ClipString() that uses string instead of char*. |
250 | | // ---------------------------------------------------------------------- |
251 | 0 | void ClipString(string* full_str, int max_len) { |
252 | 0 | int cut_at = ClipStringHelper(full_str->c_str(), max_len, true); |
253 | 0 | if (cut_at != -1) { |
254 | 0 | full_str->erase(cut_at); |
255 | 0 | if (max_len > kCutStrSize) { |
256 | 0 | full_str->append(kCutStr); |
257 | 0 | } |
258 | 0 | } |
259 | 0 | } |
260 | | |
261 | | void SplitStringIntoNPiecesAllowEmpty(const string& full, const char* delim, int pieces, |
262 | 0 | vector<string>* result) { |
263 | 0 | if (pieces == 0) { |
264 | | // No limit when pieces is 0. |
265 | 0 | AppendTo(result, strings::Split(full, AnyOf(delim))); |
266 | 0 | } else { |
267 | | // The input argument "pieces" specifies the max size that *result should |
268 | | // be. However, the argument to the Limit() delimiter is the max number of |
269 | | // delimiters, which should be one less than "pieces". Example: "a,b,c" has |
270 | | // 3 pieces and two comma delimiters. |
271 | 0 | int limit = std::max(pieces - 1, 0); |
272 | 0 | AppendTo(result, strings::Split(full, Limit(AnyOf(delim), limit))); |
273 | 0 | } |
274 | 0 | } |
275 | | |
276 | | // ---------------------------------------------------------------------- |
277 | | // SplitStringAllowEmpty |
278 | | // Split a string using a character delimiter. Append the components |
279 | | // to 'result'. If there are consecutive delimiters, this function |
280 | | // will return corresponding empty strings. |
281 | | // ---------------------------------------------------------------------- |
282 | 0 | void SplitStringAllowEmpty(const string& full, const char* delim, vector<string>* result) { |
283 | 0 | AppendTo(result, strings::Split(full, AnyOf(delim))); |
284 | 0 | } |
285 | | |
286 | | // If we know how much to allocate for a vector of strings, we can |
287 | | // allocate the vector<string> only once and directly to the right size. |
288 | | // This saves in between 33-66 % of memory space needed for the result, |
289 | | // and runs faster in the microbenchmarks. |
290 | | // |
291 | | // The reserve is only implemented for the single character delim. |
292 | | // |
293 | | // The implementation for counting is cut-and-pasted from |
294 | | // SplitStringToIteratorUsing. I could have written my own counting iterator, |
295 | | // and use the existing template function, but probably this is more clear |
296 | | // and more sure to get optimized to reasonable code. |
297 | 0 | static int CalculateReserveForVector(const string& full, const char* delim) { |
298 | 0 | int count = 0; |
299 | 0 | if (delim[0] != '\0' && delim[1] == '\0') { |
300 | | // Optimize the common case where delim is a single character. |
301 | 0 | char c = delim[0]; |
302 | 0 | const char* p = full.data(); |
303 | 0 | const char* end = p + full.size(); |
304 | 0 | while (p != end) { |
305 | 0 | if (*p == c) { // This could be optimized with hasless(v,1) trick. |
306 | 0 | ++p; |
307 | 0 | } else { |
308 | 0 | while (++p != end && *p != c) { |
309 | | // Skip to the next occurence of the delimiter. |
310 | 0 | } |
311 | 0 | ++count; |
312 | 0 | } |
313 | 0 | } |
314 | 0 | } |
315 | 0 | return count; |
316 | 0 | } |
317 | | |
318 | | // ---------------------------------------------------------------------- |
319 | | // SplitStringUsing() |
320 | | // SplitStringToHashsetUsing() |
321 | | // SplitStringToSetUsing() |
322 | | // SplitStringToMapUsing() |
323 | | // SplitStringToHashmapUsing() |
324 | | // Split a string using a character delimiter. Append the components |
325 | | // to 'result'. |
326 | | // |
327 | | // Note: For multi-character delimiters, this routine will split on *ANY* of |
328 | | // the characters in the string, not the entire string as a single delimiter. |
329 | | // ---------------------------------------------------------------------- |
330 | | template <typename StringType, typename ITR> |
331 | 0 | void SplitStringToIteratorUsing(const StringType& full, const char* delim, ITR& result) { |
332 | | // Optimize the common case where delim is a single character. |
333 | 0 | if (delim[0] != '\0' && delim[1] == '\0') { |
334 | 0 | char c = delim[0]; |
335 | 0 | const char* p = full.data(); |
336 | 0 | const char* end = p + full.size(); |
337 | 0 | while (p != end) { |
338 | 0 | if (*p == c) { |
339 | 0 | ++p; |
340 | 0 | } else { |
341 | 0 | const char* start = p; |
342 | 0 | while (++p != end && *p != c) { |
343 | | // Skip to the next occurence of the delimiter. |
344 | 0 | } |
345 | 0 | *result++ = StringType(start, p - start); |
346 | 0 | } |
347 | 0 | } |
348 | 0 | return; |
349 | 0 | } |
350 | | |
351 | 0 | string::size_type begin_index, end_index; |
352 | 0 | begin_index = full.find_first_not_of(delim); |
353 | 0 | while (begin_index != string::npos) { |
354 | 0 | end_index = full.find_first_of(delim, begin_index); |
355 | 0 | if (end_index == string::npos) { |
356 | 0 | *result++ = full.substr(begin_index); |
357 | 0 | return; |
358 | 0 | } |
359 | 0 | *result++ = full.substr(begin_index, (end_index - begin_index)); |
360 | 0 | begin_index = full.find_first_not_of(delim, end_index); |
361 | 0 | } |
362 | 0 | } |
363 | | |
364 | 0 | void SplitStringUsing(const string& full, const char* delim, vector<string>* result) { |
365 | 0 | result->reserve(result->size() + CalculateReserveForVector(full, delim)); |
366 | 0 | std::back_insert_iterator<vector<string>> it(*result); |
367 | 0 | SplitStringToIteratorUsing(full, delim, it); |
368 | 0 | } |
369 | | |
370 | | void SplitStringToHashsetUsing(const string& full, const char* delim, |
371 | 0 | unordered_set<string>* result) { |
372 | 0 | AppendTo(result, strings::Split(full, AnyOf(delim), strings::SkipEmpty())); |
373 | 0 | } |
374 | | |
375 | 0 | void SplitStringToSetUsing(const string& full, const char* delim, set<string>* result) { |
376 | 0 | AppendTo(result, strings::Split(full, AnyOf(delim), strings::SkipEmpty())); |
377 | 0 | } |
378 | | |
379 | 0 | void SplitStringToMapUsing(const string& full, const char* delim, map<string, string>* result) { |
380 | 0 | AppendTo(result, strings::Split(full, AnyOf(delim), strings::SkipEmpty())); |
381 | 0 | } |
382 | | |
383 | | void SplitStringToHashmapUsing(const string& full, const char* delim, |
384 | 0 | unordered_map<string, string>* result) { |
385 | 0 | AppendTo(result, strings::Split(full, AnyOf(delim), strings::SkipEmpty())); |
386 | 0 | } |
387 | | |
388 | | // ---------------------------------------------------------------------- |
389 | | // SplitStringPieceToVector() |
390 | | // Split a StringPiece into sub-StringPieces based on delim |
391 | | // and appends the pieces to 'vec'. |
392 | | // If omit empty strings is true, empty strings are omitted |
393 | | // from the resulting vector. |
394 | | // ---------------------------------------------------------------------- |
395 | | void SplitStringPieceToVector(const StringPiece& full, const char* delim, vector<StringPiece>* vec, |
396 | 0 | bool omit_empty_strings) { |
397 | 0 | if (omit_empty_strings) { |
398 | 0 | AppendTo(vec, strings::Split(full, AnyOf(delim), SkipEmpty())); |
399 | 0 | } else { |
400 | 0 | AppendTo(vec, strings::Split(full, AnyOf(delim))); |
401 | 0 | } |
402 | 0 | } |
403 | | |
404 | | // ---------------------------------------------------------------------- |
405 | | // SplitUsing() |
406 | | // Split a string using a string of delimiters, returning vector |
407 | | // of strings. The original string is modified to insert nulls. |
408 | | // ---------------------------------------------------------------------- |
409 | | |
410 | 0 | vector<char*>* SplitUsing(char* full, const char* delim) { |
411 | 0 | auto vec = new vector<char*>; |
412 | 0 | SplitToVector(full, delim, vec, true); // Omit empty strings |
413 | 0 | return vec; |
414 | 0 | } |
415 | | |
416 | 0 | void SplitToVector(char* full, const char* delim, vector<char*>* vec, bool omit_empty_strings) { |
417 | 0 | char* next = full; |
418 | 0 | while ((next = gstrsep(&full, delim)) != nullptr) { |
419 | 0 | if (omit_empty_strings && next[0] == '\0') continue; |
420 | 0 | vec->push_back(next); |
421 | 0 | } |
422 | | // Add last element (or full string if no delimiter found): |
423 | 0 | if (full != nullptr) { |
424 | 0 | vec->push_back(full); |
425 | 0 | } |
426 | 0 | } |
427 | | |
428 | | void SplitToVector(char* full, const char* delim, vector<const char*>* vec, |
429 | 0 | bool omit_empty_strings) { |
430 | 0 | char* next = full; |
431 | 0 | while ((next = gstrsep(&full, delim)) != nullptr) { |
432 | 0 | if (omit_empty_strings && next[0] == '\0') continue; |
433 | 0 | vec->push_back(next); |
434 | 0 | } |
435 | | // Add last element (or full string if no delimiter found): |
436 | 0 | if (full != nullptr) { |
437 | 0 | vec->push_back(full); |
438 | 0 | } |
439 | 0 | } |
440 | | |
441 | | // ---------------------------------------------------------------------- |
442 | | // SplitOneStringToken() |
443 | | // Mainly a stringified wrapper around strpbrk() |
444 | | // ---------------------------------------------------------------------- |
445 | 0 | string SplitOneStringToken(const char** source, const char* delim) { |
446 | 0 | assert(source); |
447 | 0 | assert(delim); |
448 | 0 | if (!*source) { |
449 | 0 | return string(); |
450 | 0 | } |
451 | 0 | const char* begin = *source; |
452 | | // Optimize the common case where delim is a single character. |
453 | 0 | if (delim[0] != '\0' && delim[1] == '\0') { |
454 | 0 | *source = strchr(*source, delim[0]); |
455 | 0 | } else { |
456 | 0 | *source = strpbrk(*source, delim); |
457 | 0 | } |
458 | 0 | if (*source) { |
459 | 0 | return string(begin, (*source)++); |
460 | 0 | } else { |
461 | 0 | return string(begin); |
462 | 0 | } |
463 | 0 | } |
464 | | |
465 | | // ---------------------------------------------------------------------- |
466 | | // SplitStringWithEscaping() |
467 | | // SplitStringWithEscapingAllowEmpty() |
468 | | // SplitStringWithEscapingToSet() |
469 | | // SplitStringWithWithEscapingToHashset() |
470 | | // Split the string using the specified delimiters, taking escaping into |
471 | | // account. '\' is not allowed as a delimiter. |
472 | | // ---------------------------------------------------------------------- |
473 | | template <typename ITR> |
474 | | void SplitStringWithEscapingToIterator(const string& src, const strings::CharSet& delimiters, |
475 | 0 | const bool allow_empty, ITR* result) { |
476 | 0 | CHECK(!delimiters.Test('\\')) << "\\ is not allowed as a delimiter."; |
477 | 0 | CHECK(result); |
478 | 0 | string part; |
479 | |
|
480 | 0 | for (uint32 i = 0; i < src.size(); ++i) { |
481 | 0 | char current_char = src[i]; |
482 | 0 | if (delimiters.Test(current_char)) { |
483 | | // Push substrings when we encounter delimiters. |
484 | 0 | if (allow_empty || !part.empty()) { |
485 | 0 | *(*result)++ = part; |
486 | 0 | part.clear(); |
487 | 0 | } |
488 | 0 | } else if (current_char == '\\' && ++i < src.size()) { |
489 | | // If we see a backslash, the next delimiter or backslash is literal. |
490 | 0 | current_char = src[i]; |
491 | 0 | if (current_char != '\\' && !delimiters.Test(current_char)) { |
492 | | // Don't honour unknown escape sequences: emit \f for \f. |
493 | 0 | part.push_back('\\'); |
494 | 0 | } |
495 | 0 | part.push_back(current_char); |
496 | 0 | } else { |
497 | | // Otherwise, we have a normal character or trailing backslash. |
498 | 0 | part.push_back(current_char); |
499 | 0 | } |
500 | 0 | } |
501 | | |
502 | | // Push the trailing part. |
503 | 0 | if (allow_empty || !part.empty()) { |
504 | 0 | *(*result)++ = part; |
505 | 0 | } |
506 | 0 | } Unexecuted instantiation: _Z33SplitStringWithEscapingToIteratorISt20back_insert_iteratorISt6vectorINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESaIS7_EEEEvRKS7_RKN7strings7CharSetEbPT_ Unexecuted instantiation: _Z33SplitStringWithEscapingToIteratorISt15insert_iteratorISt3setINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESt4lessIS7_ESaIS7_EEEEvRKS7_RKN7strings7CharSetEbPT_ Unexecuted instantiation: _Z33SplitStringWithEscapingToIteratorISt15insert_iteratorISt13unordered_setINSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEESt4hashIS7_ESt8equal_toIS7_ESaIS7_EEEEvRKS7_RKN7strings7CharSetEbPT_ |
507 | | |
508 | | void SplitStringWithEscaping(const string& full, const strings::CharSet& delimiters, |
509 | 0 | vector<string>* result) { |
510 | 0 | std::back_insert_iterator<vector<string>> it(*result); |
511 | 0 | SplitStringWithEscapingToIterator(full, delimiters, false, &it); |
512 | 0 | } |
513 | | |
514 | | void SplitStringWithEscapingAllowEmpty(const string& full, const strings::CharSet& delimiters, |
515 | 0 | vector<string>* result) { |
516 | 0 | std::back_insert_iterator<vector<string>> it(*result); |
517 | 0 | SplitStringWithEscapingToIterator(full, delimiters, true, &it); |
518 | 0 | } |
519 | | |
520 | | void SplitStringWithEscapingToSet(const string& full, const strings::CharSet& delimiters, |
521 | 0 | set<string>* result) { |
522 | 0 | std::insert_iterator<set<string>> it(*result, result->end()); |
523 | 0 | SplitStringWithEscapingToIterator(full, delimiters, false, &it); |
524 | 0 | } |
525 | | |
526 | | void SplitStringWithEscapingToHashset(const string& full, const strings::CharSet& delimiters, |
527 | 0 | unordered_set<string>* result) { |
528 | 0 | std::insert_iterator<unordered_set<string>> it(*result, result->end()); |
529 | 0 | SplitStringWithEscapingToIterator(full, delimiters, false, &it); |
530 | 0 | } |
531 | | |
532 | | // ---------------------------------------------------------------------- |
533 | | // SplitOneIntToken() |
534 | | // SplitOneInt32Token() |
535 | | // SplitOneUint32Token() |
536 | | // SplitOneInt64Token() |
537 | | // SplitOneUint64Token() |
538 | | // SplitOneDoubleToken() |
539 | | // SplitOneFloatToken() |
540 | | // SplitOneDecimalIntToken() |
541 | | // SplitOneDecimalInt32Token() |
542 | | // SplitOneDecimalUint32Token() |
543 | | // SplitOneDecimalInt64Token() |
544 | | // SplitOneDecimalUint64Token() |
545 | | // SplitOneHexUint32Token() |
546 | | // SplitOneHexUint64Token() |
547 | | // Mainly a stringified wrapper around strtol/strtoul/strtod |
548 | | // ---------------------------------------------------------------------- |
549 | | // Curried functions for the macro below |
550 | 0 | static inline long strto32_0(const char* source, char** end) { |
551 | 0 | return strto32(source, end, 0); |
552 | 0 | } |
553 | 0 | static inline unsigned long strtou32_0(const char* source, char** end) { |
554 | 0 | return strtou32(source, end, 0); |
555 | 0 | } |
556 | 0 | static inline int64 strto64_0(const char* source, char** end) { |
557 | 0 | return strto64(source, end, 0); |
558 | 0 | } |
559 | 0 | static inline uint64 strtou64_0(const char* source, char** end) { |
560 | 0 | return strtou64(source, end, 0); |
561 | 0 | } |
562 | 0 | static inline long strto32_10(const char* source, char** end) { |
563 | 0 | return strto32(source, end, 10); |
564 | 0 | } |
565 | 0 | static inline unsigned long strtou32_10(const char* source, char** end) { |
566 | 0 | return strtou32(source, end, 10); |
567 | 0 | } |
568 | 0 | static inline int64 strto64_10(const char* source, char** end) { |
569 | 0 | return strto64(source, end, 10); |
570 | 0 | } |
571 | 0 | static inline uint64 strtou64_10(const char* source, char** end) { |
572 | 0 | return strtou64(source, end, 10); |
573 | 0 | } |
574 | 0 | static inline uint32 strtou32_16(const char* source, char** end) { |
575 | 0 | return strtou32(source, end, 16); |
576 | 0 | } |
577 | 0 | static inline uint64 strtou64_16(const char* source, char** end) { |
578 | 0 | return strtou64(source, end, 16); |
579 | 0 | } |
580 | | |
581 | | #define DEFINE_SPLIT_ONE_NUMBER_TOKEN(name, type, function) \ |
582 | 0 | bool SplitOne##name##Token(const char** source, const char* delim, type* value) { \ |
583 | 0 | assert(source); \ |
584 | 0 | assert(delim); \ |
585 | 0 | assert(value); \ |
586 | 0 | if (!*source) return false; \ |
587 | 0 | /* Parse int */ \ |
588 | 0 | char* end; \ |
589 | 0 | *value = function(*source, &end); \ |
590 | 0 | if (end == *source) return false; /* number not present at start of string */ \ |
591 | 0 | if (end[0] && !strchr(delim, end[0])) return false; /* Garbage characters after int */ \ |
592 | 0 | /* Advance past token */ \ |
593 | 0 | if (*end != '\0') \ |
594 | 0 | *source = const_cast<const char*>(end + 1); \ |
595 | 0 | else \ |
596 | 0 | *source = NULL; \ |
597 | 0 | return true; \ |
598 | 0 | } Unexecuted instantiation: _Z16SplitOneIntTokenPPKcS0_Pi Unexecuted instantiation: _Z18SplitOneInt32TokenPPKcS0_Pi Unexecuted instantiation: _Z19SplitOneUint32TokenPPKcS0_Pj Unexecuted instantiation: _Z18SplitOneInt64TokenPPKcS0_Pl Unexecuted instantiation: _Z19SplitOneUint64TokenPPKcS0_Pm Unexecuted instantiation: _Z19SplitOneDoubleTokenPPKcS0_Pd Unexecuted instantiation: _Z18SplitOneFloatTokenPPKcS0_Pf Unexecuted instantiation: _Z23SplitOneDecimalIntTokenPPKcS0_Pi Unexecuted instantiation: _Z25SplitOneDecimalInt32TokenPPKcS0_Pi Unexecuted instantiation: _Z26SplitOneDecimalUint32TokenPPKcS0_Pj Unexecuted instantiation: _Z25SplitOneDecimalInt64TokenPPKcS0_Pl Unexecuted instantiation: _Z26SplitOneDecimalUint64TokenPPKcS0_Pm Unexecuted instantiation: _Z22SplitOneHexUint32TokenPPKcS0_Pj Unexecuted instantiation: _Z22SplitOneHexUint64TokenPPKcS0_Pm |
599 | | |
600 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int, int, strto32_0) |
601 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int32, int32, strto32_0) |
602 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Uint32, uint32, strtou32_0) |
603 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int64, int64, strto64_0) |
604 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Uint64, uint64, strtou64_0) |
605 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Double, double, strtod) |
606 | | #ifdef _MSC_VER // has no strtof() |
607 | | // Note: does an implicit cast to float. |
608 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Float, float, strtod) |
609 | | #else |
610 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(Float, float, strtof) |
611 | | #endif |
612 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt, int, strto32_10) |
613 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt32, int32, strto32_10) |
614 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalUint32, uint32, strtou32_10) |
615 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt64, int64, strto64_10) |
616 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalUint64, uint64, strtou64_10) |
617 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(HexUint32, uint32, strtou32_16) |
618 | | DEFINE_SPLIT_ONE_NUMBER_TOKEN(HexUint64, uint64, strtou64_16) |
619 | | |
620 | | // ---------------------------------------------------------------------- |
621 | | // SplitRange() |
622 | | // Splits a string of the form "<from>-<to>". Either or both can be |
623 | | // missing. A raw number (<to>) is interpreted as "<to>-". Modifies |
624 | | // parameters insofar as they're specified by the string. RETURNS |
625 | | // true iff the input is a well-formed range. If it RETURNS false, |
626 | | // from and to remain unchanged. The range in rangestr should be |
627 | | // terminated either by "\0" or by whitespace. |
628 | | // ---------------------------------------------------------------------- |
629 | | |
630 | 0 | #define EOS(ch) ((ch) == '\0' || ascii_isspace(ch)) |
631 | 0 | bool SplitRange(const char* rangestr, int* from, int* to) { |
632 | | // We need to do the const-cast because strol takes a char**, not const char** |
633 | 0 | char* val = const_cast<char*>(rangestr); |
634 | 0 | if (val == nullptr || EOS(*val)) return true; // we'll say nothingness is ok |
635 | | |
636 | 0 | if (val[0] == '-' && EOS(val[1])) // CASE 1: - |
637 | 0 | return true; // nothing changes |
638 | | |
639 | 0 | if (val[0] == '-') { // CASE 2: -<i2> |
640 | 0 | const int int2 = strto32(val + 1, &val, 10); |
641 | 0 | if (!EOS(*val)) return false; // not a valid integer |
642 | 0 | *to = int2; // only "to" changes |
643 | 0 | return true; |
644 | |
|
645 | 0 | } else { |
646 | 0 | const int int1 = strto32(val, &val, 10); |
647 | 0 | if (EOS(*val) || (*val == '-' && EOS(*(val + 1)))) { |
648 | 0 | *from = int1; // CASE 3: <i1>, same as <i1>- |
649 | 0 | return true; // only "from" changes |
650 | 0 | } else if (*val != '-') { // not a valid range |
651 | 0 | return false; |
652 | 0 | } |
653 | 0 | const int int2 = strto32(val + 1, &val, 10); |
654 | 0 | if (!EOS(*val)) return false; // not a valid integer |
655 | 0 | *from = int1; // CASE 4: <i1>-<i2> |
656 | 0 | *to = int2; |
657 | 0 | return true; |
658 | 0 | } |
659 | 0 | } |
660 | | |
661 | 0 | void SplitCSVLineWithDelimiter(char* line, char delimiter, vector<char*>* cols) { |
662 | 0 | char* end_of_line = line + strlen(line); |
663 | 0 | char* end; |
664 | 0 | char* start; |
665 | |
|
666 | 0 | for (; line < end_of_line; line++) { |
667 | | // Skip leading whitespace, unless said whitespace is the delimiter. |
668 | 0 | while (ascii_isspace(*line) && *line != delimiter) ++line; |
669 | |
|
670 | 0 | if (*line == '"' && delimiter == ',') { // Quoted value... |
671 | 0 | start = ++line; |
672 | 0 | end = start; |
673 | 0 | for (; *line; line++) { |
674 | 0 | if (*line == '"') { |
675 | 0 | line++; |
676 | 0 | if (*line != '"') // [""] is an escaped ["] |
677 | 0 | break; // but just ["] is end of value |
678 | 0 | } |
679 | 0 | *end++ = *line; |
680 | 0 | } |
681 | | // All characters after the closing quote and before the comma |
682 | | // are ignored. |
683 | 0 | line = strchr(line, delimiter); |
684 | 0 | if (!line) line = end_of_line; |
685 | 0 | } else { |
686 | 0 | start = line; |
687 | 0 | line = strchr(line, delimiter); |
688 | 0 | if (!line) line = end_of_line; |
689 | | // Skip all trailing whitespace, unless said whitespace is the delimiter. |
690 | 0 | for (end = line; end > start; --end) { |
691 | 0 | if (!ascii_isspace(end[-1]) || end[-1] == delimiter) break; |
692 | 0 | } |
693 | 0 | } |
694 | 0 | const bool need_another_column = (*line == delimiter) && (line == end_of_line - 1); |
695 | 0 | *end = '\0'; |
696 | 0 | cols->push_back(start); |
697 | | // If line was something like [paul,] (comma is the last character |
698 | | // and is not proceeded by whitespace or quote) then we are about |
699 | | // to eliminate the last column (which is empty). This would be |
700 | | // incorrect. |
701 | 0 | if (need_another_column) cols->push_back(end); |
702 | |
|
703 | 0 | assert(*line == '\0' || *line == delimiter); |
704 | 0 | } |
705 | 0 | } |
706 | | |
707 | 0 | void SplitCSVLine(char* line, vector<char*>* cols) { |
708 | 0 | SplitCSVLineWithDelimiter(line, ',', cols); |
709 | 0 | } |
710 | | |
711 | 0 | void SplitCSVLineWithDelimiterForStrings(const string& line, char delimiter, vector<string>* cols) { |
712 | | // Unfortunately, the interface requires char* instead of const char* |
713 | | // which requires copying the string. |
714 | 0 | char* cline = strndup_with_new(line.c_str(), line.size()); |
715 | 0 | vector<char*> v; |
716 | 0 | SplitCSVLineWithDelimiter(cline, delimiter, &v); |
717 | 0 | for (vector<char*>::const_iterator ci = v.begin(); ci != v.end(); ++ci) { |
718 | 0 | cols->push_back(*ci); |
719 | 0 | } |
720 | 0 | delete[] cline; |
721 | 0 | } |
722 | | |
723 | | // ---------------------------------------------------------------------- |
724 | | namespace { |
725 | | |
726 | | // Helper class used by SplitStructuredLineInternal. |
727 | | class ClosingSymbolLookup { |
728 | | public: |
729 | 0 | explicit ClosingSymbolLookup(const char* symbol_pairs) : closing_(), valid_closing_() { |
730 | | // Initialize the opening/closing arrays. |
731 | 0 | for (const char* symbol = symbol_pairs; *symbol != 0; ++symbol) { |
732 | 0 | unsigned char opening = *symbol; |
733 | 0 | ++symbol; |
734 | | // If the string ends before the closing character has been found, |
735 | | // use the opening character as the closing character. |
736 | 0 | unsigned char closing = *symbol != 0 ? *symbol : opening; |
737 | 0 | closing_[opening] = closing; |
738 | 0 | valid_closing_[closing] = true; |
739 | 0 | if (*symbol == 0) break; |
740 | 0 | } |
741 | 0 | } |
742 | | |
743 | | // Returns the closing character corresponding to an opening one, |
744 | | // or 0 if the argument is not an opening character. |
745 | 0 | char GetClosingChar(char opening) const { |
746 | 0 | return closing_[static_cast<unsigned char>(opening)]; |
747 | 0 | } |
748 | | |
749 | | // Returns true if the argument is a closing character. |
750 | 0 | bool IsClosing(char c) const { return valid_closing_[static_cast<unsigned char>(c)]; } |
751 | | |
752 | | private: |
753 | | // Maps an opening character to its closing. If the entry contains 0, |
754 | | // the character is not in the opening set. |
755 | | char closing_[256]; |
756 | | // Valid closing characters. |
757 | | bool valid_closing_[256]; |
758 | | |
759 | | DISALLOW_COPY_AND_ASSIGN(ClosingSymbolLookup); |
760 | | }; |
761 | | |
762 | | char* SplitStructuredLineInternal(char* line, char delimiter, const char* symbol_pairs, |
763 | 0 | vector<char*>* cols, bool with_escapes) { |
764 | 0 | ClosingSymbolLookup lookup(symbol_pairs); |
765 | | |
766 | | // Stack of symbols expected to close the current opened expressions. |
767 | 0 | vector<char> expected_to_close; |
768 | 0 | bool in_escape = false; |
769 | |
|
770 | 0 | CHECK(cols); |
771 | 0 | cols->push_back(line); |
772 | 0 | char* current; |
773 | 0 | for (current = line; *current; ++current) { |
774 | 0 | char c = *current; |
775 | 0 | if (in_escape) { |
776 | 0 | in_escape = false; |
777 | 0 | } else if (with_escapes && c == '\\') { |
778 | | // We are escaping the next character. Note the escape still appears |
779 | | // in the output. |
780 | 0 | in_escape = true; |
781 | 0 | } else if (expected_to_close.empty() && c == delimiter) { |
782 | | // We don't have any open expression, this is a valid separator. |
783 | 0 | *current = 0; |
784 | 0 | cols->push_back(current + 1); |
785 | 0 | } else if (!expected_to_close.empty() && c == expected_to_close.back()) { |
786 | | // Can we close the currently open expression? |
787 | 0 | expected_to_close.pop_back(); |
788 | 0 | } else if (lookup.GetClosingChar(c)) { |
789 | | // If this is an opening symbol, we open a new expression and push |
790 | | // the expected closing symbol on the stack. |
791 | 0 | expected_to_close.push_back(lookup.GetClosingChar(c)); |
792 | 0 | } else if (lookup.IsClosing(c)) { |
793 | | // Error: mismatched closing symbol. |
794 | 0 | return current; |
795 | 0 | } |
796 | 0 | } |
797 | 0 | if (!expected_to_close.empty()) { |
798 | 0 | return current; // Missing closing symbol(s) |
799 | 0 | } |
800 | 0 | return nullptr; // Success |
801 | 0 | } |
802 | | |
803 | | bool SplitStructuredLineInternal(StringPiece line, char delimiter, const char* symbol_pairs, |
804 | 0 | vector<StringPiece>* cols, bool with_escapes) { |
805 | 0 | ClosingSymbolLookup lookup(symbol_pairs); |
806 | | |
807 | | // Stack of symbols expected to close the current opened expressions. |
808 | 0 | vector<char> expected_to_close; |
809 | 0 | bool in_escape = false; |
810 | |
|
811 | 0 | CHECK_NOTNULL(cols); |
812 | 0 | cols->push_back(line); |
813 | 0 | for (int i = 0; i < line.size(); ++i) { |
814 | 0 | char c = line[i]; |
815 | 0 | if (in_escape) { |
816 | 0 | in_escape = false; |
817 | 0 | } else if (with_escapes && c == '\\') { |
818 | | // We are escaping the next character. Note the escape still appears |
819 | | // in the output. |
820 | 0 | in_escape = true; |
821 | 0 | } else if (expected_to_close.empty() && c == delimiter) { |
822 | | // We don't have any open expression, this is a valid separator. |
823 | 0 | cols->back().remove_suffix(line.size() - i); |
824 | 0 | cols->push_back(StringPiece(line, i + 1)); |
825 | 0 | } else if (!expected_to_close.empty() && c == expected_to_close.back()) { |
826 | | // Can we close the currently open expression? |
827 | 0 | expected_to_close.pop_back(); |
828 | 0 | } else if (lookup.GetClosingChar(c)) { |
829 | | // If this is an opening symbol, we open a new expression and push |
830 | | // the expected closing symbol on the stack. |
831 | 0 | expected_to_close.push_back(lookup.GetClosingChar(c)); |
832 | 0 | } else if (lookup.IsClosing(c)) { |
833 | | // Error: mismatched closing symbol. |
834 | 0 | return false; |
835 | 0 | } |
836 | 0 | } |
837 | 0 | if (!expected_to_close.empty()) { |
838 | 0 | return false; // Missing closing symbol(s) |
839 | 0 | } |
840 | 0 | return true; // Success |
841 | 0 | } |
842 | | |
843 | | } // anonymous namespace |
844 | | |
845 | | char* SplitStructuredLine(char* line, char delimiter, const char* symbol_pairs, |
846 | 0 | vector<char*>* cols) { |
847 | 0 | return SplitStructuredLineInternal(line, delimiter, symbol_pairs, cols, false); |
848 | 0 | } |
849 | | |
850 | | bool SplitStructuredLine(StringPiece line, char delimiter, const char* symbol_pairs, |
851 | 0 | vector<StringPiece>* cols) { |
852 | 0 | return SplitStructuredLineInternal(line, delimiter, symbol_pairs, cols, false); |
853 | 0 | } |
854 | | |
855 | | char* SplitStructuredLineWithEscapes(char* line, char delimiter, const char* symbol_pairs, |
856 | 0 | vector<char*>* cols) { |
857 | 0 | return SplitStructuredLineInternal(line, delimiter, symbol_pairs, cols, true); |
858 | 0 | } |
859 | | |
860 | | bool SplitStructuredLineWithEscapes(StringPiece line, char delimiter, const char* symbol_pairs, |
861 | 0 | vector<StringPiece>* cols) { |
862 | 0 | return SplitStructuredLineInternal(line, delimiter, symbol_pairs, cols, true); |
863 | 0 | } |
864 | | |
865 | | // ---------------------------------------------------------------------- |
866 | | // SplitStringIntoKeyValues() |
867 | | // ---------------------------------------------------------------------- |
868 | | bool SplitStringIntoKeyValues(const string& line, const string& key_value_delimiters, |
869 | | const string& value_value_delimiters, string* key, |
870 | 0 | vector<string>* values) { |
871 | 0 | key->clear(); |
872 | 0 | values->clear(); |
873 | | |
874 | | // find the key string |
875 | 0 | size_t end_key_pos = line.find_first_of(key_value_delimiters); |
876 | 0 | if (end_key_pos == string::npos) { |
877 | 0 | VLOG_CRITICAL << "cannot parse key from line: " << line; |
878 | 0 | return false; // no key |
879 | 0 | } |
880 | 0 | key->assign(line, 0, end_key_pos); |
881 | | |
882 | | // find the values string |
883 | 0 | string remains(line, end_key_pos, line.size() - end_key_pos); |
884 | 0 | size_t begin_values_pos = remains.find_first_not_of(key_value_delimiters); |
885 | 0 | if (begin_values_pos == string::npos) { |
886 | 0 | VLOG_CRITICAL << "cannot parse value from line: " << line; |
887 | 0 | return false; // no value |
888 | 0 | } |
889 | 0 | string values_string(remains, begin_values_pos, remains.size() - begin_values_pos); |
890 | | |
891 | | // construct the values vector |
892 | 0 | if (value_value_delimiters.empty()) { // one value |
893 | 0 | values->push_back(values_string); |
894 | 0 | } else { // multiple values |
895 | 0 | SplitStringUsing(values_string, value_value_delimiters.c_str(), values); |
896 | 0 | if (values->size() < 1) { |
897 | 0 | VLOG_CRITICAL << "cannot parse value from line: " << line; |
898 | 0 | return false; // no value |
899 | 0 | } |
900 | 0 | } |
901 | 0 | return true; |
902 | 0 | } |
903 | | |
904 | | bool SplitStringIntoKeyValuePairs(const string& line, const string& key_value_delimiters, |
905 | | const string& key_value_pair_delimiters, |
906 | 0 | vector<pair<string, string>>* kv_pairs) { |
907 | 0 | kv_pairs->clear(); |
908 | |
|
909 | 0 | vector<string> pairs; |
910 | 0 | SplitStringUsing(line, key_value_pair_delimiters.c_str(), &pairs); |
911 | |
|
912 | 0 | bool success = true; |
913 | 0 | for (const auto& pair : pairs) { |
914 | 0 | string key; |
915 | 0 | vector<string> value; |
916 | 0 | if (!SplitStringIntoKeyValues(pair, key_value_delimiters, "", &key, &value)) { |
917 | | // Don't return here, to allow for keys without associated |
918 | | // values; just record that our split failed. |
919 | 0 | success = false; |
920 | 0 | } |
921 | | // we expect at most one value because we passed in an empty vsep to |
922 | | // SplitStringIntoKeyValues |
923 | 0 | DCHECK_LE(value.size(), 1); |
924 | 0 | kv_pairs->push_back(make_pair(key, value.empty() ? "" : value[0])); |
925 | 0 | } |
926 | 0 | return success; |
927 | 0 | } |
928 | | |
929 | | // ---------------------------------------------------------------------- |
930 | | // SplitLeadingDec32Values() |
931 | | // SplitLeadingDec64Values() |
932 | | // A simple parser for space-separated decimal int32/int64 values. |
933 | | // Appends parsed integers to the end of the result vector, stopping |
934 | | // at the first unparsable spot. Skips past leading and repeated |
935 | | // whitespace (does not consume trailing whitespace), and returns |
936 | | // a pointer beyond the last character parsed. |
937 | | // -------------------------------------------------------------------- |
938 | 0 | const char* SplitLeadingDec32Values(const char* str, vector<int32>* result) { |
939 | 0 | for (;;) { |
940 | 0 | char* end = nullptr; |
941 | 0 | long value = strtol(str, &end, 10); |
942 | 0 | if (end == str) break; |
943 | | // Limit long values to int32 min/max. Needed for lp64. |
944 | 0 | if (value > numeric_limits<int32>::max()) { |
945 | 0 | value = numeric_limits<int32>::max(); |
946 | 0 | } else if (value < numeric_limits<int32>::min()) { |
947 | 0 | value = numeric_limits<int32>::min(); |
948 | 0 | } |
949 | 0 | result->push_back(value); |
950 | 0 | str = end; |
951 | 0 | if (!ascii_isspace(*end)) break; |
952 | 0 | } |
953 | 0 | return str; |
954 | 0 | } |
955 | | |
956 | 0 | const char* SplitLeadingDec64Values(const char* str, vector<int64>* result) { |
957 | 0 | for (;;) { |
958 | 0 | char* end = nullptr; |
959 | 0 | const int64 value = strtoll(str, &end, 10); |
960 | 0 | if (end == str) break; |
961 | 0 | result->push_back(value); |
962 | 0 | str = end; |
963 | 0 | if (!ascii_isspace(*end)) break; |
964 | 0 | } |
965 | 0 | return str; |
966 | 0 | } |
967 | | |
968 | 0 | void SplitStringToLines(const char* full, int max_len, int num_lines, vector<string>* result) { |
969 | 0 | if (max_len <= 0) { |
970 | 0 | return; |
971 | 0 | } |
972 | 0 | int pos = 0; |
973 | 0 | for (int i = 0; (i < num_lines || num_lines <= 0); i++) { |
974 | 0 | int cut_at = ClipStringHelper(full + pos, max_len, (i == num_lines - 1)); |
975 | 0 | if (cut_at == -1) { |
976 | 0 | result->push_back(string(full + pos)); |
977 | 0 | return; |
978 | 0 | } |
979 | 0 | result->push_back(string(full + pos, cut_at)); |
980 | 0 | if (i == num_lines - 1 && max_len > kCutStrSize) { |
981 | 0 | result->at(i).append(kCutStr); |
982 | 0 | } |
983 | 0 | pos += cut_at; |
984 | 0 | } |
985 | 0 | } |