Coverage Report

Created: 2025-04-11 23:49

/root/doris/be/src/gutil/strings/escaping.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2008 Google Inc. All Rights Reserved.
2
// Authors: Numerous. See the .h for contact people.
3
4
#include "gutil/strings/escaping.h"
5
6
#include <assert.h>
7
#include <stdio.h>
8
#include <string.h>
9
#include <glog/logging.h>
10
#include <limits>
11
#include <ostream>
12
13
#include "common/exception.h"
14
15
using std::numeric_limits;
16
#include <vector>
17
18
using std::vector;
19
20
#include "gutil/charmap.h"
21
#include "gutil/gscoped_ptr.h"
22
#include "gutil/integral_types.h"
23
#include "gutil/port.h"
24
#include "gutil/stl_util.h"
25
#include "gutil/utf/utf.h" // for runetochar
26
#include "gutil/strings/strcat.h"
27
28
namespace strings {
29
30
// These are used for the leave_nulls_escaped argument to CUnescapeInternal().
31
static bool kUnescapeNulls = false;
32
static bool kLeaveNullsEscaped = true;
33
34
// ----------------------------------------------------------------------
35
// EscapeStrForCSV()
36
//    Escapes the quotes in 'src' by doubling them. This is necessary
37
//    for generating CSV files (see SplitCSVLine).
38
//    Returns the number of characters written into dest (not counting
39
//    the \0) or -1 if there was insufficient space. Dest could end up
40
//    twice as long as src.
41
//
42
//    Example: [some "string" to test] --> [some ""string"" to test]
43
// ----------------------------------------------------------------------
44
0
int EscapeStrForCSV(const char* src, char* dest, int dest_len) {
45
0
    int used = 0;
46
47
0
    while (true) {
48
0
        if (*src == '\0' && used < dest_len) {
49
0
            dest[used] = '\0';
50
0
            return used;
51
0
        }
52
53
0
        if (used + 1 >= dest_len) // +1 because we might require two characters
54
0
            return -1;
55
56
0
        if (*src == '"') dest[used++] = '"';
57
58
0
        dest[used++] = *src++;
59
0
    }
60
0
}
61
62
// ----------------------------------------------------------------------
63
// UnescapeCEscapeSequences()
64
//    This does all the unescaping that C does: \ooo, \r, \n, etc
65
//    Returns length of resulting string.
66
//    The implementation of \x parses any positive number of hex digits,
67
//    but it is an error if the value requires more than 8 bits, and the
68
//    result is truncated to 8 bits. The same is true for octals.
69
//
70
//    The second call stores its errors in a supplied string vector.
71
//    If the string vector pointer is NULL, it reports the errors with LOG().
72
//
73
//    *** DEPRECATED: Use CUnescape() in new code ***
74
//
75
//    NOTE: any changes to this function must also be reflected in the newer
76
//    CUnescape().
77
// ----------------------------------------------------------------------
78
79
0
#define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7'))
80
81
0
int UnescapeCEscapeSequences(const char* source, char* dest) {
82
0
    return UnescapeCEscapeSequences(source, dest, nullptr);
83
0
}
84
85
0
int UnescapeCEscapeSequences(const char* source, char* dest, vector<string>* errors) {
86
0
    char* d = dest;
87
0
    const char* p = source;
88
89
    // Small optimization for case where source = dest and there's no escaping
90
0
    while (p == d && *p != '\0' && *p != '\\') p++, d++;
91
92
0
    while (*p != '\0') {
93
0
        if (*p != '\\') {
94
0
            *d++ = *p++;
95
0
        } else {
96
0
            switch (*++p) { // skip past the '\\'
97
0
            case '\0':
98
0
                LOG_STRING(ERROR, errors) << "String cannot end with \\";
99
0
                *d = '\0';
100
0
                return d - dest; // we're done with p
101
0
            case 'a':
102
0
                *d++ = '\a';
103
0
                break;
104
0
            case 'b':
105
0
                *d++ = '\b';
106
0
                break;
107
0
            case 'f':
108
0
                *d++ = '\f';
109
0
                break;
110
0
            case 'n':
111
0
                *d++ = '\n';
112
0
                break;
113
0
            case 'r':
114
0
                *d++ = '\r';
115
0
                break;
116
0
            case 't':
117
0
                *d++ = '\t';
118
0
                break;
119
0
            case 'v':
120
0
                *d++ = '\v';
121
0
                break;
122
0
            case '\\':
123
0
                *d++ = '\\';
124
0
                break;
125
0
            case '?':
126
0
                *d++ = '\?';
127
0
                break; // \?  Who knew?
128
0
            case '\'':
129
0
                *d++ = '\'';
130
0
                break;
131
0
            case '"':
132
0
                *d++ = '\"';
133
0
                break;
134
0
            case '0':
135
0
            case '1':
136
0
            case '2':
137
0
            case '3': // octal digit: 1 to 3 digits
138
0
            case '4':
139
0
            case '5':
140
0
            case '6':
141
0
            case '7': {
142
0
                const char* octal_start = p;
143
0
                unsigned int ch = *p - '0';
144
0
                if (IS_OCTAL_DIGIT(p[1])) ch = ch * 8 + *++p - '0';
145
0
                if (IS_OCTAL_DIGIT(p[1]))     // safe (and easy) to do this twice
146
0
                    ch = ch * 8 + *++p - '0'; // now points at last digit
147
0
                if (ch > 0xFF)
148
0
                    LOG_STRING(ERROR, errors) << "Value of "
149
0
                                              << "\\" << string(octal_start, p + 1 - octal_start)
150
0
                                              << " exceeds 8 bits";
151
0
                *d++ = ch;
152
0
                break;
153
0
            }
154
0
            case 'x':
155
0
            case 'X': {
156
0
                if (!ascii_isxdigit(p[1])) {
157
0
                    if (p[1] == '\0') {
158
0
                        LOG_STRING(ERROR, errors) << "String cannot end with \\x";
159
0
                    } else {
160
0
                        LOG_STRING(ERROR, errors)
161
0
                                << "\\x cannot be followed by a non-hex digit: \\" << *p << p[1];
162
0
                    }
163
0
                    break;
164
0
                }
165
0
                unsigned int ch = 0;
166
0
                const char* hex_start = p;
167
0
                while (ascii_isxdigit(p[1])) // arbitrarily many hex digits
168
0
                    ch = (ch << 4) + hex_digit_to_int(*++p);
169
0
                if (ch > 0xFF)
170
0
                    LOG_STRING(ERROR, errors)
171
0
                            << "Value of "
172
0
                            << "\\" << string(hex_start, p + 1 - hex_start) << " exceeds 8 bits";
173
0
                *d++ = ch;
174
0
                break;
175
0
            }
176
0
            case 'u': {
177
                // \uhhhh => convert 4 hex digits to UTF-8
178
0
                char32 rune = 0;
179
0
                const char* hex_start = p;
180
0
                for (int i = 0; i < 4; ++i) {
181
0
                    if (ascii_isxdigit(p[1])) {                      // Look one char ahead.
182
0
                        rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p.
183
0
                    } else {
184
0
                        LOG_STRING(ERROR, errors) << "\\u must be followed by 4 hex digits: \\"
185
0
                                                  << string(hex_start, p + 1 - hex_start);
186
0
                        break;
187
0
                    }
188
0
                }
189
0
                d += runetochar(d, &rune);
190
0
                break;
191
0
            }
192
0
            case 'U': {
193
                // \Uhhhhhhhh => convert 8 hex digits to UTF-8
194
0
                char32 rune = 0;
195
0
                const char* hex_start = p;
196
0
                for (int i = 0; i < 8; ++i) {
197
0
                    if (ascii_isxdigit(p[1])) { // Look one char ahead.
198
                        // Don't change rune until we're sure this
199
                        // is within the Unicode limit, but do advance p.
200
0
                        char32 newrune = (rune << 4) + hex_digit_to_int(*++p);
201
0
                        if (newrune > 0x10FFFF) {
202
0
                            LOG_STRING(ERROR, errors)
203
0
                                    << "Value of \\" << string(hex_start, p + 1 - hex_start)
204
0
                                    << " exceeds Unicode limit (0x10FFFF)";
205
0
                            break;
206
0
                        } else {
207
0
                            rune = newrune;
208
0
                        }
209
0
                    } else {
210
0
                        LOG_STRING(ERROR, errors) << "\\U must be followed by 8 hex digits: \\"
211
0
                                                  << string(hex_start, p + 1 - hex_start);
212
0
                        break;
213
0
                    }
214
0
                }
215
0
                d += runetochar(d, &rune);
216
0
                break;
217
0
            }
218
0
            default:
219
0
                LOG_STRING(ERROR, errors) << "Unknown escape sequence: \\" << *p;
220
0
            }
221
0
            p++; // read past letter we escaped
222
0
        }
223
0
    }
224
0
    *d = '\0';
225
0
    return d - dest;
226
0
}
227
228
// ----------------------------------------------------------------------
229
// UnescapeCEscapeString()
230
//    This does the same thing as UnescapeCEscapeSequences, but creates
231
//    a new string. The caller does not need to worry about allocating
232
//    a dest buffer. This should be used for non performance critical
233
//    tasks such as printing debug messages. It is safe for src and dest
234
//    to be the same.
235
//
236
//    The second call stores its errors in a supplied string vector.
237
//    If the string vector pointer is NULL, it reports the errors with LOG().
238
//
239
//    In the first and second calls, the length of dest is returned. In the
240
//    the third call, the new string is returned.
241
//
242
//    *** DEPRECATED: Use CUnescape() in new code ***
243
//
244
// ----------------------------------------------------------------------
245
0
int UnescapeCEscapeString(const string& src, string* dest) {
246
0
    return UnescapeCEscapeString(src, dest, nullptr);
247
0
}
248
249
0
int UnescapeCEscapeString(const string& src, string* dest, vector<string>* errors) {
250
0
    CHECK(dest);
251
0
    dest->resize(src.size() + 1);
252
0
    int len = UnescapeCEscapeSequences(src.c_str(), const_cast<char*>(dest->data()), errors);
253
0
    dest->resize(len);
254
0
    return len;
255
0
}
256
257
0
string UnescapeCEscapeString(const string& src) {
258
0
    gscoped_array<char> unescaped(new char[src.size() + 1]);
259
0
    int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), nullptr);
260
0
    return string(unescaped.get(), len);
261
0
}
262
263
// ----------------------------------------------------------------------
264
// CUnescapeInternal()
265
//    Implements both CUnescape() and CUnescapeForNullTerminatedString().
266
//
267
//    Unescapes C escape sequences and is the reverse of CEscape().
268
//
269
//    If 'source' is valid, stores the unescaped string and its size in
270
//    'dest' and 'dest_len' respectively, and returns true. Otherwise
271
//    returns false and optionally stores the error description in
272
//    'error'. Set 'error' to NULL to disable error reporting.
273
//
274
//    'dest' should point to a buffer that is at least as big as 'source'.
275
//    'source' and 'dest' may be the same.
276
//
277
//     NOTE: any changes to this function must also be reflected in the older
278
//     UnescapeCEscapeSequences().
279
// ----------------------------------------------------------------------
280
static bool CUnescapeInternal(const StringPiece& source, bool leave_nulls_escaped, char* dest,
281
22
                              int* dest_len, string* error) {
282
22
    char* d = dest;
283
22
    const char* p = source.data();
284
22
    const char* end = source.end();
285
22
    const char* last_byte = end - 1;
286
287
    // Small optimization for case where source = dest and there's no escaping
288
22
    while (p == d && p < end && *p != '\\') p++, d++;
289
290
1.05k
    while (p < end) {
291
1.03k
        if (*p != '\\') {
292
1.03k
            *d++ = *p++;
293
1.03k
        } else {
294
0
            if (++p > last_byte) { // skip past the '\\'
295
0
                if (error) *error = "String cannot end with \\";
296
0
                return false;
297
0
            }
298
0
            switch (*p) {
299
0
            case 'a':
300
0
                *d++ = '\a';
301
0
                break;
302
0
            case 'b':
303
0
                *d++ = '\b';
304
0
                break;
305
0
            case 'f':
306
0
                *d++ = '\f';
307
0
                break;
308
0
            case 'n':
309
0
                *d++ = '\n';
310
0
                break;
311
0
            case 'r':
312
0
                *d++ = '\r';
313
0
                break;
314
0
            case 't':
315
0
                *d++ = '\t';
316
0
                break;
317
0
            case 'v':
318
0
                *d++ = '\v';
319
0
                break;
320
0
            case '\\':
321
0
                *d++ = '\\';
322
0
                break;
323
0
            case '?':
324
0
                *d++ = '\?';
325
0
                break; // \?  Who knew?
326
0
            case '\'':
327
0
                *d++ = '\'';
328
0
                break;
329
0
            case '"':
330
0
                *d++ = '\"';
331
0
                break;
332
0
            case '0':
333
0
            case '1':
334
0
            case '2':
335
0
            case '3': // octal digit: 1 to 3 digits
336
0
            case '4':
337
0
            case '5':
338
0
            case '6':
339
0
            case '7': {
340
0
                const char* octal_start = p;
341
0
                unsigned int ch = *p - '0';
342
0
                if (p < last_byte && IS_OCTAL_DIGIT(p[1])) ch = ch * 8 + *++p - '0';
343
0
                if (p < last_byte && IS_OCTAL_DIGIT(p[1]))
344
0
                    ch = ch * 8 + *++p - '0'; // now points at last digit
345
0
                if (ch > 0xff) {
346
0
                    if (error) {
347
0
                        *error = "Value of \\" + string(octal_start, p + 1 - octal_start) +
348
0
                                 " exceeds 0xff";
349
0
                    }
350
0
                    return false;
351
0
                }
352
0
                if ((ch == 0) && leave_nulls_escaped) {
353
                    // Copy the escape sequence for the null character
354
0
                    const int octal_size = p + 1 - octal_start;
355
0
                    *d++ = '\\';
356
0
                    memcpy(d, octal_start, octal_size);
357
0
                    d += octal_size;
358
0
                    break;
359
0
                }
360
0
                *d++ = ch;
361
0
                break;
362
0
            }
363
0
            case 'x':
364
0
            case 'X': {
365
0
                if (p >= last_byte) {
366
0
                    if (error) *error = "String cannot end with \\x";
367
0
                    return false;
368
0
                } else if (!ascii_isxdigit(p[1])) {
369
0
                    if (error) *error = "\\x cannot be followed by a non-hex digit";
370
0
                    return false;
371
0
                }
372
0
                unsigned int ch = 0;
373
0
                const char* hex_start = p;
374
0
                while (p < last_byte && ascii_isxdigit(p[1]))
375
                    // Arbitrarily many hex digits
376
0
                    ch = (ch << 4) + hex_digit_to_int(*++p);
377
0
                if (ch > 0xFF) {
378
0
                    if (error) {
379
0
                        *error = "Value of \\" + string(hex_start, p + 1 - hex_start) +
380
0
                                 " exceeds 0xff";
381
0
                    }
382
0
                    return false;
383
0
                }
384
0
                if ((ch == 0) && leave_nulls_escaped) {
385
                    // Copy the escape sequence for the null character
386
0
                    const int hex_size = p + 1 - hex_start;
387
0
                    *d++ = '\\';
388
0
                    memcpy(d, hex_start, hex_size);
389
0
                    d += hex_size;
390
0
                    break;
391
0
                }
392
0
                *d++ = ch;
393
0
                break;
394
0
            }
395
0
            case 'u': {
396
                // \uhhhh => convert 4 hex digits to UTF-8
397
0
                char32 rune = 0;
398
0
                const char* hex_start = p;
399
0
                if (p + 4 >= end) {
400
0
                    if (error) {
401
0
                        *error = "\\u must be followed by 4 hex digits: \\" +
402
0
                                 string(hex_start, p + 1 - hex_start);
403
0
                    }
404
0
                    return false;
405
0
                }
406
0
                for (int i = 0; i < 4; ++i) {
407
                    // Look one char ahead.
408
0
                    if (ascii_isxdigit(p[1])) {
409
0
                        rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p.
410
0
                    } else {
411
0
                        if (error) {
412
0
                            *error = "\\u must be followed by 4 hex digits: \\" +
413
0
                                     string(hex_start, p + 1 - hex_start);
414
0
                        }
415
0
                        return false;
416
0
                    }
417
0
                }
418
0
                if ((rune == 0) && leave_nulls_escaped) {
419
                    // Copy the escape sequence for the null character
420
0
                    *d++ = '\\';
421
0
                    memcpy(d, hex_start, 5); // u0000
422
0
                    d += 5;
423
0
                    break;
424
0
                }
425
0
                d += runetochar(d, &rune);
426
0
                break;
427
0
            }
428
0
            case 'U': {
429
                // \Uhhhhhhhh => convert 8 hex digits to UTF-8
430
0
                char32 rune = 0;
431
0
                const char* hex_start = p;
432
0
                if (p + 8 >= end) {
433
0
                    if (error) {
434
0
                        *error = "\\U must be followed by 8 hex digits: \\" +
435
0
                                 string(hex_start, p + 1 - hex_start);
436
0
                    }
437
0
                    return false;
438
0
                }
439
0
                for (int i = 0; i < 8; ++i) {
440
                    // Look one char ahead.
441
0
                    if (ascii_isxdigit(p[1])) {
442
                        // Don't change rune until we're sure this
443
                        // is within the Unicode limit, but do advance p.
444
0
                        char32 newrune = (rune << 4) + hex_digit_to_int(*++p);
445
0
                        if (newrune > 0x10FFFF) {
446
0
                            if (error) {
447
0
                                *error = "Value of \\" + string(hex_start, p + 1 - hex_start) +
448
0
                                         " exceeds Unicode limit (0x10FFFF)";
449
0
                            }
450
0
                            return false;
451
0
                        } else {
452
0
                            rune = newrune;
453
0
                        }
454
0
                    } else {
455
0
                        if (error) {
456
0
                            *error = "\\U must be followed by 8 hex digits: \\" +
457
0
                                     string(hex_start, p + 1 - hex_start);
458
0
                        }
459
0
                        return false;
460
0
                    }
461
0
                }
462
0
                if ((rune == 0) && leave_nulls_escaped) {
463
                    // Copy the escape sequence for the null character
464
0
                    *d++ = '\\';
465
0
                    memcpy(d, hex_start, 9); // U00000000
466
0
                    d += 9;
467
0
                    break;
468
0
                }
469
0
                d += runetochar(d, &rune);
470
0
                break;
471
0
            }
472
0
            default: {
473
0
                if (error) *error = string("Unknown escape sequence: \\") + *p;
474
0
                return false;
475
0
            }
476
0
            }
477
0
            p++; // read past letter we escaped
478
0
        }
479
1.03k
    }
480
22
    *dest_len = d - dest;
481
22
    return true;
482
22
}
483
484
// ----------------------------------------------------------------------
485
// CUnescapeInternal()
486
//
487
//    Same as above but uses a C++ string for output. 'source' and 'dest'
488
//    may be the same.
489
// ----------------------------------------------------------------------
490
bool CUnescapeInternal(const StringPiece& source, bool leave_nulls_escaped, string* dest,
491
22
                       string* error) {
492
22
    dest->resize(source.size());
493
22
    int dest_size;
494
22
    if (!CUnescapeInternal(source, leave_nulls_escaped, const_cast<char*>(dest->data()), &dest_size,
495
22
                           error)) {
496
0
        return false;
497
0
    }
498
22
    dest->resize(dest_size);
499
22
    return true;
500
22
}
501
502
// ----------------------------------------------------------------------
503
// CUnescape()
504
//
505
// See CUnescapeInternal() for implementation details.
506
// ----------------------------------------------------------------------
507
0
bool CUnescape(const StringPiece& source, char* dest, int* dest_len, string* error) {
508
0
    return CUnescapeInternal(source, kUnescapeNulls, dest, dest_len, error);
509
0
}
510
511
22
bool CUnescape(const StringPiece& source, string* dest, string* error) {
512
22
    return CUnescapeInternal(source, kUnescapeNulls, dest, error);
513
22
}
514
515
// ----------------------------------------------------------------------
516
// CUnescapeForNullTerminatedString()
517
//
518
// See CUnescapeInternal() for implementation details.
519
// ----------------------------------------------------------------------
520
bool CUnescapeForNullTerminatedString(const StringPiece& source, char* dest, int* dest_len,
521
0
                                      string* error) {
522
0
    return CUnescapeInternal(source, kLeaveNullsEscaped, dest, dest_len, error);
523
0
}
524
525
0
bool CUnescapeForNullTerminatedString(const StringPiece& source, string* dest, string* error) {
526
0
    return CUnescapeInternal(source, kLeaveNullsEscaped, dest, error);
527
0
}
528
529
// ----------------------------------------------------------------------
530
// CEscapeString()
531
// CHexEscapeString()
532
// Utf8SafeCEscapeString()
533
// Utf8SafeCHexEscapeString()
534
//    Copies 'src' to 'dest', escaping dangerous characters using
535
//    C-style escape sequences. This is very useful for preparing query
536
//    flags. 'src' and 'dest' should not overlap. The 'Hex' version uses
537
//    hexadecimal rather than octal sequences. The 'Utf8Safe' version doesn't
538
//    touch UTF-8 bytes.
539
//    Returns the number of bytes written to 'dest' (not including the \0)
540
//    or -1 if there was insufficient space.
541
//
542
//    Currently only \n, \r, \t, ", ', \ and !ascii_isprint() chars are escaped.
543
// ----------------------------------------------------------------------
544
int CEscapeInternal(const char* src, int src_len, char* dest, int dest_len, bool use_hex,
545
0
                    bool utf8_safe) {
546
0
    const char* src_end = src + src_len;
547
0
    int used = 0;
548
0
    bool last_hex_escape = false; // true if last output char was \xNN
549
550
0
    for (; src < src_end; src++) {
551
0
        if (dest_len - used < 2) // Need space for two letter escape
552
0
            return -1;
553
554
0
        bool is_hex_escape = false;
555
0
        switch (*src) {
556
0
        case '\n':
557
0
            dest[used++] = '\\';
558
0
            dest[used++] = 'n';
559
0
            break;
560
0
        case '\r':
561
0
            dest[used++] = '\\';
562
0
            dest[used++] = 'r';
563
0
            break;
564
0
        case '\t':
565
0
            dest[used++] = '\\';
566
0
            dest[used++] = 't';
567
0
            break;
568
0
        case '\"':
569
0
            dest[used++] = '\\';
570
0
            dest[used++] = '\"';
571
0
            break;
572
0
        case '\'':
573
0
            dest[used++] = '\\';
574
0
            dest[used++] = '\'';
575
0
            break;
576
0
        case '\\':
577
0
            dest[used++] = '\\';
578
0
            dest[used++] = '\\';
579
0
            break;
580
0
        default:
581
            // Note that if we emit \xNN and the src character after that is a hex
582
            // digit then that digit must be escaped too to prevent it being
583
            // interpreted as part of the character code by C.
584
0
            if ((!utf8_safe || *src < 0x80) &&
585
0
                (!ascii_isprint(*src) || (last_hex_escape && ascii_isxdigit(*src)))) {
586
0
                if (dest_len - used < 4) // need space for 4 letter escape
587
0
                    return -1;
588
0
                sprintf(dest + used, (use_hex ? "\\x%02x" : "\\%03o"), *src);
589
0
                is_hex_escape = use_hex;
590
0
                used += 4;
591
0
            } else {
592
0
                dest[used++] = *src;
593
0
                break;
594
0
            }
595
0
        }
596
0
        last_hex_escape = is_hex_escape;
597
0
    }
598
599
0
    if (dest_len - used < 1) // make sure that there is room for \0
600
0
        return -1;
601
602
0
    dest[used] = '\0'; // doesn't count towards return value though
603
0
    return used;
604
0
}
605
606
0
int CEscapeString(const char* src, int src_len, char* dest, int dest_len) {
607
0
    return CEscapeInternal(src, src_len, dest, dest_len, false, false);
608
0
}
609
610
0
int CHexEscapeString(const char* src, int src_len, char* dest, int dest_len) {
611
0
    return CEscapeInternal(src, src_len, dest, dest_len, true, false);
612
0
}
613
614
0
int Utf8SafeCEscapeString(const char* src, int src_len, char* dest, int dest_len) {
615
0
    return CEscapeInternal(src, src_len, dest, dest_len, false, true);
616
0
}
617
618
0
int Utf8SafeCHexEscapeString(const char* src, int src_len, char* dest, int dest_len) {
619
0
    return CEscapeInternal(src, src_len, dest, dest_len, true, true);
620
0
}
621
622
// ----------------------------------------------------------------------
623
// CEscape()
624
// CHexEscape()
625
// Utf8SafeCEscape()
626
// Utf8SafeCHexEscape()
627
//    Copies 'src' to result, escaping dangerous characters using
628
//    C-style escape sequences. This is very useful for preparing query
629
//    flags. 'src' and 'dest' should not overlap. The 'Hex' version
630
//    hexadecimal rather than octal sequences. The 'Utf8Safe' version
631
//    doesn't touch UTF-8 bytes.
632
//
633
//    Currently only \n, \r, \t, ", ', \ and !ascii_isprint() chars are escaped.
634
// ----------------------------------------------------------------------
635
0
string CEscape(const StringPiece& src) {
636
0
    const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
637
0
    gscoped_array<char> dest(new char[dest_length]);
638
0
    const int len = CEscapeInternal(src.data(), src.size(), dest.get(), dest_length, false, false);
639
0
    DCHECK_GE(len, 0);
640
0
    return string(dest.get(), len);
641
0
}
642
643
0
string CHexEscape(const StringPiece& src) {
644
0
    const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
645
0
    gscoped_array<char> dest(new char[dest_length]);
646
0
    const int len = CEscapeInternal(src.data(), src.size(), dest.get(), dest_length, true, false);
647
0
    DCHECK_GE(len, 0);
648
0
    return string(dest.get(), len);
649
0
}
650
651
0
string Utf8SafeCEscape(const StringPiece& src) {
652
0
    const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
653
0
    gscoped_array<char> dest(new char[dest_length]);
654
0
    const int len = CEscapeInternal(src.data(), src.size(), dest.get(), dest_length, false, true);
655
0
    DCHECK_GE(len, 0);
656
0
    return string(dest.get(), len);
657
0
}
658
659
0
string Utf8SafeCHexEscape(const StringPiece& src) {
660
0
    const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
661
0
    gscoped_array<char> dest(new char[dest_length]);
662
0
    const int len = CEscapeInternal(src.data(), src.size(), dest.get(), dest_length, true, true);
663
0
    DCHECK_GE(len, 0);
664
0
    return string(dest.get(), len);
665
0
}
666
667
// ----------------------------------------------------------------------
668
// BackslashEscape and BackslashUnescape
669
// ----------------------------------------------------------------------
670
0
void BackslashEscape(const StringPiece& src, const strings::CharSet& to_escape, string* dest) {
671
0
    dest->reserve(dest->size() + src.size());
672
0
    for (const char *p = src.data(), *end = src.data() + src.size(); p != end;) {
673
        // Advance to next character we need to escape, or to end of source
674
0
        const char* next = p;
675
0
        while (next < end && !to_escape.Test(*next)) {
676
0
            next++;
677
0
        }
678
        // Append the whole run of non-escaped chars
679
0
        dest->append(p, next - p);
680
0
        if (next == end) break;
681
        // Char at *next needs to be escaped.  Append backslash followed by *next
682
0
        char c[2];
683
0
        c[0] = '\\';
684
0
        c[1] = *next;
685
0
        dest->append(c, 2);
686
0
        p = next + 1;
687
0
    }
688
0
}
689
690
0
void BackslashUnescape(const StringPiece& src, const strings::CharSet& to_unescape, string* dest) {
691
0
    dest->reserve(dest->size() + src.size());
692
0
    bool escaped = false;
693
0
    for (const char *p = src.data(), *end = src.data() + src.size(); p != end; ++p) {
694
0
        if (escaped) {
695
0
            if (!to_unescape.Test(*p)) {
696
                // Keep the backslash
697
0
                dest->push_back('\\');
698
0
            }
699
0
            dest->push_back(*p);
700
0
            escaped = false;
701
0
        } else if (*p == '\\') {
702
0
            escaped = true;
703
0
        } else {
704
0
            dest->push_back(*p);
705
0
        }
706
0
    }
707
0
}
708
709
// ----------------------------------------------------------------------
710
// int QuotedPrintableUnescape()
711
//
712
// Check out http://www.cis.ohio-state.edu/htbin/rfc/rfc2045.html for
713
// more details, only briefly implemented. But from the web...
714
// Quoted-printable is an encoding method defined in the MIME
715
// standard. It is used primarily to encode 8-bit text (such as text
716
// that includes foreign characters) into 7-bit US ASCII, creating a
717
// document that is mostly readable by humans, even in its encoded
718
// form. All MIME compliant applications can decode quoted-printable
719
// text, though they may not necessarily be able to properly display the
720
// document as it was originally intended. As quoted-printable encoding
721
// is implemented most commonly, printable ASCII characters (values 33
722
// through 126, excluding 61), tabs and spaces that do not appear at the
723
// end of lines, and end-of-line characters are not encoded. Other
724
// characters are represented by an equal sign (=) immediately followed
725
// by that character's hexadecimal value. Lines that are longer than 76
726
// characters are shortened by line breaks, with the equal sign marking
727
// where the breaks occurred.
728
//
729
// Note that QuotedPrintableUnescape is different from 'Q'-encoding as
730
// defined in rfc2047. In particular, This does not treat '_'s as spaces.
731
// See QEncodingUnescape().
732
// ----------------------------------------------------------------------
733
734
0
int QuotedPrintableUnescape(const char* source, int slen, char* dest, int szdest) {
735
0
    char* d = dest;
736
0
    const char* p = source;
737
738
0
    while (p < source + slen && *p != '\0' && d < dest + szdest) {
739
0
        switch (*p) {
740
0
        case '=':
741
            // If it's valid, convert to hex and insert or remove line-wrap.
742
            // In the case of line-wrap removal, we allow LF as well as CRLF.
743
0
            if (p < source + slen - 1) {
744
0
                if (p[1] == '\n') {
745
0
                    p++;
746
0
                } else if (p < source + slen - 2) {
747
0
                    if (ascii_isxdigit(p[1]) && ascii_isxdigit(p[2])) {
748
0
                        *d++ = hex_digit_to_int(p[1]) * 16 + hex_digit_to_int(p[2]);
749
0
                        p += 2;
750
0
                    } else if (p[1] == '\r' && p[2] == '\n') {
751
0
                        p += 2;
752
0
                    }
753
0
                }
754
0
            }
755
0
            p++;
756
0
            break;
757
0
        default:
758
0
            *d++ = *p++;
759
0
            break;
760
0
        }
761
0
    }
762
0
    return (d - dest);
763
0
}
764
765
// ----------------------------------------------------------------------
766
// int QEncodingUnescape()
767
//
768
// This is very similar to QuotedPrintableUnescape except that we convert
769
// '_'s into spaces. (See RFC 2047)
770
// ----------------------------------------------------------------------
771
0
int QEncodingUnescape(const char* source, int slen, char* dest, int szdest) {
772
0
    char* d = dest;
773
0
    const char* p = source;
774
775
0
    while (p < source + slen && *p != '\0' && d < dest + szdest) {
776
0
        switch (*p) {
777
0
        case '=':
778
            // If it's valid, convert to hex and insert or remove line-wrap.
779
            // In the case of line-wrap removal, the assumption is that this
780
            // is an RFC-compliant message with lines terminated by CRLF.
781
0
            if (p < source + slen - 2) {
782
0
                if (ascii_isxdigit(p[1]) && ascii_isxdigit(p[2])) {
783
0
                    *d++ = hex_digit_to_int(p[1]) * 16 + hex_digit_to_int(p[2]);
784
0
                    p += 2;
785
0
                } else if (p[1] == '\r' && p[2] == '\n') {
786
0
                    p += 2;
787
0
                }
788
0
            }
789
0
            p++;
790
0
            break;
791
0
        case '_': // According to rfc2047, _'s are to be treated as spaces
792
0
            *d++ = ' ';
793
0
            p++;
794
0
            break;
795
0
        default:
796
0
            *d++ = *p++;
797
0
            break;
798
0
        }
799
0
    }
800
0
    return (d - dest);
801
0
}
802
803
0
int CalculateBase64EscapedLen(int input_len, bool do_padding) {
804
    // Base64 encodes three bytes of input at a time. If the input is not
805
    // divisible by three, we pad as appropriate.
806
    //
807
    // (from http://www.ietf.org/rfc/rfc3548.txt)
808
    // Special processing is performed if fewer than 24 bits are available
809
    // at the end of the data being encoded.  A full encoding quantum is
810
    // always completed at the end of a quantity.  When fewer than 24 input
811
    // bits are available in an input group, zero bits are added (on the
812
    // right) to form an integral number of 6-bit groups.  Padding at the
813
    // end of the data is performed using the '=' character.  Since all base
814
    // 64 input is an integral number of octets, only the following cases
815
    // can arise:
816
817
    // Base64 encodes each three bytes of input into four bytes of output.
818
0
    int len = (input_len / 3) * 4;
819
820
0
    if (input_len % 3 == 0) {
821
        // (from http://www.ietf.org/rfc/rfc3548.txt)
822
        // (1) the final quantum of encoding input is an integral multiple of 24
823
        // bits; here, the final unit of encoded output will be an integral
824
        // multiple of 4 characters with no "=" padding,
825
0
    } else if (input_len % 3 == 1) {
826
        // (from http://www.ietf.org/rfc/rfc3548.txt)
827
        // (2) the final quantum of encoding input is exactly 8 bits; here, the
828
        // final unit of encoded output will be two characters followed by two
829
        // "=" padding characters, or
830
0
        len += 2;
831
0
        if (do_padding) {
832
0
            len += 2;
833
0
        }
834
0
    } else { // (input_len % 3 == 2)
835
        // (from http://www.ietf.org/rfc/rfc3548.txt)
836
        // (3) the final quantum of encoding input is exactly 16 bits; here, the
837
        // final unit of encoded output will be three characters followed by one
838
        // "=" padding character.
839
0
        len += 3;
840
0
        if (do_padding) {
841
0
            len += 1;
842
0
        }
843
0
    }
844
845
0
    assert(len >= input_len); // make sure we didn't overflow
846
0
    return len;
847
0
}
848
849
// Base64Escape does padding, so this calculation includes padding.
850
0
int CalculateBase64EscapedLen(int input_len) {
851
0
    return CalculateBase64EscapedLen(input_len, true);
852
0
}
853
854
// ----------------------------------------------------------------------
855
// int Base64Unescape() - base64 decoder
856
// int Base64Escape() - base64 encoder
857
// int WebSafeBase64Unescape() - Google's variation of base64 decoder
858
// int WebSafeBase64Escape() - Google's variation of base64 encoder
859
//
860
// Check out
861
// http://www.cis.ohio-state.edu/htbin/rfc/rfc2045.html for formal
862
// description, but what we care about is that...
863
//   Take the encoded stuff in groups of 4 characters and turn each
864
//   character into a code 0 to 63 thus:
865
//           A-Z map to 0 to 25
866
//           a-z map to 26 to 51
867
//           0-9 map to 52 to 61
868
//           +(- for WebSafe) maps to 62
869
//           /(_ for WebSafe) maps to 63
870
//   There will be four numbers, all less than 64 which can be represented
871
//   by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
872
//   Arrange the 6 digit binary numbers into three bytes as such:
873
//   aaaaaabb bbbbcccc ccdddddd
874
//   Equals signs (one or two) are used at the end of the encoded block to
875
//   indicate that the text was not an integer multiple of three bytes long.
876
// In the sorted variation, we instead use the mapping
877
//           .   maps to 0
878
//           0-9 map to 1-10
879
//           A-Z map to 11-37
880
//           _   maps to 38
881
//           a-z map to 39-63
882
// This mapping has the property that the output will be sorted in the same
883
// order as the input, i.e. a < b iff map(a) < map(b). It is web-safe and
884
// filename-safe.
885
// ----------------------------------------------------------------------
886
887
int Base64UnescapeInternal(const char* src, int szsrc, char* dest, int szdest,
888
0
                           const signed char* unbase64) {
889
0
    static const char kPad64 = '=';
890
891
0
    int decode = 0;
892
0
    int destidx = 0;
893
0
    int state = 0;
894
0
    unsigned int ch = 0;
895
0
    unsigned int temp = 0;
896
897
    // The GET_INPUT macro gets the next input character, skipping
898
    // over any whitespace, and stopping when we reach the end of the
899
    // string or when we read any non-data character.  The arguments are
900
    // an arbitrary identifier (used as a label for goto) and the number
901
    // of data bytes that must remain in the input to avoid aborting the
902
    // loop.
903
0
#define GET_INPUT(label, remain)                              \
904
0
    label:                                                    \
905
0
    --szsrc;                                                  \
906
0
    ch = *src++;                                              \
907
0
    decode = unbase64[ch];                                    \
908
0
    if (decode < 0) {                                         \
909
0
        if (ascii_isspace(ch) && szsrc >= remain) goto label; \
910
0
        state = 4 - remain;                                   \
911
0
        break;                                                \
912
0
    }
913
914
    // if dest is null, we're just checking to see if it's legal input
915
    // rather than producing output.  (I suspect this could just be done
916
    // with a regexp...).  We duplicate the loop so this test can be
917
    // outside it instead of in every iteration.
918
919
0
    if (dest) {
920
        // This loop consumes 4 input bytes and produces 3 output bytes
921
        // per iteration.  We can't know at the start that there is enough
922
        // data left in the string for a full iteration, so the loop may
923
        // break out in the middle; if so 'state' will be set to the
924
        // number of input bytes read.
925
926
0
        while (szsrc >= 4) {
927
            // We'll start by optimistically assuming that the next four
928
            // bytes of the string (src[0..3]) are four good data bytes
929
            // (that is, no nulls, whitespace, padding chars, or illegal
930
            // chars).  We need to test src[0..2] for nulls individually
931
            // before constructing temp to preserve the property that we
932
            // never read past a null in the string (no matter how long
933
            // szsrc claims the string is).
934
935
0
            if (!src[0] || !src[1] || !src[2] ||
936
0
                (temp = ((unbase64[src[0]] << 18) | (unbase64[src[1]] << 12) |
937
0
                         (unbase64[src[2]] << 6) | (unbase64[src[3]]))) &
938
0
                        0x80000000) {
939
                // Iff any of those four characters was bad (null, illegal,
940
                // whitespace, padding), then temp's high bit will be set
941
                // (because unbase64[] is -1 for all bad characters).
942
                //
943
                // We'll back up and resort to the slower decoder, which knows
944
                // how to handle those cases.
945
946
0
                GET_INPUT(first, 4);
947
0
                temp = decode;
948
0
                GET_INPUT(second, 3);
949
0
                temp = (temp << 6) | decode;
950
0
                GET_INPUT(third, 2);
951
0
                temp = (temp << 6) | decode;
952
0
                GET_INPUT(fourth, 1);
953
0
                temp = (temp << 6) | decode;
954
0
            } else {
955
                // We really did have four good data bytes, so advance four
956
                // characters in the string.
957
958
0
                szsrc -= 4;
959
0
                src += 4;
960
0
                decode = -1;
961
0
                ch = '\0';
962
0
            }
963
964
            // temp has 24 bits of input, so write that out as three bytes.
965
966
0
            if (destidx + 3 > szdest) return -1;
967
0
            dest[destidx + 2] = temp;
968
0
            temp >>= 8;
969
0
            dest[destidx + 1] = temp;
970
0
            temp >>= 8;
971
0
            dest[destidx] = temp;
972
0
            destidx += 3;
973
0
        }
974
0
    } else {
975
0
        while (szsrc >= 4) {
976
0
            if (!src[0] || !src[1] || !src[2] ||
977
0
                (temp = ((unbase64[src[0]] << 18) | (unbase64[src[1]] << 12) |
978
0
                         (unbase64[src[2]] << 6) | (unbase64[src[3]]))) &
979
0
                        0x80000000) {
980
0
                GET_INPUT(first_no_dest, 4);
981
0
                GET_INPUT(second_no_dest, 3);
982
0
                GET_INPUT(third_no_dest, 2);
983
0
                GET_INPUT(fourth_no_dest, 1);
984
0
            } else {
985
0
                szsrc -= 4;
986
0
                src += 4;
987
0
                decode = -1;
988
0
                ch = '\0';
989
0
            }
990
0
            destidx += 3;
991
0
        }
992
0
    }
993
994
0
#undef GET_INPUT
995
996
    // if the loop terminated because we read a bad character, return
997
    // now.
998
0
    if (decode < 0 && ch != '\0' && ch != kPad64 && !ascii_isspace(ch)) return -1;
999
1000
0
    if (ch == kPad64) {
1001
        // if we stopped by hitting an '=', un-read that character -- we'll
1002
        // look at it again when we count to check for the proper number of
1003
        // equals signs at the end.
1004
0
        ++szsrc;
1005
0
        --src;
1006
0
    } else {
1007
        // This loop consumes 1 input byte per iteration.  It's used to
1008
        // clean up the 0-3 input bytes remaining when the first, faster
1009
        // loop finishes.  'temp' contains the data from 'state' input
1010
        // characters read by the first loop.
1011
0
        while (szsrc > 0) {
1012
0
            --szsrc;
1013
0
            ch = *src++;
1014
0
            decode = unbase64[ch];
1015
0
            if (decode < 0) {
1016
0
                if (ascii_isspace(ch)) {
1017
0
                    continue;
1018
0
                } else if (ch == '\0') {
1019
0
                    break;
1020
0
                } else if (ch == kPad64) {
1021
                    // back up one character; we'll read it again when we check
1022
                    // for the correct number of equals signs at the end.
1023
0
                    ++szsrc;
1024
0
                    --src;
1025
0
                    break;
1026
0
                } else {
1027
0
                    return -1;
1028
0
                }
1029
0
            }
1030
1031
            // Each input character gives us six bits of output.
1032
0
            temp = (temp << 6) | decode;
1033
0
            ++state;
1034
0
            if (state == 4) {
1035
                // If we've accumulated 24 bits of output, write that out as
1036
                // three bytes.
1037
0
                if (dest) {
1038
0
                    if (destidx + 3 > szdest) return -1;
1039
0
                    dest[destidx + 2] = temp;
1040
0
                    temp >>= 8;
1041
0
                    dest[destidx + 1] = temp;
1042
0
                    temp >>= 8;
1043
0
                    dest[destidx] = temp;
1044
0
                }
1045
0
                destidx += 3;
1046
0
                state = 0;
1047
0
                temp = 0;
1048
0
            }
1049
0
        }
1050
0
    }
1051
1052
    // Process the leftover data contained in 'temp' at the end of the input.
1053
0
    int expected_equals = 0;
1054
0
    switch (state) {
1055
0
    case 0:
1056
        // Nothing left over; output is a multiple of 3 bytes.
1057
0
        break;
1058
1059
0
    case 1:
1060
        // Bad input; we have 6 bits left over.
1061
0
        return -1;
1062
1063
0
    case 2:
1064
        // Produce one more output byte from the 12 input bits we have left.
1065
0
        if (dest) {
1066
0
            if (destidx + 1 > szdest) return -1;
1067
0
            temp >>= 4;
1068
0
            dest[destidx] = temp;
1069
0
        }
1070
0
        ++destidx;
1071
0
        expected_equals = 2;
1072
0
        break;
1073
1074
0
    case 3:
1075
        // Produce two more output bytes from the 18 input bits we have left.
1076
0
        if (dest) {
1077
0
            if (destidx + 2 > szdest) return -1;
1078
0
            temp >>= 2;
1079
0
            dest[destidx + 1] = temp;
1080
0
            temp >>= 8;
1081
0
            dest[destidx] = temp;
1082
0
        }
1083
0
        destidx += 2;
1084
0
        expected_equals = 1;
1085
0
        break;
1086
1087
0
    default:
1088
        // state should have no other values at this point.
1089
0
        throw doris::Exception(
1090
0
                doris::Status::FatalError("This can't happen; base64 decoder state = {}", state));
1091
0
    }
1092
1093
    // The remainder of the string should be all whitespace, mixed with
1094
    // exactly 0 equals signs, or exactly 'expected_equals' equals
1095
    // signs.  (Always accepting 0 equals signs is a google extension
1096
    // not covered in the RFC.)
1097
1098
0
    int equals = 0;
1099
0
    while (szsrc > 0 && *src) {
1100
0
        if (*src == kPad64)
1101
0
            ++equals;
1102
0
        else if (!ascii_isspace(*src))
1103
0
            return -1;
1104
0
        --szsrc;
1105
0
        ++src;
1106
0
    }
1107
1108
0
    return (equals == 0 || equals == expected_equals) ? destidx : -1;
1109
0
}
1110
1111
// The arrays below were generated by the following code
1112
// #include <sys/time.h>
1113
// #include <stdlib.h>
1114
// #include <string.h>
1115
// main()
1116
// {
1117
//   static const char Base64[] =
1118
//     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1119
//   char *pos;
1120
//   int idx, i, j;
1121
//   printf("    ");
1122
//   for (i = 0; i < 255; i += 8) {
1123
//     for (j = i; j < i + 8; j++) {
1124
//       pos = strchr(Base64, j);
1125
//       if ((pos == NULL) || (j == 0))
1126
//         idx = -1;
1127
//       else
1128
//         idx = pos - Base64;
1129
//       if (idx == -1)
1130
//         printf(" %2d,     ", idx);
1131
//       else
1132
//         printf(" %2d/*%c*/,", idx, j);
1133
//     }
1134
//     printf("\n    ");
1135
//   }
1136
// }
1137
//
1138
// where the value of "Base64[]" was replaced by one of the base-64 conversion
1139
// tables from the functions below.
1140
static const signed char kUnBase64[] = {
1141
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1142
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1143
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1144
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1145
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       62 /*+*/, -1,
1146
        -1,       -1,       63 /*/ */, 52 /*0*/, 53 /*1*/, 54 /*2*/, 55 /*3*/, 56 /*4*/, 57 /*5*/,
1147
        58 /*6*/, 59 /*7*/, 60 /*8*/,  61 /*9*/, -1,       -1,       -1,       -1,       -1,
1148
        -1,       -1,       0 /*A*/,   1 /*B*/,  2 /*C*/,  3 /*D*/,  4 /*E*/,  5 /*F*/,  6 /*G*/,
1149
        07 /*H*/, 8 /*I*/,  9 /*J*/,   10 /*K*/, 11 /*L*/, 12 /*M*/, 13 /*N*/, 14 /*O*/, 15 /*P*/,
1150
        16 /*Q*/, 17 /*R*/, 18 /*S*/,  19 /*T*/, 20 /*U*/, 21 /*V*/, 22 /*W*/, 23 /*X*/, 24 /*Y*/,
1151
        25 /*Z*/, -1,       -1,        -1,       -1,       -1,       -1,       26 /*a*/, 27 /*b*/,
1152
        28 /*c*/, 29 /*d*/, 30 /*e*/,  31 /*f*/, 32 /*g*/, 33 /*h*/, 34 /*i*/, 35 /*j*/, 36 /*k*/,
1153
        37 /*l*/, 38 /*m*/, 39 /*n*/,  40 /*o*/, 41 /*p*/, 42 /*q*/, 43 /*r*/, 44 /*s*/, 45 /*t*/,
1154
        46 /*u*/, 47 /*v*/, 48 /*w*/,  49 /*x*/, 50 /*y*/, 51 /*z*/, -1,       -1,       -1,
1155
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1156
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1157
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1158
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1159
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1160
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1161
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1162
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1163
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1164
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1165
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1166
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1167
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1168
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1169
        -1,       -1,       -1,        -1};
1170
static const signed char kUnWebSafeBase64[] = {
1171
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1172
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1173
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1174
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1175
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1176
        62 /*-*/, -1,       -1,       52 /*0*/, 53 /*1*/, 54 /*2*/, 55 /*3*/, 56 /*4*/, 57 /*5*/,
1177
        58 /*6*/, 59 /*7*/, 60 /*8*/, 61 /*9*/, -1,       -1,       -1,       -1,       -1,
1178
        -1,       -1,       0 /*A*/,  1 /*B*/,  2 /*C*/,  3 /*D*/,  4 /*E*/,  5 /*F*/,  6 /*G*/,
1179
        07 /*H*/, 8 /*I*/,  9 /*J*/,  10 /*K*/, 11 /*L*/, 12 /*M*/, 13 /*N*/, 14 /*O*/, 15 /*P*/,
1180
        16 /*Q*/, 17 /*R*/, 18 /*S*/, 19 /*T*/, 20 /*U*/, 21 /*V*/, 22 /*W*/, 23 /*X*/, 24 /*Y*/,
1181
        25 /*Z*/, -1,       -1,       -1,       -1,       63 /*_*/, -1,       26 /*a*/, 27 /*b*/,
1182
        28 /*c*/, 29 /*d*/, 30 /*e*/, 31 /*f*/, 32 /*g*/, 33 /*h*/, 34 /*i*/, 35 /*j*/, 36 /*k*/,
1183
        37 /*l*/, 38 /*m*/, 39 /*n*/, 40 /*o*/, 41 /*p*/, 42 /*q*/, 43 /*r*/, 44 /*s*/, 45 /*t*/,
1184
        46 /*u*/, 47 /*v*/, 48 /*w*/, 49 /*x*/, 50 /*y*/, 51 /*z*/, -1,       -1,       -1,
1185
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1186
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1187
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1188
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1189
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1190
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1191
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1192
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1193
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1194
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1195
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1196
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1197
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1198
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1199
        -1,       -1,       -1,       -1};
1200
1201
0
int Base64Unescape(const char* src, int szsrc, char* dest, int szdest) {
1202
0
    return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnBase64);
1203
0
}
1204
1205
0
int WebSafeBase64Unescape(const char* src, int szsrc, char* dest, int szdest) {
1206
0
    return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnWebSafeBase64);
1207
0
}
1208
1209
static bool Base64UnescapeInternal(const char* src, int slen, string* dest,
1210
0
                                   const signed char* unbase64) {
1211
    // Determine the size of the output string.  Base64 encodes every 3 bytes into
1212
    // 4 characters.  any leftover chars are added directly for good measure.
1213
    // This is documented in the base64 RFC: http://www.ietf.org/rfc/rfc3548.txt
1214
0
    const int dest_len = 3 * (slen / 4) + (slen % 4);
1215
1216
0
    dest->clear();
1217
0
    dest->resize(dest_len);
1218
1219
    // We are getting the destination buffer by getting the beginning of the
1220
    // string and converting it into a char *.
1221
0
    const int len =
1222
0
            Base64UnescapeInternal(src, slen, string_as_array(dest), dest->size(), unbase64);
1223
0
    if (len < 0) {
1224
0
        dest->clear();
1225
0
        return false;
1226
0
    }
1227
1228
    // could be shorter if there was padding
1229
0
    DCHECK_LE(len, dest_len);
1230
0
    dest->resize(len);
1231
1232
0
    return true;
1233
0
}
1234
1235
0
bool Base64Unescape(const char* src, int slen, string* dest) {
1236
0
    return Base64UnescapeInternal(src, slen, dest, kUnBase64);
1237
0
}
1238
1239
0
bool WebSafeBase64Unescape(const char* src, int slen, string* dest) {
1240
0
    return Base64UnescapeInternal(src, slen, dest, kUnWebSafeBase64);
1241
0
}
1242
1243
int Base64EscapeInternal(const unsigned char* src, int szsrc, char* dest, int szdest,
1244
0
                         const char* base64, bool do_padding) {
1245
0
    static const char kPad64 = '=';
1246
1247
0
    if (szsrc <= 0) return 0;
1248
1249
0
    char* cur_dest = dest;
1250
0
    const unsigned char* cur_src = src;
1251
1252
    // Three bytes of data encodes to four characters of cyphertext.
1253
    // So we can pump through three-byte chunks atomically.
1254
0
    while (szsrc > 2) { /* keep going until we have less than 24 bits */
1255
0
        if ((szdest -= 4) < 0) return 0;
1256
0
        cur_dest[0] = base64[cur_src[0] >> 2];
1257
0
        cur_dest[1] = base64[((cur_src[0] & 0x03) << 4) + (cur_src[1] >> 4)];
1258
0
        cur_dest[2] = base64[((cur_src[1] & 0x0f) << 2) + (cur_src[2] >> 6)];
1259
0
        cur_dest[3] = base64[cur_src[2] & 0x3f];
1260
1261
0
        cur_dest += 4;
1262
0
        cur_src += 3;
1263
0
        szsrc -= 3;
1264
0
    }
1265
1266
    /* now deal with the tail (<=2 bytes) */
1267
0
    switch (szsrc) {
1268
0
    case 0:
1269
        // Nothing left; nothing more to do.
1270
0
        break;
1271
0
    case 1:
1272
        // One byte left: this encodes to two characters, and (optionally)
1273
        // two pad characters to round out the four-character cypherblock.
1274
0
        if ((szdest -= 2) < 0) return 0;
1275
0
        cur_dest[0] = base64[cur_src[0] >> 2];
1276
0
        cur_dest[1] = base64[(cur_src[0] & 0x03) << 4];
1277
0
        cur_dest += 2;
1278
0
        if (do_padding) {
1279
0
            if ((szdest -= 2) < 0) return 0;
1280
0
            cur_dest[0] = kPad64;
1281
0
            cur_dest[1] = kPad64;
1282
0
            cur_dest += 2;
1283
0
        }
1284
0
        break;
1285
0
    case 2:
1286
        // Two bytes left: this encodes to three characters, and (optionally)
1287
        // one pad character to round out the four-character cypherblock.
1288
0
        if ((szdest -= 3) < 0) return 0;
1289
0
        cur_dest[0] = base64[cur_src[0] >> 2];
1290
0
        cur_dest[1] = base64[((cur_src[0] & 0x03) << 4) + (cur_src[1] >> 4)];
1291
0
        cur_dest[2] = base64[(cur_src[1] & 0x0f) << 2];
1292
0
        cur_dest += 3;
1293
0
        if (do_padding) {
1294
0
            if ((szdest -= 1) < 0) return 0;
1295
0
            cur_dest[0] = kPad64;
1296
0
            cur_dest += 1;
1297
0
        }
1298
0
        break;
1299
0
    default:
1300
        // Should not be reached: blocks of 3 bytes are handled
1301
        // in the while loop before this switch statement.
1302
0
        LOG_ASSERT(false) << "Logic problem? szsrc = " << szsrc;
1303
0
        break;
1304
0
    }
1305
0
    return (cur_dest - dest);
1306
0
}
1307
1308
static const char kBase64Chars[] =
1309
        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1310
1311
static const char kWebSafeBase64Chars[] =
1312
        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
1313
1314
0
int Base64Escape(const unsigned char* src, int szsrc, char* dest, int szdest) {
1315
0
    return Base64EscapeInternal(src, szsrc, dest, szdest, kBase64Chars, true);
1316
0
}
1317
int WebSafeBase64Escape(const unsigned char* src, int szsrc, char* dest, int szdest,
1318
0
                        bool do_padding) {
1319
0
    return Base64EscapeInternal(src, szsrc, dest, szdest, kWebSafeBase64Chars, do_padding);
1320
0
}
1321
1322
void Base64EscapeInternal(const unsigned char* src, int szsrc, string* dest, bool do_padding,
1323
0
                          const char* base64_chars) {
1324
0
    const int calc_escaped_size = CalculateBase64EscapedLen(szsrc, do_padding);
1325
0
    dest->clear();
1326
0
    dest->resize(calc_escaped_size, '\0');
1327
0
    const int escaped_len = Base64EscapeInternal(src, szsrc, string_as_array(dest), dest->size(),
1328
0
                                                 base64_chars, do_padding);
1329
0
    DCHECK_EQ(calc_escaped_size, escaped_len);
1330
0
}
1331
1332
0
void Base64Escape(const unsigned char* src, int szsrc, string* dest, bool do_padding) {
1333
0
    Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars);
1334
0
}
1335
1336
0
void WebSafeBase64Escape(const unsigned char* src, int szsrc, string* dest, bool do_padding) {
1337
0
    Base64EscapeInternal(src, szsrc, dest, do_padding, kWebSafeBase64Chars);
1338
0
}
1339
1340
0
void Base64Escape(const string& src, string* dest) {
1341
0
    Base64Escape(reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest, true);
1342
0
}
1343
1344
0
void WebSafeBase64Escape(const string& src, string* dest) {
1345
0
    WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
1346
0
                        false);
1347
0
}
1348
1349
0
void WebSafeBase64EscapeWithPadding(const string& src, string* dest) {
1350
0
    WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest, true);
1351
0
}
1352
1353
// Returns true iff c is in the Base 32 alphabet.
1354
0
bool ValidBase32Byte(char c) {
1355
0
    return (c >= 'A' && c <= 'Z') || (c >= '2' && c <= '7') || c == '=';
1356
0
}
1357
1358
// Mapping from number of Base32 escaped characters (0 through 8) to number of
1359
// unescaped bytes.  8 Base32 escaped characters represent 5 unescaped bytes.
1360
// For N < 8, then number of unescaped bytes is less than 5.  Note that in
1361
// valid input, N can only be 0, 2, 4, 5, 7, or 8 (corresponding to 0, 1, 2,
1362
// 3, 4, or 5 unescaped bytes).
1363
//
1364
// We use 5 for invalid values of N to be safe, since this is used to compute
1365
// the length of the buffer to hold unescaped data.
1366
//
1367
// See http://tools.ietf.org/html/rfc4648#section-6 for details.
1368
static const int kBase32NumUnescapedBytes[] = {0, 5, 1, 5, 2, 3, 5, 4, 5};
1369
1370
0
int Base32Unescape(const char* src, int slen, char* dest, int szdest) {
1371
0
    int destidx = 0;
1372
0
    char escaped_bytes[8];
1373
0
    unsigned char unescaped_bytes[5];
1374
0
    while (slen > 0) {
1375
        // Collect the next 8 escaped bytes and convert to upper case.  If there
1376
        // are less than 8 bytes left, pad with '=', but keep track of the number
1377
        // of non-padded bytes for later.
1378
0
        int non_padded_len = 8;
1379
0
        for (int i = 0; i < 8; ++i) {
1380
0
            escaped_bytes[i] = (i < slen) ? ascii_toupper(src[i]) : '=';
1381
0
            if (!ValidBase32Byte(escaped_bytes[i])) {
1382
0
                return -1;
1383
0
            }
1384
            // Stop counting escaped bytes at first '='.
1385
0
            if (escaped_bytes[i] == '=' && non_padded_len == 8) {
1386
0
                non_padded_len = i;
1387
0
            }
1388
0
        }
1389
1390
        // Convert the 8 escaped bytes to 5 unescaped bytes and copy to dest.
1391
0
        EightBase32DigitsToFiveBytes(escaped_bytes, unescaped_bytes);
1392
0
        const int num_unescaped = kBase32NumUnescapedBytes[non_padded_len];
1393
0
        for (int i = 0; i < num_unescaped; ++i) {
1394
0
            if (destidx == szdest) {
1395
                // No more room in dest, so terminate early.
1396
0
                return -1;
1397
0
            }
1398
0
            dest[destidx] = unescaped_bytes[i];
1399
0
            ++destidx;
1400
0
        }
1401
0
        src += 8;
1402
0
        slen -= 8;
1403
0
    }
1404
0
    return destidx;
1405
0
}
1406
1407
0
bool Base32Unescape(const char* src, int slen, string* dest) {
1408
    // Determine the size of the output string.
1409
0
    const int dest_len = 5 * (slen / 8) + kBase32NumUnescapedBytes[slen % 8];
1410
1411
0
    dest->clear();
1412
0
    dest->resize(dest_len);
1413
1414
    // We are getting the destination buffer by getting the beginning of the
1415
    // string and converting it into a char *.
1416
0
    const int len = Base32Unescape(src, slen, string_as_array(dest), dest->size());
1417
0
    if (len < 0) {
1418
0
        dest->clear();
1419
0
        return false;
1420
0
    }
1421
1422
    // Could be shorter if there was padding.
1423
0
    DCHECK_LE(len, dest_len);
1424
0
    dest->resize(len);
1425
1426
0
    return true;
1427
0
}
1428
1429
void GeneralFiveBytesToEightBase32Digits(const unsigned char* in_bytes, char* out,
1430
0
                                         const char* alphabet) {
1431
    // It's easier to just hard code this.
1432
    // The conversion isbased on the following picture of the division of a
1433
    // 40-bit block into 8 5-byte words:
1434
    //
1435
    //       5   3  2  5  1  4   4 1  5  2  3   5
1436
    //     |:::::::|:::::::|:::::::|:::::::|:::::::
1437
    //     +----+----+----+----+----+----+----+----
1438
    //
1439
0
    out[0] = alphabet[in_bytes[0] >> 3];
1440
0
    out[1] = alphabet[(in_bytes[0] & 0x07) << 2 | in_bytes[1] >> 6];
1441
0
    out[2] = alphabet[(in_bytes[1] & 0x3E) >> 1];
1442
0
    out[3] = alphabet[(in_bytes[1] & 0x01) << 4 | in_bytes[2] >> 4];
1443
0
    out[4] = alphabet[(in_bytes[2] & 0x0F) << 1 | in_bytes[3] >> 7];
1444
0
    out[5] = alphabet[(in_bytes[3] & 0x7C) >> 2];
1445
0
    out[6] = alphabet[(in_bytes[3] & 0x03) << 3 | in_bytes[4] >> 5];
1446
0
    out[7] = alphabet[(in_bytes[4] & 0x1F)];
1447
0
}
1448
1449
static int GeneralBase32Escape(const unsigned char* src, size_t szsrc, char* dest, size_t szdest,
1450
0
                               const char* alphabet) {
1451
0
    static const char kPad32 = '=';
1452
1453
0
    if (szsrc == 0) return 0;
1454
1455
0
    char* cur_dest = dest;
1456
0
    const unsigned char* cur_src = src;
1457
1458
    // Five bytes of data encodes to eight characters of cyphertext.
1459
    // So we can pump through three-byte chunks atomically.
1460
0
    while (szsrc > 4) { // keep going until we have less than 40 bits
1461
0
        if (szdest < 8) return 0;
1462
0
        szdest -= 8;
1463
1464
0
        GeneralFiveBytesToEightBase32Digits(cur_src, cur_dest, alphabet);
1465
1466
0
        cur_dest += 8;
1467
0
        cur_src += 5;
1468
0
        szsrc -= 5;
1469
0
    }
1470
1471
    // Now deal with the tail (<=4 bytes).
1472
0
    if (szsrc > 0) {
1473
0
        if (szdest < 8) return 0;
1474
0
        szdest -= 8;
1475
0
        unsigned char last_chunk[5];
1476
0
        memcpy(last_chunk, cur_src, szsrc);
1477
1478
0
        for (size_t i = szsrc; i < 5; ++i) {
1479
0
            last_chunk[i] = '\0';
1480
0
        }
1481
1482
0
        GeneralFiveBytesToEightBase32Digits(last_chunk, cur_dest, alphabet);
1483
0
        int filled = (szsrc * 8) / 5 + 1;
1484
0
        cur_dest += filled;
1485
1486
        // Add on the padding.
1487
0
        for (int i = 0; i < (8 - filled); ++i) {
1488
0
            *(cur_dest++) = kPad32;
1489
0
        }
1490
0
    }
1491
1492
0
    return cur_dest - dest;
1493
0
}
1494
1495
0
static bool GeneralBase32Escape(const string& src, string* dest, const char* alphabet) {
1496
0
    const int max_escaped_size = CalculateBase32EscapedLen(src.length());
1497
0
    dest->clear();
1498
0
    dest->resize(max_escaped_size + 1, '\0');
1499
0
    const int escaped_len =
1500
0
            GeneralBase32Escape(reinterpret_cast<const unsigned char*>(src.c_str()), src.length(),
1501
0
                                &*dest->begin(), dest->size(), alphabet);
1502
1503
0
    DCHECK_LE(max_escaped_size, escaped_len);
1504
1505
0
    if (escaped_len < 0) {
1506
0
        dest->clear();
1507
0
        return false;
1508
0
    }
1509
1510
0
    dest->resize(escaped_len);
1511
0
    return true;
1512
0
}
1513
1514
static const char Base32Alphabet[] = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
1515
                                      'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
1516
                                      'W', 'X', 'Y', 'Z', '2', '3', '4', '5', '6', '7'};
1517
1518
0
int Base32Escape(const unsigned char* src, size_t szsrc, char* dest, size_t szdest) {
1519
0
    return GeneralBase32Escape(src, szsrc, dest, szdest, Base32Alphabet);
1520
0
}
1521
1522
0
bool Base32Escape(const string& src, string* dest) {
1523
0
    return GeneralBase32Escape(src, dest, Base32Alphabet);
1524
0
}
1525
1526
0
void FiveBytesToEightBase32Digits(const unsigned char* in_bytes, char* out) {
1527
0
    GeneralFiveBytesToEightBase32Digits(in_bytes, out, Base32Alphabet);
1528
0
}
1529
1530
static const char Base32HexAlphabet[] = {
1531
        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
1532
        'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
1533
};
1534
1535
0
int Base32HexEscape(const unsigned char* src, size_t szsrc, char* dest, size_t szdest) {
1536
0
    return GeneralBase32Escape(src, szsrc, dest, szdest, Base32HexAlphabet);
1537
0
}
1538
1539
0
bool Base32HexEscape(const string& src, string* dest) {
1540
0
    return GeneralBase32Escape(src, dest, Base32HexAlphabet);
1541
0
}
1542
1543
0
int CalculateBase32EscapedLen(size_t input_len) {
1544
0
    DCHECK_LE(input_len, numeric_limits<size_t>::max() / 8);
1545
0
    size_t intermediate_result = 8 * input_len + 4;
1546
0
    size_t len = intermediate_result / 5;
1547
0
    len = (len + 7) & ~7;
1548
0
    return len;
1549
0
}
1550
1551
// ----------------------------------------------------------------------
1552
// EightBase32DigitsToTenHexDigits()
1553
//   Converts an 8-digit base32 string to a 10-digit hex string.
1554
//
1555
//   *in must point to 8 base32 digits.
1556
//   *out must point to 10 bytes.
1557
//
1558
//   Base32 uses A-Z,2-7 to represent the numbers 0-31.
1559
//   See RFC3548 at http://www.ietf.org/rfc/rfc3548.txt
1560
//   for details on base32.
1561
// ----------------------------------------------------------------------
1562
1563
0
void EightBase32DigitsToTenHexDigits(const char* in, char* out) {
1564
0
    unsigned char bytes[5];
1565
0
    EightBase32DigitsToFiveBytes(in, bytes);
1566
0
    b2a_hex(bytes, out, 5);
1567
0
}
1568
1569
0
void EightBase32DigitsToFiveBytes(const char* in, unsigned char* bytes_out) {
1570
0
    static const char Base32InverseAlphabet[] = {
1571
0
            99,       99,       99,       99,       99,       99,       99,       99,
1572
0
            99,       99,       99,       99,       99,       99,       99,       99,
1573
0
            99,       99,       99,       99,       99,       99,       99,       99,
1574
0
            99,       99,       99,       99,       99,       99,       99,       99,
1575
0
            99,       99,       99,       99,       99,       99,       99,       99,
1576
0
            99,       99,       99,       99,       99,       99,       99,       99,
1577
0
            99,       99,       26 /*2*/, 27 /*3*/, 28 /*4*/, 29 /*5*/, 30 /*6*/, 31 /*7*/,
1578
0
            99,       99,       99,       99,       99,       00 /*=*/, 99,       99,
1579
0
            99,       0 /*A*/,  1 /*B*/,  2 /*C*/,  3 /*D*/,  4 /*E*/,  5 /*F*/,  6 /*G*/,
1580
0
            7 /*H*/,  8 /*I*/,  9 /*J*/,  10 /*K*/, 11 /*L*/, 12 /*M*/, 13 /*N*/, 14 /*O*/,
1581
0
            15 /*P*/, 16 /*Q*/, 17 /*R*/, 18 /*S*/, 19 /*T*/, 20 /*U*/, 21 /*V*/, 22 /*W*/,
1582
0
            23 /*X*/, 24 /*Y*/, 25 /*Z*/, 99,       99,       99,       99,       99,
1583
0
            99,       99,       99,       99,       99,       99,       99,       99,
1584
0
            99,       99,       99,       99,       99,       99,       99,       99,
1585
0
            99,       99,       99,       99,       99,       99,       99,       99,
1586
0
            99,       99,       99,       99,       99,       99,       99,       99,
1587
0
            99,       99,       99,       99,       99,       99,       99,       99,
1588
0
            99,       99,       99,       99,       99,       99,       99,       99,
1589
0
            99,       99,       99,       99,       99,       99,       99,       99,
1590
0
            99,       99,       99,       99,       99,       99,       99,       99,
1591
0
            99,       99,       99,       99,       99,       99,       99,       99,
1592
0
            99,       99,       99,       99,       99,       99,       99,       99,
1593
0
            99,       99,       99,       99,       99,       99,       99,       99,
1594
0
            99,       99,       99,       99,       99,       99,       99,       99,
1595
0
            99,       99,       99,       99,       99,       99,       99,       99,
1596
0
            99,       99,       99,       99,       99,       99,       99,       99,
1597
0
            99,       99,       99,       99,       99,       99,       99,       99,
1598
0
            99,       99,       99,       99,       99,       99,       99,       99,
1599
0
            99,       99,       99,       99,       99,       99,       99,       99,
1600
0
            99,       99,       99,       99,       99,       99,       99,       99,
1601
0
            99,       99,       99,       99,       99,       99,       99,       99,
1602
0
            99,       99,       99,       99,       99,       99,       99,       99};
1603
1604
    // Convert to raw bytes. It's easier to just hard code this.
1605
0
    bytes_out[0] = Base32InverseAlphabet[in[0]] << 3 | Base32InverseAlphabet[in[1]] >> 2;
1606
1607
0
    bytes_out[1] = Base32InverseAlphabet[in[1]] << 6 | Base32InverseAlphabet[in[2]] << 1 |
1608
0
                   Base32InverseAlphabet[in[3]] >> 4;
1609
1610
0
    bytes_out[2] = Base32InverseAlphabet[in[3]] << 4 | Base32InverseAlphabet[in[4]] >> 1;
1611
1612
0
    bytes_out[3] = Base32InverseAlphabet[in[4]] << 7 | Base32InverseAlphabet[in[5]] << 2 |
1613
0
                   Base32InverseAlphabet[in[6]] >> 3;
1614
1615
0
    bytes_out[4] = Base32InverseAlphabet[in[6]] << 5 | Base32InverseAlphabet[in[7]];
1616
0
}
1617
1618
// ----------------------------------------------------------------------
1619
// TenHexDigitsToEightBase32Digits()
1620
//   Converts a 10-digit hex string to an 8-digit base32 string.
1621
//
1622
//   *in must point to 10 hex digits.
1623
//   *out must point to 8 bytes.
1624
//
1625
//   See RFC3548 at http://www.ietf.org/rfc/rfc3548.txt
1626
//   for details on base32.
1627
// ----------------------------------------------------------------------
1628
0
void TenHexDigitsToEightBase32Digits(const char* in, char* out) {
1629
0
    unsigned char bytes[5];
1630
1631
    // Convert hex to raw bytes.
1632
0
    a2b_hex(in, bytes, 5);
1633
0
    FiveBytesToEightBase32Digits(bytes, out);
1634
0
}
1635
1636
// ----------------------------------------------------------------------
1637
// EscapeFileName / UnescapeFileName
1638
// ----------------------------------------------------------------------
1639
static const Charmap escape_file_name_exceptions(
1640
        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" // letters
1641
        "0123456789"                                           // digits
1642
        "-_.");
1643
1644
0
void EscapeFileName(const StringPiece& src, string* dst) {
1645
    // Reserve at least src.size() chars
1646
0
    dst->reserve(dst->size() + src.size());
1647
1648
0
    for (char c : src) {
1649
        // We do not use "isalpha" because we want the behavior to be
1650
        // independent of the current locale settings.
1651
0
        if (escape_file_name_exceptions.contains(c)) {
1652
0
            dst->push_back(c);
1653
1654
0
        } else if (c == '/') {
1655
0
            dst->push_back('~');
1656
1657
0
        } else {
1658
0
            char tmp[2];
1659
0
            b2a_hex(reinterpret_cast<const unsigned char*>(&c), tmp, 1);
1660
0
            dst->push_back('%');
1661
0
            dst->append(tmp, 2);
1662
0
        }
1663
0
    }
1664
0
}
1665
1666
0
void UnescapeFileName(const StringPiece& src_piece, string* dst) {
1667
0
    const char* src = src_piece.data();
1668
0
    const int len = src_piece.size();
1669
0
    for (int i = 0; i < len; ++i) {
1670
0
        const char c = src[i];
1671
0
        if (c == '~') {
1672
0
            dst->push_back('/');
1673
1674
0
        } else if ((c == '%') && (i + 2 < len)) {
1675
0
            unsigned char tmp[1];
1676
0
            a2b_hex(src + i + 1, &tmp[0], 1);
1677
0
            dst->push_back(tmp[0]);
1678
0
            i += 2;
1679
1680
0
        } else {
1681
0
            dst->push_back(c);
1682
0
        }
1683
0
    }
1684
0
}
1685
1686
static char hex_value[256] = {
1687
        0, 0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,
1688
        0, 0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,
1689
        0, 1,  2,  3,  4,  5,  6,  7, 8, 9, 0, 0, 0, 0, 0, 0, // '0'..'9'
1690
        0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'A'..'F'
1691
        0, 0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 11, 12, 13, 14, 15, 0,
1692
        0, 0,  0,  0,  0,  0,  0,  0, // 'a'..'f'
1693
        0, 0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,
1694
        0, 0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,
1695
        0, 0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,
1696
        0, 0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,
1697
        0, 0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,
1698
        0, 0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0};
1699
1700
static char hex_char[] = "0123456789abcdef";
1701
1702
// This is a templated function so that T can be either a char*
1703
// or a string.  This works because we use the [] operator to access
1704
// individual characters at a time.
1705
template <typename T>
1706
0
void a2b_hex_t(const char* a, T b, int num) {
1707
0
    for (int i = 0; i < num; i++) {
1708
0
        b[i] = (hex_value[a[i * 2] & 0xFF] << 4) + (hex_value[a[i * 2 + 1] & 0xFF]);
1709
0
    }
1710
0
}
Unexecuted instantiation: _ZN7strings9a2b_hex_tIPhEEvPKcT_i
Unexecuted instantiation: _ZN7strings9a2b_hex_tIPcEEvPKcT_i
Unexecuted instantiation: _ZN7strings9a2b_hex_tIRNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEEvPKcT_i
1711
1712
0
string a2b_bin(const string& a, bool byte_order_msb) {
1713
0
    string result;
1714
0
    const char* data = a.c_str();
1715
0
    int num_bytes = (a.size() + 7) / 8;
1716
0
    for (int byte_offset = 0; byte_offset < num_bytes; ++byte_offset) {
1717
0
        unsigned char c = 0;
1718
0
        for (int bit_offset = 0; bit_offset < 8; ++bit_offset) {
1719
0
            if (*data == '\0') break;
1720
0
            if (*data++ != '0') {
1721
0
                int bits_to_shift = (byte_order_msb) ? 7 - bit_offset : bit_offset;
1722
0
                c |= (1 << bits_to_shift);
1723
0
            }
1724
0
        }
1725
0
        result.append(1, c);
1726
0
    }
1727
0
    return result;
1728
0
}
1729
1730
// This is a templated function so that T can be either a char*
1731
// or a string.  This works because we use the [] operator to access
1732
// individual characters at a time.
1733
template <typename T>
1734
0
void b2a_hex_t(const unsigned char* b, T a, int num) {
1735
0
    for (int i = 0; i < num; i++) {
1736
0
        a[i * 2 + 0] = hex_char[b[i] >> 4];
1737
0
        a[i * 2 + 1] = hex_char[b[i] & 0xf];
1738
0
    }
1739
0
}
Unexecuted instantiation: _ZN7strings9b2a_hex_tIPcEEvPKhT_i
Unexecuted instantiation: _ZN7strings9b2a_hex_tIRNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEEvPKhT_i
1740
1741
0
string b2a_bin(const string& b, bool byte_order_msb) {
1742
0
    string result;
1743
0
    for (char c : b) {
1744
0
        for (int bit_offset = 0; bit_offset < 8; ++bit_offset) {
1745
0
            int x = (byte_order_msb) ? 7 - bit_offset : bit_offset;
1746
0
            result.append(1, (c & (1 << x)) ? '1' : '0');
1747
0
        }
1748
0
    }
1749
0
    return result;
1750
0
}
1751
1752
0
void b2a_hex(const unsigned char* b, char* a, int num) {
1753
0
    b2a_hex_t<char*>(b, a, num);
1754
0
}
1755
1756
0
void a2b_hex(const char* a, unsigned char* b, int num) {
1757
0
    a2b_hex_t<unsigned char*>(a, b, num);
1758
0
}
1759
1760
0
void a2b_hex(const char* a, char* b, int num) {
1761
0
    a2b_hex_t<char*>(a, b, num);
1762
0
}
1763
1764
0
string b2a_hex(const char* b, int len) {
1765
0
    string result;
1766
0
    result.resize(len << 1);
1767
0
    b2a_hex_t<string&>(reinterpret_cast<const unsigned char*>(b), result, len);
1768
0
    return result;
1769
0
}
1770
1771
0
string b2a_hex(const StringPiece& b) {
1772
0
    return b2a_hex(b.data(), b.size());
1773
0
}
1774
1775
0
string a2b_hex(const string& a) {
1776
0
    string result;
1777
0
    a2b_hex(a.c_str(), &result, a.size() / 2);
1778
1779
0
    return result;
1780
0
}
1781
1782
0
void b2a_hex(const unsigned char* from, string* to, int num) {
1783
0
    to->resize(num << 1);
1784
0
    b2a_hex_t<string&>(from, *to, num);
1785
0
}
1786
1787
0
void a2b_hex(const char* from, string* to, int num) {
1788
0
    to->resize(num);
1789
0
    a2b_hex_t<string&>(from, *to, num);
1790
0
}
1791
1792
const char* kDontNeedShellEscapeChars =
1793
        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.=/:,@";
1794
1795
0
string ShellEscape(StringPiece src) {
1796
0
    if (!src.empty() && // empty string needs quotes
1797
0
        src.find_first_not_of(kDontNeedShellEscapeChars) == StringPiece::npos) {
1798
        // only contains chars that don't need quotes; it's fine
1799
0
        return src.ToString();
1800
0
    } else if (src.find('\'') == StringPiece::npos) {
1801
        // no single quotes; just wrap it in single quotes
1802
0
        return StrCat("'", src, "'");
1803
0
    } else {
1804
        // needs double quote escaping
1805
0
        string result = "\"";
1806
0
        for (char c : src) {
1807
0
            switch (c) {
1808
0
            case '\\':
1809
0
            case '$':
1810
0
            case '"':
1811
0
            case '`':
1812
0
                result.push_back('\\');
1813
0
            };
1814
0
            result.push_back(c);
1815
0
        }
1816
0
        result.push_back('"');
1817
0
        return result;
1818
0
    }
1819
0
}
1820
1821
static const char kHexTable[513] =
1822
        "000102030405060708090a0b0c0d0e0f"
1823
        "101112131415161718191a1b1c1d1e1f"
1824
        "202122232425262728292a2b2c2d2e2f"
1825
        "303132333435363738393a3b3c3d3e3f"
1826
        "404142434445464748494a4b4c4d4e4f"
1827
        "505152535455565758595a5b5c5d5e5f"
1828
        "606162636465666768696a6b6c6d6e6f"
1829
        "707172737475767778797a7b7c7d7e7f"
1830
        "808182838485868788898a8b8c8d8e8f"
1831
        "909192939495969798999a9b9c9d9e9f"
1832
        "a0a1a2a3a4a5a6a7a8a9aaabacadaeaf"
1833
        "b0b1b2b3b4b5b6b7b8b9babbbcbdbebf"
1834
        "c0c1c2c3c4c5c6c7c8c9cacbcccdcecf"
1835
        "d0d1d2d3d4d5d6d7d8d9dadbdcdddedf"
1836
        "e0e1e2e3e4e5e6e7e8e9eaebecedeeef"
1837
        "f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff";
1838
1839
//------------------------------------------------------------------------
1840
// ByteStringToAscii
1841
//  Reads at most bytes_to_read from binary_string and prints it to
1842
//  ascii_string in downcased hex.
1843
//------------------------------------------------------------------------
1844
0
void ByteStringToAscii(string const& binary_string, int bytes_to_read, string* ascii_string) {
1845
0
    if (binary_string.size() < bytes_to_read) {
1846
0
        bytes_to_read = binary_string.size();
1847
0
    }
1848
1849
0
    CHECK_GE(bytes_to_read, 0);
1850
0
    ascii_string->resize(bytes_to_read * 2);
1851
1852
0
    string::const_iterator in = binary_string.begin();
1853
0
    string::iterator out = ascii_string->begin();
1854
1855
0
    for (int i = 0; i < bytes_to_read; i++) {
1856
0
        *out++ = kHexTable[(*in) * 2];
1857
0
        *out++ = kHexTable[(*in) * 2 + 1];
1858
0
        ++in;
1859
0
    }
1860
0
}
1861
1862
//------------------------------------------------------------------------
1863
// ByteStringFromAscii
1864
//  Converts the hex from ascii_string into binary data and
1865
//  writes the binary data into binary_string.
1866
//  Empty input successfully converts to empty output.
1867
//  Returns false and may modify output if it is
1868
//  unable to parse the hex string.
1869
//------------------------------------------------------------------------
1870
0
bool ByteStringFromAscii(string const& hex_string, string* binary_string) {
1871
0
    binary_string->clear();
1872
1873
0
    if ((hex_string.size() % 2) != 0) {
1874
0
        return false;
1875
0
    }
1876
1877
0
    int value = 0;
1878
0
    for (int i = 0; i < hex_string.size(); i++) {
1879
0
        char c = hex_string[i];
1880
1881
0
        if (!ascii_isxdigit(c)) {
1882
0
            return false;
1883
0
        }
1884
1885
0
        if (ascii_isdigit(c)) {
1886
0
            value += c - '0';
1887
0
        } else if (ascii_islower(c)) {
1888
0
            value += 10 + c - 'a';
1889
0
        } else {
1890
0
            value += 10 + c - 'A';
1891
0
        }
1892
1893
0
        if (i & 1) {
1894
0
            binary_string->push_back(value);
1895
0
            value = 0;
1896
0
        } else {
1897
0
            value <<= 4;
1898
0
        }
1899
0
    }
1900
1901
0
    return true;
1902
0
}
1903
1904
// ----------------------------------------------------------------------
1905
// CleanStringLineEndings()
1906
//   Clean up a multi-line string to conform to Unix line endings.
1907
//   Reads from src and appends to dst, so usually dst should be empty.
1908
//
1909
//   If there is no line ending at the end of a non-empty string, it can
1910
//   be added automatically.
1911
//
1912
//   Four different types of input are correctly handled:
1913
//
1914
//     - Unix/Linux files: line ending is LF, pass through unchanged
1915
//
1916
//     - DOS/Windows files: line ending is CRLF: convert to LF
1917
//
1918
//     - Legacy Mac files: line ending is CR: convert to LF
1919
//
1920
//     - Garbled files: random line endings, covert gracefully
1921
//                      lonely CR, lonely LF, CRLF: convert to LF
1922
//
1923
//   @param src The multi-line string to convert
1924
//   @param dst The converted string is appended to this string
1925
//   @param auto_end_last_line Automatically terminate the last line
1926
//
1927
//   Limitations:
1928
//
1929
//     This does not do the right thing for CRCRLF files created by
1930
//     broken programs that do another Unix->DOS conversion on files
1931
//     that are already in CRLF format.  For this, a two-pass approach
1932
//     brute-force would be needed that
1933
//
1934
//       (1) determines the presence of LF (first one is ok)
1935
//       (2) if yes, removes any CR, else convert every CR to LF
1936
1937
0
void CleanStringLineEndings(const string& src, string* dst, bool auto_end_last_line) {
1938
0
    if (dst->empty()) {
1939
0
        dst->append(src);
1940
0
        CleanStringLineEndings(dst, auto_end_last_line);
1941
0
    } else {
1942
0
        string tmp = src;
1943
0
        CleanStringLineEndings(&tmp, auto_end_last_line);
1944
0
        dst->append(tmp);
1945
0
    }
1946
0
}
1947
1948
0
void CleanStringLineEndings(string* str, bool auto_end_last_line) {
1949
0
    int output_pos = 0;
1950
0
    bool r_seen = false;
1951
0
    int len = str->size();
1952
1953
0
    char* p = string_as_array(str);
1954
1955
0
    for (int input_pos = 0; input_pos < len;) {
1956
0
        if (!r_seen && input_pos + 8 < len) {
1957
0
            uint64 v = UNALIGNED_LOAD64(p + input_pos);
1958
            // Loop over groups of 8 bytes at a time until we come across
1959
            // a word that has a byte whose value is less than or equal to
1960
            // '\r' (i.e. could contain a \n (0x0a) or a \r (0x0d) ).
1961
            //
1962
            // We use a has_less macro that quickly tests a whole 64-bit
1963
            // word to see if any of the bytes has a value < N.
1964
            //
1965
            // For more details, see:
1966
            //   http://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
1967
0
#define has_less(x, n) (((x) - ~0ULL / 255 * (n)) & ~(x) & ~0ULL / 255 * 128)
1968
0
            if (!has_less(v, '\r' + 1)) {
1969
0
#undef has_less
1970
                // No byte in this word has a value that could be a \r or a \n
1971
0
                if (output_pos != input_pos) UNALIGNED_STORE64(p + output_pos, v);
1972
0
                input_pos += 8;
1973
0
                output_pos += 8;
1974
0
                continue;
1975
0
            }
1976
0
        }
1977
0
        string::const_reference in = p[input_pos];
1978
0
        if (in == '\r') {
1979
0
            if (r_seen) p[output_pos++] = '\n';
1980
0
            r_seen = true;
1981
0
        } else if (in == '\n') {
1982
0
            if (input_pos != output_pos)
1983
0
                p[output_pos++] = '\n';
1984
0
            else
1985
0
                output_pos++;
1986
0
            r_seen = false;
1987
0
        } else {
1988
0
            if (r_seen) p[output_pos++] = '\n';
1989
0
            r_seen = false;
1990
0
            if (input_pos != output_pos)
1991
0
                p[output_pos++] = in;
1992
0
            else
1993
0
                output_pos++;
1994
0
        }
1995
0
        input_pos++;
1996
0
    }
1997
0
    if (r_seen || (auto_end_last_line && output_pos > 0 && p[output_pos - 1] != '\n')) {
1998
0
        str->resize(output_pos + 1);
1999
0
        str->operator[](output_pos) = '\n';
2000
0
    } else if (output_pos < len) {
2001
0
        str->resize(output_pos);
2002
0
    }
2003
0
}
2004
2005
} // namespace strings