Coverage Report

Created: 2024-11-21 23:52

/root/doris/be/src/gutil/strings/escaping.cc
Line
Count
Source (jump to first uncovered line)
1
// Copyright 2008 Google Inc. All Rights Reserved.
2
// Authors: Numerous. See the .h for contact people.
3
4
#include "gutil/strings/escaping.h"
5
6
#include <assert.h>
7
#include <stdio.h>
8
#include <string.h>
9
#include <glog/logging.h>
10
#include <limits>
11
#include <ostream>
12
13
using std::numeric_limits;
14
#include <vector>
15
16
using std::vector;
17
18
#include "gutil/charmap.h"
19
#include "gutil/gscoped_ptr.h"
20
#include "gutil/integral_types.h"
21
#include "gutil/port.h"
22
#include "gutil/stl_util.h"
23
#include "gutil/utf/utf.h" // for runetochar
24
#include "gutil/strings/strcat.h"
25
26
namespace strings {
27
28
// These are used for the leave_nulls_escaped argument to CUnescapeInternal().
29
static bool kUnescapeNulls = false;
30
static bool kLeaveNullsEscaped = true;
31
32
// ----------------------------------------------------------------------
33
// EscapeStrForCSV()
34
//    Escapes the quotes in 'src' by doubling them. This is necessary
35
//    for generating CSV files (see SplitCSVLine).
36
//    Returns the number of characters written into dest (not counting
37
//    the \0) or -1 if there was insufficient space. Dest could end up
38
//    twice as long as src.
39
//
40
//    Example: [some "string" to test] --> [some ""string"" to test]
41
// ----------------------------------------------------------------------
42
0
int EscapeStrForCSV(const char* src, char* dest, int dest_len) {
43
0
    int used = 0;
44
45
0
    while (true) {
46
0
        if (*src == '\0' && used < dest_len) {
47
0
            dest[used] = '\0';
48
0
            return used;
49
0
        }
50
51
0
        if (used + 1 >= dest_len) // +1 because we might require two characters
52
0
            return -1;
53
54
0
        if (*src == '"') dest[used++] = '"';
55
56
0
        dest[used++] = *src++;
57
0
    }
58
0
}
59
60
// ----------------------------------------------------------------------
61
// UnescapeCEscapeSequences()
62
//    This does all the unescaping that C does: \ooo, \r, \n, etc
63
//    Returns length of resulting string.
64
//    The implementation of \x parses any positive number of hex digits,
65
//    but it is an error if the value requires more than 8 bits, and the
66
//    result is truncated to 8 bits. The same is true for octals.
67
//
68
//    The second call stores its errors in a supplied string vector.
69
//    If the string vector pointer is NULL, it reports the errors with LOG().
70
//
71
//    *** DEPRECATED: Use CUnescape() in new code ***
72
//
73
//    NOTE: any changes to this function must also be reflected in the newer
74
//    CUnescape().
75
// ----------------------------------------------------------------------
76
77
0
#define IS_OCTAL_DIGIT(c) (((c) >= '0') && ((c) <= '7'))
78
79
0
int UnescapeCEscapeSequences(const char* source, char* dest) {
80
0
    return UnescapeCEscapeSequences(source, dest, nullptr);
81
0
}
82
83
0
int UnescapeCEscapeSequences(const char* source, char* dest, vector<string>* errors) {
84
0
    char* d = dest;
85
0
    const char* p = source;
86
87
    // Small optimization for case where source = dest and there's no escaping
88
0
    while (p == d && *p != '\0' && *p != '\\') p++, d++;
89
90
0
    while (*p != '\0') {
91
0
        if (*p != '\\') {
92
0
            *d++ = *p++;
93
0
        } else {
94
0
            switch (*++p) { // skip past the '\\'
95
0
            case '\0':
96
0
                LOG_STRING(ERROR, errors) << "String cannot end with \\";
97
0
                *d = '\0';
98
0
                return d - dest; // we're done with p
99
0
            case 'a':
100
0
                *d++ = '\a';
101
0
                break;
102
0
            case 'b':
103
0
                *d++ = '\b';
104
0
                break;
105
0
            case 'f':
106
0
                *d++ = '\f';
107
0
                break;
108
0
            case 'n':
109
0
                *d++ = '\n';
110
0
                break;
111
0
            case 'r':
112
0
                *d++ = '\r';
113
0
                break;
114
0
            case 't':
115
0
                *d++ = '\t';
116
0
                break;
117
0
            case 'v':
118
0
                *d++ = '\v';
119
0
                break;
120
0
            case '\\':
121
0
                *d++ = '\\';
122
0
                break;
123
0
            case '?':
124
0
                *d++ = '\?';
125
0
                break; // \?  Who knew?
126
0
            case '\'':
127
0
                *d++ = '\'';
128
0
                break;
129
0
            case '"':
130
0
                *d++ = '\"';
131
0
                break;
132
0
            case '0':
133
0
            case '1':
134
0
            case '2':
135
0
            case '3': // octal digit: 1 to 3 digits
136
0
            case '4':
137
0
            case '5':
138
0
            case '6':
139
0
            case '7': {
140
0
                const char* octal_start = p;
141
0
                unsigned int ch = *p - '0';
142
0
                if (IS_OCTAL_DIGIT(p[1])) ch = ch * 8 + *++p - '0';
143
0
                if (IS_OCTAL_DIGIT(p[1]))     // safe (and easy) to do this twice
144
0
                    ch = ch * 8 + *++p - '0'; // now points at last digit
145
0
                if (ch > 0xFF)
146
0
                    LOG_STRING(ERROR, errors) << "Value of "
147
0
                                              << "\\" << string(octal_start, p + 1 - octal_start)
148
0
                                              << " exceeds 8 bits";
149
0
                *d++ = ch;
150
0
                break;
151
0
            }
152
0
            case 'x':
153
0
            case 'X': {
154
0
                if (!ascii_isxdigit(p[1])) {
155
0
                    if (p[1] == '\0') {
156
0
                        LOG_STRING(ERROR, errors) << "String cannot end with \\x";
157
0
                    } else {
158
0
                        LOG_STRING(ERROR, errors)
159
0
                                << "\\x cannot be followed by a non-hex digit: \\" << *p << p[1];
160
0
                    }
161
0
                    break;
162
0
                }
163
0
                unsigned int ch = 0;
164
0
                const char* hex_start = p;
165
0
                while (ascii_isxdigit(p[1])) // arbitrarily many hex digits
166
0
                    ch = (ch << 4) + hex_digit_to_int(*++p);
167
0
                if (ch > 0xFF)
168
0
                    LOG_STRING(ERROR, errors)
169
0
                            << "Value of "
170
0
                            << "\\" << string(hex_start, p + 1 - hex_start) << " exceeds 8 bits";
171
0
                *d++ = ch;
172
0
                break;
173
0
            }
174
0
            case 'u': {
175
                // \uhhhh => convert 4 hex digits to UTF-8
176
0
                char32 rune = 0;
177
0
                const char* hex_start = p;
178
0
                for (int i = 0; i < 4; ++i) {
179
0
                    if (ascii_isxdigit(p[1])) {                      // Look one char ahead.
180
0
                        rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p.
181
0
                    } else {
182
0
                        LOG_STRING(ERROR, errors) << "\\u must be followed by 4 hex digits: \\"
183
0
                                                  << string(hex_start, p + 1 - hex_start);
184
0
                        break;
185
0
                    }
186
0
                }
187
0
                d += runetochar(d, &rune);
188
0
                break;
189
0
            }
190
0
            case 'U': {
191
                // \Uhhhhhhhh => convert 8 hex digits to UTF-8
192
0
                char32 rune = 0;
193
0
                const char* hex_start = p;
194
0
                for (int i = 0; i < 8; ++i) {
195
0
                    if (ascii_isxdigit(p[1])) { // Look one char ahead.
196
                        // Don't change rune until we're sure this
197
                        // is within the Unicode limit, but do advance p.
198
0
                        char32 newrune = (rune << 4) + hex_digit_to_int(*++p);
199
0
                        if (newrune > 0x10FFFF) {
200
0
                            LOG_STRING(ERROR, errors)
201
0
                                    << "Value of \\" << string(hex_start, p + 1 - hex_start)
202
0
                                    << " exceeds Unicode limit (0x10FFFF)";
203
0
                            break;
204
0
                        } else {
205
0
                            rune = newrune;
206
0
                        }
207
0
                    } else {
208
0
                        LOG_STRING(ERROR, errors) << "\\U must be followed by 8 hex digits: \\"
209
0
                                                  << string(hex_start, p + 1 - hex_start);
210
0
                        break;
211
0
                    }
212
0
                }
213
0
                d += runetochar(d, &rune);
214
0
                break;
215
0
            }
216
0
            default:
217
0
                LOG_STRING(ERROR, errors) << "Unknown escape sequence: \\" << *p;
218
0
            }
219
0
            p++; // read past letter we escaped
220
0
        }
221
0
    }
222
0
    *d = '\0';
223
0
    return d - dest;
224
0
}
225
226
// ----------------------------------------------------------------------
227
// UnescapeCEscapeString()
228
//    This does the same thing as UnescapeCEscapeSequences, but creates
229
//    a new string. The caller does not need to worry about allocating
230
//    a dest buffer. This should be used for non performance critical
231
//    tasks such as printing debug messages. It is safe for src and dest
232
//    to be the same.
233
//
234
//    The second call stores its errors in a supplied string vector.
235
//    If the string vector pointer is NULL, it reports the errors with LOG().
236
//
237
//    In the first and second calls, the length of dest is returned. In the
238
//    the third call, the new string is returned.
239
//
240
//    *** DEPRECATED: Use CUnescape() in new code ***
241
//
242
// ----------------------------------------------------------------------
243
0
int UnescapeCEscapeString(const string& src, string* dest) {
244
0
    return UnescapeCEscapeString(src, dest, nullptr);
245
0
}
246
247
0
int UnescapeCEscapeString(const string& src, string* dest, vector<string>* errors) {
248
0
    CHECK(dest);
249
0
    dest->resize(src.size() + 1);
250
0
    int len = UnescapeCEscapeSequences(src.c_str(), const_cast<char*>(dest->data()), errors);
251
0
    dest->resize(len);
252
0
    return len;
253
0
}
254
255
0
string UnescapeCEscapeString(const string& src) {
256
0
    gscoped_array<char> unescaped(new char[src.size() + 1]);
257
0
    int len = UnescapeCEscapeSequences(src.c_str(), unescaped.get(), nullptr);
258
0
    return string(unescaped.get(), len);
259
0
}
260
261
// ----------------------------------------------------------------------
262
// CUnescapeInternal()
263
//    Implements both CUnescape() and CUnescapeForNullTerminatedString().
264
//
265
//    Unescapes C escape sequences and is the reverse of CEscape().
266
//
267
//    If 'source' is valid, stores the unescaped string and its size in
268
//    'dest' and 'dest_len' respectively, and returns true. Otherwise
269
//    returns false and optionally stores the error description in
270
//    'error'. Set 'error' to NULL to disable error reporting.
271
//
272
//    'dest' should point to a buffer that is at least as big as 'source'.
273
//    'source' and 'dest' may be the same.
274
//
275
//     NOTE: any changes to this function must also be reflected in the older
276
//     UnescapeCEscapeSequences().
277
// ----------------------------------------------------------------------
278
static bool CUnescapeInternal(const StringPiece& source, bool leave_nulls_escaped, char* dest,
279
14
                              int* dest_len, string* error) {
280
14
    char* d = dest;
281
14
    const char* p = source.data();
282
14
    const char* end = source.end();
283
14
    const char* last_byte = end - 1;
284
285
    // Small optimization for case where source = dest and there's no escaping
286
14
    while (p == d && p < end && *p != '\\') p++, d++;
287
288
665
    while (p < end) {
289
651
        if (*p != '\\') {
290
651
            *d++ = *p++;
291
651
        } else {
292
0
            if (++p > last_byte) { // skip past the '\\'
293
0
                if (error) *error = "String cannot end with \\";
294
0
                return false;
295
0
            }
296
0
            switch (*p) {
297
0
            case 'a':
298
0
                *d++ = '\a';
299
0
                break;
300
0
            case 'b':
301
0
                *d++ = '\b';
302
0
                break;
303
0
            case 'f':
304
0
                *d++ = '\f';
305
0
                break;
306
0
            case 'n':
307
0
                *d++ = '\n';
308
0
                break;
309
0
            case 'r':
310
0
                *d++ = '\r';
311
0
                break;
312
0
            case 't':
313
0
                *d++ = '\t';
314
0
                break;
315
0
            case 'v':
316
0
                *d++ = '\v';
317
0
                break;
318
0
            case '\\':
319
0
                *d++ = '\\';
320
0
                break;
321
0
            case '?':
322
0
                *d++ = '\?';
323
0
                break; // \?  Who knew?
324
0
            case '\'':
325
0
                *d++ = '\'';
326
0
                break;
327
0
            case '"':
328
0
                *d++ = '\"';
329
0
                break;
330
0
            case '0':
331
0
            case '1':
332
0
            case '2':
333
0
            case '3': // octal digit: 1 to 3 digits
334
0
            case '4':
335
0
            case '5':
336
0
            case '6':
337
0
            case '7': {
338
0
                const char* octal_start = p;
339
0
                unsigned int ch = *p - '0';
340
0
                if (p < last_byte && IS_OCTAL_DIGIT(p[1])) ch = ch * 8 + *++p - '0';
341
0
                if (p < last_byte && IS_OCTAL_DIGIT(p[1]))
342
0
                    ch = ch * 8 + *++p - '0'; // now points at last digit
343
0
                if (ch > 0xff) {
344
0
                    if (error) {
345
0
                        *error = "Value of \\" + string(octal_start, p + 1 - octal_start) +
346
0
                                 " exceeds 0xff";
347
0
                    }
348
0
                    return false;
349
0
                }
350
0
                if ((ch == 0) && leave_nulls_escaped) {
351
                    // Copy the escape sequence for the null character
352
0
                    const int octal_size = p + 1 - octal_start;
353
0
                    *d++ = '\\';
354
0
                    memcpy(d, octal_start, octal_size);
355
0
                    d += octal_size;
356
0
                    break;
357
0
                }
358
0
                *d++ = ch;
359
0
                break;
360
0
            }
361
0
            case 'x':
362
0
            case 'X': {
363
0
                if (p >= last_byte) {
364
0
                    if (error) *error = "String cannot end with \\x";
365
0
                    return false;
366
0
                } else if (!ascii_isxdigit(p[1])) {
367
0
                    if (error) *error = "\\x cannot be followed by a non-hex digit";
368
0
                    return false;
369
0
                }
370
0
                unsigned int ch = 0;
371
0
                const char* hex_start = p;
372
0
                while (p < last_byte && ascii_isxdigit(p[1]))
373
                    // Arbitrarily many hex digits
374
0
                    ch = (ch << 4) + hex_digit_to_int(*++p);
375
0
                if (ch > 0xFF) {
376
0
                    if (error) {
377
0
                        *error = "Value of \\" + string(hex_start, p + 1 - hex_start) +
378
0
                                 " exceeds 0xff";
379
0
                    }
380
0
                    return false;
381
0
                }
382
0
                if ((ch == 0) && leave_nulls_escaped) {
383
                    // Copy the escape sequence for the null character
384
0
                    const int hex_size = p + 1 - hex_start;
385
0
                    *d++ = '\\';
386
0
                    memcpy(d, hex_start, hex_size);
387
0
                    d += hex_size;
388
0
                    break;
389
0
                }
390
0
                *d++ = ch;
391
0
                break;
392
0
            }
393
0
            case 'u': {
394
                // \uhhhh => convert 4 hex digits to UTF-8
395
0
                char32 rune = 0;
396
0
                const char* hex_start = p;
397
0
                if (p + 4 >= end) {
398
0
                    if (error) {
399
0
                        *error = "\\u must be followed by 4 hex digits: \\" +
400
0
                                 string(hex_start, p + 1 - hex_start);
401
0
                    }
402
0
                    return false;
403
0
                }
404
0
                for (int i = 0; i < 4; ++i) {
405
                    // Look one char ahead.
406
0
                    if (ascii_isxdigit(p[1])) {
407
0
                        rune = (rune << 4) + hex_digit_to_int(*++p); // Advance p.
408
0
                    } else {
409
0
                        if (error) {
410
0
                            *error = "\\u must be followed by 4 hex digits: \\" +
411
0
                                     string(hex_start, p + 1 - hex_start);
412
0
                        }
413
0
                        return false;
414
0
                    }
415
0
                }
416
0
                if ((rune == 0) && leave_nulls_escaped) {
417
                    // Copy the escape sequence for the null character
418
0
                    *d++ = '\\';
419
0
                    memcpy(d, hex_start, 5); // u0000
420
0
                    d += 5;
421
0
                    break;
422
0
                }
423
0
                d += runetochar(d, &rune);
424
0
                break;
425
0
            }
426
0
            case 'U': {
427
                // \Uhhhhhhhh => convert 8 hex digits to UTF-8
428
0
                char32 rune = 0;
429
0
                const char* hex_start = p;
430
0
                if (p + 8 >= end) {
431
0
                    if (error) {
432
0
                        *error = "\\U must be followed by 8 hex digits: \\" +
433
0
                                 string(hex_start, p + 1 - hex_start);
434
0
                    }
435
0
                    return false;
436
0
                }
437
0
                for (int i = 0; i < 8; ++i) {
438
                    // Look one char ahead.
439
0
                    if (ascii_isxdigit(p[1])) {
440
                        // Don't change rune until we're sure this
441
                        // is within the Unicode limit, but do advance p.
442
0
                        char32 newrune = (rune << 4) + hex_digit_to_int(*++p);
443
0
                        if (newrune > 0x10FFFF) {
444
0
                            if (error) {
445
0
                                *error = "Value of \\" + string(hex_start, p + 1 - hex_start) +
446
0
                                         " exceeds Unicode limit (0x10FFFF)";
447
0
                            }
448
0
                            return false;
449
0
                        } else {
450
0
                            rune = newrune;
451
0
                        }
452
0
                    } else {
453
0
                        if (error) {
454
0
                            *error = "\\U must be followed by 8 hex digits: \\" +
455
0
                                     string(hex_start, p + 1 - hex_start);
456
0
                        }
457
0
                        return false;
458
0
                    }
459
0
                }
460
0
                if ((rune == 0) && leave_nulls_escaped) {
461
                    // Copy the escape sequence for the null character
462
0
                    *d++ = '\\';
463
0
                    memcpy(d, hex_start, 9); // U00000000
464
0
                    d += 9;
465
0
                    break;
466
0
                }
467
0
                d += runetochar(d, &rune);
468
0
                break;
469
0
            }
470
0
            default: {
471
0
                if (error) *error = string("Unknown escape sequence: \\") + *p;
472
0
                return false;
473
0
            }
474
0
            }
475
0
            p++; // read past letter we escaped
476
0
        }
477
651
    }
478
14
    *dest_len = d - dest;
479
14
    return true;
480
14
}
481
482
// ----------------------------------------------------------------------
483
// CUnescapeInternal()
484
//
485
//    Same as above but uses a C++ string for output. 'source' and 'dest'
486
//    may be the same.
487
// ----------------------------------------------------------------------
488
bool CUnescapeInternal(const StringPiece& source, bool leave_nulls_escaped, string* dest,
489
14
                       string* error) {
490
14
    dest->resize(source.size());
491
14
    int dest_size;
492
14
    if (!CUnescapeInternal(source, leave_nulls_escaped, const_cast<char*>(dest->data()), &dest_size,
493
14
                           error)) {
494
0
        return false;
495
0
    }
496
14
    dest->resize(dest_size);
497
14
    return true;
498
14
}
499
500
// ----------------------------------------------------------------------
501
// CUnescape()
502
//
503
// See CUnescapeInternal() for implementation details.
504
// ----------------------------------------------------------------------
505
0
bool CUnescape(const StringPiece& source, char* dest, int* dest_len, string* error) {
506
0
    return CUnescapeInternal(source, kUnescapeNulls, dest, dest_len, error);
507
0
}
508
509
14
bool CUnescape(const StringPiece& source, string* dest, string* error) {
510
14
    return CUnescapeInternal(source, kUnescapeNulls, dest, error);
511
14
}
512
513
// ----------------------------------------------------------------------
514
// CUnescapeForNullTerminatedString()
515
//
516
// See CUnescapeInternal() for implementation details.
517
// ----------------------------------------------------------------------
518
bool CUnescapeForNullTerminatedString(const StringPiece& source, char* dest, int* dest_len,
519
0
                                      string* error) {
520
0
    return CUnescapeInternal(source, kLeaveNullsEscaped, dest, dest_len, error);
521
0
}
522
523
0
bool CUnescapeForNullTerminatedString(const StringPiece& source, string* dest, string* error) {
524
0
    return CUnescapeInternal(source, kLeaveNullsEscaped, dest, error);
525
0
}
526
527
// ----------------------------------------------------------------------
528
// CEscapeString()
529
// CHexEscapeString()
530
// Utf8SafeCEscapeString()
531
// Utf8SafeCHexEscapeString()
532
//    Copies 'src' to 'dest', escaping dangerous characters using
533
//    C-style escape sequences. This is very useful for preparing query
534
//    flags. 'src' and 'dest' should not overlap. The 'Hex' version uses
535
//    hexadecimal rather than octal sequences. The 'Utf8Safe' version doesn't
536
//    touch UTF-8 bytes.
537
//    Returns the number of bytes written to 'dest' (not including the \0)
538
//    or -1 if there was insufficient space.
539
//
540
//    Currently only \n, \r, \t, ", ', \ and !ascii_isprint() chars are escaped.
541
// ----------------------------------------------------------------------
542
int CEscapeInternal(const char* src, int src_len, char* dest, int dest_len, bool use_hex,
543
0
                    bool utf8_safe) {
544
0
    const char* src_end = src + src_len;
545
0
    int used = 0;
546
0
    bool last_hex_escape = false; // true if last output char was \xNN
547
548
0
    for (; src < src_end; src++) {
549
0
        if (dest_len - used < 2) // Need space for two letter escape
550
0
            return -1;
551
552
0
        bool is_hex_escape = false;
553
0
        switch (*src) {
554
0
        case '\n':
555
0
            dest[used++] = '\\';
556
0
            dest[used++] = 'n';
557
0
            break;
558
0
        case '\r':
559
0
            dest[used++] = '\\';
560
0
            dest[used++] = 'r';
561
0
            break;
562
0
        case '\t':
563
0
            dest[used++] = '\\';
564
0
            dest[used++] = 't';
565
0
            break;
566
0
        case '\"':
567
0
            dest[used++] = '\\';
568
0
            dest[used++] = '\"';
569
0
            break;
570
0
        case '\'':
571
0
            dest[used++] = '\\';
572
0
            dest[used++] = '\'';
573
0
            break;
574
0
        case '\\':
575
0
            dest[used++] = '\\';
576
0
            dest[used++] = '\\';
577
0
            break;
578
0
        default:
579
            // Note that if we emit \xNN and the src character after that is a hex
580
            // digit then that digit must be escaped too to prevent it being
581
            // interpreted as part of the character code by C.
582
0
            if ((!utf8_safe || *src < 0x80) &&
583
0
                (!ascii_isprint(*src) || (last_hex_escape && ascii_isxdigit(*src)))) {
584
0
                if (dest_len - used < 4) // need space for 4 letter escape
585
0
                    return -1;
586
0
                sprintf(dest + used, (use_hex ? "\\x%02x" : "\\%03o"), *src);
587
0
                is_hex_escape = use_hex;
588
0
                used += 4;
589
0
            } else {
590
0
                dest[used++] = *src;
591
0
                break;
592
0
            }
593
0
        }
594
0
        last_hex_escape = is_hex_escape;
595
0
    }
596
597
0
    if (dest_len - used < 1) // make sure that there is room for \0
598
0
        return -1;
599
600
0
    dest[used] = '\0'; // doesn't count towards return value though
601
0
    return used;
602
0
}
603
604
0
int CEscapeString(const char* src, int src_len, char* dest, int dest_len) {
605
0
    return CEscapeInternal(src, src_len, dest, dest_len, false, false);
606
0
}
607
608
0
int CHexEscapeString(const char* src, int src_len, char* dest, int dest_len) {
609
0
    return CEscapeInternal(src, src_len, dest, dest_len, true, false);
610
0
}
611
612
0
int Utf8SafeCEscapeString(const char* src, int src_len, char* dest, int dest_len) {
613
0
    return CEscapeInternal(src, src_len, dest, dest_len, false, true);
614
0
}
615
616
0
int Utf8SafeCHexEscapeString(const char* src, int src_len, char* dest, int dest_len) {
617
0
    return CEscapeInternal(src, src_len, dest, dest_len, true, true);
618
0
}
619
620
// ----------------------------------------------------------------------
621
// CEscape()
622
// CHexEscape()
623
// Utf8SafeCEscape()
624
// Utf8SafeCHexEscape()
625
//    Copies 'src' to result, escaping dangerous characters using
626
//    C-style escape sequences. This is very useful for preparing query
627
//    flags. 'src' and 'dest' should not overlap. The 'Hex' version
628
//    hexadecimal rather than octal sequences. The 'Utf8Safe' version
629
//    doesn't touch UTF-8 bytes.
630
//
631
//    Currently only \n, \r, \t, ", ', \ and !ascii_isprint() chars are escaped.
632
// ----------------------------------------------------------------------
633
0
string CEscape(const StringPiece& src) {
634
0
    const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
635
0
    gscoped_array<char> dest(new char[dest_length]);
636
0
    const int len = CEscapeInternal(src.data(), src.size(), dest.get(), dest_length, false, false);
637
0
    DCHECK_GE(len, 0);
638
0
    return string(dest.get(), len);
639
0
}
640
641
0
string CHexEscape(const StringPiece& src) {
642
0
    const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
643
0
    gscoped_array<char> dest(new char[dest_length]);
644
0
    const int len = CEscapeInternal(src.data(), src.size(), dest.get(), dest_length, true, false);
645
0
    DCHECK_GE(len, 0);
646
0
    return string(dest.get(), len);
647
0
}
648
649
0
string Utf8SafeCEscape(const StringPiece& src) {
650
0
    const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
651
0
    gscoped_array<char> dest(new char[dest_length]);
652
0
    const int len = CEscapeInternal(src.data(), src.size(), dest.get(), dest_length, false, true);
653
0
    DCHECK_GE(len, 0);
654
0
    return string(dest.get(), len);
655
0
}
656
657
0
string Utf8SafeCHexEscape(const StringPiece& src) {
658
0
    const int dest_length = src.size() * 4 + 1; // Maximum possible expansion
659
0
    gscoped_array<char> dest(new char[dest_length]);
660
0
    const int len = CEscapeInternal(src.data(), src.size(), dest.get(), dest_length, true, true);
661
0
    DCHECK_GE(len, 0);
662
0
    return string(dest.get(), len);
663
0
}
664
665
// ----------------------------------------------------------------------
666
// BackslashEscape and BackslashUnescape
667
// ----------------------------------------------------------------------
668
0
void BackslashEscape(const StringPiece& src, const strings::CharSet& to_escape, string* dest) {
669
0
    dest->reserve(dest->size() + src.size());
670
0
    for (const char *p = src.data(), *end = src.data() + src.size(); p != end;) {
671
        // Advance to next character we need to escape, or to end of source
672
0
        const char* next = p;
673
0
        while (next < end && !to_escape.Test(*next)) {
674
0
            next++;
675
0
        }
676
        // Append the whole run of non-escaped chars
677
0
        dest->append(p, next - p);
678
0
        if (next == end) break;
679
        // Char at *next needs to be escaped.  Append backslash followed by *next
680
0
        char c[2];
681
0
        c[0] = '\\';
682
0
        c[1] = *next;
683
0
        dest->append(c, 2);
684
0
        p = next + 1;
685
0
    }
686
0
}
687
688
0
void BackslashUnescape(const StringPiece& src, const strings::CharSet& to_unescape, string* dest) {
689
0
    dest->reserve(dest->size() + src.size());
690
0
    bool escaped = false;
691
0
    for (const char *p = src.data(), *end = src.data() + src.size(); p != end; ++p) {
692
0
        if (escaped) {
693
0
            if (!to_unescape.Test(*p)) {
694
                // Keep the backslash
695
0
                dest->push_back('\\');
696
0
            }
697
0
            dest->push_back(*p);
698
0
            escaped = false;
699
0
        } else if (*p == '\\') {
700
0
            escaped = true;
701
0
        } else {
702
0
            dest->push_back(*p);
703
0
        }
704
0
    }
705
0
}
706
707
// ----------------------------------------------------------------------
708
// int QuotedPrintableUnescape()
709
//
710
// Check out http://www.cis.ohio-state.edu/htbin/rfc/rfc2045.html for
711
// more details, only briefly implemented. But from the web...
712
// Quoted-printable is an encoding method defined in the MIME
713
// standard. It is used primarily to encode 8-bit text (such as text
714
// that includes foreign characters) into 7-bit US ASCII, creating a
715
// document that is mostly readable by humans, even in its encoded
716
// form. All MIME compliant applications can decode quoted-printable
717
// text, though they may not necessarily be able to properly display the
718
// document as it was originally intended. As quoted-printable encoding
719
// is implemented most commonly, printable ASCII characters (values 33
720
// through 126, excluding 61), tabs and spaces that do not appear at the
721
// end of lines, and end-of-line characters are not encoded. Other
722
// characters are represented by an equal sign (=) immediately followed
723
// by that character's hexadecimal value. Lines that are longer than 76
724
// characters are shortened by line breaks, with the equal sign marking
725
// where the breaks occurred.
726
//
727
// Note that QuotedPrintableUnescape is different from 'Q'-encoding as
728
// defined in rfc2047. In particular, This does not treat '_'s as spaces.
729
// See QEncodingUnescape().
730
// ----------------------------------------------------------------------
731
732
0
int QuotedPrintableUnescape(const char* source, int slen, char* dest, int szdest) {
733
0
    char* d = dest;
734
0
    const char* p = source;
735
736
0
    while (p < source + slen && *p != '\0' && d < dest + szdest) {
737
0
        switch (*p) {
738
0
        case '=':
739
            // If it's valid, convert to hex and insert or remove line-wrap.
740
            // In the case of line-wrap removal, we allow LF as well as CRLF.
741
0
            if (p < source + slen - 1) {
742
0
                if (p[1] == '\n') {
743
0
                    p++;
744
0
                } else if (p < source + slen - 2) {
745
0
                    if (ascii_isxdigit(p[1]) && ascii_isxdigit(p[2])) {
746
0
                        *d++ = hex_digit_to_int(p[1]) * 16 + hex_digit_to_int(p[2]);
747
0
                        p += 2;
748
0
                    } else if (p[1] == '\r' && p[2] == '\n') {
749
0
                        p += 2;
750
0
                    }
751
0
                }
752
0
            }
753
0
            p++;
754
0
            break;
755
0
        default:
756
0
            *d++ = *p++;
757
0
            break;
758
0
        }
759
0
    }
760
0
    return (d - dest);
761
0
}
762
763
// ----------------------------------------------------------------------
764
// int QEncodingUnescape()
765
//
766
// This is very similar to QuotedPrintableUnescape except that we convert
767
// '_'s into spaces. (See RFC 2047)
768
// ----------------------------------------------------------------------
769
0
int QEncodingUnescape(const char* source, int slen, char* dest, int szdest) {
770
0
    char* d = dest;
771
0
    const char* p = source;
772
773
0
    while (p < source + slen && *p != '\0' && d < dest + szdest) {
774
0
        switch (*p) {
775
0
        case '=':
776
            // If it's valid, convert to hex and insert or remove line-wrap.
777
            // In the case of line-wrap removal, the assumption is that this
778
            // is an RFC-compliant message with lines terminated by CRLF.
779
0
            if (p < source + slen - 2) {
780
0
                if (ascii_isxdigit(p[1]) && ascii_isxdigit(p[2])) {
781
0
                    *d++ = hex_digit_to_int(p[1]) * 16 + hex_digit_to_int(p[2]);
782
0
                    p += 2;
783
0
                } else if (p[1] == '\r' && p[2] == '\n') {
784
0
                    p += 2;
785
0
                }
786
0
            }
787
0
            p++;
788
0
            break;
789
0
        case '_': // According to rfc2047, _'s are to be treated as spaces
790
0
            *d++ = ' ';
791
0
            p++;
792
0
            break;
793
0
        default:
794
0
            *d++ = *p++;
795
0
            break;
796
0
        }
797
0
    }
798
0
    return (d - dest);
799
0
}
800
801
0
int CalculateBase64EscapedLen(int input_len, bool do_padding) {
802
    // Base64 encodes three bytes of input at a time. If the input is not
803
    // divisible by three, we pad as appropriate.
804
    //
805
    // (from http://www.ietf.org/rfc/rfc3548.txt)
806
    // Special processing is performed if fewer than 24 bits are available
807
    // at the end of the data being encoded.  A full encoding quantum is
808
    // always completed at the end of a quantity.  When fewer than 24 input
809
    // bits are available in an input group, zero bits are added (on the
810
    // right) to form an integral number of 6-bit groups.  Padding at the
811
    // end of the data is performed using the '=' character.  Since all base
812
    // 64 input is an integral number of octets, only the following cases
813
    // can arise:
814
815
    // Base64 encodes each three bytes of input into four bytes of output.
816
0
    int len = (input_len / 3) * 4;
817
818
0
    if (input_len % 3 == 0) {
819
        // (from http://www.ietf.org/rfc/rfc3548.txt)
820
        // (1) the final quantum of encoding input is an integral multiple of 24
821
        // bits; here, the final unit of encoded output will be an integral
822
        // multiple of 4 characters with no "=" padding,
823
0
    } else if (input_len % 3 == 1) {
824
        // (from http://www.ietf.org/rfc/rfc3548.txt)
825
        // (2) the final quantum of encoding input is exactly 8 bits; here, the
826
        // final unit of encoded output will be two characters followed by two
827
        // "=" padding characters, or
828
0
        len += 2;
829
0
        if (do_padding) {
830
0
            len += 2;
831
0
        }
832
0
    } else { // (input_len % 3 == 2)
833
        // (from http://www.ietf.org/rfc/rfc3548.txt)
834
        // (3) the final quantum of encoding input is exactly 16 bits; here, the
835
        // final unit of encoded output will be three characters followed by one
836
        // "=" padding character.
837
0
        len += 3;
838
0
        if (do_padding) {
839
0
            len += 1;
840
0
        }
841
0
    }
842
843
0
    assert(len >= input_len); // make sure we didn't overflow
844
0
    return len;
845
0
}
846
847
// Base64Escape does padding, so this calculation includes padding.
848
0
int CalculateBase64EscapedLen(int input_len) {
849
0
    return CalculateBase64EscapedLen(input_len, true);
850
0
}
851
852
// ----------------------------------------------------------------------
853
// int Base64Unescape() - base64 decoder
854
// int Base64Escape() - base64 encoder
855
// int WebSafeBase64Unescape() - Google's variation of base64 decoder
856
// int WebSafeBase64Escape() - Google's variation of base64 encoder
857
//
858
// Check out
859
// http://www.cis.ohio-state.edu/htbin/rfc/rfc2045.html for formal
860
// description, but what we care about is that...
861
//   Take the encoded stuff in groups of 4 characters and turn each
862
//   character into a code 0 to 63 thus:
863
//           A-Z map to 0 to 25
864
//           a-z map to 26 to 51
865
//           0-9 map to 52 to 61
866
//           +(- for WebSafe) maps to 62
867
//           /(_ for WebSafe) maps to 63
868
//   There will be four numbers, all less than 64 which can be represented
869
//   by a 6 digit binary number (aaaaaa, bbbbbb, cccccc, dddddd respectively).
870
//   Arrange the 6 digit binary numbers into three bytes as such:
871
//   aaaaaabb bbbbcccc ccdddddd
872
//   Equals signs (one or two) are used at the end of the encoded block to
873
//   indicate that the text was not an integer multiple of three bytes long.
874
// In the sorted variation, we instead use the mapping
875
//           .   maps to 0
876
//           0-9 map to 1-10
877
//           A-Z map to 11-37
878
//           _   maps to 38
879
//           a-z map to 39-63
880
// This mapping has the property that the output will be sorted in the same
881
// order as the input, i.e. a < b iff map(a) < map(b). It is web-safe and
882
// filename-safe.
883
// ----------------------------------------------------------------------
884
885
int Base64UnescapeInternal(const char* src, int szsrc, char* dest, int szdest,
886
0
                           const signed char* unbase64) {
887
0
    static const char kPad64 = '=';
888
889
0
    int decode = 0;
890
0
    int destidx = 0;
891
0
    int state = 0;
892
0
    unsigned int ch = 0;
893
0
    unsigned int temp = 0;
894
895
    // The GET_INPUT macro gets the next input character, skipping
896
    // over any whitespace, and stopping when we reach the end of the
897
    // string or when we read any non-data character.  The arguments are
898
    // an arbitrary identifier (used as a label for goto) and the number
899
    // of data bytes that must remain in the input to avoid aborting the
900
    // loop.
901
0
#define GET_INPUT(label, remain)                              \
902
0
    label:                                                    \
903
0
    --szsrc;                                                  \
904
0
    ch = *src++;                                              \
905
0
    decode = unbase64[ch];                                    \
906
0
    if (decode < 0) {                                         \
907
0
        if (ascii_isspace(ch) && szsrc >= remain) goto label; \
908
0
        state = 4 - remain;                                   \
909
0
        break;                                                \
910
0
    }
911
912
    // if dest is null, we're just checking to see if it's legal input
913
    // rather than producing output.  (I suspect this could just be done
914
    // with a regexp...).  We duplicate the loop so this test can be
915
    // outside it instead of in every iteration.
916
917
0
    if (dest) {
918
        // This loop consumes 4 input bytes and produces 3 output bytes
919
        // per iteration.  We can't know at the start that there is enough
920
        // data left in the string for a full iteration, so the loop may
921
        // break out in the middle; if so 'state' will be set to the
922
        // number of input bytes read.
923
924
0
        while (szsrc >= 4) {
925
            // We'll start by optimistically assuming that the next four
926
            // bytes of the string (src[0..3]) are four good data bytes
927
            // (that is, no nulls, whitespace, padding chars, or illegal
928
            // chars).  We need to test src[0..2] for nulls individually
929
            // before constructing temp to preserve the property that we
930
            // never read past a null in the string (no matter how long
931
            // szsrc claims the string is).
932
933
0
            if (!src[0] || !src[1] || !src[2] ||
934
0
                (temp = ((unbase64[src[0]] << 18) | (unbase64[src[1]] << 12) |
935
0
                         (unbase64[src[2]] << 6) | (unbase64[src[3]]))) &
936
0
                        0x80000000) {
937
                // Iff any of those four characters was bad (null, illegal,
938
                // whitespace, padding), then temp's high bit will be set
939
                // (because unbase64[] is -1 for all bad characters).
940
                //
941
                // We'll back up and resort to the slower decoder, which knows
942
                // how to handle those cases.
943
944
0
                GET_INPUT(first, 4);
945
0
                temp = decode;
946
0
                GET_INPUT(second, 3);
947
0
                temp = (temp << 6) | decode;
948
0
                GET_INPUT(third, 2);
949
0
                temp = (temp << 6) | decode;
950
0
                GET_INPUT(fourth, 1);
951
0
                temp = (temp << 6) | decode;
952
0
            } else {
953
                // We really did have four good data bytes, so advance four
954
                // characters in the string.
955
956
0
                szsrc -= 4;
957
0
                src += 4;
958
0
                decode = -1;
959
0
                ch = '\0';
960
0
            }
961
962
            // temp has 24 bits of input, so write that out as three bytes.
963
964
0
            if (destidx + 3 > szdest) return -1;
965
0
            dest[destidx + 2] = temp;
966
0
            temp >>= 8;
967
0
            dest[destidx + 1] = temp;
968
0
            temp >>= 8;
969
0
            dest[destidx] = temp;
970
0
            destidx += 3;
971
0
        }
972
0
    } else {
973
0
        while (szsrc >= 4) {
974
0
            if (!src[0] || !src[1] || !src[2] ||
975
0
                (temp = ((unbase64[src[0]] << 18) | (unbase64[src[1]] << 12) |
976
0
                         (unbase64[src[2]] << 6) | (unbase64[src[3]]))) &
977
0
                        0x80000000) {
978
0
                GET_INPUT(first_no_dest, 4);
979
0
                GET_INPUT(second_no_dest, 3);
980
0
                GET_INPUT(third_no_dest, 2);
981
0
                GET_INPUT(fourth_no_dest, 1);
982
0
            } else {
983
0
                szsrc -= 4;
984
0
                src += 4;
985
0
                decode = -1;
986
0
                ch = '\0';
987
0
            }
988
0
            destidx += 3;
989
0
        }
990
0
    }
991
992
0
#undef GET_INPUT
993
994
    // if the loop terminated because we read a bad character, return
995
    // now.
996
0
    if (decode < 0 && ch != '\0' && ch != kPad64 && !ascii_isspace(ch)) return -1;
997
998
0
    if (ch == kPad64) {
999
        // if we stopped by hitting an '=', un-read that character -- we'll
1000
        // look at it again when we count to check for the proper number of
1001
        // equals signs at the end.
1002
0
        ++szsrc;
1003
0
        --src;
1004
0
    } else {
1005
        // This loop consumes 1 input byte per iteration.  It's used to
1006
        // clean up the 0-3 input bytes remaining when the first, faster
1007
        // loop finishes.  'temp' contains the data from 'state' input
1008
        // characters read by the first loop.
1009
0
        while (szsrc > 0) {
1010
0
            --szsrc;
1011
0
            ch = *src++;
1012
0
            decode = unbase64[ch];
1013
0
            if (decode < 0) {
1014
0
                if (ascii_isspace(ch)) {
1015
0
                    continue;
1016
0
                } else if (ch == '\0') {
1017
0
                    break;
1018
0
                } else if (ch == kPad64) {
1019
                    // back up one character; we'll read it again when we check
1020
                    // for the correct number of equals signs at the end.
1021
0
                    ++szsrc;
1022
0
                    --src;
1023
0
                    break;
1024
0
                } else {
1025
0
                    return -1;
1026
0
                }
1027
0
            }
1028
1029
            // Each input character gives us six bits of output.
1030
0
            temp = (temp << 6) | decode;
1031
0
            ++state;
1032
0
            if (state == 4) {
1033
                // If we've accumulated 24 bits of output, write that out as
1034
                // three bytes.
1035
0
                if (dest) {
1036
0
                    if (destidx + 3 > szdest) return -1;
1037
0
                    dest[destidx + 2] = temp;
1038
0
                    temp >>= 8;
1039
0
                    dest[destidx + 1] = temp;
1040
0
                    temp >>= 8;
1041
0
                    dest[destidx] = temp;
1042
0
                }
1043
0
                destidx += 3;
1044
0
                state = 0;
1045
0
                temp = 0;
1046
0
            }
1047
0
        }
1048
0
    }
1049
1050
    // Process the leftover data contained in 'temp' at the end of the input.
1051
0
    int expected_equals = 0;
1052
0
    switch (state) {
1053
0
    case 0:
1054
        // Nothing left over; output is a multiple of 3 bytes.
1055
0
        break;
1056
1057
0
    case 1:
1058
        // Bad input; we have 6 bits left over.
1059
0
        return -1;
1060
1061
0
    case 2:
1062
        // Produce one more output byte from the 12 input bits we have left.
1063
0
        if (dest) {
1064
0
            if (destidx + 1 > szdest) return -1;
1065
0
            temp >>= 4;
1066
0
            dest[destidx] = temp;
1067
0
        }
1068
0
        ++destidx;
1069
0
        expected_equals = 2;
1070
0
        break;
1071
1072
0
    case 3:
1073
        // Produce two more output bytes from the 18 input bits we have left.
1074
0
        if (dest) {
1075
0
            if (destidx + 2 > szdest) return -1;
1076
0
            temp >>= 2;
1077
0
            dest[destidx + 1] = temp;
1078
0
            temp >>= 8;
1079
0
            dest[destidx] = temp;
1080
0
        }
1081
0
        destidx += 2;
1082
0
        expected_equals = 1;
1083
0
        break;
1084
1085
0
    default:
1086
        // state should have no other values at this point.
1087
0
        LOG(FATAL) << "This can't happen; base64 decoder state = " << state;
1088
0
    }
1089
1090
    // The remainder of the string should be all whitespace, mixed with
1091
    // exactly 0 equals signs, or exactly 'expected_equals' equals
1092
    // signs.  (Always accepting 0 equals signs is a google extension
1093
    // not covered in the RFC.)
1094
1095
0
    int equals = 0;
1096
0
    while (szsrc > 0 && *src) {
1097
0
        if (*src == kPad64)
1098
0
            ++equals;
1099
0
        else if (!ascii_isspace(*src))
1100
0
            return -1;
1101
0
        --szsrc;
1102
0
        ++src;
1103
0
    }
1104
1105
0
    return (equals == 0 || equals == expected_equals) ? destidx : -1;
1106
0
}
1107
1108
// The arrays below were generated by the following code
1109
// #include <sys/time.h>
1110
// #include <stdlib.h>
1111
// #include <string.h>
1112
// main()
1113
// {
1114
//   static const char Base64[] =
1115
//     "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1116
//   char *pos;
1117
//   int idx, i, j;
1118
//   printf("    ");
1119
//   for (i = 0; i < 255; i += 8) {
1120
//     for (j = i; j < i + 8; j++) {
1121
//       pos = strchr(Base64, j);
1122
//       if ((pos == NULL) || (j == 0))
1123
//         idx = -1;
1124
//       else
1125
//         idx = pos - Base64;
1126
//       if (idx == -1)
1127
//         printf(" %2d,     ", idx);
1128
//       else
1129
//         printf(" %2d/*%c*/,", idx, j);
1130
//     }
1131
//     printf("\n    ");
1132
//   }
1133
// }
1134
//
1135
// where the value of "Base64[]" was replaced by one of the base-64 conversion
1136
// tables from the functions below.
1137
static const signed char kUnBase64[] = {
1138
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1139
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1140
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1141
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1142
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       62 /*+*/, -1,
1143
        -1,       -1,       63 /*/ */, 52 /*0*/, 53 /*1*/, 54 /*2*/, 55 /*3*/, 56 /*4*/, 57 /*5*/,
1144
        58 /*6*/, 59 /*7*/, 60 /*8*/,  61 /*9*/, -1,       -1,       -1,       -1,       -1,
1145
        -1,       -1,       0 /*A*/,   1 /*B*/,  2 /*C*/,  3 /*D*/,  4 /*E*/,  5 /*F*/,  6 /*G*/,
1146
        07 /*H*/, 8 /*I*/,  9 /*J*/,   10 /*K*/, 11 /*L*/, 12 /*M*/, 13 /*N*/, 14 /*O*/, 15 /*P*/,
1147
        16 /*Q*/, 17 /*R*/, 18 /*S*/,  19 /*T*/, 20 /*U*/, 21 /*V*/, 22 /*W*/, 23 /*X*/, 24 /*Y*/,
1148
        25 /*Z*/, -1,       -1,        -1,       -1,       -1,       -1,       26 /*a*/, 27 /*b*/,
1149
        28 /*c*/, 29 /*d*/, 30 /*e*/,  31 /*f*/, 32 /*g*/, 33 /*h*/, 34 /*i*/, 35 /*j*/, 36 /*k*/,
1150
        37 /*l*/, 38 /*m*/, 39 /*n*/,  40 /*o*/, 41 /*p*/, 42 /*q*/, 43 /*r*/, 44 /*s*/, 45 /*t*/,
1151
        46 /*u*/, 47 /*v*/, 48 /*w*/,  49 /*x*/, 50 /*y*/, 51 /*z*/, -1,       -1,       -1,
1152
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1153
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1154
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1155
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1156
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1157
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1158
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1159
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1160
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1161
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1162
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1163
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1164
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1165
        -1,       -1,       -1,        -1,       -1,       -1,       -1,       -1,       -1,
1166
        -1,       -1,       -1,        -1};
1167
static const signed char kUnWebSafeBase64[] = {
1168
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1169
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1170
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1171
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1172
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1173
        62 /*-*/, -1,       -1,       52 /*0*/, 53 /*1*/, 54 /*2*/, 55 /*3*/, 56 /*4*/, 57 /*5*/,
1174
        58 /*6*/, 59 /*7*/, 60 /*8*/, 61 /*9*/, -1,       -1,       -1,       -1,       -1,
1175
        -1,       -1,       0 /*A*/,  1 /*B*/,  2 /*C*/,  3 /*D*/,  4 /*E*/,  5 /*F*/,  6 /*G*/,
1176
        07 /*H*/, 8 /*I*/,  9 /*J*/,  10 /*K*/, 11 /*L*/, 12 /*M*/, 13 /*N*/, 14 /*O*/, 15 /*P*/,
1177
        16 /*Q*/, 17 /*R*/, 18 /*S*/, 19 /*T*/, 20 /*U*/, 21 /*V*/, 22 /*W*/, 23 /*X*/, 24 /*Y*/,
1178
        25 /*Z*/, -1,       -1,       -1,       -1,       63 /*_*/, -1,       26 /*a*/, 27 /*b*/,
1179
        28 /*c*/, 29 /*d*/, 30 /*e*/, 31 /*f*/, 32 /*g*/, 33 /*h*/, 34 /*i*/, 35 /*j*/, 36 /*k*/,
1180
        37 /*l*/, 38 /*m*/, 39 /*n*/, 40 /*o*/, 41 /*p*/, 42 /*q*/, 43 /*r*/, 44 /*s*/, 45 /*t*/,
1181
        46 /*u*/, 47 /*v*/, 48 /*w*/, 49 /*x*/, 50 /*y*/, 51 /*z*/, -1,       -1,       -1,
1182
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1183
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1184
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1185
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1186
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1187
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1188
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1189
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1190
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1191
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1192
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1193
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1194
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1195
        -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,       -1,
1196
        -1,       -1,       -1,       -1};
1197
1198
0
int Base64Unescape(const char* src, int szsrc, char* dest, int szdest) {
1199
0
    return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnBase64);
1200
0
}
1201
1202
0
int WebSafeBase64Unescape(const char* src, int szsrc, char* dest, int szdest) {
1203
0
    return Base64UnescapeInternal(src, szsrc, dest, szdest, kUnWebSafeBase64);
1204
0
}
1205
1206
static bool Base64UnescapeInternal(const char* src, int slen, string* dest,
1207
0
                                   const signed char* unbase64) {
1208
    // Determine the size of the output string.  Base64 encodes every 3 bytes into
1209
    // 4 characters.  any leftover chars are added directly for good measure.
1210
    // This is documented in the base64 RFC: http://www.ietf.org/rfc/rfc3548.txt
1211
0
    const int dest_len = 3 * (slen / 4) + (slen % 4);
1212
1213
0
    dest->clear();
1214
0
    dest->resize(dest_len);
1215
1216
    // We are getting the destination buffer by getting the beginning of the
1217
    // string and converting it into a char *.
1218
0
    const int len =
1219
0
            Base64UnescapeInternal(src, slen, string_as_array(dest), dest->size(), unbase64);
1220
0
    if (len < 0) {
1221
0
        dest->clear();
1222
0
        return false;
1223
0
    }
1224
1225
    // could be shorter if there was padding
1226
0
    DCHECK_LE(len, dest_len);
1227
0
    dest->resize(len);
1228
1229
0
    return true;
1230
0
}
1231
1232
0
bool Base64Unescape(const char* src, int slen, string* dest) {
1233
0
    return Base64UnescapeInternal(src, slen, dest, kUnBase64);
1234
0
}
1235
1236
0
bool WebSafeBase64Unescape(const char* src, int slen, string* dest) {
1237
0
    return Base64UnescapeInternal(src, slen, dest, kUnWebSafeBase64);
1238
0
}
1239
1240
int Base64EscapeInternal(const unsigned char* src, int szsrc, char* dest, int szdest,
1241
0
                         const char* base64, bool do_padding) {
1242
0
    static const char kPad64 = '=';
1243
1244
0
    if (szsrc <= 0) return 0;
1245
1246
0
    char* cur_dest = dest;
1247
0
    const unsigned char* cur_src = src;
1248
1249
    // Three bytes of data encodes to four characters of cyphertext.
1250
    // So we can pump through three-byte chunks atomically.
1251
0
    while (szsrc > 2) { /* keep going until we have less than 24 bits */
1252
0
        if ((szdest -= 4) < 0) return 0;
1253
0
        cur_dest[0] = base64[cur_src[0] >> 2];
1254
0
        cur_dest[1] = base64[((cur_src[0] & 0x03) << 4) + (cur_src[1] >> 4)];
1255
0
        cur_dest[2] = base64[((cur_src[1] & 0x0f) << 2) + (cur_src[2] >> 6)];
1256
0
        cur_dest[3] = base64[cur_src[2] & 0x3f];
1257
1258
0
        cur_dest += 4;
1259
0
        cur_src += 3;
1260
0
        szsrc -= 3;
1261
0
    }
1262
1263
    /* now deal with the tail (<=2 bytes) */
1264
0
    switch (szsrc) {
1265
0
    case 0:
1266
        // Nothing left; nothing more to do.
1267
0
        break;
1268
0
    case 1:
1269
        // One byte left: this encodes to two characters, and (optionally)
1270
        // two pad characters to round out the four-character cypherblock.
1271
0
        if ((szdest -= 2) < 0) return 0;
1272
0
        cur_dest[0] = base64[cur_src[0] >> 2];
1273
0
        cur_dest[1] = base64[(cur_src[0] & 0x03) << 4];
1274
0
        cur_dest += 2;
1275
0
        if (do_padding) {
1276
0
            if ((szdest -= 2) < 0) return 0;
1277
0
            cur_dest[0] = kPad64;
1278
0
            cur_dest[1] = kPad64;
1279
0
            cur_dest += 2;
1280
0
        }
1281
0
        break;
1282
0
    case 2:
1283
        // Two bytes left: this encodes to three characters, and (optionally)
1284
        // one pad character to round out the four-character cypherblock.
1285
0
        if ((szdest -= 3) < 0) return 0;
1286
0
        cur_dest[0] = base64[cur_src[0] >> 2];
1287
0
        cur_dest[1] = base64[((cur_src[0] & 0x03) << 4) + (cur_src[1] >> 4)];
1288
0
        cur_dest[2] = base64[(cur_src[1] & 0x0f) << 2];
1289
0
        cur_dest += 3;
1290
0
        if (do_padding) {
1291
0
            if ((szdest -= 1) < 0) return 0;
1292
0
            cur_dest[0] = kPad64;
1293
0
            cur_dest += 1;
1294
0
        }
1295
0
        break;
1296
0
    default:
1297
        // Should not be reached: blocks of 3 bytes are handled
1298
        // in the while loop before this switch statement.
1299
0
        LOG_ASSERT(false) << "Logic problem? szsrc = " << szsrc;
1300
0
        break;
1301
0
    }
1302
0
    return (cur_dest - dest);
1303
0
}
1304
1305
static const char kBase64Chars[] =
1306
        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
1307
1308
static const char kWebSafeBase64Chars[] =
1309
        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_";
1310
1311
0
int Base64Escape(const unsigned char* src, int szsrc, char* dest, int szdest) {
1312
0
    return Base64EscapeInternal(src, szsrc, dest, szdest, kBase64Chars, true);
1313
0
}
1314
int WebSafeBase64Escape(const unsigned char* src, int szsrc, char* dest, int szdest,
1315
0
                        bool do_padding) {
1316
0
    return Base64EscapeInternal(src, szsrc, dest, szdest, kWebSafeBase64Chars, do_padding);
1317
0
}
1318
1319
void Base64EscapeInternal(const unsigned char* src, int szsrc, string* dest, bool do_padding,
1320
0
                          const char* base64_chars) {
1321
0
    const int calc_escaped_size = CalculateBase64EscapedLen(szsrc, do_padding);
1322
0
    dest->clear();
1323
0
    dest->resize(calc_escaped_size, '\0');
1324
0
    const int escaped_len = Base64EscapeInternal(src, szsrc, string_as_array(dest), dest->size(),
1325
0
                                                 base64_chars, do_padding);
1326
0
    DCHECK_EQ(calc_escaped_size, escaped_len);
1327
0
}
1328
1329
0
void Base64Escape(const unsigned char* src, int szsrc, string* dest, bool do_padding) {
1330
0
    Base64EscapeInternal(src, szsrc, dest, do_padding, kBase64Chars);
1331
0
}
1332
1333
0
void WebSafeBase64Escape(const unsigned char* src, int szsrc, string* dest, bool do_padding) {
1334
0
    Base64EscapeInternal(src, szsrc, dest, do_padding, kWebSafeBase64Chars);
1335
0
}
1336
1337
0
void Base64Escape(const string& src, string* dest) {
1338
0
    Base64Escape(reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest, true);
1339
0
}
1340
1341
0
void WebSafeBase64Escape(const string& src, string* dest) {
1342
0
    WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest,
1343
0
                        false);
1344
0
}
1345
1346
0
void WebSafeBase64EscapeWithPadding(const string& src, string* dest) {
1347
0
    WebSafeBase64Escape(reinterpret_cast<const unsigned char*>(src.data()), src.size(), dest, true);
1348
0
}
1349
1350
// Returns true iff c is in the Base 32 alphabet.
1351
0
bool ValidBase32Byte(char c) {
1352
0
    return (c >= 'A' && c <= 'Z') || (c >= '2' && c <= '7') || c == '=';
1353
0
}
1354
1355
// Mapping from number of Base32 escaped characters (0 through 8) to number of
1356
// unescaped bytes.  8 Base32 escaped characters represent 5 unescaped bytes.
1357
// For N < 8, then number of unescaped bytes is less than 5.  Note that in
1358
// valid input, N can only be 0, 2, 4, 5, 7, or 8 (corresponding to 0, 1, 2,
1359
// 3, 4, or 5 unescaped bytes).
1360
//
1361
// We use 5 for invalid values of N to be safe, since this is used to compute
1362
// the length of the buffer to hold unescaped data.
1363
//
1364
// See http://tools.ietf.org/html/rfc4648#section-6 for details.
1365
static const int kBase32NumUnescapedBytes[] = {0, 5, 1, 5, 2, 3, 5, 4, 5};
1366
1367
0
int Base32Unescape(const char* src, int slen, char* dest, int szdest) {
1368
0
    int destidx = 0;
1369
0
    char escaped_bytes[8];
1370
0
    unsigned char unescaped_bytes[5];
1371
0
    while (slen > 0) {
1372
        // Collect the next 8 escaped bytes and convert to upper case.  If there
1373
        // are less than 8 bytes left, pad with '=', but keep track of the number
1374
        // of non-padded bytes for later.
1375
0
        int non_padded_len = 8;
1376
0
        for (int i = 0; i < 8; ++i) {
1377
0
            escaped_bytes[i] = (i < slen) ? ascii_toupper(src[i]) : '=';
1378
0
            if (!ValidBase32Byte(escaped_bytes[i])) {
1379
0
                return -1;
1380
0
            }
1381
            // Stop counting escaped bytes at first '='.
1382
0
            if (escaped_bytes[i] == '=' && non_padded_len == 8) {
1383
0
                non_padded_len = i;
1384
0
            }
1385
0
        }
1386
1387
        // Convert the 8 escaped bytes to 5 unescaped bytes and copy to dest.
1388
0
        EightBase32DigitsToFiveBytes(escaped_bytes, unescaped_bytes);
1389
0
        const int num_unescaped = kBase32NumUnescapedBytes[non_padded_len];
1390
0
        for (int i = 0; i < num_unescaped; ++i) {
1391
0
            if (destidx == szdest) {
1392
                // No more room in dest, so terminate early.
1393
0
                return -1;
1394
0
            }
1395
0
            dest[destidx] = unescaped_bytes[i];
1396
0
            ++destidx;
1397
0
        }
1398
0
        src += 8;
1399
0
        slen -= 8;
1400
0
    }
1401
0
    return destidx;
1402
0
}
1403
1404
0
bool Base32Unescape(const char* src, int slen, string* dest) {
1405
    // Determine the size of the output string.
1406
0
    const int dest_len = 5 * (slen / 8) + kBase32NumUnescapedBytes[slen % 8];
1407
1408
0
    dest->clear();
1409
0
    dest->resize(dest_len);
1410
1411
    // We are getting the destination buffer by getting the beginning of the
1412
    // string and converting it into a char *.
1413
0
    const int len = Base32Unescape(src, slen, string_as_array(dest), dest->size());
1414
0
    if (len < 0) {
1415
0
        dest->clear();
1416
0
        return false;
1417
0
    }
1418
1419
    // Could be shorter if there was padding.
1420
0
    DCHECK_LE(len, dest_len);
1421
0
    dest->resize(len);
1422
1423
0
    return true;
1424
0
}
1425
1426
void GeneralFiveBytesToEightBase32Digits(const unsigned char* in_bytes, char* out,
1427
0
                                         const char* alphabet) {
1428
    // It's easier to just hard code this.
1429
    // The conversion isbased on the following picture of the division of a
1430
    // 40-bit block into 8 5-byte words:
1431
    //
1432
    //       5   3  2  5  1  4   4 1  5  2  3   5
1433
    //     |:::::::|:::::::|:::::::|:::::::|:::::::
1434
    //     +----+----+----+----+----+----+----+----
1435
    //
1436
0
    out[0] = alphabet[in_bytes[0] >> 3];
1437
0
    out[1] = alphabet[(in_bytes[0] & 0x07) << 2 | in_bytes[1] >> 6];
1438
0
    out[2] = alphabet[(in_bytes[1] & 0x3E) >> 1];
1439
0
    out[3] = alphabet[(in_bytes[1] & 0x01) << 4 | in_bytes[2] >> 4];
1440
0
    out[4] = alphabet[(in_bytes[2] & 0x0F) << 1 | in_bytes[3] >> 7];
1441
0
    out[5] = alphabet[(in_bytes[3] & 0x7C) >> 2];
1442
0
    out[6] = alphabet[(in_bytes[3] & 0x03) << 3 | in_bytes[4] >> 5];
1443
0
    out[7] = alphabet[(in_bytes[4] & 0x1F)];
1444
0
}
1445
1446
static int GeneralBase32Escape(const unsigned char* src, size_t szsrc, char* dest, size_t szdest,
1447
0
                               const char* alphabet) {
1448
0
    static const char kPad32 = '=';
1449
1450
0
    if (szsrc == 0) return 0;
1451
1452
0
    char* cur_dest = dest;
1453
0
    const unsigned char* cur_src = src;
1454
1455
    // Five bytes of data encodes to eight characters of cyphertext.
1456
    // So we can pump through three-byte chunks atomically.
1457
0
    while (szsrc > 4) { // keep going until we have less than 40 bits
1458
0
        if (szdest < 8) return 0;
1459
0
        szdest -= 8;
1460
1461
0
        GeneralFiveBytesToEightBase32Digits(cur_src, cur_dest, alphabet);
1462
1463
0
        cur_dest += 8;
1464
0
        cur_src += 5;
1465
0
        szsrc -= 5;
1466
0
    }
1467
1468
    // Now deal with the tail (<=4 bytes).
1469
0
    if (szsrc > 0) {
1470
0
        if (szdest < 8) return 0;
1471
0
        szdest -= 8;
1472
0
        unsigned char last_chunk[5];
1473
0
        memcpy(last_chunk, cur_src, szsrc);
1474
1475
0
        for (size_t i = szsrc; i < 5; ++i) {
1476
0
            last_chunk[i] = '\0';
1477
0
        }
1478
1479
0
        GeneralFiveBytesToEightBase32Digits(last_chunk, cur_dest, alphabet);
1480
0
        int filled = (szsrc * 8) / 5 + 1;
1481
0
        cur_dest += filled;
1482
1483
        // Add on the padding.
1484
0
        for (int i = 0; i < (8 - filled); ++i) {
1485
0
            *(cur_dest++) = kPad32;
1486
0
        }
1487
0
    }
1488
1489
0
    return cur_dest - dest;
1490
0
}
1491
1492
0
static bool GeneralBase32Escape(const string& src, string* dest, const char* alphabet) {
1493
0
    const int max_escaped_size = CalculateBase32EscapedLen(src.length());
1494
0
    dest->clear();
1495
0
    dest->resize(max_escaped_size + 1, '\0');
1496
0
    const int escaped_len =
1497
0
            GeneralBase32Escape(reinterpret_cast<const unsigned char*>(src.c_str()), src.length(),
1498
0
                                &*dest->begin(), dest->size(), alphabet);
1499
1500
0
    DCHECK_LE(max_escaped_size, escaped_len);
1501
1502
0
    if (escaped_len < 0) {
1503
0
        dest->clear();
1504
0
        return false;
1505
0
    }
1506
1507
0
    dest->resize(escaped_len);
1508
0
    return true;
1509
0
}
1510
1511
static const char Base32Alphabet[] = {'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K',
1512
                                      'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
1513
                                      'W', 'X', 'Y', 'Z', '2', '3', '4', '5', '6', '7'};
1514
1515
0
int Base32Escape(const unsigned char* src, size_t szsrc, char* dest, size_t szdest) {
1516
0
    return GeneralBase32Escape(src, szsrc, dest, szdest, Base32Alphabet);
1517
0
}
1518
1519
0
bool Base32Escape(const string& src, string* dest) {
1520
0
    return GeneralBase32Escape(src, dest, Base32Alphabet);
1521
0
}
1522
1523
0
void FiveBytesToEightBase32Digits(const unsigned char* in_bytes, char* out) {
1524
0
    GeneralFiveBytesToEightBase32Digits(in_bytes, out, Base32Alphabet);
1525
0
}
1526
1527
static const char Base32HexAlphabet[] = {
1528
        '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F',
1529
        'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V',
1530
};
1531
1532
0
int Base32HexEscape(const unsigned char* src, size_t szsrc, char* dest, size_t szdest) {
1533
0
    return GeneralBase32Escape(src, szsrc, dest, szdest, Base32HexAlphabet);
1534
0
}
1535
1536
0
bool Base32HexEscape(const string& src, string* dest) {
1537
0
    return GeneralBase32Escape(src, dest, Base32HexAlphabet);
1538
0
}
1539
1540
0
int CalculateBase32EscapedLen(size_t input_len) {
1541
0
    DCHECK_LE(input_len, numeric_limits<size_t>::max() / 8);
1542
0
    size_t intermediate_result = 8 * input_len + 4;
1543
0
    size_t len = intermediate_result / 5;
1544
0
    len = (len + 7) & ~7;
1545
0
    return len;
1546
0
}
1547
1548
// ----------------------------------------------------------------------
1549
// EightBase32DigitsToTenHexDigits()
1550
//   Converts an 8-digit base32 string to a 10-digit hex string.
1551
//
1552
//   *in must point to 8 base32 digits.
1553
//   *out must point to 10 bytes.
1554
//
1555
//   Base32 uses A-Z,2-7 to represent the numbers 0-31.
1556
//   See RFC3548 at http://www.ietf.org/rfc/rfc3548.txt
1557
//   for details on base32.
1558
// ----------------------------------------------------------------------
1559
1560
0
void EightBase32DigitsToTenHexDigits(const char* in, char* out) {
1561
0
    unsigned char bytes[5];
1562
0
    EightBase32DigitsToFiveBytes(in, bytes);
1563
0
    b2a_hex(bytes, out, 5);
1564
0
}
1565
1566
0
void EightBase32DigitsToFiveBytes(const char* in, unsigned char* bytes_out) {
1567
0
    static const char Base32InverseAlphabet[] = {
1568
0
            99,       99,       99,       99,       99,       99,       99,       99,
1569
0
            99,       99,       99,       99,       99,       99,       99,       99,
1570
0
            99,       99,       99,       99,       99,       99,       99,       99,
1571
0
            99,       99,       99,       99,       99,       99,       99,       99,
1572
0
            99,       99,       99,       99,       99,       99,       99,       99,
1573
0
            99,       99,       99,       99,       99,       99,       99,       99,
1574
0
            99,       99,       26 /*2*/, 27 /*3*/, 28 /*4*/, 29 /*5*/, 30 /*6*/, 31 /*7*/,
1575
0
            99,       99,       99,       99,       99,       00 /*=*/, 99,       99,
1576
0
            99,       0 /*A*/,  1 /*B*/,  2 /*C*/,  3 /*D*/,  4 /*E*/,  5 /*F*/,  6 /*G*/,
1577
0
            7 /*H*/,  8 /*I*/,  9 /*J*/,  10 /*K*/, 11 /*L*/, 12 /*M*/, 13 /*N*/, 14 /*O*/,
1578
0
            15 /*P*/, 16 /*Q*/, 17 /*R*/, 18 /*S*/, 19 /*T*/, 20 /*U*/, 21 /*V*/, 22 /*W*/,
1579
0
            23 /*X*/, 24 /*Y*/, 25 /*Z*/, 99,       99,       99,       99,       99,
1580
0
            99,       99,       99,       99,       99,       99,       99,       99,
1581
0
            99,       99,       99,       99,       99,       99,       99,       99,
1582
0
            99,       99,       99,       99,       99,       99,       99,       99,
1583
0
            99,       99,       99,       99,       99,       99,       99,       99,
1584
0
            99,       99,       99,       99,       99,       99,       99,       99,
1585
0
            99,       99,       99,       99,       99,       99,       99,       99,
1586
0
            99,       99,       99,       99,       99,       99,       99,       99,
1587
0
            99,       99,       99,       99,       99,       99,       99,       99,
1588
0
            99,       99,       99,       99,       99,       99,       99,       99,
1589
0
            99,       99,       99,       99,       99,       99,       99,       99,
1590
0
            99,       99,       99,       99,       99,       99,       99,       99,
1591
0
            99,       99,       99,       99,       99,       99,       99,       99,
1592
0
            99,       99,       99,       99,       99,       99,       99,       99,
1593
0
            99,       99,       99,       99,       99,       99,       99,       99,
1594
0
            99,       99,       99,       99,       99,       99,       99,       99,
1595
0
            99,       99,       99,       99,       99,       99,       99,       99,
1596
0
            99,       99,       99,       99,       99,       99,       99,       99,
1597
0
            99,       99,       99,       99,       99,       99,       99,       99,
1598
0
            99,       99,       99,       99,       99,       99,       99,       99,
1599
0
            99,       99,       99,       99,       99,       99,       99,       99};
1600
1601
    // Convert to raw bytes. It's easier to just hard code this.
1602
0
    bytes_out[0] = Base32InverseAlphabet[in[0]] << 3 | Base32InverseAlphabet[in[1]] >> 2;
1603
1604
0
    bytes_out[1] = Base32InverseAlphabet[in[1]] << 6 | Base32InverseAlphabet[in[2]] << 1 |
1605
0
                   Base32InverseAlphabet[in[3]] >> 4;
1606
1607
0
    bytes_out[2] = Base32InverseAlphabet[in[3]] << 4 | Base32InverseAlphabet[in[4]] >> 1;
1608
1609
0
    bytes_out[3] = Base32InverseAlphabet[in[4]] << 7 | Base32InverseAlphabet[in[5]] << 2 |
1610
0
                   Base32InverseAlphabet[in[6]] >> 3;
1611
1612
0
    bytes_out[4] = Base32InverseAlphabet[in[6]] << 5 | Base32InverseAlphabet[in[7]];
1613
0
}
1614
1615
// ----------------------------------------------------------------------
1616
// TenHexDigitsToEightBase32Digits()
1617
//   Converts a 10-digit hex string to an 8-digit base32 string.
1618
//
1619
//   *in must point to 10 hex digits.
1620
//   *out must point to 8 bytes.
1621
//
1622
//   See RFC3548 at http://www.ietf.org/rfc/rfc3548.txt
1623
//   for details on base32.
1624
// ----------------------------------------------------------------------
1625
0
void TenHexDigitsToEightBase32Digits(const char* in, char* out) {
1626
0
    unsigned char bytes[5];
1627
1628
    // Convert hex to raw bytes.
1629
0
    a2b_hex(in, bytes, 5);
1630
0
    FiveBytesToEightBase32Digits(bytes, out);
1631
0
}
1632
1633
// ----------------------------------------------------------------------
1634
// EscapeFileName / UnescapeFileName
1635
// ----------------------------------------------------------------------
1636
static const Charmap escape_file_name_exceptions(
1637
        "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ" // letters
1638
        "0123456789"                                           // digits
1639
        "-_.");
1640
1641
0
void EscapeFileName(const StringPiece& src, string* dst) {
1642
    // Reserve at least src.size() chars
1643
0
    dst->reserve(dst->size() + src.size());
1644
1645
0
    for (char c : src) {
1646
        // We do not use "isalpha" because we want the behavior to be
1647
        // independent of the current locale settings.
1648
0
        if (escape_file_name_exceptions.contains(c)) {
1649
0
            dst->push_back(c);
1650
1651
0
        } else if (c == '/') {
1652
0
            dst->push_back('~');
1653
1654
0
        } else {
1655
0
            char tmp[2];
1656
0
            b2a_hex(reinterpret_cast<const unsigned char*>(&c), tmp, 1);
1657
0
            dst->push_back('%');
1658
0
            dst->append(tmp, 2);
1659
0
        }
1660
0
    }
1661
0
}
1662
1663
0
void UnescapeFileName(const StringPiece& src_piece, string* dst) {
1664
0
    const char* src = src_piece.data();
1665
0
    const int len = src_piece.size();
1666
0
    for (int i = 0; i < len; ++i) {
1667
0
        const char c = src[i];
1668
0
        if (c == '~') {
1669
0
            dst->push_back('/');
1670
1671
0
        } else if ((c == '%') && (i + 2 < len)) {
1672
0
            unsigned char tmp[1];
1673
0
            a2b_hex(src + i + 1, &tmp[0], 1);
1674
0
            dst->push_back(tmp[0]);
1675
0
            i += 2;
1676
1677
0
        } else {
1678
0
            dst->push_back(c);
1679
0
        }
1680
0
    }
1681
0
}
1682
1683
static char hex_value[256] = {
1684
        0, 0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,
1685
        0, 0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,
1686
        0, 1,  2,  3,  4,  5,  6,  7, 8, 9, 0, 0, 0, 0, 0, 0, // '0'..'9'
1687
        0, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 'A'..'F'
1688
        0, 0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 10, 11, 12, 13, 14, 15, 0,
1689
        0, 0,  0,  0,  0,  0,  0,  0, // 'a'..'f'
1690
        0, 0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,
1691
        0, 0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,
1692
        0, 0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,
1693
        0, 0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,
1694
        0, 0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,
1695
        0, 0,  0,  0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0};
1696
1697
static char hex_char[] = "0123456789abcdef";
1698
1699
// This is a templated function so that T can be either a char*
1700
// or a string.  This works because we use the [] operator to access
1701
// individual characters at a time.
1702
template <typename T>
1703
0
void a2b_hex_t(const char* a, T b, int num) {
1704
0
    for (int i = 0; i < num; i++) {
1705
0
        b[i] = (hex_value[a[i * 2] & 0xFF] << 4) + (hex_value[a[i * 2 + 1] & 0xFF]);
1706
0
    }
1707
0
}
Unexecuted instantiation: _ZN7strings9a2b_hex_tIPhEEvPKcT_i
Unexecuted instantiation: _ZN7strings9a2b_hex_tIPcEEvPKcT_i
Unexecuted instantiation: _ZN7strings9a2b_hex_tIRNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEEvPKcT_i
1708
1709
0
string a2b_bin(const string& a, bool byte_order_msb) {
1710
0
    string result;
1711
0
    const char* data = a.c_str();
1712
0
    int num_bytes = (a.size() + 7) / 8;
1713
0
    for (int byte_offset = 0; byte_offset < num_bytes; ++byte_offset) {
1714
0
        unsigned char c = 0;
1715
0
        for (int bit_offset = 0; bit_offset < 8; ++bit_offset) {
1716
0
            if (*data == '\0') break;
1717
0
            if (*data++ != '0') {
1718
0
                int bits_to_shift = (byte_order_msb) ? 7 - bit_offset : bit_offset;
1719
0
                c |= (1 << bits_to_shift);
1720
0
            }
1721
0
        }
1722
0
        result.append(1, c);
1723
0
    }
1724
0
    return result;
1725
0
}
1726
1727
// This is a templated function so that T can be either a char*
1728
// or a string.  This works because we use the [] operator to access
1729
// individual characters at a time.
1730
template <typename T>
1731
0
void b2a_hex_t(const unsigned char* b, T a, int num) {
1732
0
    for (int i = 0; i < num; i++) {
1733
0
        a[i * 2 + 0] = hex_char[b[i] >> 4];
1734
0
        a[i * 2 + 1] = hex_char[b[i] & 0xf];
1735
0
    }
1736
0
}
Unexecuted instantiation: _ZN7strings9b2a_hex_tIPcEEvPKhT_i
Unexecuted instantiation: _ZN7strings9b2a_hex_tIRNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEEEEvPKhT_i
1737
1738
0
string b2a_bin(const string& b, bool byte_order_msb) {
1739
0
    string result;
1740
0
    for (char c : b) {
1741
0
        for (int bit_offset = 0; bit_offset < 8; ++bit_offset) {
1742
0
            int x = (byte_order_msb) ? 7 - bit_offset : bit_offset;
1743
0
            result.append(1, (c & (1 << x)) ? '1' : '0');
1744
0
        }
1745
0
    }
1746
0
    return result;
1747
0
}
1748
1749
0
void b2a_hex(const unsigned char* b, char* a, int num) {
1750
0
    b2a_hex_t<char*>(b, a, num);
1751
0
}
1752
1753
0
void a2b_hex(const char* a, unsigned char* b, int num) {
1754
0
    a2b_hex_t<unsigned char*>(a, b, num);
1755
0
}
1756
1757
0
void a2b_hex(const char* a, char* b, int num) {
1758
0
    a2b_hex_t<char*>(a, b, num);
1759
0
}
1760
1761
0
string b2a_hex(const char* b, int len) {
1762
0
    string result;
1763
0
    result.resize(len << 1);
1764
0
    b2a_hex_t<string&>(reinterpret_cast<const unsigned char*>(b), result, len);
1765
0
    return result;
1766
0
}
1767
1768
0
string b2a_hex(const StringPiece& b) {
1769
0
    return b2a_hex(b.data(), b.size());
1770
0
}
1771
1772
0
string a2b_hex(const string& a) {
1773
0
    string result;
1774
0
    a2b_hex(a.c_str(), &result, a.size() / 2);
1775
1776
0
    return result;
1777
0
}
1778
1779
0
void b2a_hex(const unsigned char* from, string* to, int num) {
1780
0
    to->resize(num << 1);
1781
0
    b2a_hex_t<string&>(from, *to, num);
1782
0
}
1783
1784
0
void a2b_hex(const char* from, string* to, int num) {
1785
0
    to->resize(num);
1786
0
    a2b_hex_t<string&>(from, *to, num);
1787
0
}
1788
1789
const char* kDontNeedShellEscapeChars =
1790
        "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_.=/:,@";
1791
1792
0
string ShellEscape(StringPiece src) {
1793
0
    if (!src.empty() && // empty string needs quotes
1794
0
        src.find_first_not_of(kDontNeedShellEscapeChars) == StringPiece::npos) {
1795
        // only contains chars that don't need quotes; it's fine
1796
0
        return src.ToString();
1797
0
    } else if (src.find('\'') == StringPiece::npos) {
1798
        // no single quotes; just wrap it in single quotes
1799
0
        return StrCat("'", src, "'");
1800
0
    } else {
1801
        // needs double quote escaping
1802
0
        string result = "\"";
1803
0
        for (char c : src) {
1804
0
            switch (c) {
1805
0
            case '\\':
1806
0
            case '$':
1807
0
            case '"':
1808
0
            case '`':
1809
0
                result.push_back('\\');
1810
0
            };
1811
0
            result.push_back(c);
1812
0
        }
1813
0
        result.push_back('"');
1814
0
        return result;
1815
0
    }
1816
0
}
1817
1818
static const char kHexTable[513] =
1819
        "000102030405060708090a0b0c0d0e0f"
1820
        "101112131415161718191a1b1c1d1e1f"
1821
        "202122232425262728292a2b2c2d2e2f"
1822
        "303132333435363738393a3b3c3d3e3f"
1823
        "404142434445464748494a4b4c4d4e4f"
1824
        "505152535455565758595a5b5c5d5e5f"
1825
        "606162636465666768696a6b6c6d6e6f"
1826
        "707172737475767778797a7b7c7d7e7f"
1827
        "808182838485868788898a8b8c8d8e8f"
1828
        "909192939495969798999a9b9c9d9e9f"
1829
        "a0a1a2a3a4a5a6a7a8a9aaabacadaeaf"
1830
        "b0b1b2b3b4b5b6b7b8b9babbbcbdbebf"
1831
        "c0c1c2c3c4c5c6c7c8c9cacbcccdcecf"
1832
        "d0d1d2d3d4d5d6d7d8d9dadbdcdddedf"
1833
        "e0e1e2e3e4e5e6e7e8e9eaebecedeeef"
1834
        "f0f1f2f3f4f5f6f7f8f9fafbfcfdfeff";
1835
1836
//------------------------------------------------------------------------
1837
// ByteStringToAscii
1838
//  Reads at most bytes_to_read from binary_string and prints it to
1839
//  ascii_string in downcased hex.
1840
//------------------------------------------------------------------------
1841
0
void ByteStringToAscii(string const& binary_string, int bytes_to_read, string* ascii_string) {
1842
0
    if (binary_string.size() < bytes_to_read) {
1843
0
        bytes_to_read = binary_string.size();
1844
0
    }
1845
1846
0
    CHECK_GE(bytes_to_read, 0);
1847
0
    ascii_string->resize(bytes_to_read * 2);
1848
1849
0
    string::const_iterator in = binary_string.begin();
1850
0
    string::iterator out = ascii_string->begin();
1851
1852
0
    for (int i = 0; i < bytes_to_read; i++) {
1853
0
        *out++ = kHexTable[(*in) * 2];
1854
0
        *out++ = kHexTable[(*in) * 2 + 1];
1855
0
        ++in;
1856
0
    }
1857
0
}
1858
1859
//------------------------------------------------------------------------
1860
// ByteStringFromAscii
1861
//  Converts the hex from ascii_string into binary data and
1862
//  writes the binary data into binary_string.
1863
//  Empty input successfully converts to empty output.
1864
//  Returns false and may modify output if it is
1865
//  unable to parse the hex string.
1866
//------------------------------------------------------------------------
1867
0
bool ByteStringFromAscii(string const& hex_string, string* binary_string) {
1868
0
    binary_string->clear();
1869
1870
0
    if ((hex_string.size() % 2) != 0) {
1871
0
        return false;
1872
0
    }
1873
1874
0
    int value = 0;
1875
0
    for (int i = 0; i < hex_string.size(); i++) {
1876
0
        char c = hex_string[i];
1877
1878
0
        if (!ascii_isxdigit(c)) {
1879
0
            return false;
1880
0
        }
1881
1882
0
        if (ascii_isdigit(c)) {
1883
0
            value += c - '0';
1884
0
        } else if (ascii_islower(c)) {
1885
0
            value += 10 + c - 'a';
1886
0
        } else {
1887
0
            value += 10 + c - 'A';
1888
0
        }
1889
1890
0
        if (i & 1) {
1891
0
            binary_string->push_back(value);
1892
0
            value = 0;
1893
0
        } else {
1894
0
            value <<= 4;
1895
0
        }
1896
0
    }
1897
1898
0
    return true;
1899
0
}
1900
1901
// ----------------------------------------------------------------------
1902
// CleanStringLineEndings()
1903
//   Clean up a multi-line string to conform to Unix line endings.
1904
//   Reads from src and appends to dst, so usually dst should be empty.
1905
//
1906
//   If there is no line ending at the end of a non-empty string, it can
1907
//   be added automatically.
1908
//
1909
//   Four different types of input are correctly handled:
1910
//
1911
//     - Unix/Linux files: line ending is LF, pass through unchanged
1912
//
1913
//     - DOS/Windows files: line ending is CRLF: convert to LF
1914
//
1915
//     - Legacy Mac files: line ending is CR: convert to LF
1916
//
1917
//     - Garbled files: random line endings, covert gracefully
1918
//                      lonely CR, lonely LF, CRLF: convert to LF
1919
//
1920
//   @param src The multi-line string to convert
1921
//   @param dst The converted string is appended to this string
1922
//   @param auto_end_last_line Automatically terminate the last line
1923
//
1924
//   Limitations:
1925
//
1926
//     This does not do the right thing for CRCRLF files created by
1927
//     broken programs that do another Unix->DOS conversion on files
1928
//     that are already in CRLF format.  For this, a two-pass approach
1929
//     brute-force would be needed that
1930
//
1931
//       (1) determines the presence of LF (first one is ok)
1932
//       (2) if yes, removes any CR, else convert every CR to LF
1933
1934
0
void CleanStringLineEndings(const string& src, string* dst, bool auto_end_last_line) {
1935
0
    if (dst->empty()) {
1936
0
        dst->append(src);
1937
0
        CleanStringLineEndings(dst, auto_end_last_line);
1938
0
    } else {
1939
0
        string tmp = src;
1940
0
        CleanStringLineEndings(&tmp, auto_end_last_line);
1941
0
        dst->append(tmp);
1942
0
    }
1943
0
}
1944
1945
0
void CleanStringLineEndings(string* str, bool auto_end_last_line) {
1946
0
    int output_pos = 0;
1947
0
    bool r_seen = false;
1948
0
    int len = str->size();
1949
1950
0
    char* p = string_as_array(str);
1951
1952
0
    for (int input_pos = 0; input_pos < len;) {
1953
0
        if (!r_seen && input_pos + 8 < len) {
1954
0
            uint64 v = UNALIGNED_LOAD64(p + input_pos);
1955
            // Loop over groups of 8 bytes at a time until we come across
1956
            // a word that has a byte whose value is less than or equal to
1957
            // '\r' (i.e. could contain a \n (0x0a) or a \r (0x0d) ).
1958
            //
1959
            // We use a has_less macro that quickly tests a whole 64-bit
1960
            // word to see if any of the bytes has a value < N.
1961
            //
1962
            // For more details, see:
1963
            //   http://graphics.stanford.edu/~seander/bithacks.html#HasLessInWord
1964
0
#define has_less(x, n) (((x) - ~0ULL / 255 * (n)) & ~(x) & ~0ULL / 255 * 128)
1965
0
            if (!has_less(v, '\r' + 1)) {
1966
0
#undef has_less
1967
                // No byte in this word has a value that could be a \r or a \n
1968
0
                if (output_pos != input_pos) UNALIGNED_STORE64(p + output_pos, v);
1969
0
                input_pos += 8;
1970
0
                output_pos += 8;
1971
0
                continue;
1972
0
            }
1973
0
        }
1974
0
        string::const_reference in = p[input_pos];
1975
0
        if (in == '\r') {
1976
0
            if (r_seen) p[output_pos++] = '\n';
1977
0
            r_seen = true;
1978
0
        } else if (in == '\n') {
1979
0
            if (input_pos != output_pos)
1980
0
                p[output_pos++] = '\n';
1981
0
            else
1982
0
                output_pos++;
1983
0
            r_seen = false;
1984
0
        } else {
1985
0
            if (r_seen) p[output_pos++] = '\n';
1986
0
            r_seen = false;
1987
0
            if (input_pos != output_pos)
1988
0
                p[output_pos++] = in;
1989
0
            else
1990
0
                output_pos++;
1991
0
        }
1992
0
        input_pos++;
1993
0
    }
1994
0
    if (r_seen || (auto_end_last_line && output_pos > 0 && p[output_pos - 1] != '\n')) {
1995
0
        str->resize(output_pos + 1);
1996
0
        str->operator[](output_pos) = '\n';
1997
0
    } else if (output_pos < len) {
1998
0
        str->resize(output_pos);
1999
0
    }
2000
0
}
2001
2002
} // namespace strings