be/src/storage/itoken_extractor.cpp

Source
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements.  See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership.  The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License.  You may obtain a copy of the License at
//
//   http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied.  See the License for the
// specific language governing permissions and limitations
// under the License.

#include "storage/itoken_extractor.h"

#include <stdint.h>

#include "util/simd/vstring_function.h"

namespace doris {

bool NgramTokenExtractor::next_in_string(const char* data, size_t length, size_t* __restrict pos,
                                         size_t* __restrict token_start,
                                         size_t* __restrict token_length) const {
    *token_start = *pos;
    *token_length = 0;
    size_t code_points = 0;
    for (; code_points < n && *token_start + *token_length < length; ++code_points) {
        size_t sz = get_utf8_byte_length(static_cast<uint8_t>(data[*token_start + *token_length]));
        *token_length += sz;
    }
    *pos += get_utf8_byte_length(static_cast<uint8_t>(data[*pos]));
    return code_points == n;
}

bool NgramTokenExtractor::next_in_string_like(const char* data, size_t length, size_t* pos,
                                              std::string& token) const {
    token.clear();

    size_t code_points = 0;
    bool escaped = false;
    for (size_t i = *pos; i < length;) {
        if (escaped && (data[i] == '%' || data[i] == '_' || data[i] == '\\')) {
            token += data[i];
            ++code_points;
            escaped = false;
            ++i;
        } else if (!escaped && (data[i] == '%' || data[i] == '_')) {
            /// This token is too small, go to the next.
            token.clear();
            code_points = 0;
            escaped = false;
            *pos = ++i;
        } else if (!escaped && data[i] == '\\') {
            escaped = true;
            ++i;
        } else {
            const size_t sz = get_utf8_byte_length(static_cast<uint8_t>(data[i]));
            for (size_t j = 0; j < sz; ++j) {
                token += data[i + j];
            }
            i += sz;
            ++code_points;
            escaped = false;
        }

        if (code_points == n) {
            *pos += get_utf8_byte_length(static_cast<uint8_t>(data[*pos]));
            return true;
        }
    }

    return false;
}
} // namespace doris

Line	Count	Source
1		// Licensed to the Apache Software Foundation (ASF) under one
2		// or more contributor license agreements. See the NOTICE file
3		// distributed with this work for additional information
4		// regarding copyright ownership. The ASF licenses this file
5		// to you under the Apache License, Version 2.0 (the
6		// "License"); you may not use this file except in compliance
7		// with the License. You may obtain a copy of the License at
8		//
9		// http://www.apache.org/licenses/LICENSE-2.0
10		//
11		// Unless required by applicable law or agreed to in writing,
12		// software distributed under the License is distributed on an
13		// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14		// KIND, either express or implied. See the License for the
15		// specific language governing permissions and limitations
16		// under the License.
17
18		#include "storage/itoken_extractor.h"
19
20		#include <stdint.h>
21
22		#include "util/simd/vstring_function.h"
23
24		namespace doris {
25
26		bool NgramTokenExtractor::next_in_string(const char* data, size_t length, size_t* __restrict pos,
27		size_t* __restrict token_start,
28	1.01M	size_t* __restrict token_length) const {
29	1.01M	token_start = pos;
30	1.01M	*token_length = 0;
31	1.01M	size_t code_points = 0;
32	9.59M	for (; code_points < n && token_start + token_length < length; ++code_points) {
33	8.58M	size_t sz = get_utf8_byte_length(static_cast<uint8_t>(data[token_start + token_length]));
34	8.58M	*token_length += sz;
35	8.58M	}
36	1.01M	pos += get_utf8_byte_length(static_cast<uint8_t>(data[pos]));
37	1.01M	return code_points == n;
38	1.01M	}
39
40		bool NgramTokenExtractor::next_in_string_like(const char* data, size_t length, size_t* pos,
41	419	std::string& token) const {
42	419	token.clear();
43
44	419	size_t code_points = 0;
45	419	bool escaped = false;
46	1.50k	for (size_t i = *pos; i < length;) {
47	1.45k	if (escaped && (data[i] == '%' \|\| data[i] == '_' \|\| data[i] == '\\')) {
48	5	token += data[i];
49	5	++code_points;
50	5	escaped = false;
51	5	++i;
52	1.45k	} else if (!escaped && (data[i] == '%' \|\| data[i] == '_')) {
53		/// This token is too small, go to the next.
54	57	token.clear();
55	57	code_points = 0;
56	57	escaped = false;
57	57	*pos = ++i;
58	1.39k	} else if (!escaped && data[i] == '\\') {
59	5	escaped = true;
60	5	++i;
61	1.39k	} else {
62	1.39k	const size_t sz = get_utf8_byte_length(static_cast<uint8_t>(data[i]));
63	2.82k	for (size_t j = 0; j < sz; ++j) {
64	1.43k	token += data[i + j];
65	1.43k	}
66	1.39k	i += sz;
67	1.39k	++code_points;
68	1.39k	escaped = false;
69	1.39k	}
70
71	1.45k	if (code_points == n) {
72	369	pos += get_utf8_byte_length(static_cast<uint8_t>(data[pos]));
73	369	return true;
74	369	}
75	1.45k	}
76
77	50	return false;
78	419	}
79		} // namespace doris

Coverage Report

Created: 2026-03-16 13:13