be/src/exprs/function/function_encode_varchar.cpp
Line | Count | Source |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | #include <cstddef> |
19 | | #include <limits> |
20 | | #include <type_traits> |
21 | | |
22 | | #include "common/exception.h" |
23 | | #include "common/status.h" |
24 | | #include "core/column/column_const.h" |
25 | | #include "core/column/column_vector.h" |
26 | | #include "core/data_type/data_type.h" |
27 | | #include "core/data_type/data_type_number.h" |
28 | | #include "core/data_type/data_type_string.h" |
29 | | #include "core/data_type/primitive_type.h" |
30 | | #include "core/types.h" |
31 | | #include "exprs/function/function.h" |
32 | | #include "exprs/function/function_helpers.h" |
33 | | #include "exprs/function/simple_function_factory.h" |
34 | | #include "util/simd/reverse_copy_bytes.h" |
35 | | |
36 | | namespace doris { |
37 | | |
38 | | struct EncodeAsSmallInt { |
39 | | static constexpr auto name = "encode_as_smallint"; |
40 | | }; |
41 | | |
42 | | struct EncodeAsInt { |
43 | | static constexpr auto name = "encode_as_int"; |
44 | | }; |
45 | | |
46 | | struct EncodeAsBigInt { |
47 | | static constexpr auto name = "encode_as_bigint"; |
48 | | }; |
49 | | |
50 | | struct EncodeAsLargeInt { |
51 | | static constexpr auto name = "encode_as_largeint"; |
52 | | }; |
53 | | |
54 | | template <typename Name, PrimitiveType ReturnType> |
55 | | class FunctionEncodeVarchar : public IFunction { |
56 | | public: |
57 | | static constexpr auto name = Name::name; |
58 | 72 | static FunctionPtr create() { return std::make_shared<FunctionEncodeVarchar>(); }_ZN5doris21FunctionEncodeVarcharINS_16EncodeAsSmallIntELNS_13PrimitiveTypeE4EE6createEv Line | Count | Source | 58 | 7 | static FunctionPtr create() { return std::make_shared<FunctionEncodeVarchar>(); } |
_ZN5doris21FunctionEncodeVarcharINS_11EncodeAsIntELNS_13PrimitiveTypeE5EE6createEv Line | Count | Source | 58 | 11 | static FunctionPtr create() { return std::make_shared<FunctionEncodeVarchar>(); } |
_ZN5doris21FunctionEncodeVarcharINS_14EncodeAsBigIntELNS_13PrimitiveTypeE6EE6createEv Line | Count | Source | 58 | 19 | static FunctionPtr create() { return std::make_shared<FunctionEncodeVarchar>(); } |
_ZN5doris21FunctionEncodeVarcharINS_16EncodeAsLargeIntELNS_13PrimitiveTypeE7EE6createEv Line | Count | Source | 58 | 35 | static FunctionPtr create() { return std::make_shared<FunctionEncodeVarchar>(); } |
|
59 | | |
60 | 4 | String get_name() const override { return name; }_ZNK5doris21FunctionEncodeVarcharINS_16EncodeAsSmallIntELNS_13PrimitiveTypeE4EE8get_nameB5cxx11Ev Line | Count | Source | 60 | 1 | String get_name() const override { return name; } |
_ZNK5doris21FunctionEncodeVarcharINS_11EncodeAsIntELNS_13PrimitiveTypeE5EE8get_nameB5cxx11Ev Line | Count | Source | 60 | 1 | String get_name() const override { return name; } |
_ZNK5doris21FunctionEncodeVarcharINS_14EncodeAsBigIntELNS_13PrimitiveTypeE6EE8get_nameB5cxx11Ev Line | Count | Source | 60 | 1 | String get_name() const override { return name; } |
_ZNK5doris21FunctionEncodeVarcharINS_16EncodeAsLargeIntELNS_13PrimitiveTypeE7EE8get_nameB5cxx11Ev Line | Count | Source | 60 | 1 | String get_name() const override { return name; } |
|
61 | | |
62 | 64 | size_t get_number_of_arguments() const override { return 1; }_ZNK5doris21FunctionEncodeVarcharINS_16EncodeAsSmallIntELNS_13PrimitiveTypeE4EE23get_number_of_argumentsEv Line | Count | Source | 62 | 5 | size_t get_number_of_arguments() const override { return 1; } |
_ZNK5doris21FunctionEncodeVarcharINS_11EncodeAsIntELNS_13PrimitiveTypeE5EE23get_number_of_argumentsEv Line | Count | Source | 62 | 9 | size_t get_number_of_arguments() const override { return 1; } |
_ZNK5doris21FunctionEncodeVarcharINS_14EncodeAsBigIntELNS_13PrimitiveTypeE6EE23get_number_of_argumentsEv Line | Count | Source | 62 | 17 | size_t get_number_of_arguments() const override { return 1; } |
_ZNK5doris21FunctionEncodeVarcharINS_16EncodeAsLargeIntELNS_13PrimitiveTypeE7EE23get_number_of_argumentsEv Line | Count | Source | 62 | 33 | size_t get_number_of_arguments() const override { return 1; } |
|
63 | | |
64 | 64 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { |
65 | 64 | return std::make_shared<typename PrimitiveTypeTraits<ReturnType>::DataType>(); |
66 | 64 | } _ZNK5doris21FunctionEncodeVarcharINS_16EncodeAsSmallIntELNS_13PrimitiveTypeE4EE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 64 | 5 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 65 | 5 | return std::make_shared<typename PrimitiveTypeTraits<ReturnType>::DataType>(); | 66 | 5 | } |
_ZNK5doris21FunctionEncodeVarcharINS_11EncodeAsIntELNS_13PrimitiveTypeE5EE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 64 | 9 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 65 | 9 | return std::make_shared<typename PrimitiveTypeTraits<ReturnType>::DataType>(); | 66 | 9 | } |
_ZNK5doris21FunctionEncodeVarcharINS_14EncodeAsBigIntELNS_13PrimitiveTypeE6EE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 64 | 17 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 65 | 17 | return std::make_shared<typename PrimitiveTypeTraits<ReturnType>::DataType>(); | 66 | 17 | } |
_ZNK5doris21FunctionEncodeVarcharINS_16EncodeAsLargeIntELNS_13PrimitiveTypeE7EE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS8_EE Line | Count | Source | 64 | 33 | DataTypePtr get_return_type_impl(const DataTypes& arguments) const override { | 65 | 33 | return std::make_shared<typename PrimitiveTypeTraits<ReturnType>::DataType>(); | 66 | 33 | } |
|
67 | | |
68 | | Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments, |
69 | 64 | uint32_t result, size_t input_rows_count) const override { |
70 | 64 | const ColumnString* col_str = |
71 | 64 | assert_cast<const ColumnString*>(block.get_by_position(arguments[0]).column.get()); |
72 | | |
73 | | // max_row_byte_size = size of string + size of offset value |
74 | 64 | size_t max_str_size = col_str->get_max_row_byte_size() - sizeof(UInt32); |
75 | | |
76 | 64 | if (max_str_size > sizeof(typename PrimitiveTypeTraits<ReturnType>::CppType) - 1) { |
77 | 4 | return Status::InternalError( |
78 | 4 | "String is too long to encode, input string size {}, max valid string " |
79 | 4 | "size for {} is {}", |
80 | 4 | max_str_size, name, |
81 | 4 | sizeof(typename PrimitiveTypeTraits<ReturnType>::CppType) - 1); |
82 | 4 | } |
83 | | |
84 | 60 | auto col_res = PrimitiveTypeTraits<ReturnType>::ColumnType::create(input_rows_count, 0); |
85 | 60 | auto& col_res_data = col_res->get_data(); |
86 | | |
87 | 245k | for (size_t i = 0; i < input_rows_count; ++i) { |
88 | 245k | const char* str_ptr = col_str->get_data_at(i).data; |
89 | 245k | auto str_size = static_cast<UInt8>(col_str->get_data_at(i).size); |
90 | 245k | auto* res = &col_res_data[i]; |
91 | 245k | auto* __restrict ui8_ptr = reinterpret_cast<UInt8*>(res); |
92 | | |
93 | | // "reverse" the order of string on little endian machine. |
94 | 245k | simd::reverse_copy_bytes(ui8_ptr, |
95 | 245k | sizeof(typename PrimitiveTypeTraits<ReturnType>::CppType), |
96 | 245k | str_ptr, str_size); |
97 | | // Lowest byte of Integer stores the size of the string, bit left shiflted by 1 so that we can get |
98 | | // correct size after right shifting by 1 |
99 | 245k | memset(ui8_ptr, str_size << 1, 1); |
100 | 245k | *res >>= 1; |
101 | | // operator &= can not be applied to Int128 |
102 | 245k | *res = *res & |
103 | 245k | std::numeric_limits<typename PrimitiveTypeTraits<ReturnType>::CppType>::max(); |
104 | 245k | } |
105 | | |
106 | 60 | block.get_by_position(result).column = std::move(col_res); |
107 | | |
108 | 60 | return Status::OK(); |
109 | 64 | } _ZNK5doris21FunctionEncodeVarcharINS_16EncodeAsSmallIntELNS_13PrimitiveTypeE4EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 69 | 5 | uint32_t result, size_t input_rows_count) const override { | 70 | 5 | const ColumnString* col_str = | 71 | 5 | assert_cast<const ColumnString*>(block.get_by_position(arguments[0]).column.get()); | 72 | | | 73 | | // max_row_byte_size = size of string + size of offset value | 74 | 5 | size_t max_str_size = col_str->get_max_row_byte_size() - sizeof(UInt32); | 75 | | | 76 | 5 | if (max_str_size > sizeof(typename PrimitiveTypeTraits<ReturnType>::CppType) - 1) { | 77 | 1 | return Status::InternalError( | 78 | 1 | "String is too long to encode, input string size {}, max valid string " | 79 | 1 | "size for {} is {}", | 80 | 1 | max_str_size, name, | 81 | 1 | sizeof(typename PrimitiveTypeTraits<ReturnType>::CppType) - 1); | 82 | 1 | } | 83 | | | 84 | 4 | auto col_res = PrimitiveTypeTraits<ReturnType>::ColumnType::create(input_rows_count, 0); | 85 | 4 | auto& col_res_data = col_res->get_data(); | 86 | | | 87 | 16.3k | for (size_t i = 0; i < input_rows_count; ++i) { | 88 | 16.3k | const char* str_ptr = col_str->get_data_at(i).data; | 89 | 16.3k | auto str_size = static_cast<UInt8>(col_str->get_data_at(i).size); | 90 | 16.3k | auto* res = &col_res_data[i]; | 91 | 16.3k | auto* __restrict ui8_ptr = reinterpret_cast<UInt8*>(res); | 92 | | | 93 | | // "reverse" the order of string on little endian machine. | 94 | 16.3k | simd::reverse_copy_bytes(ui8_ptr, | 95 | 16.3k | sizeof(typename PrimitiveTypeTraits<ReturnType>::CppType), | 96 | 16.3k | str_ptr, str_size); | 97 | | // Lowest byte of Integer stores the size of the string, bit left shiflted by 1 so that we can get | 98 | | // correct size after right shifting by 1 | 99 | 16.3k | memset(ui8_ptr, str_size << 1, 1); | 100 | 16.3k | *res >>= 1; | 101 | | // operator &= can not be applied to Int128 | 102 | 16.3k | *res = *res & | 103 | 16.3k | std::numeric_limits<typename PrimitiveTypeTraits<ReturnType>::CppType>::max(); | 104 | 16.3k | } | 105 | | | 106 | 4 | block.get_by_position(result).column = std::move(col_res); | 107 | | | 108 | 4 | return Status::OK(); | 109 | 5 | } |
_ZNK5doris21FunctionEncodeVarcharINS_11EncodeAsIntELNS_13PrimitiveTypeE5EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 69 | 9 | uint32_t result, size_t input_rows_count) const override { | 70 | 9 | const ColumnString* col_str = | 71 | 9 | assert_cast<const ColumnString*>(block.get_by_position(arguments[0]).column.get()); | 72 | | | 73 | | // max_row_byte_size = size of string + size of offset value | 74 | 9 | size_t max_str_size = col_str->get_max_row_byte_size() - sizeof(UInt32); | 75 | | | 76 | 9 | if (max_str_size > sizeof(typename PrimitiveTypeTraits<ReturnType>::CppType) - 1) { | 77 | 1 | return Status::InternalError( | 78 | 1 | "String is too long to encode, input string size {}, max valid string " | 79 | 1 | "size for {} is {}", | 80 | 1 | max_str_size, name, | 81 | 1 | sizeof(typename PrimitiveTypeTraits<ReturnType>::CppType) - 1); | 82 | 1 | } | 83 | | | 84 | 8 | auto col_res = PrimitiveTypeTraits<ReturnType>::ColumnType::create(input_rows_count, 0); | 85 | 8 | auto& col_res_data = col_res->get_data(); | 86 | | | 87 | 32.7k | for (size_t i = 0; i < input_rows_count; ++i) { | 88 | 32.7k | const char* str_ptr = col_str->get_data_at(i).data; | 89 | 32.7k | auto str_size = static_cast<UInt8>(col_str->get_data_at(i).size); | 90 | 32.7k | auto* res = &col_res_data[i]; | 91 | 32.7k | auto* __restrict ui8_ptr = reinterpret_cast<UInt8*>(res); | 92 | | | 93 | | // "reverse" the order of string on little endian machine. | 94 | 32.7k | simd::reverse_copy_bytes(ui8_ptr, | 95 | 32.7k | sizeof(typename PrimitiveTypeTraits<ReturnType>::CppType), | 96 | 32.7k | str_ptr, str_size); | 97 | | // Lowest byte of Integer stores the size of the string, bit left shiflted by 1 so that we can get | 98 | | // correct size after right shifting by 1 | 99 | 32.7k | memset(ui8_ptr, str_size << 1, 1); | 100 | 32.7k | *res >>= 1; | 101 | | // operator &= can not be applied to Int128 | 102 | 32.7k | *res = *res & | 103 | 32.7k | std::numeric_limits<typename PrimitiveTypeTraits<ReturnType>::CppType>::max(); | 104 | 32.7k | } | 105 | | | 106 | 8 | block.get_by_position(result).column = std::move(col_res); | 107 | | | 108 | 8 | return Status::OK(); | 109 | 9 | } |
_ZNK5doris21FunctionEncodeVarcharINS_14EncodeAsBigIntELNS_13PrimitiveTypeE6EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 69 | 17 | uint32_t result, size_t input_rows_count) const override { | 70 | 17 | const ColumnString* col_str = | 71 | 17 | assert_cast<const ColumnString*>(block.get_by_position(arguments[0]).column.get()); | 72 | | | 73 | | // max_row_byte_size = size of string + size of offset value | 74 | 17 | size_t max_str_size = col_str->get_max_row_byte_size() - sizeof(UInt32); | 75 | | | 76 | 17 | if (max_str_size > sizeof(typename PrimitiveTypeTraits<ReturnType>::CppType) - 1) { | 77 | 1 | return Status::InternalError( | 78 | 1 | "String is too long to encode, input string size {}, max valid string " | 79 | 1 | "size for {} is {}", | 80 | 1 | max_str_size, name, | 81 | 1 | sizeof(typename PrimitiveTypeTraits<ReturnType>::CppType) - 1); | 82 | 1 | } | 83 | | | 84 | 16 | auto col_res = PrimitiveTypeTraits<ReturnType>::ColumnType::create(input_rows_count, 0); | 85 | 16 | auto& col_res_data = col_res->get_data(); | 86 | | | 87 | 65.5k | for (size_t i = 0; i < input_rows_count; ++i) { | 88 | 65.5k | const char* str_ptr = col_str->get_data_at(i).data; | 89 | 65.5k | auto str_size = static_cast<UInt8>(col_str->get_data_at(i).size); | 90 | 65.5k | auto* res = &col_res_data[i]; | 91 | 65.5k | auto* __restrict ui8_ptr = reinterpret_cast<UInt8*>(res); | 92 | | | 93 | | // "reverse" the order of string on little endian machine. | 94 | 65.5k | simd::reverse_copy_bytes(ui8_ptr, | 95 | 65.5k | sizeof(typename PrimitiveTypeTraits<ReturnType>::CppType), | 96 | 65.5k | str_ptr, str_size); | 97 | | // Lowest byte of Integer stores the size of the string, bit left shiflted by 1 so that we can get | 98 | | // correct size after right shifting by 1 | 99 | 65.5k | memset(ui8_ptr, str_size << 1, 1); | 100 | 65.5k | *res >>= 1; | 101 | | // operator &= can not be applied to Int128 | 102 | 65.5k | *res = *res & | 103 | 65.5k | std::numeric_limits<typename PrimitiveTypeTraits<ReturnType>::CppType>::max(); | 104 | 65.5k | } | 105 | | | 106 | 16 | block.get_by_position(result).column = std::move(col_res); | 107 | | | 108 | 16 | return Status::OK(); | 109 | 17 | } |
_ZNK5doris21FunctionEncodeVarcharINS_16EncodeAsLargeIntELNS_13PrimitiveTypeE7EE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm Line | Count | Source | 69 | 33 | uint32_t result, size_t input_rows_count) const override { | 70 | 33 | const ColumnString* col_str = | 71 | 33 | assert_cast<const ColumnString*>(block.get_by_position(arguments[0]).column.get()); | 72 | | | 73 | | // max_row_byte_size = size of string + size of offset value | 74 | 33 | size_t max_str_size = col_str->get_max_row_byte_size() - sizeof(UInt32); | 75 | | | 76 | 33 | if (max_str_size > sizeof(typename PrimitiveTypeTraits<ReturnType>::CppType) - 1) { | 77 | 1 | return Status::InternalError( | 78 | 1 | "String is too long to encode, input string size {}, max valid string " | 79 | 1 | "size for {} is {}", | 80 | 1 | max_str_size, name, | 81 | 1 | sizeof(typename PrimitiveTypeTraits<ReturnType>::CppType) - 1); | 82 | 1 | } | 83 | | | 84 | 32 | auto col_res = PrimitiveTypeTraits<ReturnType>::ColumnType::create(input_rows_count, 0); | 85 | 32 | auto& col_res_data = col_res->get_data(); | 86 | | | 87 | 131k | for (size_t i = 0; i < input_rows_count; ++i) { | 88 | 131k | const char* str_ptr = col_str->get_data_at(i).data; | 89 | 131k | auto str_size = static_cast<UInt8>(col_str->get_data_at(i).size); | 90 | 131k | auto* res = &col_res_data[i]; | 91 | 131k | auto* __restrict ui8_ptr = reinterpret_cast<UInt8*>(res); | 92 | | | 93 | | // "reverse" the order of string on little endian machine. | 94 | 131k | simd::reverse_copy_bytes(ui8_ptr, | 95 | 131k | sizeof(typename PrimitiveTypeTraits<ReturnType>::CppType), | 96 | 131k | str_ptr, str_size); | 97 | | // Lowest byte of Integer stores the size of the string, bit left shiflted by 1 so that we can get | 98 | | // correct size after right shifting by 1 | 99 | 131k | memset(ui8_ptr, str_size << 1, 1); | 100 | 131k | *res >>= 1; | 101 | | // operator &= can not be applied to Int128 | 102 | 131k | *res = *res & | 103 | 131k | std::numeric_limits<typename PrimitiveTypeTraits<ReturnType>::CppType>::max(); | 104 | 131k | } | 105 | | | 106 | 32 | block.get_by_position(result).column = std::move(col_res); | 107 | | | 108 | 32 | return Status::OK(); | 109 | 33 | } |
|
110 | | }; |
111 | | |
112 | 1 | void register_function_encode_varchar(SimpleFunctionFactory& factory) { |
113 | 1 | factory.register_function<FunctionEncodeVarchar<EncodeAsSmallInt, TYPE_SMALLINT>>(); |
114 | 1 | factory.register_function<FunctionEncodeVarchar<EncodeAsInt, TYPE_INT>>(); |
115 | 1 | factory.register_function<FunctionEncodeVarchar<EncodeAsBigInt, TYPE_BIGINT>>(); |
116 | 1 | factory.register_function<FunctionEncodeVarchar<EncodeAsLargeInt, TYPE_LARGEINT>>(); |
117 | 1 | } |
118 | | |
119 | | } // namespace doris |