Coverage Report

Created: 2026-03-15 08:11

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
be/src/exprs/function/array/function_array_distance.h
Line
Count
Source
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
#pragma once
19
20
#include <faiss/impl/platform_macros.h>
21
#include <faiss/utils/distances.h>
22
#include <gen_cpp/Types_types.h>
23
24
#include "common/exception.h"
25
#include "common/status.h"
26
#include "core/assert_cast.h"
27
#include "core/column/column.h"
28
#include "core/column/column_array.h"
29
#include "core/column/column_nullable.h"
30
#include "core/data_type/data_type.h"
31
#include "core/data_type/data_type_array.h"
32
#include "core/data_type/data_type_nullable.h"
33
#include "core/data_type/data_type_number.h"
34
#include "core/data_type/primitive_type.h"
35
#include "core/types.h"
36
#include "exec/common/util.hpp"
37
#include "exprs/function/array/function_array_utils.h"
38
#include "exprs/function/function.h"
39
40
namespace doris {
41
42
class L1Distance {
43
public:
44
    static constexpr auto name = "l1_distance";
45
0
    static float distance(const float* x, const float* y, size_t d) {
46
0
        return faiss::fvec_L1(x, y, d);
47
0
    }
48
};
49
50
class L2Distance {
51
public:
52
    static constexpr auto name = "l2_distance";
53
0
    static float distance(const float* x, const float* y, size_t d) {
54
0
        return std::sqrt(faiss::fvec_L2sqr(x, y, d));
55
0
    }
56
};
57
58
class InnerProduct {
59
public:
60
    static constexpr auto name = "inner_product";
61
0
    static float distance(const float* x, const float* y, size_t d) {
62
0
        return faiss::fvec_inner_product(x, y, d);
63
0
    }
64
};
65
66
class CosineDistance {
67
public:
68
    static constexpr auto name = "cosine_distance";
69
    static float distance(const float* x, const float* y, size_t d);
70
};
71
72
class CosineSimilarity {
73
public:
74
    static constexpr auto name = "cosine_similarity";
75
    static float distance(const float* x, const float* y, size_t d);
76
};
77
78
class L2DistanceApproximate : public L2Distance {
79
public:
80
    static constexpr auto name = "l2_distance_approximate";
81
};
82
83
class InnerProductApproximate : public InnerProduct {
84
public:
85
    static constexpr auto name = "inner_product_approximate";
86
};
87
88
template <typename DistanceImpl>
89
class FunctionArrayDistance : public IFunction {
90
public:
91
    using DataType = PrimitiveTypeTraits<TYPE_FLOAT>::DataType;
92
    using ColumnType = PrimitiveTypeTraits<TYPE_FLOAT>::ColumnType;
93
94
    static constexpr auto name = DistanceImpl::name;
95
7
    String get_name() const override { return name; }
_ZNK5doris21FunctionArrayDistanceINS_10L1DistanceEE8get_nameB5cxx11Ev
Line
Count
Source
95
1
    String get_name() const override { return name; }
_ZNK5doris21FunctionArrayDistanceINS_10L2DistanceEE8get_nameB5cxx11Ev
Line
Count
Source
95
1
    String get_name() const override { return name; }
_ZNK5doris21FunctionArrayDistanceINS_14CosineDistanceEE8get_nameB5cxx11Ev
Line
Count
Source
95
1
    String get_name() const override { return name; }
_ZNK5doris21FunctionArrayDistanceINS_16CosineSimilarityEE8get_nameB5cxx11Ev
Line
Count
Source
95
1
    String get_name() const override { return name; }
_ZNK5doris21FunctionArrayDistanceINS_12InnerProductEE8get_nameB5cxx11Ev
Line
Count
Source
95
1
    String get_name() const override { return name; }
_ZNK5doris21FunctionArrayDistanceINS_21L2DistanceApproximateEE8get_nameB5cxx11Ev
Line
Count
Source
95
1
    String get_name() const override { return name; }
_ZNK5doris21FunctionArrayDistanceINS_23InnerProductApproximateEE8get_nameB5cxx11Ev
Line
Count
Source
95
1
    String get_name() const override { return name; }
96
33
    static FunctionPtr create() { return std::make_shared<FunctionArrayDistance<DistanceImpl>>(); }
_ZN5doris21FunctionArrayDistanceINS_10L1DistanceEE6createEv
Line
Count
Source
96
2
    static FunctionPtr create() { return std::make_shared<FunctionArrayDistance<DistanceImpl>>(); }
_ZN5doris21FunctionArrayDistanceINS_10L2DistanceEE6createEv
Line
Count
Source
96
2
    static FunctionPtr create() { return std::make_shared<FunctionArrayDistance<DistanceImpl>>(); }
_ZN5doris21FunctionArrayDistanceINS_14CosineDistanceEE6createEv
Line
Count
Source
96
2
    static FunctionPtr create() { return std::make_shared<FunctionArrayDistance<DistanceImpl>>(); }
_ZN5doris21FunctionArrayDistanceINS_16CosineSimilarityEE6createEv
Line
Count
Source
96
12
    static FunctionPtr create() { return std::make_shared<FunctionArrayDistance<DistanceImpl>>(); }
_ZN5doris21FunctionArrayDistanceINS_12InnerProductEE6createEv
Line
Count
Source
96
2
    static FunctionPtr create() { return std::make_shared<FunctionArrayDistance<DistanceImpl>>(); }
_ZN5doris21FunctionArrayDistanceINS_21L2DistanceApproximateEE6createEv
Line
Count
Source
96
11
    static FunctionPtr create() { return std::make_shared<FunctionArrayDistance<DistanceImpl>>(); }
_ZN5doris21FunctionArrayDistanceINS_23InnerProductApproximateEE6createEv
Line
Count
Source
96
2
    static FunctionPtr create() { return std::make_shared<FunctionArrayDistance<DistanceImpl>>(); }
97
19
    size_t get_number_of_arguments() const override { return 2; }
Unexecuted instantiation: _ZNK5doris21FunctionArrayDistanceINS_10L1DistanceEE23get_number_of_argumentsEv
Unexecuted instantiation: _ZNK5doris21FunctionArrayDistanceINS_10L2DistanceEE23get_number_of_argumentsEv
Unexecuted instantiation: _ZNK5doris21FunctionArrayDistanceINS_14CosineDistanceEE23get_number_of_argumentsEv
_ZNK5doris21FunctionArrayDistanceINS_16CosineSimilarityEE23get_number_of_argumentsEv
Line
Count
Source
97
10
    size_t get_number_of_arguments() const override { return 2; }
Unexecuted instantiation: _ZNK5doris21FunctionArrayDistanceINS_12InnerProductEE23get_number_of_argumentsEv
_ZNK5doris21FunctionArrayDistanceINS_21L2DistanceApproximateEE23get_number_of_argumentsEv
Line
Count
Source
97
9
    size_t get_number_of_arguments() const override { return 2; }
Unexecuted instantiation: _ZNK5doris21FunctionArrayDistanceINS_23InnerProductApproximateEE23get_number_of_argumentsEv
98
99
19
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
100
19
        if (arguments.size() != 2) {
101
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Invalid number of arguments");
102
0
        }
103
104
        // primitive_type of Nullable is its nested type.
105
19
        if (arguments[0]->get_primitive_type() != TYPE_ARRAY ||
106
19
            arguments[1]->get_primitive_type() != TYPE_ARRAY) {
107
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
108
0
                                   "Arguments for function {} must be arrays", get_name());
109
0
        }
110
111
19
        return std::make_shared<DataTypeFloat32>();
112
19
    }
Unexecuted instantiation: _ZNK5doris21FunctionArrayDistanceINS_10L1DistanceEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE
Unexecuted instantiation: _ZNK5doris21FunctionArrayDistanceINS_10L2DistanceEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE
Unexecuted instantiation: _ZNK5doris21FunctionArrayDistanceINS_14CosineDistanceEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE
_ZNK5doris21FunctionArrayDistanceINS_16CosineSimilarityEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE
Line
Count
Source
99
10
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
100
10
        if (arguments.size() != 2) {
101
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Invalid number of arguments");
102
0
        }
103
104
        // primitive_type of Nullable is its nested type.
105
10
        if (arguments[0]->get_primitive_type() != TYPE_ARRAY ||
106
10
            arguments[1]->get_primitive_type() != TYPE_ARRAY) {
107
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
108
0
                                   "Arguments for function {} must be arrays", get_name());
109
0
        }
110
111
10
        return std::make_shared<DataTypeFloat32>();
112
10
    }
Unexecuted instantiation: _ZNK5doris21FunctionArrayDistanceINS_12InnerProductEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE
_ZNK5doris21FunctionArrayDistanceINS_21L2DistanceApproximateEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE
Line
Count
Source
99
9
    DataTypePtr get_return_type_impl(const DataTypes& arguments) const override {
100
9
        if (arguments.size() != 2) {
101
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT, "Invalid number of arguments");
102
0
        }
103
104
        // primitive_type of Nullable is its nested type.
105
9
        if (arguments[0]->get_primitive_type() != TYPE_ARRAY ||
106
9
            arguments[1]->get_primitive_type() != TYPE_ARRAY) {
107
0
            throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
108
0
                                   "Arguments for function {} must be arrays", get_name());
109
0
        }
110
111
9
        return std::make_shared<DataTypeFloat32>();
112
9
    }
Unexecuted instantiation: _ZNK5doris21FunctionArrayDistanceINS_23InnerProductApproximateEE20get_return_type_implERKSt6vectorISt10shared_ptrIKNS_9IDataTypeEESaIS7_EE
113
114
    // All array distance functions has always not nullable return type.
115
    // We want to make sure throw exception if input columns contain NULL.
116
29
    bool use_default_implementation_for_nulls() const override { return false; }
Unexecuted instantiation: _ZNK5doris21FunctionArrayDistanceINS_10L1DistanceEE36use_default_implementation_for_nullsEv
Unexecuted instantiation: _ZNK5doris21FunctionArrayDistanceINS_10L2DistanceEE36use_default_implementation_for_nullsEv
Unexecuted instantiation: _ZNK5doris21FunctionArrayDistanceINS_14CosineDistanceEE36use_default_implementation_for_nullsEv
_ZNK5doris21FunctionArrayDistanceINS_16CosineSimilarityEE36use_default_implementation_for_nullsEv
Line
Count
Source
116
20
    bool use_default_implementation_for_nulls() const override { return false; }
Unexecuted instantiation: _ZNK5doris21FunctionArrayDistanceINS_12InnerProductEE36use_default_implementation_for_nullsEv
_ZNK5doris21FunctionArrayDistanceINS_21L2DistanceApproximateEE36use_default_implementation_for_nullsEv
Line
Count
Source
116
9
    bool use_default_implementation_for_nulls() const override { return false; }
Unexecuted instantiation: _ZNK5doris21FunctionArrayDistanceINS_23InnerProductApproximateEE36use_default_implementation_for_nullsEv
117
118
    Status execute_impl(FunctionContext* context, Block& block, const ColumnNumbers& arguments,
119
10
                        uint32_t result, size_t input_rows_count) const override {
120
10
        const auto& arg1 = block.get_by_position(arguments[0]);
121
10
        const auto& arg2 = block.get_by_position(arguments[1]);
122
123
10
        auto col1 = arg1.column->convert_to_full_column_if_const();
124
10
        auto col2 = arg2.column->convert_to_full_column_if_const();
125
10
        if (col1->size() != col2->size()) {
126
0
            return Status::RuntimeError(
127
0
                    fmt::format("function {} have different input array sizes: {} and {}",
128
0
                                get_name(), col1->size(), col2->size()));
129
0
        }
130
131
10
        const ColumnArray* arr1 = nullptr;
132
10
        const ColumnArray* arr2 = nullptr;
133
134
10
        if (col1->is_nullable()) {
135
10
            if (col1->has_null()) {
136
0
                throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
137
0
                                       "First argument for function {} cannot be null", get_name());
138
0
            }
139
10
            auto nullable1 = assert_cast<const ColumnNullable*>(col1.get());
140
10
            arr1 = assert_cast<const ColumnArray*>(nullable1->get_nested_column_ptr().get());
141
10
        } else {
142
0
            arr1 = assert_cast<const ColumnArray*>(col1.get());
143
0
        }
144
145
10
        if (col2->is_nullable()) {
146
10
            if (col2->has_null()) {
147
0
                throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
148
0
                                       "Second argument for function {} cannot be null",
149
0
                                       get_name());
150
0
            }
151
10
            auto nullable2 = assert_cast<const ColumnNullable*>(col2.get());
152
10
            arr2 = assert_cast<const ColumnArray*>(nullable2->get_nested_column_ptr().get());
153
10
        } else {
154
0
            arr2 = assert_cast<const ColumnArray*>(col2.get());
155
0
        }
156
157
10
        const ColumnFloat32* float1 = nullptr;
158
10
        const ColumnFloat32* float2 = nullptr;
159
10
        if (arr1->get_data_ptr()->is_nullable()) {
160
10
            if (arr1->get_data_ptr()->has_null()) {
161
0
                throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
162
0
                                       "First argument for function {} cannot have null",
163
0
                                       get_name());
164
0
            }
165
10
            auto nullable1 = assert_cast<const ColumnNullable*>(arr1->get_data_ptr().get());
166
10
            float1 = assert_cast<const ColumnFloat32*>(nullable1->get_nested_column_ptr().get());
167
10
        } else {
168
0
            float1 = assert_cast<const ColumnFloat32*>(arr1->get_data_ptr().get());
169
0
        }
170
171
10
        if (arr2->get_data_ptr()->is_nullable()) {
172
10
            if (arr2->get_data_ptr()->has_null()) {
173
0
                throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
174
0
                                       "Second argument for function {} cannot have null",
175
0
                                       get_name());
176
0
            }
177
10
            auto nullable2 = assert_cast<const ColumnNullable*>(arr2->get_data_ptr().get());
178
10
            float2 = assert_cast<const ColumnFloat32*>(nullable2->get_nested_column_ptr().get());
179
10
        } else {
180
0
            float2 = assert_cast<const ColumnFloat32*>(arr2->get_data_ptr().get());
181
0
        }
182
183
10
        const ColumnOffset64* offset1 =
184
10
                assert_cast<const ColumnArray::ColumnOffsets*>(arr1->get_offsets_ptr().get());
185
10
        const ColumnOffset64* offset2 =
186
10
                assert_cast<const ColumnArray::ColumnOffsets*>(arr2->get_offsets_ptr().get());
187
        // prepare return data
188
10
        auto dst = ColumnType::create(input_rows_count);
189
10
        auto& dst_data = dst->get_data();
190
191
10
        size_t elemt_cnt = offset1->size();
192
22
        for (ssize_t row = 0; row < elemt_cnt; ++row) {
193
            // Calculate actual array sizes for current row.
194
            // For nullable arrays, we cannot compare absolute offset values directly because:
195
            // 1. When a row is null, its offset might equal the previous offset (no elements added)
196
            // 2. Or it might include the array size even if the row is null (implementation dependent)
197
            // Therefore, we must calculate the actual array size as: offsets[row] - offsets[row-1]
198
12
            ssize_t size1 = offset1->get_data()[row] - offset1->get_data()[row - 1];
199
12
            ssize_t size2 = offset2->get_data()[row] - offset2->get_data()[row - 1];
200
201
12
            if (size1 != size2) [[unlikely]] {
202
0
                return Status::InvalidArgument(
203
0
                        "function {} have different input element sizes of array: {} and {}",
204
0
                        get_name(), size1, size2);
205
0
            }
206
12
            dst_data[row] = DistanceImpl::distance(
207
12
                    float1->get_data().data() + offset1->get_data()[row - 1],
208
12
                    float2->get_data().data() + offset2->get_data()[row - 1], size1);
209
12
        }
210
211
10
        block.replace_by_position(result, std::move(dst));
212
10
        return Status::OK();
213
10
    }
Unexecuted instantiation: _ZNK5doris21FunctionArrayDistanceINS_10L1DistanceEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm
Unexecuted instantiation: _ZNK5doris21FunctionArrayDistanceINS_10L2DistanceEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm
Unexecuted instantiation: _ZNK5doris21FunctionArrayDistanceINS_14CosineDistanceEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm
_ZNK5doris21FunctionArrayDistanceINS_16CosineSimilarityEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm
Line
Count
Source
119
10
                        uint32_t result, size_t input_rows_count) const override {
120
10
        const auto& arg1 = block.get_by_position(arguments[0]);
121
10
        const auto& arg2 = block.get_by_position(arguments[1]);
122
123
10
        auto col1 = arg1.column->convert_to_full_column_if_const();
124
10
        auto col2 = arg2.column->convert_to_full_column_if_const();
125
10
        if (col1->size() != col2->size()) {
126
0
            return Status::RuntimeError(
127
0
                    fmt::format("function {} have different input array sizes: {} and {}",
128
0
                                get_name(), col1->size(), col2->size()));
129
0
        }
130
131
10
        const ColumnArray* arr1 = nullptr;
132
10
        const ColumnArray* arr2 = nullptr;
133
134
10
        if (col1->is_nullable()) {
135
10
            if (col1->has_null()) {
136
0
                throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
137
0
                                       "First argument for function {} cannot be null", get_name());
138
0
            }
139
10
            auto nullable1 = assert_cast<const ColumnNullable*>(col1.get());
140
10
            arr1 = assert_cast<const ColumnArray*>(nullable1->get_nested_column_ptr().get());
141
10
        } else {
142
0
            arr1 = assert_cast<const ColumnArray*>(col1.get());
143
0
        }
144
145
10
        if (col2->is_nullable()) {
146
10
            if (col2->has_null()) {
147
0
                throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
148
0
                                       "Second argument for function {} cannot be null",
149
0
                                       get_name());
150
0
            }
151
10
            auto nullable2 = assert_cast<const ColumnNullable*>(col2.get());
152
10
            arr2 = assert_cast<const ColumnArray*>(nullable2->get_nested_column_ptr().get());
153
10
        } else {
154
0
            arr2 = assert_cast<const ColumnArray*>(col2.get());
155
0
        }
156
157
10
        const ColumnFloat32* float1 = nullptr;
158
10
        const ColumnFloat32* float2 = nullptr;
159
10
        if (arr1->get_data_ptr()->is_nullable()) {
160
10
            if (arr1->get_data_ptr()->has_null()) {
161
0
                throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
162
0
                                       "First argument for function {} cannot have null",
163
0
                                       get_name());
164
0
            }
165
10
            auto nullable1 = assert_cast<const ColumnNullable*>(arr1->get_data_ptr().get());
166
10
            float1 = assert_cast<const ColumnFloat32*>(nullable1->get_nested_column_ptr().get());
167
10
        } else {
168
0
            float1 = assert_cast<const ColumnFloat32*>(arr1->get_data_ptr().get());
169
0
        }
170
171
10
        if (arr2->get_data_ptr()->is_nullable()) {
172
10
            if (arr2->get_data_ptr()->has_null()) {
173
0
                throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
174
0
                                       "Second argument for function {} cannot have null",
175
0
                                       get_name());
176
0
            }
177
10
            auto nullable2 = assert_cast<const ColumnNullable*>(arr2->get_data_ptr().get());
178
10
            float2 = assert_cast<const ColumnFloat32*>(nullable2->get_nested_column_ptr().get());
179
10
        } else {
180
0
            float2 = assert_cast<const ColumnFloat32*>(arr2->get_data_ptr().get());
181
0
        }
182
183
10
        const ColumnOffset64* offset1 =
184
10
                assert_cast<const ColumnArray::ColumnOffsets*>(arr1->get_offsets_ptr().get());
185
10
        const ColumnOffset64* offset2 =
186
10
                assert_cast<const ColumnArray::ColumnOffsets*>(arr2->get_offsets_ptr().get());
187
        // prepare return data
188
10
        auto dst = ColumnType::create(input_rows_count);
189
10
        auto& dst_data = dst->get_data();
190
191
10
        size_t elemt_cnt = offset1->size();
192
22
        for (ssize_t row = 0; row < elemt_cnt; ++row) {
193
            // Calculate actual array sizes for current row.
194
            // For nullable arrays, we cannot compare absolute offset values directly because:
195
            // 1. When a row is null, its offset might equal the previous offset (no elements added)
196
            // 2. Or it might include the array size even if the row is null (implementation dependent)
197
            // Therefore, we must calculate the actual array size as: offsets[row] - offsets[row-1]
198
12
            ssize_t size1 = offset1->get_data()[row] - offset1->get_data()[row - 1];
199
12
            ssize_t size2 = offset2->get_data()[row] - offset2->get_data()[row - 1];
200
201
12
            if (size1 != size2) [[unlikely]] {
202
0
                return Status::InvalidArgument(
203
0
                        "function {} have different input element sizes of array: {} and {}",
204
0
                        get_name(), size1, size2);
205
0
            }
206
12
            dst_data[row] = DistanceImpl::distance(
207
12
                    float1->get_data().data() + offset1->get_data()[row - 1],
208
12
                    float2->get_data().data() + offset2->get_data()[row - 1], size1);
209
12
        }
210
211
10
        block.replace_by_position(result, std::move(dst));
212
10
        return Status::OK();
213
10
    }
Unexecuted instantiation: _ZNK5doris21FunctionArrayDistanceINS_12InnerProductEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm
Unexecuted instantiation: _ZNK5doris21FunctionArrayDistanceINS_21L2DistanceApproximateEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm
Unexecuted instantiation: _ZNK5doris21FunctionArrayDistanceINS_23InnerProductApproximateEE12execute_implEPNS_15FunctionContextERNS_5BlockERKSt6vectorIjSaIjEEjm
214
};
215
216
} // namespace doris