/root/doris/contrib/openblas/kernel/arm/sum.c
Line | Count | Source |
1 | | /*************************************************************************** |
2 | | Copyright (c) 2013, The OpenBLAS Project |
3 | | All rights reserved. |
4 | | Redistribution and use in source and binary forms, with or without |
5 | | modification, are permitted provided that the following conditions are |
6 | | met: |
7 | | 1. Redistributions of source code must retain the above copyright |
8 | | notice, this list of conditions and the following disclaimer. |
9 | | 2. Redistributions in binary form must reproduce the above copyright |
10 | | notice, this list of conditions and the following disclaimer in |
11 | | the documentation and/or other materials provided with the |
12 | | distribution. |
13 | | 3. Neither the name of the OpenBLAS project nor the names of |
14 | | its contributors may be used to endorse or promote products |
15 | | derived from this software without specific prior written permission. |
16 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | | ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE |
20 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
21 | | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
22 | | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
23 | | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
24 | | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE |
25 | | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
26 | | *****************************************************************************/ |
27 | | |
28 | | /************************************************************************************** |
29 | | * trivial copy of asum.c with the ABS() removed * |
30 | | **************************************************************************************/ |
31 | | |
32 | | #include "common.h" |
33 | | #include "../simd/intrin.h" |
34 | | #include <math.h> |
35 | | |
36 | | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) |
37 | 0 | { |
38 | 0 | BLASLONG i = 0; |
39 | 0 | FLOAT sumf = 0.0; |
40 | 0 | if (n <= 0 || inc_x <= 0) |
41 | 0 | return (sumf); |
42 | 0 | n *= inc_x; |
43 | 0 | if (inc_x == 1) |
44 | 0 | { |
45 | | #if V_SIMD && (!defined(DOUBLE) || (defined(DOUBLE) && V_SIMD_F64 && V_SIMD > 128)) |
46 | | #ifdef DOUBLE |
47 | 0 | const int vstep = v_nlanes_f64; |
48 | | const int unrollx4 = n & (-vstep * 4); |
49 | | const int unrollx = n & -vstep; |
50 | 0 | v_f64 vsum0 = v_zero_f64(); |
51 | 0 | v_f64 vsum1 = v_zero_f64(); |
52 | 0 | v_f64 vsum2 = v_zero_f64(); |
53 | 0 | v_f64 vsum3 = v_zero_f64(); |
54 | 0 | for (; i < unrollx4; i += vstep * 4) |
55 | 0 | { |
56 | 0 | vsum0 = v_add_f64(vsum0, v_loadu_f64(x + i)); |
57 | 0 | vsum1 = v_add_f64(vsum1, v_loadu_f64(x + i + vstep)); |
58 | 0 | vsum2 = v_add_f64(vsum2, v_loadu_f64(x + i + vstep * 2)); |
59 | 0 | vsum3 = v_add_f64(vsum3, v_loadu_f64(x + i + vstep * 3)); |
60 | 0 | } |
61 | 0 | vsum0 = v_add_f64( |
62 | 0 | v_add_f64(vsum0, vsum1), v_add_f64(vsum2, vsum3)); |
63 | 0 | for (; i < unrollx; i += vstep) |
64 | 0 | { |
65 | 0 | vsum0 = v_add_f64(vsum0, v_loadu_f64(x + i)); |
66 | 0 | } |
67 | | sumf = v_sum_f64(vsum0); |
68 | | #else |
69 | 0 | const int vstep = v_nlanes_f32; |
70 | | const int unrollx4 = n & (-vstep * 4); |
71 | | const int unrollx = n & -vstep; |
72 | 0 | v_f32 vsum0 = v_zero_f32(); |
73 | 0 | v_f32 vsum1 = v_zero_f32(); |
74 | 0 | v_f32 vsum2 = v_zero_f32(); |
75 | 0 | v_f32 vsum3 = v_zero_f32(); |
76 | 0 | for (; i < unrollx4; i += vstep * 4) |
77 | 0 | { |
78 | 0 | vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i)); |
79 | 0 | vsum1 = v_add_f32(vsum1, v_loadu_f32(x + i + vstep)); |
80 | 0 | vsum2 = v_add_f32(vsum2, v_loadu_f32(x + i + vstep * 2)); |
81 | 0 | vsum3 = v_add_f32(vsum3, v_loadu_f32(x + i + vstep * 3)); |
82 | 0 | } |
83 | 0 | vsum0 = v_add_f32( |
84 | 0 | v_add_f32(vsum0, vsum1), v_add_f32(vsum2, vsum3)); |
85 | 0 | for (; i < unrollx; i += vstep) |
86 | 0 | { |
87 | 0 | vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i)); |
88 | 0 | } |
89 | | sumf = v_sum_f32(vsum0); |
90 | | #endif |
91 | | #else |
92 | | int n1 = n & -4; |
93 | 0 | for (; i < n1; i += 4) |
94 | 0 | { |
95 | 0 | sumf += x[i] + x[i + 1] + x[i + 2] + x[i + 3]; |
96 | 0 | } |
97 | | #endif |
98 | 0 | } |
99 | 0 | while (i < n) |
100 | 0 | { |
101 | 0 | sumf += x[i]; |
102 | 0 | i += inc_x; |
103 | 0 | } |
104 | 0 | return (sumf); |
105 | 0 | } Unexecuted instantiation: ssum_k_PRESCOTT Unexecuted instantiation: dsum_k_PRESCOTT Unexecuted instantiation: ssum_k_CORE2 Unexecuted instantiation: dsum_k_CORE2 Unexecuted instantiation: ssum_k_NEHALEM Unexecuted instantiation: dsum_k_NEHALEM Unexecuted instantiation: ssum_k_BARCELONA Unexecuted instantiation: dsum_k_BARCELONA Unexecuted instantiation: ssum_k_SANDYBRIDGE Unexecuted instantiation: dsum_k_SANDYBRIDGE Unexecuted instantiation: ssum_k_BULLDOZER Unexecuted instantiation: dsum_k_BULLDOZER Unexecuted instantiation: ssum_k_PILEDRIVER Unexecuted instantiation: dsum_k_PILEDRIVER Unexecuted instantiation: ssum_k_STEAMROLLER Unexecuted instantiation: dsum_k_STEAMROLLER Unexecuted instantiation: ssum_k_EXCAVATOR Unexecuted instantiation: dsum_k_EXCAVATOR Unexecuted instantiation: ssum_k_HASWELL Unexecuted instantiation: dsum_k_HASWELL Unexecuted instantiation: ssum_k_ZEN Unexecuted instantiation: dsum_k_ZEN |