/root/doris/contrib/openblas/kernel/generic/dot.c
Line | Count | Source |
1 | | /*************************************************************************** |
2 | | Copyright (c) 2014, The OpenBLAS Project |
3 | | All rights reserved. |
4 | | Redistribution and use in source and binary forms, with or without |
5 | | modification, are permitted provided that the following conditions are |
6 | | met: |
7 | | 1. Redistributions of source code must retain the above copyright |
8 | | notice, this list of conditions and the following disclaimer. |
9 | | 2. Redistributions in binary form must reproduce the above copyright |
10 | | notice, this list of conditions and the following disclaimer in |
11 | | the documentation and/or other materials provided with the |
12 | | distribution. |
13 | | 3. Neither the name of the OpenBLAS project nor the names of |
14 | | its contributors may be used to endorse or promote products |
15 | | derived from this software without specific prior written permission. |
16 | | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
17 | | AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
18 | | IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
19 | | ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE |
20 | | LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
21 | | DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
22 | | SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
23 | | CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
24 | | OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE |
25 | | USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
26 | | *****************************************************************************/ |
27 | | |
28 | | |
29 | | #include "common.h" |
30 | | #include "../simd/intrin.h" |
31 | | #if defined(DSDOT) |
32 | | double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) |
33 | | #else |
34 | | FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y) |
35 | | #endif |
36 | 0 | { |
37 | 0 | BLASLONG i=0; |
38 | 0 | BLASLONG ix=0,iy=0; |
39 | |
|
40 | | #if defined(DSDOT) |
41 | | double dot = 0.0 ; |
42 | | #else |
43 | 0 | FLOAT dot = 0.0 ; |
44 | | #endif |
45 | |
|
46 | 0 | if ( n < 1 ) return(dot); |
47 | | |
48 | 0 | if ( (inc_x == 1) && (inc_y == 1) ) |
49 | 0 | { |
50 | | #if V_SIMD && !defined(DSDOT) |
51 | 0 | const int vstep = v_nlanes_f32; |
52 | | const int unrollx4 = n & (-vstep * 4); |
53 | | const int unrollx = n & -vstep; |
54 | 0 | v_f32 vsum0 = v_zero_f32(); |
55 | 0 | v_f32 vsum1 = v_zero_f32(); |
56 | 0 | v_f32 vsum2 = v_zero_f32(); |
57 | 0 | v_f32 vsum3 = v_zero_f32(); |
58 | 0 | while(i < unrollx4) |
59 | 0 | { |
60 | 0 | vsum0 = v_muladd_f32( |
61 | 0 | v_loadu_f32(x + i), v_loadu_f32(y + i), vsum0 |
62 | 0 | ); |
63 | 0 | vsum1 = v_muladd_f32( |
64 | 0 | v_loadu_f32(x + i + vstep), v_loadu_f32(y + i + vstep), vsum1 |
65 | 0 | ); |
66 | 0 | vsum2 = v_muladd_f32( |
67 | 0 | v_loadu_f32(x + i + vstep*2), v_loadu_f32(y + i + vstep*2), vsum2 |
68 | 0 | ); |
69 | 0 | vsum3 = v_muladd_f32( |
70 | 0 | v_loadu_f32(x + i + vstep*3), v_loadu_f32(y + i + vstep*3), vsum3 |
71 | 0 | ); |
72 | 0 | i += vstep*4; |
73 | 0 | } |
74 | 0 | vsum0 = v_add_f32( |
75 | 0 | v_add_f32(vsum0, vsum1), v_add_f32(vsum2 , vsum3) |
76 | | ); |
77 | 0 | while(i < unrollx) |
78 | 0 | { |
79 | 0 | vsum0 = v_muladd_f32( |
80 | 0 | v_loadu_f32(x + i), v_loadu_f32(y + i), vsum0 |
81 | 0 | ); |
82 | 0 | i += vstep; |
83 | 0 | } |
84 | | dot = v_sum_f32(vsum0); |
85 | | #elif defined(DSDOT) |
86 | | int n1 = n & -4; |
87 | 0 | for (; i < n1; i += 4) |
88 | 0 | { |
89 | 0 | dot += (double) y[i] * (double) x[i] |
90 | 0 | + (double) y[i+1] * (double) x[i+1] |
91 | 0 | + (double) y[i+2] * (double) x[i+2] |
92 | 0 | + (double) y[i+3] * (double) x[i+3] ; |
93 | 0 | } |
94 | | #else |
95 | | int n1 = n & -4; |
96 | | for (; i < n1; i += 4) |
97 | | { |
98 | | dot += y[i] * x[i] |
99 | | + y[i+1] * x[i+1] |
100 | | + y[i+2] * x[i+2] |
101 | | + y[i+3] * x[i+3] ; |
102 | | } |
103 | | #endif |
104 | 0 | while(i < n) |
105 | 0 | { |
106 | |
|
107 | | #if defined(DSDOT) |
108 | | dot += (double) y[i] * (double) x[i] ; |
109 | | #else |
110 | | dot += y[i] * x[i] ; |
111 | | #endif |
112 | 0 | i++ ; |
113 | |
|
114 | 0 | } |
115 | 0 | return(dot); |
116 | | |
117 | |
|
118 | 0 | } |
119 | | |
120 | 0 | while(i < n) |
121 | 0 | { |
122 | |
|
123 | | #if defined(DSDOT) |
124 | | dot += (double) y[iy] * (double) x[ix] ; |
125 | | #else |
126 | | dot += y[iy] * x[ix] ; |
127 | | #endif |
128 | 0 | ix += inc_x ; |
129 | 0 | iy += inc_y ; |
130 | 0 | i++ ; |
131 | |
|
132 | 0 | } |
133 | 0 | return(dot); |
134 | |
|
135 | 0 | } Unexecuted instantiation: sdot_k_PRESCOTT Unexecuted instantiation: dsdot_k_PRESCOTT Unexecuted instantiation: sdot_k_CORE2 Unexecuted instantiation: dsdot_k_CORE2 Unexecuted instantiation: sdot_k_BARCELONA Unexecuted instantiation: dsdot_k_BARCELONA |
136 | | |
137 | | |