/root/doris/contrib/openblas/driver/level3/level3.c
Line | Count | Source |
1 | | /*********************************************************************/ |
2 | | /* Copyright 2009, 2010 The University of Texas at Austin. */ |
3 | | /* All rights reserved. */ |
4 | | /* */ |
5 | | /* Redistribution and use in source and binary forms, with or */ |
6 | | /* without modification, are permitted provided that the following */ |
7 | | /* conditions are met: */ |
8 | | /* */ |
9 | | /* 1. Redistributions of source code must retain the above */ |
10 | | /* copyright notice, this list of conditions and the following */ |
11 | | /* disclaimer. */ |
12 | | /* */ |
13 | | /* 2. Redistributions in binary form must reproduce the above */ |
14 | | /* copyright notice, this list of conditions and the following */ |
15 | | /* disclaimer in the documentation and/or other materials */ |
16 | | /* provided with the distribution. */ |
17 | | /* */ |
18 | | /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ |
19 | | /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ |
20 | | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ |
21 | | /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ |
22 | | /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ |
23 | | /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ |
24 | | /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ |
25 | | /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ |
26 | | /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ |
27 | | /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ |
28 | | /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ |
29 | | /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ |
30 | | /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ |
31 | | /* POSSIBILITY OF SUCH DAMAGE. */ |
32 | | /* */ |
33 | | /* The views and conclusions contained in the software and */ |
34 | | /* documentation are those of the authors and should not be */ |
35 | | /* interpreted as representing official policies, either expressed */ |
36 | | /* or implied, of The University of Texas at Austin. */ |
37 | | /*********************************************************************/ |
38 | | |
39 | | /* This file is a template for level 3 operation */ |
40 | | |
41 | | #ifndef BETA_OPERATION |
42 | | #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) |
43 | | #ifndef COMPLEX |
44 | | #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ |
45 | 300 | GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ |
46 | 300 | BETA[0], NULL, 0, NULL, 0, \ |
47 | 300 | (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) |
48 | | #else |
49 | | #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ |
50 | | GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ |
51 | | BETA[0], BETA[1], NULL, 0, NULL, 0, \ |
52 | | (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) |
53 | | #endif |
54 | | #else |
55 | | #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ |
56 | | GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ |
57 | | BETA, NULL, 0, NULL, 0, \ |
58 | | (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) |
59 | | #endif |
60 | | #endif |
61 | | |
62 | | #ifndef ICOPY_OPERATION |
63 | | #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ |
64 | | defined(RN) || defined(RT) || defined(RC) || defined(RR) |
65 | 0 | #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); |
66 | | #else |
67 | 458 | #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); |
68 | | #endif |
69 | | #endif |
70 | | |
71 | | #ifndef OCOPY_OPERATION |
72 | | #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ |
73 | | defined(NR) || defined(TR) || defined(CR) || defined(RR) |
74 | 5.97k | #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); |
75 | | #else |
76 | 0 | #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); |
77 | | #endif |
78 | | #endif |
79 | | |
80 | | #ifndef KERNEL_FUNC |
81 | | #if defined(NN) || defined(NT) || defined(TN) || defined(TT) |
82 | 5.97k | #define KERNEL_FUNC GEMM_KERNEL_N |
83 | | #endif |
84 | | #if defined(CN) || defined(CT) || defined(RN) || defined(RT) |
85 | | #define KERNEL_FUNC GEMM_KERNEL_L |
86 | | #endif |
87 | | #if defined(NC) || defined(TC) || defined(NR) || defined(TR) |
88 | | #define KERNEL_FUNC GEMM_KERNEL_R |
89 | | #endif |
90 | | #if defined(CC) || defined(CR) || defined(RC) || defined(RR) |
91 | | #define KERNEL_FUNC GEMM_KERNEL_B |
92 | | #endif |
93 | | #endif |
94 | | |
95 | | #ifndef KERNEL_OPERATION |
96 | | #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) |
97 | | #ifndef COMPLEX |
98 | | #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ |
99 | 5.97k | KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) |
100 | | #else |
101 | | #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ |
102 | | KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) |
103 | | #endif |
104 | | #else |
105 | | #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ |
106 | | KERNEL_FUNC(M, N, K, ALPHA, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) |
107 | | #endif |
108 | | #endif |
109 | | |
110 | | #ifndef FUSED_KERNEL_OPERATION |
111 | | #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ |
112 | | defined(NR) || defined(TR) || defined(CR) || defined(RR) |
113 | | #ifndef COMPLEX |
114 | | #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ |
115 | | FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], SA, SB, \ |
116 | | (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) |
117 | | #else |
118 | | #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ |
119 | | FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ |
120 | | (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) |
121 | | |
122 | | #endif |
123 | | #else |
124 | | #ifndef COMPLEX |
125 | | #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ |
126 | | FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], SA, SB, \ |
127 | | (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) |
128 | | #else |
129 | | #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ |
130 | | FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ |
131 | | (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) |
132 | | #endif |
133 | | #endif |
134 | | #endif |
135 | | |
136 | | #ifndef A |
137 | 304 | #define A args -> a |
138 | | #endif |
139 | | #ifndef LDA |
140 | 304 | #define LDA args -> lda |
141 | | #endif |
142 | | #ifndef B |
143 | 304 | #define B args -> b |
144 | | #endif |
145 | | #ifndef LDB |
146 | 304 | #define LDB args -> ldb |
147 | | #endif |
148 | | #ifndef C |
149 | 304 | #define C args -> c |
150 | | #endif |
151 | | #ifndef LDC |
152 | 304 | #define LDC args -> ldc |
153 | | #endif |
154 | | #ifndef M |
155 | 304 | #define M args -> m |
156 | | #endif |
157 | | #ifndef N |
158 | 304 | #define N args -> n |
159 | | #endif |
160 | | #ifndef K |
161 | 304 | #define K args -> k |
162 | | #endif |
163 | | |
164 | | #ifdef TIMING |
165 | | #define START_RPCC() rpcc_counter = rpcc() |
166 | | #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter |
167 | | #else |
168 | | #define START_RPCC() |
169 | | #define STOP_RPCC(COUNTER) |
170 | | #endif |
171 | | |
172 | | int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, |
173 | 304 | XFLOAT *sa, XFLOAT *sb, BLASLONG dummy){ |
174 | 304 | BLASLONG k, lda, ldb, ldc; |
175 | 304 | FLOAT *alpha, *beta; |
176 | 304 | IFLOAT *a, *b; |
177 | 304 | FLOAT *c; |
178 | 304 | BLASLONG m_from, m_to, n_from, n_to; |
179 | | |
180 | 304 | BLASLONG ls, is, js; |
181 | 304 | BLASLONG min_l, min_i, min_j; |
182 | 304 | #if !defined(FUSED_GEMM) || defined(TIMING) |
183 | 304 | BLASLONG jjs, min_jj; |
184 | 304 | #endif |
185 | | |
186 | 304 | BLASLONG l1stride, gemm_p, l2size; |
187 | | |
188 | | #if defined(XDOUBLE) && defined(QUAD_PRECISION) |
189 | | xidouble xalpha; |
190 | | #endif |
191 | | |
192 | | #ifdef TIMING |
193 | | unsigned long long rpcc_counter; |
194 | | unsigned long long innercost = 0; |
195 | | unsigned long long outercost = 0; |
196 | | unsigned long long kernelcost = 0; |
197 | | double total; |
198 | | #endif |
199 | | |
200 | 304 | k = K; |
201 | | |
202 | 304 | a = (IFLOAT *)A; |
203 | 304 | b = (IFLOAT *)B; |
204 | 304 | c = (FLOAT *)C; |
205 | | |
206 | 304 | lda = LDA; |
207 | 304 | ldb = LDB; |
208 | 304 | ldc = LDC; |
209 | | |
210 | 304 | alpha = (FLOAT *)args -> alpha; |
211 | 304 | beta = (FLOAT *)args -> beta; |
212 | | |
213 | 304 | m_from = 0; |
214 | 304 | m_to = M; |
215 | | |
216 | 304 | if (range_m) { |
217 | 0 | m_from = *(((BLASLONG *)range_m) + 0); |
218 | 0 | m_to = *(((BLASLONG *)range_m) + 1); |
219 | 0 | } |
220 | | |
221 | 304 | n_from = 0; |
222 | 304 | n_to = N; |
223 | | |
224 | 304 | if (range_n) { |
225 | 0 | n_from = *(((BLASLONG *)range_n) + 0); |
226 | 0 | n_to = *(((BLASLONG *)range_n) + 1); |
227 | 0 | } |
228 | | |
229 | 304 | if (beta) { |
230 | 304 | #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) |
231 | 304 | #ifndef COMPLEX |
232 | 304 | if (beta[0] != ONE |
233 | | #else |
234 | | if ((beta[0] != ONE) || (beta[1] != ZERO) |
235 | | #endif |
236 | | #else |
237 | | if (((beta[0].x[1] != 0x3fff000000000000UL) || beta[0].x[0] != 0) |
238 | | #ifdef COMPLEX |
239 | | &&(((beta[1].x[0] | beta[1].x[1]) << 1) != 0) |
240 | | #endif |
241 | | #endif |
242 | 304 | ) { |
243 | | #if defined(XDOUBLE) && defined(QUAD_PRECISION) |
244 | | xidouble xbeta; |
245 | | |
246 | | qtox(&xbeta, beta); |
247 | | #endif |
248 | 300 | BETA_OPERATION(m_from, m_to, n_from, n_to, beta, c, ldc); |
249 | 300 | } |
250 | 304 | } |
251 | | |
252 | 304 | if ((k == 0) || (alpha == NULL)) return 0; |
253 | | |
254 | 304 | #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) |
255 | 304 | if ( alpha[0] == ZERO |
256 | | #ifdef COMPLEX |
257 | | && alpha[1] == ZERO |
258 | | #endif |
259 | 304 | ) return 0; |
260 | | #else |
261 | | if (((alpha[0].x[0] | alpha[0].x[1] |
262 | | #ifdef COMPLEX |
263 | | | alpha[1].x[0] | alpha[1].x[1] |
264 | | #endif |
265 | | ) << 1) == 0) return 0; |
266 | | #endif |
267 | | |
268 | | #if defined(XDOUBLE) && defined(QUAD_PRECISION) |
269 | | qtox(&xalpha, alpha); |
270 | | #endif |
271 | | |
272 | 304 | l2size = GEMM_P * GEMM_Q; |
273 | | |
274 | | #if 0 |
275 | | fprintf(stderr, "GEMM(Single): M_from : %ld M_to : %ld N_from : %ld N_to : %ld k : %ld\n", m_from, m_to, n_from, n_to, k); |
276 | | fprintf(stderr, "GEMM(Single):: P = %4ld Q = %4ld R = %4ld\n", (BLASLONG)GEMM_P, (BLASLONG)GEMM_Q, (BLASLONG)GEMM_R); |
277 | | // fprintf(stderr, "GEMM: SA .. %p SB .. %p\n", sa, sb); |
278 | | |
279 | | // fprintf(stderr, "A = %p B = %p C = %p\n\tlda = %ld ldb = %ld ldc = %ld\n", a, b, c, lda, ldb, ldc); |
280 | | #endif |
281 | | |
282 | | #ifdef TIMING |
283 | | innercost = 0; |
284 | | outercost = 0; |
285 | | kernelcost = 0; |
286 | | #endif |
287 | | |
288 | 608 | for(js = n_from; js < n_to; js += GEMM_R){ |
289 | 304 | min_j = n_to - js; |
290 | 304 | if (min_j > GEMM_R) min_j = GEMM_R; |
291 | | |
292 | 762 | for(ls = 0; ls < k; ls += min_l){ |
293 | | |
294 | 458 | min_l = k - ls; |
295 | | |
296 | 458 | if (min_l >= GEMM_Q * 2) { |
297 | | // gemm_p = GEMM_P; |
298 | 0 | min_l = GEMM_Q; |
299 | 458 | } else { |
300 | 458 | if (min_l > GEMM_Q) { |
301 | 154 | min_l = ((min_l / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; |
302 | 154 | } |
303 | 458 | gemm_p = ((l2size / min_l + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; |
304 | 766 | while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M; |
305 | 458 | } |
306 | | |
307 | 458 | BLASLONG pad_min_l = min_l; |
308 | | #if defined(HALF) |
309 | | #if defined(DYNAMIC_ARCH) |
310 | | pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1); |
311 | | #else |
312 | | pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);; |
313 | | #endif |
314 | | #endif |
315 | | |
316 | | /* First, we have to move data A to L2 cache */ |
317 | 458 | min_i = m_to - m_from; |
318 | 458 | l1stride = 1; |
319 | | |
320 | 458 | if (min_i >= GEMM_P * 2) { |
321 | 0 | min_i = GEMM_P; |
322 | 458 | } else { |
323 | 458 | if (min_i > GEMM_P) { |
324 | 0 | min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; |
325 | 458 | } else { |
326 | 458 | l1stride = 0; |
327 | 458 | } |
328 | 458 | } |
329 | | |
330 | 458 | START_RPCC(); |
331 | | |
332 | 458 | ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); |
333 | | |
334 | 458 | STOP_RPCC(innercost); |
335 | | |
336 | | #if defined(FUSED_GEMM) && !defined(TIMING) |
337 | | |
338 | | FUSED_KERNEL_OPERATION(min_i, min_j, min_l, alpha, |
339 | | sa, sb, b, ldb, c, ldc, m_from, js, ls); |
340 | | |
341 | | |
342 | | #else |
343 | 6.43k | for(jjs = js; jjs < js + min_j; jjs += min_jj){ |
344 | 5.97k | min_jj = min_j + js - jjs; |
345 | | #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) |
346 | | /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */ |
347 | | if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; |
348 | | #else |
349 | 5.97k | if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; |
350 | 755 | else |
351 | | /* |
352 | | if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; |
353 | | else |
354 | | */ |
355 | 755 | if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; |
356 | 5.97k | #endif |
357 | | |
358 | | |
359 | 5.97k | START_RPCC(); |
360 | | |
361 | 5.97k | OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, |
362 | 5.97k | sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride); |
363 | | |
364 | 5.97k | STOP_RPCC(outercost); |
365 | | |
366 | 5.97k | START_RPCC(); |
367 | | |
368 | 5.97k | #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) |
369 | 5.97k | KERNEL_OPERATION(min_i, min_jj, min_l, alpha, |
370 | 5.97k | sa, sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); |
371 | | #else |
372 | | KERNEL_OPERATION(min_i, min_jj, min_l, (void *)&xalpha, |
373 | | sa, sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); |
374 | | #endif |
375 | | |
376 | 5.97k | STOP_RPCC(kernelcost); |
377 | 5.97k | } |
378 | 458 | #endif |
379 | | |
380 | 458 | for(is = m_from + min_i; is < m_to; is += min_i){ |
381 | 0 | min_i = m_to - is; |
382 | |
|
383 | 0 | if (min_i >= GEMM_P * 2) { |
384 | 0 | min_i = GEMM_P; |
385 | 0 | } else |
386 | 0 | if (min_i > GEMM_P) { |
387 | 0 | min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; |
388 | 0 | } |
389 | |
|
390 | 0 | START_RPCC(); |
391 | |
|
392 | 0 | ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); |
393 | |
|
394 | 0 | STOP_RPCC(innercost); |
395 | |
|
396 | 0 | START_RPCC(); |
397 | |
|
398 | 0 | #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) |
399 | 0 | KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js); |
400 | | #else |
401 | | KERNEL_OPERATION(min_i, min_j, min_l, (void *)&xalpha, sa, sb, c, ldc, is, js); |
402 | | #endif |
403 | |
|
404 | 0 | STOP_RPCC(kernelcost); |
405 | |
|
406 | 0 | } /* end of is */ |
407 | 458 | } /* end of js */ |
408 | 304 | } /* end of ls */ |
409 | | |
410 | | |
411 | | #ifdef TIMING |
412 | | total = (double)outercost + (double)innercost + (double)kernelcost; |
413 | | |
414 | | printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", |
415 | | innercost / total * 100., outercost / total * 100., |
416 | | kernelcost / total * 100., |
417 | | (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / (double)kernelcost * 100. * (double)COMPSIZE / 2., |
418 | | (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. * (double)COMPSIZE / 2.); |
419 | | |
420 | | #endif |
421 | | |
422 | 304 | return 0; |
423 | 304 | } Unexecuted instantiation: sgemm_nn Unexecuted instantiation: dgemm_nn Unexecuted instantiation: sgemm_nt Unexecuted instantiation: dgemm_nt Line | Count | Source | 173 | 304 | XFLOAT *sa, XFLOAT *sb, BLASLONG dummy){ | 174 | 304 | BLASLONG k, lda, ldb, ldc; | 175 | 304 | FLOAT *alpha, *beta; | 176 | 304 | IFLOAT *a, *b; | 177 | 304 | FLOAT *c; | 178 | 304 | BLASLONG m_from, m_to, n_from, n_to; | 179 | | | 180 | 304 | BLASLONG ls, is, js; | 181 | 304 | BLASLONG min_l, min_i, min_j; | 182 | 304 | #if !defined(FUSED_GEMM) || defined(TIMING) | 183 | 304 | BLASLONG jjs, min_jj; | 184 | 304 | #endif | 185 | | | 186 | 304 | BLASLONG l1stride, gemm_p, l2size; | 187 | | | 188 | | #if defined(XDOUBLE) && defined(QUAD_PRECISION) | 189 | | xidouble xalpha; | 190 | | #endif | 191 | | | 192 | | #ifdef TIMING | 193 | | unsigned long long rpcc_counter; | 194 | | unsigned long long innercost = 0; | 195 | | unsigned long long outercost = 0; | 196 | | unsigned long long kernelcost = 0; | 197 | | double total; | 198 | | #endif | 199 | | | 200 | 304 | k = K; | 201 | | | 202 | 304 | a = (IFLOAT *)A; | 203 | 304 | b = (IFLOAT *)B; | 204 | 304 | c = (FLOAT *)C; | 205 | | | 206 | 304 | lda = LDA; | 207 | 304 | ldb = LDB; | 208 | 304 | ldc = LDC; | 209 | | | 210 | 304 | alpha = (FLOAT *)args -> alpha; | 211 | 304 | beta = (FLOAT *)args -> beta; | 212 | | | 213 | 304 | m_from = 0; | 214 | 304 | m_to = M; | 215 | | | 216 | 304 | if (range_m) { | 217 | 0 | m_from = *(((BLASLONG *)range_m) + 0); | 218 | 0 | m_to = *(((BLASLONG *)range_m) + 1); | 219 | 0 | } | 220 | | | 221 | 304 | n_from = 0; | 222 | 304 | n_to = N; | 223 | | | 224 | 304 | if (range_n) { | 225 | 0 | n_from = *(((BLASLONG *)range_n) + 0); | 226 | 0 | n_to = *(((BLASLONG *)range_n) + 1); | 227 | 0 | } | 228 | | | 229 | 304 | if (beta) { | 230 | 304 | #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) | 231 | 304 | #ifndef COMPLEX | 232 | 304 | if (beta[0] != ONE | 233 | | #else | 234 | | if ((beta[0] != ONE) || (beta[1] != ZERO) | 235 | | #endif | 236 | | #else | 237 | | if (((beta[0].x[1] != 0x3fff000000000000UL) || beta[0].x[0] != 0) | 238 | | #ifdef COMPLEX | 239 | | &&(((beta[1].x[0] | beta[1].x[1]) << 1) != 0) | 240 | | #endif | 241 | | #endif | 242 | 304 | ) { | 243 | | #if defined(XDOUBLE) && defined(QUAD_PRECISION) | 244 | | xidouble xbeta; | 245 | | | 246 | | qtox(&xbeta, beta); | 247 | | #endif | 248 | 300 | BETA_OPERATION(m_from, m_to, n_from, n_to, beta, c, ldc); | 249 | 300 | } | 250 | 304 | } | 251 | | | 252 | 304 | if ((k == 0) || (alpha == NULL)) return 0; | 253 | | | 254 | 304 | #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) | 255 | 304 | if ( alpha[0] == ZERO | 256 | | #ifdef COMPLEX | 257 | | && alpha[1] == ZERO | 258 | | #endif | 259 | 304 | ) return 0; | 260 | | #else | 261 | | if (((alpha[0].x[0] | alpha[0].x[1] | 262 | | #ifdef COMPLEX | 263 | | | alpha[1].x[0] | alpha[1].x[1] | 264 | | #endif | 265 | | ) << 1) == 0) return 0; | 266 | | #endif | 267 | | | 268 | | #if defined(XDOUBLE) && defined(QUAD_PRECISION) | 269 | | qtox(&xalpha, alpha); | 270 | | #endif | 271 | | | 272 | 304 | l2size = GEMM_P * GEMM_Q; | 273 | | | 274 | | #if 0 | 275 | | fprintf(stderr, "GEMM(Single): M_from : %ld M_to : %ld N_from : %ld N_to : %ld k : %ld\n", m_from, m_to, n_from, n_to, k); | 276 | | fprintf(stderr, "GEMM(Single):: P = %4ld Q = %4ld R = %4ld\n", (BLASLONG)GEMM_P, (BLASLONG)GEMM_Q, (BLASLONG)GEMM_R); | 277 | | // fprintf(stderr, "GEMM: SA .. %p SB .. %p\n", sa, sb); | 278 | | | 279 | | // fprintf(stderr, "A = %p B = %p C = %p\n\tlda = %ld ldb = %ld ldc = %ld\n", a, b, c, lda, ldb, ldc); | 280 | | #endif | 281 | | | 282 | | #ifdef TIMING | 283 | | innercost = 0; | 284 | | outercost = 0; | 285 | | kernelcost = 0; | 286 | | #endif | 287 | | | 288 | 608 | for(js = n_from; js < n_to; js += GEMM_R){ | 289 | 304 | min_j = n_to - js; | 290 | 304 | if (min_j > GEMM_R) min_j = GEMM_R; | 291 | | | 292 | 762 | for(ls = 0; ls < k; ls += min_l){ | 293 | | | 294 | 458 | min_l = k - ls; | 295 | | | 296 | 458 | if (min_l >= GEMM_Q * 2) { | 297 | | // gemm_p = GEMM_P; | 298 | 0 | min_l = GEMM_Q; | 299 | 458 | } else { | 300 | 458 | if (min_l > GEMM_Q) { | 301 | 154 | min_l = ((min_l / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; | 302 | 154 | } | 303 | 458 | gemm_p = ((l2size / min_l + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; | 304 | 766 | while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M; | 305 | 458 | } | 306 | | | 307 | 458 | BLASLONG pad_min_l = min_l; | 308 | | #if defined(HALF) | 309 | | #if defined(DYNAMIC_ARCH) | 310 | | pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1); | 311 | | #else | 312 | | pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);; | 313 | | #endif | 314 | | #endif | 315 | | | 316 | | /* First, we have to move data A to L2 cache */ | 317 | 458 | min_i = m_to - m_from; | 318 | 458 | l1stride = 1; | 319 | | | 320 | 458 | if (min_i >= GEMM_P * 2) { | 321 | 0 | min_i = GEMM_P; | 322 | 458 | } else { | 323 | 458 | if (min_i > GEMM_P) { | 324 | 0 | min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; | 325 | 458 | } else { | 326 | 458 | l1stride = 0; | 327 | 458 | } | 328 | 458 | } | 329 | | | 330 | 458 | START_RPCC(); | 331 | | | 332 | 458 | ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); | 333 | | | 334 | 458 | STOP_RPCC(innercost); | 335 | | | 336 | | #if defined(FUSED_GEMM) && !defined(TIMING) | 337 | | | 338 | | FUSED_KERNEL_OPERATION(min_i, min_j, min_l, alpha, | 339 | | sa, sb, b, ldb, c, ldc, m_from, js, ls); | 340 | | | 341 | | | 342 | | #else | 343 | 6.43k | for(jjs = js; jjs < js + min_j; jjs += min_jj){ | 344 | 5.97k | min_jj = min_j + js - jjs; | 345 | | #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | 346 | | /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */ | 347 | | if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | 348 | | #else | 349 | 5.97k | if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; | 350 | 755 | else | 351 | | /* | 352 | | if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; | 353 | | else | 354 | | */ | 355 | 755 | if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | 356 | 5.97k | #endif | 357 | | | 358 | | | 359 | 5.97k | START_RPCC(); | 360 | | | 361 | 5.97k | OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, | 362 | 5.97k | sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride); | 363 | | | 364 | 5.97k | STOP_RPCC(outercost); | 365 | | | 366 | 5.97k | START_RPCC(); | 367 | | | 368 | 5.97k | #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) | 369 | 5.97k | KERNEL_OPERATION(min_i, min_jj, min_l, alpha, | 370 | 5.97k | sa, sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); | 371 | | #else | 372 | | KERNEL_OPERATION(min_i, min_jj, min_l, (void *)&xalpha, | 373 | | sa, sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); | 374 | | #endif | 375 | | | 376 | 5.97k | STOP_RPCC(kernelcost); | 377 | 5.97k | } | 378 | 458 | #endif | 379 | | | 380 | 458 | for(is = m_from + min_i; is < m_to; is += min_i){ | 381 | 0 | min_i = m_to - is; | 382 | |
| 383 | 0 | if (min_i >= GEMM_P * 2) { | 384 | 0 | min_i = GEMM_P; | 385 | 0 | } else | 386 | 0 | if (min_i > GEMM_P) { | 387 | 0 | min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; | 388 | 0 | } | 389 | |
| 390 | 0 | START_RPCC(); | 391 | |
| 392 | 0 | ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); | 393 | |
| 394 | 0 | STOP_RPCC(innercost); | 395 | |
| 396 | 0 | START_RPCC(); | 397 | |
| 398 | 0 | #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) | 399 | 0 | KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js); | 400 | | #else | 401 | | KERNEL_OPERATION(min_i, min_j, min_l, (void *)&xalpha, sa, sb, c, ldc, is, js); | 402 | | #endif | 403 | |
| 404 | 0 | STOP_RPCC(kernelcost); | 405 | |
| 406 | 0 | } /* end of is */ | 407 | 458 | } /* end of js */ | 408 | 304 | } /* end of ls */ | 409 | | | 410 | | | 411 | | #ifdef TIMING | 412 | | total = (double)outercost + (double)innercost + (double)kernelcost; | 413 | | | 414 | | printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", | 415 | | innercost / total * 100., outercost / total * 100., | 416 | | kernelcost / total * 100., | 417 | | (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / (double)kernelcost * 100. * (double)COMPSIZE / 2., | 418 | | (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. * (double)COMPSIZE / 2.); | 419 | | | 420 | | #endif | 421 | | | 422 | 304 | return 0; | 423 | 304 | } |
Unexecuted instantiation: dgemm_tn Unexecuted instantiation: sgemm_tt Unexecuted instantiation: dgemm_tt |