/root/doris/contrib/openblas/driver/level3/level3.c
Line | Count | Source |
1 | | /*********************************************************************/ |
2 | | /* Copyright 2009, 2010 The University of Texas at Austin. */ |
3 | | /* All rights reserved. */ |
4 | | /* */ |
5 | | /* Redistribution and use in source and binary forms, with or */ |
6 | | /* without modification, are permitted provided that the following */ |
7 | | /* conditions are met: */ |
8 | | /* */ |
9 | | /* 1. Redistributions of source code must retain the above */ |
10 | | /* copyright notice, this list of conditions and the following */ |
11 | | /* disclaimer. */ |
12 | | /* */ |
13 | | /* 2. Redistributions in binary form must reproduce the above */ |
14 | | /* copyright notice, this list of conditions and the following */ |
15 | | /* disclaimer in the documentation and/or other materials */ |
16 | | /* provided with the distribution. */ |
17 | | /* */ |
18 | | /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ |
19 | | /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ |
20 | | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ |
21 | | /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ |
22 | | /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ |
23 | | /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ |
24 | | /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ |
25 | | /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ |
26 | | /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ |
27 | | /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ |
28 | | /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ |
29 | | /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ |
30 | | /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ |
31 | | /* POSSIBILITY OF SUCH DAMAGE. */ |
32 | | /* */ |
33 | | /* The views and conclusions contained in the software and */ |
34 | | /* documentation are those of the authors and should not be */ |
35 | | /* interpreted as representing official policies, either expressed */ |
36 | | /* or implied, of The University of Texas at Austin. */ |
37 | | /*********************************************************************/ |
38 | | |
39 | | /* This file is a template for level 3 operation */ |
40 | | |
41 | | #ifndef BETA_OPERATION |
42 | | #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) |
43 | | #ifndef COMPLEX |
44 | | #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ |
45 | 0 | GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ |
46 | 0 | BETA[0], NULL, 0, NULL, 0, \ |
47 | 0 | (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) |
48 | | #else |
49 | | #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ |
50 | | GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ |
51 | | BETA[0], BETA[1], NULL, 0, NULL, 0, \ |
52 | | (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) |
53 | | #endif |
54 | | #else |
55 | | #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ |
56 | | GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ |
57 | | BETA, NULL, 0, NULL, 0, \ |
58 | | (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) |
59 | | #endif |
60 | | #endif |
61 | | |
62 | | #ifndef ICOPY_OPERATION |
63 | | #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ |
64 | | defined(RN) || defined(RT) || defined(RC) || defined(RR) |
65 | 0 | #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); |
66 | | #else |
67 | 0 | #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); |
68 | | #endif |
69 | | #endif |
70 | | |
71 | | #ifndef OCOPY_OPERATION |
72 | | #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ |
73 | | defined(NR) || defined(TR) || defined(CR) || defined(RR) |
74 | 0 | #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); |
75 | | #else |
76 | 0 | #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); |
77 | | #endif |
78 | | #endif |
79 | | |
80 | | #ifndef KERNEL_FUNC |
81 | | #if defined(NN) || defined(NT) || defined(TN) || defined(TT) |
82 | 0 | #define KERNEL_FUNC GEMM_KERNEL_N |
83 | | #endif |
84 | | #if defined(CN) || defined(CT) || defined(RN) || defined(RT) |
85 | | #define KERNEL_FUNC GEMM_KERNEL_L |
86 | | #endif |
87 | | #if defined(NC) || defined(TC) || defined(NR) || defined(TR) |
88 | | #define KERNEL_FUNC GEMM_KERNEL_R |
89 | | #endif |
90 | | #if defined(CC) || defined(CR) || defined(RC) || defined(RR) |
91 | | #define KERNEL_FUNC GEMM_KERNEL_B |
92 | | #endif |
93 | | #endif |
94 | | |
95 | | #ifndef KERNEL_OPERATION |
96 | | #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) |
97 | | #ifndef COMPLEX |
98 | | #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ |
99 | 0 | KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) |
100 | | #else |
101 | | #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ |
102 | | KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) |
103 | | #endif |
104 | | #else |
105 | | #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ |
106 | | KERNEL_FUNC(M, N, K, ALPHA, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) |
107 | | #endif |
108 | | #endif |
109 | | |
110 | | #ifndef FUSED_KERNEL_OPERATION |
111 | | #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ |
112 | | defined(NR) || defined(TR) || defined(CR) || defined(RR) |
113 | | #ifndef COMPLEX |
114 | | #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ |
115 | | FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], SA, SB, \ |
116 | | (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) |
117 | | #else |
118 | | #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ |
119 | | FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ |
120 | | (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) |
121 | | |
122 | | #endif |
123 | | #else |
124 | | #ifndef COMPLEX |
125 | | #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ |
126 | | FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], SA, SB, \ |
127 | | (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) |
128 | | #else |
129 | | #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ |
130 | | FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ |
131 | | (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) |
132 | | #endif |
133 | | #endif |
134 | | #endif |
135 | | |
136 | | #ifndef A |
137 | 0 | #define A args -> a |
138 | | #endif |
139 | | #ifndef LDA |
140 | 0 | #define LDA args -> lda |
141 | | #endif |
142 | | #ifndef B |
143 | 0 | #define B args -> b |
144 | | #endif |
145 | | #ifndef LDB |
146 | 0 | #define LDB args -> ldb |
147 | | #endif |
148 | | #ifndef C |
149 | 0 | #define C args -> c |
150 | | #endif |
151 | | #ifndef LDC |
152 | 0 | #define LDC args -> ldc |
153 | | #endif |
154 | | #ifndef M |
155 | 0 | #define M args -> m |
156 | | #endif |
157 | | #ifndef N |
158 | 0 | #define N args -> n |
159 | | #endif |
160 | | #ifndef K |
161 | 0 | #define K args -> k |
162 | | #endif |
163 | | |
164 | | #ifdef TIMING |
165 | | #define START_RPCC() rpcc_counter = rpcc() |
166 | | #define STOP_RPCC(COUNTER) COUNTER += rpcc() - rpcc_counter |
167 | | #else |
168 | | #define START_RPCC() |
169 | | #define STOP_RPCC(COUNTER) |
170 | | #endif |
171 | | |
172 | | int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, |
173 | 0 | XFLOAT *sa, XFLOAT *sb, BLASLONG dummy){ |
174 | 0 | BLASLONG k, lda, ldb, ldc; |
175 | 0 | FLOAT *alpha, *beta; |
176 | 0 | IFLOAT *a, *b; |
177 | 0 | FLOAT *c; |
178 | 0 | BLASLONG m_from, m_to, n_from, n_to; |
179 | |
|
180 | 0 | BLASLONG ls, is, js; |
181 | 0 | BLASLONG min_l, min_i, min_j; |
182 | 0 | #if !defined(FUSED_GEMM) || defined(TIMING) |
183 | 0 | BLASLONG jjs, min_jj; |
184 | 0 | #endif |
185 | |
|
186 | 0 | BLASLONG l1stride, gemm_p, l2size; |
187 | |
|
188 | | #if defined(XDOUBLE) && defined(QUAD_PRECISION) |
189 | | xidouble xalpha; |
190 | | #endif |
191 | |
|
192 | | #ifdef TIMING |
193 | | unsigned long long rpcc_counter; |
194 | | unsigned long long innercost = 0; |
195 | | unsigned long long outercost = 0; |
196 | | unsigned long long kernelcost = 0; |
197 | | double total; |
198 | | #endif |
199 | |
|
200 | 0 | k = K; |
201 | |
|
202 | 0 | a = (IFLOAT *)A; |
203 | 0 | b = (IFLOAT *)B; |
204 | 0 | c = (FLOAT *)C; |
205 | |
|
206 | 0 | lda = LDA; |
207 | 0 | ldb = LDB; |
208 | 0 | ldc = LDC; |
209 | |
|
210 | 0 | alpha = (FLOAT *)args -> alpha; |
211 | 0 | beta = (FLOAT *)args -> beta; |
212 | |
|
213 | 0 | m_from = 0; |
214 | 0 | m_to = M; |
215 | |
|
216 | 0 | if (range_m) { |
217 | 0 | m_from = *(((BLASLONG *)range_m) + 0); |
218 | 0 | m_to = *(((BLASLONG *)range_m) + 1); |
219 | 0 | } |
220 | |
|
221 | 0 | n_from = 0; |
222 | 0 | n_to = N; |
223 | |
|
224 | 0 | if (range_n) { |
225 | 0 | n_from = *(((BLASLONG *)range_n) + 0); |
226 | 0 | n_to = *(((BLASLONG *)range_n) + 1); |
227 | 0 | } |
228 | |
|
229 | 0 | if (beta) { |
230 | 0 | #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) |
231 | 0 | #ifndef COMPLEX |
232 | 0 | if (beta[0] != ONE |
233 | | #else |
234 | | if ((beta[0] != ONE) || (beta[1] != ZERO) |
235 | | #endif |
236 | | #else |
237 | | if (((beta[0].x[1] != 0x3fff000000000000UL) || beta[0].x[0] != 0) |
238 | | #ifdef COMPLEX |
239 | | &&(((beta[1].x[0] | beta[1].x[1]) << 1) != 0) |
240 | | #endif |
241 | | #endif |
242 | 0 | ) { |
243 | | #if defined(XDOUBLE) && defined(QUAD_PRECISION) |
244 | | xidouble xbeta; |
245 | | |
246 | | qtox(&xbeta, beta); |
247 | | #endif |
248 | 0 | BETA_OPERATION(m_from, m_to, n_from, n_to, beta, c, ldc); |
249 | 0 | } |
250 | 0 | } |
251 | |
|
252 | 0 | if ((k == 0) || (alpha == NULL)) return 0; |
253 | | |
254 | 0 | #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) |
255 | 0 | if ( alpha[0] == ZERO |
256 | | #ifdef COMPLEX |
257 | | && alpha[1] == ZERO |
258 | | #endif |
259 | 0 | ) return 0; |
260 | | #else |
261 | | if (((alpha[0].x[0] | alpha[0].x[1] |
262 | | #ifdef COMPLEX |
263 | | | alpha[1].x[0] | alpha[1].x[1] |
264 | | #endif |
265 | | ) << 1) == 0) return 0; |
266 | | #endif |
267 | | |
268 | | #if defined(XDOUBLE) && defined(QUAD_PRECISION) |
269 | | qtox(&xalpha, alpha); |
270 | | #endif |
271 | | |
272 | 0 | l2size = GEMM_P * GEMM_Q; |
273 | |
|
274 | | #if 0 |
275 | | fprintf(stderr, "GEMM(Single): M_from : %ld M_to : %ld N_from : %ld N_to : %ld k : %ld\n", m_from, m_to, n_from, n_to, k); |
276 | | fprintf(stderr, "GEMM(Single):: P = %4ld Q = %4ld R = %4ld\n", (BLASLONG)GEMM_P, (BLASLONG)GEMM_Q, (BLASLONG)GEMM_R); |
277 | | // fprintf(stderr, "GEMM: SA .. %p SB .. %p\n", sa, sb); |
278 | | |
279 | | // fprintf(stderr, "A = %p B = %p C = %p\n\tlda = %ld ldb = %ld ldc = %ld\n", a, b, c, lda, ldb, ldc); |
280 | | #endif |
281 | |
|
282 | | #ifdef TIMING |
283 | | innercost = 0; |
284 | | outercost = 0; |
285 | | kernelcost = 0; |
286 | | #endif |
287 | |
|
288 | 0 | for(js = n_from; js < n_to; js += GEMM_R){ |
289 | 0 | min_j = n_to - js; |
290 | 0 | if (min_j > GEMM_R) min_j = GEMM_R; |
291 | |
|
292 | 0 | for(ls = 0; ls < k; ls += min_l){ |
293 | |
|
294 | 0 | min_l = k - ls; |
295 | |
|
296 | 0 | if (min_l >= GEMM_Q * 2) { |
297 | | // gemm_p = GEMM_P; |
298 | 0 | min_l = GEMM_Q; |
299 | 0 | } else { |
300 | 0 | if (min_l > GEMM_Q) { |
301 | 0 | min_l = ((min_l / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; |
302 | 0 | } |
303 | 0 | gemm_p = ((l2size / min_l + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; |
304 | 0 | while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M; |
305 | 0 | } |
306 | |
|
307 | 0 | BLASLONG pad_min_l = min_l; |
308 | | #if defined(HALF) |
309 | | #if defined(DYNAMIC_ARCH) |
310 | | pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1); |
311 | | #else |
312 | | pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);; |
313 | | #endif |
314 | | #endif |
315 | | |
316 | | /* First, we have to move data A to L2 cache */ |
317 | 0 | min_i = m_to - m_from; |
318 | 0 | l1stride = 1; |
319 | |
|
320 | 0 | if (min_i >= GEMM_P * 2) { |
321 | 0 | min_i = GEMM_P; |
322 | 0 | } else { |
323 | 0 | if (min_i > GEMM_P) { |
324 | 0 | min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; |
325 | 0 | } else { |
326 | 0 | l1stride = 0; |
327 | 0 | } |
328 | 0 | } |
329 | |
|
330 | 0 | START_RPCC(); |
331 | |
|
332 | 0 | ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); |
333 | |
|
334 | 0 | STOP_RPCC(innercost); |
335 | |
|
336 | | #if defined(FUSED_GEMM) && !defined(TIMING) |
337 | | |
338 | | FUSED_KERNEL_OPERATION(min_i, min_j, min_l, alpha, |
339 | | sa, sb, b, ldb, c, ldc, m_from, js, ls); |
340 | | |
341 | | |
342 | | #else |
343 | 0 | for(jjs = js; jjs < js + min_j; jjs += min_jj){ |
344 | 0 | min_jj = min_j + js - jjs; |
345 | | #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) |
346 | | /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */ |
347 | | if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; |
348 | | #else |
349 | 0 | if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; |
350 | 0 | else |
351 | | /* |
352 | | if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; |
353 | | else |
354 | | */ |
355 | 0 | if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; |
356 | 0 | #endif |
357 | | |
358 | |
|
359 | 0 | START_RPCC(); |
360 | |
|
361 | 0 | OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, |
362 | 0 | sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride); |
363 | |
|
364 | 0 | STOP_RPCC(outercost); |
365 | |
|
366 | 0 | START_RPCC(); |
367 | |
|
368 | 0 | #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) |
369 | 0 | KERNEL_OPERATION(min_i, min_jj, min_l, alpha, |
370 | 0 | sa, sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); |
371 | | #else |
372 | | KERNEL_OPERATION(min_i, min_jj, min_l, (void *)&xalpha, |
373 | | sa, sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride, c, ldc, m_from, jjs); |
374 | | #endif |
375 | |
|
376 | 0 | STOP_RPCC(kernelcost); |
377 | 0 | } |
378 | 0 | #endif |
379 | |
|
380 | 0 | for(is = m_from + min_i; is < m_to; is += min_i){ |
381 | 0 | min_i = m_to - is; |
382 | |
|
383 | 0 | if (min_i >= GEMM_P * 2) { |
384 | 0 | min_i = GEMM_P; |
385 | 0 | } else |
386 | 0 | if (min_i > GEMM_P) { |
387 | 0 | min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; |
388 | 0 | } |
389 | |
|
390 | 0 | START_RPCC(); |
391 | |
|
392 | 0 | ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); |
393 | |
|
394 | 0 | STOP_RPCC(innercost); |
395 | |
|
396 | 0 | START_RPCC(); |
397 | |
|
398 | 0 | #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) |
399 | 0 | KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js); |
400 | | #else |
401 | | KERNEL_OPERATION(min_i, min_j, min_l, (void *)&xalpha, sa, sb, c, ldc, is, js); |
402 | | #endif |
403 | |
|
404 | 0 | STOP_RPCC(kernelcost); |
405 | |
|
406 | 0 | } /* end of is */ |
407 | 0 | } /* end of js */ |
408 | 0 | } /* end of ls */ |
409 | | |
410 | |
|
411 | | #ifdef TIMING |
412 | | total = (double)outercost + (double)innercost + (double)kernelcost; |
413 | | |
414 | | printf( "Copy A : %5.2f Copy B: %5.2f Kernel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", |
415 | | innercost / total * 100., outercost / total * 100., |
416 | | kernelcost / total * 100., |
417 | | (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / (double)kernelcost * 100. * (double)COMPSIZE / 2., |
418 | | (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. * (double)COMPSIZE / 2.); |
419 | | |
420 | | #endif |
421 | |
|
422 | 0 | return 0; |
423 | 0 | } Unexecuted instantiation: sgemm_nn Unexecuted instantiation: dgemm_nn Unexecuted instantiation: sgemm_nt Unexecuted instantiation: dgemm_nt Unexecuted instantiation: sgemm_tn Unexecuted instantiation: dgemm_tn Unexecuted instantiation: sgemm_tt Unexecuted instantiation: dgemm_tt |