/root/doris/contrib/openblas/driver/level3/level3.c
| Line | Count | Source | 
| 1 |  | /*********************************************************************/ | 
| 2 |  | /* Copyright 2009, 2010 The University of Texas at Austin.           */ | 
| 3 |  | /* All rights reserved.                                              */ | 
| 4 |  | /*                                                                   */ | 
| 5 |  | /* Redistribution and use in source and binary forms, with or        */ | 
| 6 |  | /* without modification, are permitted provided that the following   */ | 
| 7 |  | /* conditions are met:                                               */ | 
| 8 |  | /*                                                                   */ | 
| 9 |  | /*   1. Redistributions of source code must retain the above         */ | 
| 10 |  | /*      copyright notice, this list of conditions and the following  */ | 
| 11 |  | /*      disclaimer.                                                  */ | 
| 12 |  | /*                                                                   */ | 
| 13 |  | /*   2. Redistributions in binary form must reproduce the above      */ | 
| 14 |  | /*      copyright notice, this list of conditions and the following  */ | 
| 15 |  | /*      disclaimer in the documentation and/or other materials       */ | 
| 16 |  | /*      provided with the distribution.                              */ | 
| 17 |  | /*                                                                   */ | 
| 18 |  | /*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */ | 
| 19 |  | /*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */ | 
| 20 |  | /*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */ | 
| 21 |  | /*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */ | 
| 22 |  | /*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */ | 
| 23 |  | /*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */ | 
| 24 |  | /*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */ | 
| 25 |  | /*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */ | 
| 26 |  | /*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */ | 
| 27 |  | /*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */ | 
| 28 |  | /*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */ | 
| 29 |  | /*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */ | 
| 30 |  | /*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */ | 
| 31 |  | /*    POSSIBILITY OF SUCH DAMAGE.                                    */ | 
| 32 |  | /*                                                                   */ | 
| 33 |  | /* The views and conclusions contained in the software and           */ | 
| 34 |  | /* documentation are those of the authors and should not be          */ | 
| 35 |  | /* interpreted as representing official policies, either expressed   */ | 
| 36 |  | /* or implied, of The University of Texas at Austin.                 */ | 
| 37 |  | /*********************************************************************/ | 
| 38 |  |  | 
| 39 |  | /* This file is a template for level 3 operation */ | 
| 40 |  |  | 
| 41 |  | #ifndef BETA_OPERATION | 
| 42 |  | #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) | 
| 43 |  | #ifndef COMPLEX | 
| 44 |  | #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ | 
| 45 | 0 |   GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ | 
| 46 | 0 |       BETA[0], NULL, 0, NULL, 0, \ | 
| 47 | 0 |       (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) | 
| 48 |  | #else | 
| 49 |  | #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ | 
| 50 |  |   GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ | 
| 51 |  |       BETA[0], BETA[1], NULL, 0, NULL, 0, \ | 
| 52 |  |       (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) | 
| 53 |  | #endif | 
| 54 |  | #else | 
| 55 |  | #define BETA_OPERATION(M_FROM, M_TO, N_FROM, N_TO, BETA, C, LDC) \ | 
| 56 |  |   GEMM_BETA((M_TO) - (M_FROM), (N_TO - N_FROM), 0, \ | 
| 57 |  |       BETA, NULL, 0, NULL, 0, \ | 
| 58 |  |       (FLOAT *)(C) + ((M_FROM) + (N_FROM) * (LDC)) * COMPSIZE, LDC) | 
| 59 |  | #endif | 
| 60 |  | #endif | 
| 61 |  |  | 
| 62 |  | #ifndef ICOPY_OPERATION | 
| 63 |  | #if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ | 
| 64 |  |     defined(RN) || defined(RT) || defined(RC) || defined(RR) | 
| 65 | 0 | #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ITCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); | 
| 66 |  | #else | 
| 67 | 0 | #define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_INCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); | 
| 68 |  | #endif | 
| 69 |  | #endif | 
| 70 |  |  | 
| 71 |  | #ifndef OCOPY_OPERATION | 
| 72 |  | #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ | 
| 73 |  |     defined(NR) || defined(TR) || defined(CR) || defined(RR) | 
| 74 | 0 | #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_ONCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); | 
| 75 |  | #else | 
| 76 | 0 | #define OCOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) GEMM_OTCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); | 
| 77 |  | #endif | 
| 78 |  | #endif | 
| 79 |  |  | 
| 80 |  | #ifndef KERNEL_FUNC | 
| 81 |  | #if defined(NN) || defined(NT) || defined(TN) || defined(TT) | 
| 82 | 0 | #define KERNEL_FUNC GEMM_KERNEL_N | 
| 83 |  | #endif | 
| 84 |  | #if defined(CN) || defined(CT) || defined(RN) || defined(RT) | 
| 85 |  | #define KERNEL_FUNC GEMM_KERNEL_L | 
| 86 |  | #endif | 
| 87 |  | #if defined(NC) || defined(TC) || defined(NR) || defined(TR) | 
| 88 |  | #define KERNEL_FUNC GEMM_KERNEL_R | 
| 89 |  | #endif | 
| 90 |  | #if defined(CC) || defined(CR) || defined(RC) || defined(RR) | 
| 91 |  | #define KERNEL_FUNC GEMM_KERNEL_B | 
| 92 |  | #endif | 
| 93 |  | #endif | 
| 94 |  |  | 
| 95 |  | #ifndef KERNEL_OPERATION | 
| 96 |  | #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) | 
| 97 |  | #ifndef COMPLEX | 
| 98 |  | #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ | 
| 99 | 0 |   KERNEL_FUNC(M, N, K, ALPHA[0], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) | 
| 100 |  | #else | 
| 101 |  | #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ | 
| 102 |  |   KERNEL_FUNC(M, N, K, ALPHA[0], ALPHA[1], SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) | 
| 103 |  | #endif | 
| 104 |  | #else | 
| 105 |  | #define KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, C, LDC, X, Y) \ | 
| 106 |  |   KERNEL_FUNC(M, N, K, ALPHA, SA, SB, (FLOAT *)(C) + ((X) + (Y) * LDC) * COMPSIZE, LDC) | 
| 107 |  | #endif | 
| 108 |  | #endif | 
| 109 |  |  | 
| 110 |  | #ifndef FUSED_KERNEL_OPERATION | 
| 111 |  | #if defined(NN) || defined(TN) || defined(CN) || defined(RN) || \ | 
| 112 |  |     defined(NR) || defined(TR) || defined(CR) || defined(RR) | 
| 113 |  | #ifndef COMPLEX | 
| 114 |  | #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ | 
| 115 |  |   FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], SA, SB, \ | 
| 116 |  |   (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) | 
| 117 |  | #else | 
| 118 |  | #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ | 
| 119 |  |   FUSED_GEMM_KERNEL_N(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ | 
| 120 |  |   (FLOAT *)(B) + ((L) + (J) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) | 
| 121 |  |  | 
| 122 |  | #endif | 
| 123 |  | #else | 
| 124 |  | #ifndef COMPLEX | 
| 125 |  | #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ | 
| 126 |  |   FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], SA, SB, \ | 
| 127 |  |   (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) | 
| 128 |  | #else | 
| 129 |  | #define FUSED_KERNEL_OPERATION(M, N, K, ALPHA, SA, SB, B, LDB, C, LDC, I, J, L) \ | 
| 130 |  |   FUSED_GEMM_KERNEL_T(M, N, K, ALPHA[0], ALPHA[1], SA, SB, \ | 
| 131 |  |   (FLOAT *)(B) + ((J) + (L) * LDB) * COMPSIZE, LDB, (FLOAT *)(C) + ((I) + (J) * LDC) * COMPSIZE, LDC) | 
| 132 |  | #endif | 
| 133 |  | #endif | 
| 134 |  | #endif | 
| 135 |  |  | 
| 136 |  | #ifndef A | 
| 137 | 0 | #define A args -> a | 
| 138 |  | #endif | 
| 139 |  | #ifndef LDA | 
| 140 | 0 | #define LDA args -> lda | 
| 141 |  | #endif | 
| 142 |  | #ifndef B | 
| 143 | 0 | #define B args -> b | 
| 144 |  | #endif | 
| 145 |  | #ifndef LDB | 
| 146 | 0 | #define LDB args -> ldb | 
| 147 |  | #endif | 
| 148 |  | #ifndef C | 
| 149 | 0 | #define C args -> c | 
| 150 |  | #endif | 
| 151 |  | #ifndef LDC | 
| 152 | 0 | #define LDC args -> ldc | 
| 153 |  | #endif | 
| 154 |  | #ifndef M | 
| 155 | 0 | #define M args -> m | 
| 156 |  | #endif | 
| 157 |  | #ifndef N | 
| 158 | 0 | #define N args -> n | 
| 159 |  | #endif | 
| 160 |  | #ifndef K | 
| 161 | 0 | #define K args -> k | 
| 162 |  | #endif | 
| 163 |  |  | 
| 164 |  | #ifdef TIMING | 
| 165 |  | #define START_RPCC()    rpcc_counter = rpcc() | 
| 166 |  | #define STOP_RPCC(COUNTER)  COUNTER  += rpcc() - rpcc_counter | 
| 167 |  | #else | 
| 168 |  | #define START_RPCC() | 
| 169 |  | #define STOP_RPCC(COUNTER) | 
| 170 |  | #endif | 
| 171 |  |  | 
| 172 |  | int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, | 
| 173 | 0 |       XFLOAT *sa, XFLOAT *sb, BLASLONG dummy){ | 
| 174 | 0 |   BLASLONG k, lda, ldb, ldc; | 
| 175 | 0 |   FLOAT *alpha, *beta; | 
| 176 | 0 |   IFLOAT *a, *b; | 
| 177 | 0 |   FLOAT *c; | 
| 178 | 0 |   BLASLONG m_from, m_to, n_from, n_to; | 
| 179 |  | 
 | 
| 180 | 0 |   BLASLONG ls, is, js; | 
| 181 | 0 |   BLASLONG min_l, min_i, min_j; | 
| 182 | 0 | #if !defined(FUSED_GEMM) || defined(TIMING) | 
| 183 | 0 |   BLASLONG jjs, min_jj; | 
| 184 | 0 | #endif | 
| 185 |  | 
 | 
| 186 | 0 |   BLASLONG l1stride, gemm_p, l2size; | 
| 187 |  | 
 | 
| 188 |  | #if defined(XDOUBLE) && defined(QUAD_PRECISION) | 
| 189 |  |   xidouble xalpha; | 
| 190 |  | #endif | 
| 191 |  | 
 | 
| 192 |  | #ifdef TIMING | 
| 193 |  |   unsigned long long rpcc_counter; | 
| 194 |  |   unsigned long long innercost  = 0; | 
| 195 |  |   unsigned long long outercost  = 0; | 
| 196 |  |   unsigned long long kernelcost = 0; | 
| 197 |  |   double total; | 
| 198 |  | #endif | 
| 199 |  | 
 | 
| 200 | 0 |   k = K; | 
| 201 |  | 
 | 
| 202 | 0 |   a = (IFLOAT *)A; | 
| 203 | 0 |   b = (IFLOAT *)B; | 
| 204 | 0 |   c = (FLOAT *)C; | 
| 205 |  | 
 | 
| 206 | 0 |   lda = LDA; | 
| 207 | 0 |   ldb = LDB; | 
| 208 | 0 |   ldc = LDC; | 
| 209 |  | 
 | 
| 210 | 0 |   alpha = (FLOAT *)args -> alpha; | 
| 211 | 0 |   beta  = (FLOAT *)args -> beta; | 
| 212 |  | 
 | 
| 213 | 0 |   m_from = 0; | 
| 214 | 0 |   m_to   = M; | 
| 215 |  | 
 | 
| 216 | 0 |   if (range_m) { | 
| 217 | 0 |     m_from = *(((BLASLONG *)range_m) + 0); | 
| 218 | 0 |     m_to   = *(((BLASLONG *)range_m) + 1); | 
| 219 | 0 |   } | 
| 220 |  | 
 | 
| 221 | 0 |   n_from = 0; | 
| 222 | 0 |   n_to   = N; | 
| 223 |  | 
 | 
| 224 | 0 |   if (range_n) { | 
| 225 | 0 |     n_from = *(((BLASLONG *)range_n) + 0); | 
| 226 | 0 |     n_to   = *(((BLASLONG *)range_n) + 1); | 
| 227 | 0 |   } | 
| 228 |  | 
 | 
| 229 | 0 |   if (beta) { | 
| 230 | 0 | #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) | 
| 231 | 0 | #ifndef COMPLEX | 
| 232 | 0 |     if (beta[0] != ONE | 
| 233 |  | #else | 
| 234 |  |     if ((beta[0] != ONE) || (beta[1] != ZERO) | 
| 235 |  | #endif | 
| 236 |  | #else | 
| 237 |  |     if (((beta[0].x[1] != 0x3fff000000000000UL) || beta[0].x[0] != 0) | 
| 238 |  | #ifdef COMPLEX | 
| 239 |  |   &&(((beta[1].x[0] | beta[1].x[1]) << 1) != 0) | 
| 240 |  | #endif | 
| 241 |  | #endif | 
| 242 | 0 |   ) { | 
| 243 |  | #if defined(XDOUBLE) && defined(QUAD_PRECISION) | 
| 244 |  |     xidouble xbeta; | 
| 245 |  |  | 
| 246 |  |     qtox(&xbeta, beta); | 
| 247 |  | #endif | 
| 248 | 0 |     BETA_OPERATION(m_from, m_to, n_from, n_to, beta, c, ldc); | 
| 249 | 0 |   } | 
| 250 | 0 |   } | 
| 251 |  | 
 | 
| 252 | 0 |   if ((k == 0) || (alpha == NULL)) return 0; | 
| 253 |  |  | 
| 254 | 0 | #if !defined(XDOUBLE) || !defined(QUAD_PRECISION) | 
| 255 | 0 |   if ( alpha[0] == ZERO | 
| 256 |  | #ifdef COMPLEX | 
| 257 |  |       && alpha[1] == ZERO | 
| 258 |  | #endif | 
| 259 | 0 |    ) return 0;  | 
| 260 |  | #else | 
| 261 |  |   if (((alpha[0].x[0] | alpha[0].x[1] | 
| 262 |  | #ifdef COMPLEX | 
| 263 |  |        | alpha[1].x[0] | alpha[1].x[1] | 
| 264 |  | #endif | 
| 265 |  |        ) << 1) == 0) return 0; | 
| 266 |  | #endif | 
| 267 |  |  | 
| 268 |  | #if defined(XDOUBLE)  && defined(QUAD_PRECISION) | 
| 269 |  |   qtox(&xalpha, alpha); | 
| 270 |  | #endif | 
| 271 |  |  | 
| 272 | 0 |   l2size = GEMM_P * GEMM_Q; | 
| 273 |  | 
 | 
| 274 |  | #if 0 | 
| 275 |  |   fprintf(stderr, "GEMM(Single): M_from : %ld  M_to : %ld  N_from : %ld  N_to : %ld  k : %ld\n", m_from, m_to, n_from, n_to, k); | 
| 276 |  |   fprintf(stderr, "GEMM(Single):: P = %4ld  Q = %4ld  R = %4ld\n", (BLASLONG)GEMM_P, (BLASLONG)GEMM_Q, (BLASLONG)GEMM_R); | 
| 277 |  |   //  fprintf(stderr, "GEMM: SA .. %p  SB .. %p\n", sa, sb); | 
| 278 |  |  | 
| 279 |  |   //  fprintf(stderr, "A = %p  B = %p  C = %p\n\tlda = %ld  ldb = %ld ldc = %ld\n", a, b, c, lda, ldb, ldc); | 
| 280 |  | #endif | 
| 281 |  | 
 | 
| 282 |  | #ifdef TIMING | 
| 283 |  |   innercost = 0; | 
| 284 |  |   outercost = 0; | 
| 285 |  |   kernelcost = 0; | 
| 286 |  | #endif | 
| 287 |  | 
 | 
| 288 | 0 |   for(js = n_from; js < n_to; js += GEMM_R){ | 
| 289 | 0 |     min_j = n_to - js; | 
| 290 | 0 |     if (min_j > GEMM_R) min_j = GEMM_R; | 
| 291 |  | 
 | 
| 292 | 0 |     for(ls = 0; ls < k; ls += min_l){ | 
| 293 |  | 
 | 
| 294 | 0 |       min_l = k - ls; | 
| 295 |  | 
 | 
| 296 | 0 |       if (min_l >= GEMM_Q * 2) { | 
| 297 |  |   // gemm_p = GEMM_P; | 
| 298 | 0 |   min_l  = GEMM_Q; | 
| 299 | 0 |       } else { | 
| 300 | 0 |   if (min_l > GEMM_Q) { | 
| 301 | 0 |     min_l = ((min_l / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; | 
| 302 | 0 |   } | 
| 303 | 0 |   gemm_p = ((l2size / min_l + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; | 
| 304 | 0 |   while (gemm_p * min_l > l2size) gemm_p -= GEMM_UNROLL_M; | 
| 305 | 0 |       } | 
| 306 |  | 
 | 
| 307 | 0 |       BLASLONG pad_min_l = min_l; | 
| 308 |  | #if defined(HALF) | 
| 309 |  | #if defined(DYNAMIC_ARCH) | 
| 310 |  |       pad_min_l = (min_l + gotoblas->sbgemm_align_k - 1) & ~(gotoblas->sbgemm_align_k-1); | 
| 311 |  | #else | 
| 312 |  |       pad_min_l = (min_l + SBGEMM_ALIGN_K - 1) & ~(SBGEMM_ALIGN_K - 1);; | 
| 313 |  | #endif | 
| 314 |  | #endif | 
| 315 |  |  | 
| 316 |  |       /* First, we have to move data A to L2 cache */ | 
| 317 | 0 |       min_i = m_to - m_from; | 
| 318 | 0 |       l1stride = 1; | 
| 319 |  | 
 | 
| 320 | 0 |       if (min_i >= GEMM_P * 2) { | 
| 321 | 0 |   min_i = GEMM_P; | 
| 322 | 0 |       } else { | 
| 323 | 0 |   if (min_i > GEMM_P) { | 
| 324 | 0 |     min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; | 
| 325 | 0 |   } else { | 
| 326 | 0 |     l1stride = 0; | 
| 327 | 0 |   } | 
| 328 | 0 |       } | 
| 329 |  | 
 | 
| 330 | 0 |       START_RPCC(); | 
| 331 |  | 
 | 
| 332 | 0 |       ICOPY_OPERATION(min_l, min_i, a, lda, ls, m_from, sa); | 
| 333 |  | 
 | 
| 334 | 0 |       STOP_RPCC(innercost); | 
| 335 |  | 
 | 
| 336 |  | #if defined(FUSED_GEMM) && !defined(TIMING) | 
| 337 |  |  | 
| 338 |  |       FUSED_KERNEL_OPERATION(min_i, min_j, min_l, alpha, | 
| 339 |  |            sa, sb, b, ldb, c, ldc, m_from, js, ls); | 
| 340 |  |  | 
| 341 |  |  | 
| 342 |  | #else | 
| 343 | 0 |       for(jjs = js; jjs < js + min_j; jjs += min_jj){ | 
| 344 | 0 |   min_jj = min_j + js - jjs; | 
| 345 |  | #if defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) | 
| 346 |  |   /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve best performance */ | 
| 347 |  |   if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; | 
| 348 |  | #else | 
| 349 | 0 |         if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; | 
| 350 | 0 |         else | 
| 351 |  | /* | 
| 352 |  |     if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; | 
| 353 |  |           else | 
| 354 |  | */ | 
| 355 | 0 |               if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; | 
| 356 | 0 | #endif | 
| 357 |  |  | 
| 358 |  | 
 | 
| 359 | 0 |   START_RPCC(); | 
| 360 |  | 
 | 
| 361 | 0 |   OCOPY_OPERATION(min_l, min_jj, b, ldb, ls, jjs, | 
| 362 | 0 |       sb + pad_min_l * (jjs - js) * COMPSIZE * l1stride); | 
| 363 |  | 
 | 
| 364 | 0 |   STOP_RPCC(outercost); | 
| 365 |  | 
 | 
| 366 | 0 |   START_RPCC(); | 
| 367 |  | 
 | 
| 368 | 0 | #if !defined(XDOUBLE)  || !defined(QUAD_PRECISION) | 
| 369 | 0 |   KERNEL_OPERATION(min_i, min_jj, min_l, alpha, | 
| 370 | 0 |        sa, sb + pad_min_l * (jjs - js)  * COMPSIZE * l1stride, c, ldc, m_from, jjs); | 
| 371 |  | #else | 
| 372 |  |   KERNEL_OPERATION(min_i, min_jj, min_l, (void *)&xalpha, | 
| 373 |  |        sa, sb + pad_min_l * (jjs - js)  * COMPSIZE * l1stride, c, ldc, m_from, jjs); | 
| 374 |  | #endif | 
| 375 |  | 
 | 
| 376 | 0 |   STOP_RPCC(kernelcost); | 
| 377 | 0 |       } | 
| 378 | 0 | #endif | 
| 379 |  | 
 | 
| 380 | 0 |       for(is = m_from + min_i; is < m_to; is += min_i){ | 
| 381 | 0 |   min_i = m_to - is; | 
| 382 |  | 
 | 
| 383 | 0 |   if (min_i >= GEMM_P * 2) { | 
| 384 | 0 |     min_i = GEMM_P; | 
| 385 | 0 |   } else | 
| 386 | 0 |     if (min_i > GEMM_P) { | 
| 387 | 0 |       min_i = ((min_i / 2 + GEMM_UNROLL_M - 1)/GEMM_UNROLL_M) * GEMM_UNROLL_M; | 
| 388 | 0 |     } | 
| 389 |  | 
 | 
| 390 | 0 |   START_RPCC(); | 
| 391 |  | 
 | 
| 392 | 0 |   ICOPY_OPERATION(min_l, min_i, a, lda, ls, is, sa); | 
| 393 |  | 
 | 
| 394 | 0 |   STOP_RPCC(innercost); | 
| 395 |  | 
 | 
| 396 | 0 |   START_RPCC(); | 
| 397 |  | 
 | 
| 398 | 0 | #if !defined(XDOUBLE)  || !defined(QUAD_PRECISION) | 
| 399 | 0 |   KERNEL_OPERATION(min_i, min_j, min_l, alpha, sa, sb, c, ldc, is, js); | 
| 400 |  | #else | 
| 401 |  |   KERNEL_OPERATION(min_i, min_j, min_l, (void *)&xalpha, sa, sb, c, ldc, is, js); | 
| 402 |  | #endif | 
| 403 |  | 
 | 
| 404 | 0 |   STOP_RPCC(kernelcost); | 
| 405 |  | 
 | 
| 406 | 0 |       } /* end of is */ | 
| 407 | 0 |     } /* end of js */ | 
| 408 | 0 |   } /* end of ls */ | 
| 409 |  |  | 
| 410 |  | 
 | 
| 411 |  | #ifdef TIMING | 
| 412 |  |   total = (double)outercost + (double)innercost + (double)kernelcost; | 
| 413 |  |  | 
| 414 |  |   printf( "Copy A : %5.2f Copy  B: %5.2f  Kernel : %5.2f  kernel Effi. : %5.2f Total Effi. : %5.2f\n", | 
| 415 |  |      innercost / total * 100., outercost / total * 100., | 
| 416 |  |     kernelcost / total * 100., | 
| 417 |  |     (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / (double)kernelcost * 100. * (double)COMPSIZE / 2., | 
| 418 |  |     (double)(m_to - m_from) * (double)(n_to - n_from) * (double)k / total * 100. * (double)COMPSIZE / 2.); | 
| 419 |  |  | 
| 420 |  | #endif | 
| 421 |  | 
 | 
| 422 | 0 |   return 0; | 
| 423 | 0 | } Unexecuted instantiation: sgemm_nnUnexecuted instantiation: dgemm_nnUnexecuted instantiation: sgemm_ntUnexecuted instantiation: dgemm_ntUnexecuted instantiation: sgemm_tnUnexecuted instantiation: dgemm_tnUnexecuted instantiation: sgemm_ttUnexecuted instantiation: dgemm_tt |