Coverage Report

Created: 2025-09-11 18:52

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/root/doris/contrib/openblas/driver/others/parameter.c
Line
Count
Source
1
/*********************************************************************/
2
/* Copyright 2009, 2010 The University of Texas at Austin.           */
3
/* All rights reserved.                                              */
4
/*                                                                   */
5
/* Redistribution and use in source and binary forms, with or        */
6
/* without modification, are permitted provided that the following   */
7
/* conditions are met:                                               */
8
/*                                                                   */
9
/*   1. Redistributions of source code must retain the above         */
10
/*      copyright notice, this list of conditions and the following  */
11
/*      disclaimer.                                                  */
12
/*                                                                   */
13
/*   2. Redistributions in binary form must reproduce the above      */
14
/*      copyright notice, this list of conditions and the following  */
15
/*      disclaimer in the documentation and/or other materials       */
16
/*      provided with the distribution.                              */
17
/*                                                                   */
18
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32
/*                                                                   */
33
/* The views and conclusions contained in the software and           */
34
/* documentation are those of the authors and should not be          */
35
/* interpreted as representing official policies, either expressed   */
36
/* or implied, of The University of Texas at Austin.                 */
37
/*********************************************************************/
38
39
#include <stdio.h>
40
#include <string.h>
41
#include "common.h"
42
43
extern int openblas_block_factor(void);
44
int get_L2_size(void);
45
46
#define DEFAULT_GEMM_P 128
47
#define DEFAULT_GEMM_Q 128
48
#define DEFAULT_GEMM_R 128
49
#define DEFAULT_GEMM_OFFSET_A 0
50
#define DEFAULT_GEMM_OFFSET_B 0
51
52
/* Global Parameter */
53
#if GEMM_OFFSET_A == gemm_offset_a
54
BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A;
55
#else
56
BLASLONG gemm_offset_a = GEMM_OFFSET_A;
57
#endif
58
59
#if GEMM_OFFSET_B == gemm_offset_b
60
BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B;
61
#else
62
BLASLONG gemm_offset_b = GEMM_OFFSET_B;
63
#endif
64
65
#if SBGEMM_P == sbgemm_p
66
BLASLONG sbgemm_p = DEFAULT_GEMM_P;
67
#else
68
BLASLONG sbgemm_p = SBGEMM_P;
69
#endif
70
#if SGEMM_P == sgemm_p
71
BLASLONG sgemm_p = DEFAULT_GEMM_P;
72
#else
73
BLASLONG sgemm_p = SGEMM_P;
74
#endif
75
#if DGEMM_P == dgemm_p
76
BLASLONG dgemm_p = DEFAULT_GEMM_P;
77
#else
78
BLASLONG dgemm_p = DGEMM_P;
79
#endif
80
#if CGEMM_P == cgemm_p
81
BLASLONG cgemm_p = DEFAULT_GEMM_P;
82
#else
83
BLASLONG cgemm_p = CGEMM_P;
84
#endif
85
#if ZGEMM_P == zgemm_p
86
BLASLONG zgemm_p = DEFAULT_GEMM_P;
87
#else
88
BLASLONG zgemm_p = ZGEMM_P;
89
#endif
90
91
#if SBGEMM_Q == sbgemm_q
92
BLASLONG sbgemm_q = DEFAULT_GEMM_Q;
93
#else
94
BLASLONG sbgemm_q = SBGEMM_Q;
95
#endif
96
#if SGEMM_Q == sgemm_q
97
BLASLONG sgemm_q = DEFAULT_GEMM_Q;
98
#else
99
BLASLONG sgemm_q = SGEMM_Q;
100
#endif
101
#if DGEMM_Q == dgemm_q
102
BLASLONG dgemm_q = DEFAULT_GEMM_Q;
103
#else
104
BLASLONG dgemm_q = DGEMM_Q;
105
#endif
106
#if CGEMM_Q == cgemm_q
107
BLASLONG cgemm_q = DEFAULT_GEMM_Q;
108
#else
109
BLASLONG cgemm_q = CGEMM_Q;
110
#endif
111
#if ZGEMM_Q == zgemm_q
112
BLASLONG zgemm_q = DEFAULT_GEMM_Q;
113
#else
114
BLASLONG zgemm_q = ZGEMM_Q;
115
#endif
116
117
#if SBGEMM_R == sbgemm_r
118
BLASLONG sbgemm_r = DEFAULT_GEMM_R;
119
#else
120
BLASLONG sbgemm_r = SBGEMM_R;
121
#endif
122
#if SGEMM_R == sgemm_r
123
BLASLONG sgemm_r = DEFAULT_GEMM_R;
124
#else
125
BLASLONG sgemm_r = SGEMM_R;
126
#endif
127
#if DGEMM_R == dgemm_r
128
BLASLONG dgemm_r = DEFAULT_GEMM_R;
129
#else
130
BLASLONG dgemm_r = DGEMM_R;
131
#endif
132
#if CGEMM_R == cgemm_r
133
BLASLONG cgemm_r = DEFAULT_GEMM_R;
134
#else
135
BLASLONG cgemm_r = CGEMM_R;
136
#endif
137
#if ZGEMM_R == zgemm_r
138
BLASLONG zgemm_r = DEFAULT_GEMM_R;
139
#else
140
BLASLONG zgemm_r = ZGEMM_R;
141
#endif
142
143
#if defined(EXPRECISION) || defined(QUAD_PRECISION)
144
#if QGEMM_P == qgemm_p
145
BLASLONG qgemm_p = DEFAULT_GEMM_P;
146
#else
147
BLASLONG qgemm_p = QGEMM_P;
148
#endif
149
#if XGEMM_P == xgemm_p
150
BLASLONG xgemm_p = DEFAULT_GEMM_P;
151
#else
152
BLASLONG xgemm_p = XGEMM_P;
153
#endif
154
#if QGEMM_Q == qgemm_q
155
BLASLONG qgemm_q = DEFAULT_GEMM_Q;
156
#else
157
BLASLONG qgemm_q = QGEMM_Q;
158
#endif
159
#if XGEMM_Q == xgemm_q
160
BLASLONG xgemm_q = DEFAULT_GEMM_Q;
161
#else
162
BLASLONG xgemm_q = XGEMM_Q;
163
#endif
164
#if QGEMM_R == qgemm_r
165
BLASLONG qgemm_r = DEFAULT_GEMM_R;
166
#else
167
BLASLONG qgemm_r = QGEMM_R;
168
#endif
169
#if XGEMM_R == xgemm_r
170
BLASLONG xgemm_r = DEFAULT_GEMM_R;
171
#else
172
BLASLONG xgemm_r = XGEMM_R;
173
#endif
174
#endif
175
176
#if defined(ARCH_X86) || defined(ARCH_X86_64)
177
178
0
int get_L2_size(void){
179
180
0
  int eax, ebx, ecx, edx;
181
182
0
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \
183
0
    defined(CORE_PRESCOTT) || defined(CORE_CORE2)       || defined(PENRYN) || defined(DUNNINGTON) || \
184
0
    defined(CORE_NEHALEM)  || defined(CORE_SANDYBRIDGE) || defined(ATOM)   || defined(GENERIC)    || \
185
0
    defined(PILEDRIVER)    || defined(HASWELL)          || defined(STEAMROLLER) || defined(EXCAVATOR) || \
186
0
    defined(ZEN)           || defined(SKYLAKEX)         || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
187
188
0
  cpuid(0x80000006, &eax, &ebx, &ecx, &edx);
189
190
0
  return BITMASK(ecx, 16, 0xffff);
191
192
#else
193
194
  int info[15];
195
  int i;
196
197
  cpuid(2, &eax, &ebx, &ecx, &edx);
198
199
  info[ 0] = BITMASK(eax,  8, 0xff);
200
  info[ 1] = BITMASK(eax, 16, 0xff);
201
  info[ 2] = BITMASK(eax, 24, 0xff);
202
203
  info[ 3] = BITMASK(ebx,  0, 0xff);
204
  info[ 4] = BITMASK(ebx,  8, 0xff);
205
  info[ 5] = BITMASK(ebx, 16, 0xff);
206
  info[ 6] = BITMASK(ebx, 24, 0xff);
207
208
  info[ 7] = BITMASK(ecx,  0, 0xff);
209
  info[ 8] = BITMASK(ecx,  8, 0xff);
210
  info[ 9] = BITMASK(ecx, 16, 0xff);
211
  info[10] = BITMASK(ecx, 24, 0xff);
212
213
  info[11] = BITMASK(edx,  0, 0xff);
214
  info[12] = BITMASK(edx,  8, 0xff);
215
  info[13] = BITMASK(edx, 16, 0xff);
216
  info[14] = BITMASK(edx, 24, 0xff);
217
218
  for (i = 0; i < 15; i++){
219
220
    switch (info[i]){
221
      case 0x3b :
222
      case 0x41 :
223
      case 0x79 :
224
  return  128;
225
  break;
226
227
      case 0x3c :
228
      case 0x42 :
229
      case 0x7a :
230
      case 0x7e :
231
      case 0x82 :
232
  return  256;
233
  break;
234
235
      case 0x43 :
236
      case 0x7b :
237
      case 0x7f :
238
      case 0x83 :
239
      case 0x86 :
240
  return  512;
241
  break;
242
243
      case 0x44 :
244
      case 0x78 :
245
      case 0x7c :
246
      case 0x84 :
247
      case 0x87 :
248
  return 1024;
249
  break;
250
251
      case 0x45 :
252
      case 0x7d :
253
      case 0x85 :
254
  return 2048;
255
256
      case 0x49 :
257
  return 4096;
258
  break;
259
    }
260
  }
261
262
  /* Never reached */
263
  return 0;
264
#endif
265
0
}
266
267
1
void blas_set_parameter(void){
268
269
1
  int factor;
270
1
#if defined(BULLDOZER) || defined(PILEDRIVER)  || defined(SANDYBRIDGE) || defined(NEHALEM) || \
271
1
    defined(HASWELL)   || defined(STEAMROLLER) || defined(EXCAVATOR)   || defined(ZEN)     || \
272
1
    defined(SKYLAKEX)  || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS)
273
1
  int size = 16;
274
#else
275
  int size = get_L2_size();
276
#endif
277
278
#if defined(CORE_KATMAI)  || defined(CORE_COPPERMINE) || defined(CORE_BANIAS)
279
  size >>= 7;
280
281
#if defined(CORE_BANIAS) && (HAVE_HIT > 1)
282
  sgemm_p =  64 / HAVE_HIT * size;
283
  dgemm_p =  32 / HAVE_HIT * size;
284
  cgemm_p =  32 / HAVE_HIT * size;
285
  zgemm_p =  16 / HAVE_HIT * size;
286
#ifdef EXPRECISION
287
  qgemm_p =  16 / HAVE_HIT * size;
288
  xgemm_p =   8 / HAVE_HIT * size;
289
#endif
290
#ifdef QUAD_PRECISION
291
  qgemm_p =   8 / HAVE_HIT * size;
292
  xgemm_p =   4 / HAVE_HIT * size;
293
#endif
294
#else
295
  sgemm_p =  64 * size;
296
  dgemm_p =  32 * size;
297
  cgemm_p =  32 * size;
298
  zgemm_p =  16 * size;
299
#ifdef EXPRECISION
300
  qgemm_p =  16 * size;
301
  xgemm_p =   8 * size;
302
#endif
303
#ifdef QUAD_PRECISION
304
  qgemm_p =   8 * size;
305
  xgemm_p =   4 * size;
306
#endif
307
#endif
308
#endif
309
310
#if defined(CORE_NORTHWOOD)
311
  size >>= 7;
312
313
#ifdef ALLOC_HUGETLB
314
  sgemm_p = 128 * size;
315
  dgemm_p =  64 * size;
316
  cgemm_p =  64 * size;
317
  zgemm_p =  32 * size;
318
#ifdef EXPRECISION
319
  qgemm_p =  32 * size;
320
  xgemm_p =  16 * size;
321
#endif
322
#ifdef QUAD_PRECISION
323
  qgemm_p =  16 * size;
324
  xgemm_p =   8 * size;
325
#endif
326
#else
327
  sgemm_p =  96 * size;
328
  dgemm_p =  48 * size;
329
  cgemm_p =  48 * size;
330
  zgemm_p =  24 * size;
331
#ifdef EXPRECISION
332
  qgemm_p =  24 * size;
333
  xgemm_p =  12 * size;
334
#endif
335
#ifdef QUAD_PRECISION
336
  qgemm_p =  12 * size;
337
  xgemm_p =   6 * size;
338
#endif
339
#endif
340
#endif
341
342
#if defined(CORE_CORE2)
343
344
  size >>= 9;
345
346
  sgemm_p =  92 * size;
347
  dgemm_p =  46 * size;
348
  cgemm_p =  46 * size;
349
  zgemm_p =  23 * size;
350
351
#ifdef EXPRECISION
352
  qgemm_p =  23 * size;
353
  xgemm_p =  11 * size;
354
#endif
355
#ifdef QUAD_PRECISION
356
  qgemm_p =  11 * size;
357
  xgemm_p =   5 * size;
358
#endif
359
#endif
360
361
#if defined(PENRYN)
362
363
  size >>= 9;
364
365
  sgemm_p = 1024;
366
  dgemm_p =  512;
367
  cgemm_p =  512;
368
  zgemm_p =  256;
369
370
#ifdef EXPRECISION
371
  qgemm_p =  256;
372
  xgemm_p =  128;
373
#endif
374
#ifdef QUAD_PRECISION
375
  qgemm_p =  21 * size + 4;
376
  xgemm_p =  10 * size + 2;
377
#endif
378
#endif
379
380
#if defined(DUNNINGTON)
381
382
  size >>= 9;
383
384
  sgemm_p = 384;
385
  dgemm_p = 384;
386
  cgemm_p = 384;
387
  zgemm_p = 384;
388
389
#ifdef EXPRECISION
390
  qgemm_p = 384;
391
  xgemm_p = 384;
392
#endif
393
#ifdef QUAD_PRECISION
394
  qgemm_p =  21 * size + 4;
395
  xgemm_p =  10 * size + 2;
396
#endif
397
#endif
398
399
#if defined(NEHALEM)
400
  sgemm_p = 1024;
401
  dgemm_p =  512;
402
  cgemm_p =  512;
403
  zgemm_p =  256;
404
#ifdef EXPRECISION
405
  qgemm_p =  256;
406
  xgemm_p =  128;
407
#endif
408
#endif
409
410
#if defined(SANDYBRIDGE)
411
  sgemm_p = 1024;
412
  dgemm_p =  512;
413
  cgemm_p =  512;
414
  zgemm_p =  256;
415
#ifdef EXPRECISION
416
  qgemm_p =  256;
417
  xgemm_p =  128;
418
#endif
419
#endif
420
421
#if defined(CORE_PRESCOTT)  || defined(GENERIC)
422
  size >>= 6;
423
424
  if (size > 16) size = 16;
425
426
  sgemm_p =  56 * size;
427
  dgemm_p =  28 * size;
428
  cgemm_p =  28 * size;
429
  zgemm_p =  14 * size;
430
#ifdef EXPRECISION
431
  qgemm_p =  14 * size;
432
  xgemm_p =   7 * size;
433
#endif
434
#ifdef QUAD_PRECISION
435
  qgemm_p =   7 * size;
436
  xgemm_p =   3 * size;
437
#endif
438
#endif
439
440
#if defined(CORE_OPTERON)
441
  sgemm_p =  224 + 14 * (size >> 5);
442
  dgemm_p =  112 + 14 * (size >> 6);
443
  cgemm_p =  116 + 14 * (size >> 6);
444
  zgemm_p =   58 + 14 * (size >> 7);
445
#ifdef EXPRECISION
446
  qgemm_p =   58 + 14 * (size >> 7);
447
  xgemm_p =   29 + 14 * (size >> 8);
448
#endif
449
#ifdef QUAD_PRECISION
450
  qgemm_p =   29 + 14 * (size >> 8);
451
  xgemm_p =   15 + 14 * (size >> 9);
452
#endif
453
#endif
454
455
#if defined(ATOM)
456
  size >>= 8;
457
458
  sgemm_p =  256;
459
  dgemm_p =  128;
460
  cgemm_p =  128;
461
  zgemm_p =   64;
462
#ifdef EXPRECISION
463
  qgemm_p =   64;
464
  xgemm_p =   32;
465
#endif
466
#ifdef QUAD_PRECISION
467
  qgemm_p =   32;
468
  xgemm_p =   16;
469
#endif
470
#endif
471
472
#if defined(CORE_BARCELONA) || defined(CORE_BOBCAT)
473
  size >>= 8;
474
475
  sgemm_p = 232 * size;
476
  dgemm_p = 116 * size;
477
  cgemm_p = 116 * size;
478
  zgemm_p =  58 * size;
479
#ifdef EXPRECISION
480
  qgemm_p =  58 * size;
481
  xgemm_p =  26 * size;
482
#endif
483
#ifdef QUAD_PRECISION
484
  qgemm_p =  26 * size;
485
  xgemm_p =  13 * size;
486
#endif
487
#endif
488
489
1
  factor=openblas_block_factor();
490
1
  if (factor>0) {
491
0
    if (factor <  10) factor =  10;
492
0
    if (factor > 200) factor = 200;
493
494
0
    sgemm_p = ((long)((double)sgemm_p * (double)factor * 1.e-2)) & ~7L;
495
0
    dgemm_p = ((long)((double)dgemm_p * (double)factor * 1.e-2)) & ~7L;
496
0
    cgemm_p = ((long)((double)cgemm_p * (double)factor * 1.e-2)) & ~7L;
497
0
    zgemm_p = ((long)((double)zgemm_p * (double)factor * 1.e-2)) & ~7L;
498
#ifdef EXPRECISION
499
    qgemm_p = ((long)((double)qgemm_p * (double)factor * 1.e-2)) & ~7L;
500
    xgemm_p = ((long)((double)xgemm_p * (double)factor * 1.e-2)) & ~7L;
501
#endif
502
0
  }
503
504
1
  if (sgemm_p == 0) sgemm_p = 64;
505
1
  if (dgemm_p == 0) dgemm_p = 64;
506
1
  if (cgemm_p == 0) cgemm_p = 64;
507
1
  if (zgemm_p == 0) zgemm_p = 64;
508
#ifdef EXPRECISION
509
  if (qgemm_p == 0) qgemm_p = 64;
510
  if (xgemm_p == 0) xgemm_p = 64;
511
#endif
512
513
#ifdef QUAD_PRECISION
514
  if (qgemm_p == 0) qgemm_p = 64;
515
  if (xgemm_p == 0) xgemm_p = 64;
516
#endif
517
518
1
  sgemm_p = ((sgemm_p + SGEMM_UNROLL_M - 1)/SGEMM_UNROLL_M) * SGEMM_UNROLL_M;
519
1
  dgemm_p = ((dgemm_p + DGEMM_UNROLL_M - 1)/DGEMM_UNROLL_M) * DGEMM_UNROLL_M;
520
1
  cgemm_p = ((cgemm_p + CGEMM_UNROLL_M - 1)/CGEMM_UNROLL_M) * CGEMM_UNROLL_M;
521
1
  zgemm_p = ((zgemm_p + ZGEMM_UNROLL_M - 1)/ZGEMM_UNROLL_M) * ZGEMM_UNROLL_M;
522
#ifdef QUAD_PRECISION
523
  qgemm_p = ((qgemm_p + QGEMM_UNROLL_M - 1)/QGEMM_UNROLL_M) * QGEMM_UNROLL_M;
524
  xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M;
525
#endif
526
527
#ifdef BUILD_BFLOAT16
528
  sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q *  4)) - 15) & ~15;
529
#endif
530
1
  sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q *  4)) - 15) & ~15;
531
1
  dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q *  8)) - 15) & ~15;
532
1
  cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q *  8)) - 15) & ~15;
533
1
  zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
534
#if defined(EXPRECISION) || defined(QUAD_PRECISION)
535
  qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
536
  xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
537
#endif
538
539
#if 0
540
  fprintf(stderr, "SGEMM ... %3d, %3d, %3d\n", SGEMM_P, SGEMM_Q, SGEMM_R);
541
  fprintf(stderr, "DGEMM ... %3d, %3d, %3d\n", DGEMM_P, DGEMM_Q, DGEMM_R);
542
  fprintf(stderr, "CGEMM ... %3d, %3d, %3d\n", CGEMM_P, CGEMM_Q, CGEMM_R);
543
  fprintf(stderr, "ZGEMM ... %3d, %3d, %3d\n", ZGEMM_P, ZGEMM_Q, ZGEMM_R);
544
#endif
545
546
1
  return;
547
1
}
548
549
#if 0
550
551
int get_current_cpu_info(void){
552
553
  int nlprocs, ncores, cmplegacy;
554
  int htt     = 0;
555
  int apicid  = 0;
556
557
#if defined(CORE_PRESCOTT) || defined(CORE_OPTERON)
558
  int eax, ebx, ecx, edx;
559
560
  cpuid(1, &eax, &ebx, &ecx, &edx);
561
  nlprocs = BITMASK(ebx, 16, 0xff);
562
  apicid  = BITMASK(ebx, 24, 0xff);
563
  htt     = BITMASK(edx, 28, 0x01);
564
#endif
565
566
#if defined(CORE_PRESCOTT)
567
  cpuid(4, &eax, &ebx, &ecx, &edx);
568
  ncores = BITMASK(eax, 26, 0x3f);
569
570
  if (htt == 0)  nlprocs = 0;
571
#endif
572
573
#if defined(CORE_OPTERON)
574
  cpuid(0x80000008, &eax, &ebx, &ecx, &edx);
575
  ncores = BITMASK(ecx,  0, 0xff);
576
577
  cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
578
  cmplegacy = BITMASK(ecx,  1, 0x01);
579
580
  if (htt == 0) {
581
    nlprocs = 0;
582
    ncores  = 0;
583
    cmplegacy = 0;
584
  }
585
#endif
586
587
  ncores  ++;
588
589
  fprintf(stderr, "APICID = %d  Number of core = %d\n", apicid, ncores);
590
591
  return 0;
592
}
593
#endif
594
595
#endif
596
597
#if defined(ARCH_IA64)
598
599
static inline BLASULONG cpuid(BLASULONG regnum){
600
  BLASULONG value;
601
602
#ifndef __ECC
603
  asm ("mov %0=cpuid[%r1]" : "=r"(value) : "rO"(regnum));
604
#else
605
 value = __getIndReg(_IA64_REG_INDR_CPUID, regnum);
606
#endif
607
608
  return value;
609
}
610
611
#if 1
612
613
void blas_set_parameter(void){
614
615
  BLASULONG cpuid3, size;
616
617
  cpuid3 = cpuid(3);
618
619
  size = BITMASK(cpuid3, 16, 0xff);
620
621
  sbgemm_p = 192 * (size + 1);
622
  sgemm_p = 192 * (size + 1);
623
  dgemm_p =  96 * (size + 1);
624
  cgemm_p =  96 * (size + 1);
625
  zgemm_p =  48 * (size + 1);
626
#ifdef EXPRECISION
627
  qgemm_p =  64 * (size + 1);
628
  xgemm_p =  32 * (size + 1);
629
#endif
630
#ifdef QUAD_PRECISION
631
  qgemm_p =  32 * (size + 1);
632
  xgemm_p =  16 * (size + 1);
633
#endif
634
635
#ifdef BUILD_BFLOAT16
636
  sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q *  4)) - 15) & ~15;
637
#endif
638
  sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q *  4)) - 15) & ~15;
639
  dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q *  8)) - 15) & ~15;
640
  cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q *  8)) - 15) & ~15;
641
  zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
642
#if defined(EXPRECISION) || defined(QUAD_PRECISION)
643
  qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
644
  xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
645
#endif
646
647
  return;
648
}
649
650
#else
651
652
#define IA64_SYS_NAME  "/sys/devices/system/cpu/cpu0/cache/index3/size"
653
#define IA64_PROC_NAME "/proc/pal/cpu0/cache_info"
654
655
void blas_set_parameter(void){
656
657
  BLASULONG cpuid3;
658
  int size = 0;
659
660
#if 1
661
  char buffer[128];
662
  FILE *infile;
663
664
  if ((infile = fopen(IA64_SYS_NAME, "r")) != NULL) {
665
666
    fgets(buffer, sizeof(buffer), infile);
667
    fclose(infile);
668
669
    size = atoi(buffer) / 1536;
670
  }
671
672
  if (size <= 0) {
673
    if ((infile = fopen(IA64_PROC_NAME, "r")) != NULL) {
674
675
      while(fgets(buffer, sizeof(buffer), infile) != NULL) {
676
  if ((!strncmp("Data/Instruction Cache level 3", buffer, 30))) break;
677
      }
678
679
      fgets(buffer, sizeof(buffer), infile);
680
681
      fclose(infile);
682
683
      *strstr(buffer, "bytes") = (char)NULL;
684
685
      size = atoi(strchr(buffer, ':') + 1) / 1572864;
686
    }
687
  }
688
#endif
689
690
  /* The last resort */
691
692
  if (size <= 0) {
693
    cpuid3 = cpuid(3);
694
695
    size = BITMASK(cpuid3, 16, 0xff) + 1;
696
  }
697
698
  sgemm_p = 320 * size;
699
  dgemm_p = 160 * size;
700
  cgemm_p = 160 * size;
701
  zgemm_p =  80 * size;
702
#ifdef EXPRECISION
703
  qgemm_p =  80 * size;
704
  xgemm_p =  40 * size;
705
#endif
706
707
  sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q *  4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q *  4)) - 15) & ~15;
708
  dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q *  8)) - 15) & ~15;
709
  cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q *  8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q *  8)) - 15) & ~15;
710
  zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15;
711
#ifdef EXPRECISION
712
  qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15;
713
  xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15;
714
#endif
715
716
  return;
717
}
718
719
#endif
720
721
#endif
722
723
#if defined(ARCH_MIPS64)
724
void blas_set_parameter(void){
725
#if defined(LOONGSON3R3) || defined(LOONGSON3R4)
726
#ifdef SMP
727
  if(blas_num_threads == 1){
728
#endif
729
    //single thread
730
    dgemm_r = 1024;
731
#ifdef SMP
732
  }else{
733
    //multi thread
734
    dgemm_r = 200;
735
  }
736
#endif
737
#endif
738
739
}
740
#endif
741
742
#if defined(ARCH_LOONGARCH64)
743
int get_L3_size() {
744
  int ret = 0, id = 0x14;
745
  __asm__ volatile (
746
    "cpucfg %[ret], %[id]"
747
    : [ret]"=r"(ret)
748
    : [id]"r"(id)
749
    : "memory"
750
  );
751
  return ((ret & 0xffff) + 1) * pow(2, ((ret >> 16) & 0xff)) * pow(2, ((ret >> 24) & 0x7f)) / 1024 / 1024; // MB
752
}
753
754
void blas_set_parameter(void){
755
#if defined(LA464)
756
  int L3_size = get_L3_size();
757
#ifdef SMP
758
  if(blas_num_threads == 1){
759
#endif
760
    //single thread
761
    if (L3_size == 32){ // 3C5000 and 3D5000
762
      sgemm_p = 256;
763
      sgemm_q = 384;
764
      sgemm_r = 8192;
765
766
      dgemm_p = 112;
767
      dgemm_q = 289;
768
      dgemm_r = 4096;
769
770
      cgemm_p = 128;
771
      cgemm_q = 256;
772
      cgemm_r = 4096;
773
774
      zgemm_p = 128;
775
      zgemm_q = 128;
776
      zgemm_r = 2048;
777
    } else { // 3A5000 and 3C5000L
778
      sgemm_p = 256;
779
      sgemm_q = 384;
780
      sgemm_r = 4096;
781
782
      dgemm_p = 112;
783
      dgemm_q = 300;
784
      dgemm_r = 3024;
785
786
      cgemm_p = 128;
787
      cgemm_q = 256;
788
      cgemm_r = 2048;
789
790
      zgemm_p = 128;
791
      zgemm_q = 128;
792
      zgemm_r = 1024;
793
    }
794
#ifdef SMP
795
  }else{
796
    //multi thread
797
    if (L3_size == 32){ // 3C5000 and 3D5000
798
      sgemm_p = 256;
799
      sgemm_q = 384;
800
      sgemm_r = 1024;
801
802
      dgemm_p = 112;
803
      dgemm_q = 289;
804
      dgemm_r = 342;
805
806
      cgemm_p = 128;
807
      cgemm_q = 256;
808
      cgemm_r = 512;
809
810
      zgemm_p = 128;
811
      zgemm_q = 128;
812
      zgemm_r = 512;
813
    } else { // 3A5000 and 3C5000L
814
      sgemm_p = 256;
815
      sgemm_q = 384;
816
      sgemm_r = 2048;
817
818
      dgemm_p = 112;
819
      dgemm_q = 300;
820
      dgemm_r = 738;
821
822
      cgemm_p = 128;
823
      cgemm_q = 256;
824
      cgemm_r = 1024;
825
826
      zgemm_p = 128;
827
      zgemm_q = 128;
828
      zgemm_r = 1024;
829
    }
830
  }
831
#endif
832
#endif
833
}
834
#endif
835
836
#if defined(ARCH_ARM64)
837
838
void blas_set_parameter(void)
839
{
840
}
841
842
#endif