/root/doris/contrib/openblas/driver/others/parameter.c
Line | Count | Source |
1 | | /*********************************************************************/ |
2 | | /* Copyright 2009, 2010 The University of Texas at Austin. */ |
3 | | /* All rights reserved. */ |
4 | | /* */ |
5 | | /* Redistribution and use in source and binary forms, with or */ |
6 | | /* without modification, are permitted provided that the following */ |
7 | | /* conditions are met: */ |
8 | | /* */ |
9 | | /* 1. Redistributions of source code must retain the above */ |
10 | | /* copyright notice, this list of conditions and the following */ |
11 | | /* disclaimer. */ |
12 | | /* */ |
13 | | /* 2. Redistributions in binary form must reproduce the above */ |
14 | | /* copyright notice, this list of conditions and the following */ |
15 | | /* disclaimer in the documentation and/or other materials */ |
16 | | /* provided with the distribution. */ |
17 | | /* */ |
18 | | /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ |
19 | | /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ |
20 | | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ |
21 | | /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ |
22 | | /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ |
23 | | /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ |
24 | | /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ |
25 | | /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ |
26 | | /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ |
27 | | /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ |
28 | | /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ |
29 | | /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ |
30 | | /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ |
31 | | /* POSSIBILITY OF SUCH DAMAGE. */ |
32 | | /* */ |
33 | | /* The views and conclusions contained in the software and */ |
34 | | /* documentation are those of the authors and should not be */ |
35 | | /* interpreted as representing official policies, either expressed */ |
36 | | /* or implied, of The University of Texas at Austin. */ |
37 | | /*********************************************************************/ |
38 | | |
39 | | #include <stdio.h> |
40 | | #include <string.h> |
41 | | #include "common.h" |
42 | | |
43 | | extern int openblas_block_factor(void); |
44 | | int get_L2_size(void); |
45 | | |
46 | | #define DEFAULT_GEMM_P 128 |
47 | | #define DEFAULT_GEMM_Q 128 |
48 | | #define DEFAULT_GEMM_R 128 |
49 | | #define DEFAULT_GEMM_OFFSET_A 0 |
50 | | #define DEFAULT_GEMM_OFFSET_B 0 |
51 | | |
52 | | /* Global Parameter */ |
53 | | #if GEMM_OFFSET_A == gemm_offset_a |
54 | | BLASLONG gemm_offset_a = DEFAULT_GEMM_OFFSET_A; |
55 | | #else |
56 | | BLASLONG gemm_offset_a = GEMM_OFFSET_A; |
57 | | #endif |
58 | | |
59 | | #if GEMM_OFFSET_B == gemm_offset_b |
60 | | BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B; |
61 | | #else |
62 | | BLASLONG gemm_offset_b = GEMM_OFFSET_B; |
63 | | #endif |
64 | | |
65 | | #if SBGEMM_P == sbgemm_p |
66 | | BLASLONG sbgemm_p = DEFAULT_GEMM_P; |
67 | | #else |
68 | | BLASLONG sbgemm_p = SBGEMM_P; |
69 | | #endif |
70 | | #if SGEMM_P == sgemm_p |
71 | | BLASLONG sgemm_p = DEFAULT_GEMM_P; |
72 | | #else |
73 | | BLASLONG sgemm_p = SGEMM_P; |
74 | | #endif |
75 | | #if DGEMM_P == dgemm_p |
76 | | BLASLONG dgemm_p = DEFAULT_GEMM_P; |
77 | | #else |
78 | | BLASLONG dgemm_p = DGEMM_P; |
79 | | #endif |
80 | | #if CGEMM_P == cgemm_p |
81 | | BLASLONG cgemm_p = DEFAULT_GEMM_P; |
82 | | #else |
83 | | BLASLONG cgemm_p = CGEMM_P; |
84 | | #endif |
85 | | #if ZGEMM_P == zgemm_p |
86 | | BLASLONG zgemm_p = DEFAULT_GEMM_P; |
87 | | #else |
88 | | BLASLONG zgemm_p = ZGEMM_P; |
89 | | #endif |
90 | | |
91 | | #if SBGEMM_Q == sbgemm_q |
92 | | BLASLONG sbgemm_q = DEFAULT_GEMM_Q; |
93 | | #else |
94 | | BLASLONG sbgemm_q = SBGEMM_Q; |
95 | | #endif |
96 | | #if SGEMM_Q == sgemm_q |
97 | | BLASLONG sgemm_q = DEFAULT_GEMM_Q; |
98 | | #else |
99 | | BLASLONG sgemm_q = SGEMM_Q; |
100 | | #endif |
101 | | #if DGEMM_Q == dgemm_q |
102 | | BLASLONG dgemm_q = DEFAULT_GEMM_Q; |
103 | | #else |
104 | | BLASLONG dgemm_q = DGEMM_Q; |
105 | | #endif |
106 | | #if CGEMM_Q == cgemm_q |
107 | | BLASLONG cgemm_q = DEFAULT_GEMM_Q; |
108 | | #else |
109 | | BLASLONG cgemm_q = CGEMM_Q; |
110 | | #endif |
111 | | #if ZGEMM_Q == zgemm_q |
112 | | BLASLONG zgemm_q = DEFAULT_GEMM_Q; |
113 | | #else |
114 | | BLASLONG zgemm_q = ZGEMM_Q; |
115 | | #endif |
116 | | |
117 | | #if SBGEMM_R == sbgemm_r |
118 | | BLASLONG sbgemm_r = DEFAULT_GEMM_R; |
119 | | #else |
120 | | BLASLONG sbgemm_r = SBGEMM_R; |
121 | | #endif |
122 | | #if SGEMM_R == sgemm_r |
123 | | BLASLONG sgemm_r = DEFAULT_GEMM_R; |
124 | | #else |
125 | | BLASLONG sgemm_r = SGEMM_R; |
126 | | #endif |
127 | | #if DGEMM_R == dgemm_r |
128 | | BLASLONG dgemm_r = DEFAULT_GEMM_R; |
129 | | #else |
130 | | BLASLONG dgemm_r = DGEMM_R; |
131 | | #endif |
132 | | #if CGEMM_R == cgemm_r |
133 | | BLASLONG cgemm_r = DEFAULT_GEMM_R; |
134 | | #else |
135 | | BLASLONG cgemm_r = CGEMM_R; |
136 | | #endif |
137 | | #if ZGEMM_R == zgemm_r |
138 | | BLASLONG zgemm_r = DEFAULT_GEMM_R; |
139 | | #else |
140 | | BLASLONG zgemm_r = ZGEMM_R; |
141 | | #endif |
142 | | |
143 | | #if defined(EXPRECISION) || defined(QUAD_PRECISION) |
144 | | #if QGEMM_P == qgemm_p |
145 | | BLASLONG qgemm_p = DEFAULT_GEMM_P; |
146 | | #else |
147 | | BLASLONG qgemm_p = QGEMM_P; |
148 | | #endif |
149 | | #if XGEMM_P == xgemm_p |
150 | | BLASLONG xgemm_p = DEFAULT_GEMM_P; |
151 | | #else |
152 | | BLASLONG xgemm_p = XGEMM_P; |
153 | | #endif |
154 | | #if QGEMM_Q == qgemm_q |
155 | | BLASLONG qgemm_q = DEFAULT_GEMM_Q; |
156 | | #else |
157 | | BLASLONG qgemm_q = QGEMM_Q; |
158 | | #endif |
159 | | #if XGEMM_Q == xgemm_q |
160 | | BLASLONG xgemm_q = DEFAULT_GEMM_Q; |
161 | | #else |
162 | | BLASLONG xgemm_q = XGEMM_Q; |
163 | | #endif |
164 | | #if QGEMM_R == qgemm_r |
165 | | BLASLONG qgemm_r = DEFAULT_GEMM_R; |
166 | | #else |
167 | | BLASLONG qgemm_r = QGEMM_R; |
168 | | #endif |
169 | | #if XGEMM_R == xgemm_r |
170 | | BLASLONG xgemm_r = DEFAULT_GEMM_R; |
171 | | #else |
172 | | BLASLONG xgemm_r = XGEMM_R; |
173 | | #endif |
174 | | #endif |
175 | | |
176 | | #if defined(ARCH_X86) || defined(ARCH_X86_64) |
177 | | |
178 | 0 | int get_L2_size(void){ |
179 | |
|
180 | 0 | int eax, ebx, ecx, edx; |
181 | |
|
182 | 0 | #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ |
183 | 0 | defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ |
184 | 0 | defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ |
185 | 0 | defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || \ |
186 | 0 | defined(ZEN) || defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) |
187 | |
|
188 | 0 | cpuid(0x80000006, &eax, &ebx, &ecx, &edx); |
189 | |
|
190 | 0 | return BITMASK(ecx, 16, 0xffff); |
191 | |
|
192 | | #else |
193 | | |
194 | | int info[15]; |
195 | | int i; |
196 | | |
197 | | cpuid(2, &eax, &ebx, &ecx, &edx); |
198 | | |
199 | | info[ 0] = BITMASK(eax, 8, 0xff); |
200 | | info[ 1] = BITMASK(eax, 16, 0xff); |
201 | | info[ 2] = BITMASK(eax, 24, 0xff); |
202 | | |
203 | | info[ 3] = BITMASK(ebx, 0, 0xff); |
204 | | info[ 4] = BITMASK(ebx, 8, 0xff); |
205 | | info[ 5] = BITMASK(ebx, 16, 0xff); |
206 | | info[ 6] = BITMASK(ebx, 24, 0xff); |
207 | | |
208 | | info[ 7] = BITMASK(ecx, 0, 0xff); |
209 | | info[ 8] = BITMASK(ecx, 8, 0xff); |
210 | | info[ 9] = BITMASK(ecx, 16, 0xff); |
211 | | info[10] = BITMASK(ecx, 24, 0xff); |
212 | | |
213 | | info[11] = BITMASK(edx, 0, 0xff); |
214 | | info[12] = BITMASK(edx, 8, 0xff); |
215 | | info[13] = BITMASK(edx, 16, 0xff); |
216 | | info[14] = BITMASK(edx, 24, 0xff); |
217 | | |
218 | | for (i = 0; i < 15; i++){ |
219 | | |
220 | | switch (info[i]){ |
221 | | case 0x3b : |
222 | | case 0x41 : |
223 | | case 0x79 : |
224 | | return 128; |
225 | | break; |
226 | | |
227 | | case 0x3c : |
228 | | case 0x42 : |
229 | | case 0x7a : |
230 | | case 0x7e : |
231 | | case 0x82 : |
232 | | return 256; |
233 | | break; |
234 | | |
235 | | case 0x43 : |
236 | | case 0x7b : |
237 | | case 0x7f : |
238 | | case 0x83 : |
239 | | case 0x86 : |
240 | | return 512; |
241 | | break; |
242 | | |
243 | | case 0x44 : |
244 | | case 0x78 : |
245 | | case 0x7c : |
246 | | case 0x84 : |
247 | | case 0x87 : |
248 | | return 1024; |
249 | | break; |
250 | | |
251 | | case 0x45 : |
252 | | case 0x7d : |
253 | | case 0x85 : |
254 | | return 2048; |
255 | | |
256 | | case 0x49 : |
257 | | return 4096; |
258 | | break; |
259 | | } |
260 | | } |
261 | | |
262 | | /* Never reached */ |
263 | | return 0; |
264 | | #endif |
265 | 0 | } |
266 | | |
267 | 1 | void blas_set_parameter(void){ |
268 | | |
269 | 1 | int factor; |
270 | 1 | #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || \ |
271 | 1 | defined(HASWELL) || defined(STEAMROLLER) || defined(EXCAVATOR) || defined(ZEN) || \ |
272 | 1 | defined(SKYLAKEX) || defined(COOPERLAKE) || defined(SAPPHIRERAPIDS) |
273 | 1 | int size = 16; |
274 | | #else |
275 | | int size = get_L2_size(); |
276 | | #endif |
277 | | |
278 | | #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) |
279 | | size >>= 7; |
280 | | |
281 | | #if defined(CORE_BANIAS) && (HAVE_HIT > 1) |
282 | | sgemm_p = 64 / HAVE_HIT * size; |
283 | | dgemm_p = 32 / HAVE_HIT * size; |
284 | | cgemm_p = 32 / HAVE_HIT * size; |
285 | | zgemm_p = 16 / HAVE_HIT * size; |
286 | | #ifdef EXPRECISION |
287 | | qgemm_p = 16 / HAVE_HIT * size; |
288 | | xgemm_p = 8 / HAVE_HIT * size; |
289 | | #endif |
290 | | #ifdef QUAD_PRECISION |
291 | | qgemm_p = 8 / HAVE_HIT * size; |
292 | | xgemm_p = 4 / HAVE_HIT * size; |
293 | | #endif |
294 | | #else |
295 | | sgemm_p = 64 * size; |
296 | | dgemm_p = 32 * size; |
297 | | cgemm_p = 32 * size; |
298 | | zgemm_p = 16 * size; |
299 | | #ifdef EXPRECISION |
300 | | qgemm_p = 16 * size; |
301 | | xgemm_p = 8 * size; |
302 | | #endif |
303 | | #ifdef QUAD_PRECISION |
304 | | qgemm_p = 8 * size; |
305 | | xgemm_p = 4 * size; |
306 | | #endif |
307 | | #endif |
308 | | #endif |
309 | | |
310 | | #if defined(CORE_NORTHWOOD) |
311 | | size >>= 7; |
312 | | |
313 | | #ifdef ALLOC_HUGETLB |
314 | | sgemm_p = 128 * size; |
315 | | dgemm_p = 64 * size; |
316 | | cgemm_p = 64 * size; |
317 | | zgemm_p = 32 * size; |
318 | | #ifdef EXPRECISION |
319 | | qgemm_p = 32 * size; |
320 | | xgemm_p = 16 * size; |
321 | | #endif |
322 | | #ifdef QUAD_PRECISION |
323 | | qgemm_p = 16 * size; |
324 | | xgemm_p = 8 * size; |
325 | | #endif |
326 | | #else |
327 | | sgemm_p = 96 * size; |
328 | | dgemm_p = 48 * size; |
329 | | cgemm_p = 48 * size; |
330 | | zgemm_p = 24 * size; |
331 | | #ifdef EXPRECISION |
332 | | qgemm_p = 24 * size; |
333 | | xgemm_p = 12 * size; |
334 | | #endif |
335 | | #ifdef QUAD_PRECISION |
336 | | qgemm_p = 12 * size; |
337 | | xgemm_p = 6 * size; |
338 | | #endif |
339 | | #endif |
340 | | #endif |
341 | | |
342 | | #if defined(CORE_CORE2) |
343 | | |
344 | | size >>= 9; |
345 | | |
346 | | sgemm_p = 92 * size; |
347 | | dgemm_p = 46 * size; |
348 | | cgemm_p = 46 * size; |
349 | | zgemm_p = 23 * size; |
350 | | |
351 | | #ifdef EXPRECISION |
352 | | qgemm_p = 23 * size; |
353 | | xgemm_p = 11 * size; |
354 | | #endif |
355 | | #ifdef QUAD_PRECISION |
356 | | qgemm_p = 11 * size; |
357 | | xgemm_p = 5 * size; |
358 | | #endif |
359 | | #endif |
360 | | |
361 | | #if defined(PENRYN) |
362 | | |
363 | | size >>= 9; |
364 | | |
365 | | sgemm_p = 1024; |
366 | | dgemm_p = 512; |
367 | | cgemm_p = 512; |
368 | | zgemm_p = 256; |
369 | | |
370 | | #ifdef EXPRECISION |
371 | | qgemm_p = 256; |
372 | | xgemm_p = 128; |
373 | | #endif |
374 | | #ifdef QUAD_PRECISION |
375 | | qgemm_p = 21 * size + 4; |
376 | | xgemm_p = 10 * size + 2; |
377 | | #endif |
378 | | #endif |
379 | | |
380 | | #if defined(DUNNINGTON) |
381 | | |
382 | | size >>= 9; |
383 | | |
384 | | sgemm_p = 384; |
385 | | dgemm_p = 384; |
386 | | cgemm_p = 384; |
387 | | zgemm_p = 384; |
388 | | |
389 | | #ifdef EXPRECISION |
390 | | qgemm_p = 384; |
391 | | xgemm_p = 384; |
392 | | #endif |
393 | | #ifdef QUAD_PRECISION |
394 | | qgemm_p = 21 * size + 4; |
395 | | xgemm_p = 10 * size + 2; |
396 | | #endif |
397 | | #endif |
398 | | |
399 | | #if defined(NEHALEM) |
400 | | sgemm_p = 1024; |
401 | | dgemm_p = 512; |
402 | | cgemm_p = 512; |
403 | | zgemm_p = 256; |
404 | | #ifdef EXPRECISION |
405 | | qgemm_p = 256; |
406 | | xgemm_p = 128; |
407 | | #endif |
408 | | #endif |
409 | | |
410 | | #if defined(SANDYBRIDGE) |
411 | | sgemm_p = 1024; |
412 | | dgemm_p = 512; |
413 | | cgemm_p = 512; |
414 | | zgemm_p = 256; |
415 | | #ifdef EXPRECISION |
416 | | qgemm_p = 256; |
417 | | xgemm_p = 128; |
418 | | #endif |
419 | | #endif |
420 | | |
421 | | #if defined(CORE_PRESCOTT) || defined(GENERIC) |
422 | | size >>= 6; |
423 | | |
424 | | if (size > 16) size = 16; |
425 | | |
426 | | sgemm_p = 56 * size; |
427 | | dgemm_p = 28 * size; |
428 | | cgemm_p = 28 * size; |
429 | | zgemm_p = 14 * size; |
430 | | #ifdef EXPRECISION |
431 | | qgemm_p = 14 * size; |
432 | | xgemm_p = 7 * size; |
433 | | #endif |
434 | | #ifdef QUAD_PRECISION |
435 | | qgemm_p = 7 * size; |
436 | | xgemm_p = 3 * size; |
437 | | #endif |
438 | | #endif |
439 | | |
440 | | #if defined(CORE_OPTERON) |
441 | | sgemm_p = 224 + 14 * (size >> 5); |
442 | | dgemm_p = 112 + 14 * (size >> 6); |
443 | | cgemm_p = 116 + 14 * (size >> 6); |
444 | | zgemm_p = 58 + 14 * (size >> 7); |
445 | | #ifdef EXPRECISION |
446 | | qgemm_p = 58 + 14 * (size >> 7); |
447 | | xgemm_p = 29 + 14 * (size >> 8); |
448 | | #endif |
449 | | #ifdef QUAD_PRECISION |
450 | | qgemm_p = 29 + 14 * (size >> 8); |
451 | | xgemm_p = 15 + 14 * (size >> 9); |
452 | | #endif |
453 | | #endif |
454 | | |
455 | | #if defined(ATOM) |
456 | | size >>= 8; |
457 | | |
458 | | sgemm_p = 256; |
459 | | dgemm_p = 128; |
460 | | cgemm_p = 128; |
461 | | zgemm_p = 64; |
462 | | #ifdef EXPRECISION |
463 | | qgemm_p = 64; |
464 | | xgemm_p = 32; |
465 | | #endif |
466 | | #ifdef QUAD_PRECISION |
467 | | qgemm_p = 32; |
468 | | xgemm_p = 16; |
469 | | #endif |
470 | | #endif |
471 | | |
472 | | #if defined(CORE_BARCELONA) || defined(CORE_BOBCAT) |
473 | | size >>= 8; |
474 | | |
475 | | sgemm_p = 232 * size; |
476 | | dgemm_p = 116 * size; |
477 | | cgemm_p = 116 * size; |
478 | | zgemm_p = 58 * size; |
479 | | #ifdef EXPRECISION |
480 | | qgemm_p = 58 * size; |
481 | | xgemm_p = 26 * size; |
482 | | #endif |
483 | | #ifdef QUAD_PRECISION |
484 | | qgemm_p = 26 * size; |
485 | | xgemm_p = 13 * size; |
486 | | #endif |
487 | | #endif |
488 | | |
489 | 1 | factor=openblas_block_factor(); |
490 | 1 | if (factor>0) { |
491 | 0 | if (factor < 10) factor = 10; |
492 | 0 | if (factor > 200) factor = 200; |
493 | |
|
494 | 0 | sgemm_p = ((long)((double)sgemm_p * (double)factor * 1.e-2)) & ~7L; |
495 | 0 | dgemm_p = ((long)((double)dgemm_p * (double)factor * 1.e-2)) & ~7L; |
496 | 0 | cgemm_p = ((long)((double)cgemm_p * (double)factor * 1.e-2)) & ~7L; |
497 | 0 | zgemm_p = ((long)((double)zgemm_p * (double)factor * 1.e-2)) & ~7L; |
498 | | #ifdef EXPRECISION |
499 | | qgemm_p = ((long)((double)qgemm_p * (double)factor * 1.e-2)) & ~7L; |
500 | | xgemm_p = ((long)((double)xgemm_p * (double)factor * 1.e-2)) & ~7L; |
501 | | #endif |
502 | 0 | } |
503 | | |
504 | 1 | if (sgemm_p == 0) sgemm_p = 64; |
505 | 1 | if (dgemm_p == 0) dgemm_p = 64; |
506 | 1 | if (cgemm_p == 0) cgemm_p = 64; |
507 | 1 | if (zgemm_p == 0) zgemm_p = 64; |
508 | | #ifdef EXPRECISION |
509 | | if (qgemm_p == 0) qgemm_p = 64; |
510 | | if (xgemm_p == 0) xgemm_p = 64; |
511 | | #endif |
512 | | |
513 | | #ifdef QUAD_PRECISION |
514 | | if (qgemm_p == 0) qgemm_p = 64; |
515 | | if (xgemm_p == 0) xgemm_p = 64; |
516 | | #endif |
517 | | |
518 | 1 | sgemm_p = ((sgemm_p + SGEMM_UNROLL_M - 1)/SGEMM_UNROLL_M) * SGEMM_UNROLL_M; |
519 | 1 | dgemm_p = ((dgemm_p + DGEMM_UNROLL_M - 1)/DGEMM_UNROLL_M) * DGEMM_UNROLL_M; |
520 | 1 | cgemm_p = ((cgemm_p + CGEMM_UNROLL_M - 1)/CGEMM_UNROLL_M) * CGEMM_UNROLL_M; |
521 | 1 | zgemm_p = ((zgemm_p + ZGEMM_UNROLL_M - 1)/ZGEMM_UNROLL_M) * ZGEMM_UNROLL_M; |
522 | | #ifdef QUAD_PRECISION |
523 | | qgemm_p = ((qgemm_p + QGEMM_UNROLL_M - 1)/QGEMM_UNROLL_M) * QGEMM_UNROLL_M; |
524 | | xgemm_p = ((xgemm_p + XGEMM_UNROLL_M - 1)/XGEMM_UNROLL_M) * XGEMM_UNROLL_M; |
525 | | #endif |
526 | | |
527 | | #ifdef BUILD_BFLOAT16 |
528 | | sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; |
529 | | #endif |
530 | 1 | sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; |
531 | 1 | dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; |
532 | 1 | cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; |
533 | 1 | zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15; |
534 | | #if defined(EXPRECISION) || defined(QUAD_PRECISION) |
535 | | qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15; |
536 | | xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15; |
537 | | #endif |
538 | | |
539 | | #if 0 |
540 | | fprintf(stderr, "SGEMM ... %3d, %3d, %3d\n", SGEMM_P, SGEMM_Q, SGEMM_R); |
541 | | fprintf(stderr, "DGEMM ... %3d, %3d, %3d\n", DGEMM_P, DGEMM_Q, DGEMM_R); |
542 | | fprintf(stderr, "CGEMM ... %3d, %3d, %3d\n", CGEMM_P, CGEMM_Q, CGEMM_R); |
543 | | fprintf(stderr, "ZGEMM ... %3d, %3d, %3d\n", ZGEMM_P, ZGEMM_Q, ZGEMM_R); |
544 | | #endif |
545 | | |
546 | 1 | return; |
547 | 1 | } |
548 | | |
549 | | #if 0 |
550 | | |
551 | | int get_current_cpu_info(void){ |
552 | | |
553 | | int nlprocs, ncores, cmplegacy; |
554 | | int htt = 0; |
555 | | int apicid = 0; |
556 | | |
557 | | #if defined(CORE_PRESCOTT) || defined(CORE_OPTERON) |
558 | | int eax, ebx, ecx, edx; |
559 | | |
560 | | cpuid(1, &eax, &ebx, &ecx, &edx); |
561 | | nlprocs = BITMASK(ebx, 16, 0xff); |
562 | | apicid = BITMASK(ebx, 24, 0xff); |
563 | | htt = BITMASK(edx, 28, 0x01); |
564 | | #endif |
565 | | |
566 | | #if defined(CORE_PRESCOTT) |
567 | | cpuid(4, &eax, &ebx, &ecx, &edx); |
568 | | ncores = BITMASK(eax, 26, 0x3f); |
569 | | |
570 | | if (htt == 0) nlprocs = 0; |
571 | | #endif |
572 | | |
573 | | #if defined(CORE_OPTERON) |
574 | | cpuid(0x80000008, &eax, &ebx, &ecx, &edx); |
575 | | ncores = BITMASK(ecx, 0, 0xff); |
576 | | |
577 | | cpuid(0x80000001, &eax, &ebx, &ecx, &edx); |
578 | | cmplegacy = BITMASK(ecx, 1, 0x01); |
579 | | |
580 | | if (htt == 0) { |
581 | | nlprocs = 0; |
582 | | ncores = 0; |
583 | | cmplegacy = 0; |
584 | | } |
585 | | #endif |
586 | | |
587 | | ncores ++; |
588 | | |
589 | | fprintf(stderr, "APICID = %d Number of core = %d\n", apicid, ncores); |
590 | | |
591 | | return 0; |
592 | | } |
593 | | #endif |
594 | | |
595 | | #endif |
596 | | |
597 | | #if defined(ARCH_IA64) |
598 | | |
599 | | static inline BLASULONG cpuid(BLASULONG regnum){ |
600 | | BLASULONG value; |
601 | | |
602 | | #ifndef __ECC |
603 | | asm ("mov %0=cpuid[%r1]" : "=r"(value) : "rO"(regnum)); |
604 | | #else |
605 | | value = __getIndReg(_IA64_REG_INDR_CPUID, regnum); |
606 | | #endif |
607 | | |
608 | | return value; |
609 | | } |
610 | | |
611 | | #if 1 |
612 | | |
613 | | void blas_set_parameter(void){ |
614 | | |
615 | | BLASULONG cpuid3, size; |
616 | | |
617 | | cpuid3 = cpuid(3); |
618 | | |
619 | | size = BITMASK(cpuid3, 16, 0xff); |
620 | | |
621 | | sbgemm_p = 192 * (size + 1); |
622 | | sgemm_p = 192 * (size + 1); |
623 | | dgemm_p = 96 * (size + 1); |
624 | | cgemm_p = 96 * (size + 1); |
625 | | zgemm_p = 48 * (size + 1); |
626 | | #ifdef EXPRECISION |
627 | | qgemm_p = 64 * (size + 1); |
628 | | xgemm_p = 32 * (size + 1); |
629 | | #endif |
630 | | #ifdef QUAD_PRECISION |
631 | | qgemm_p = 32 * (size + 1); |
632 | | xgemm_p = 16 * (size + 1); |
633 | | #endif |
634 | | |
635 | | #ifdef BUILD_BFLOAT16 |
636 | | sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; |
637 | | #endif |
638 | | sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; |
639 | | dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; |
640 | | cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; |
641 | | zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15; |
642 | | #if defined(EXPRECISION) || defined(QUAD_PRECISION) |
643 | | qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15; |
644 | | xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15; |
645 | | #endif |
646 | | |
647 | | return; |
648 | | } |
649 | | |
650 | | #else |
651 | | |
652 | | #define IA64_SYS_NAME "/sys/devices/system/cpu/cpu0/cache/index3/size" |
653 | | #define IA64_PROC_NAME "/proc/pal/cpu0/cache_info" |
654 | | |
655 | | void blas_set_parameter(void){ |
656 | | |
657 | | BLASULONG cpuid3; |
658 | | int size = 0; |
659 | | |
660 | | #if 1 |
661 | | char buffer[128]; |
662 | | FILE *infile; |
663 | | |
664 | | if ((infile = fopen(IA64_SYS_NAME, "r")) != NULL) { |
665 | | |
666 | | fgets(buffer, sizeof(buffer), infile); |
667 | | fclose(infile); |
668 | | |
669 | | size = atoi(buffer) / 1536; |
670 | | } |
671 | | |
672 | | if (size <= 0) { |
673 | | if ((infile = fopen(IA64_PROC_NAME, "r")) != NULL) { |
674 | | |
675 | | while(fgets(buffer, sizeof(buffer), infile) != NULL) { |
676 | | if ((!strncmp("Data/Instruction Cache level 3", buffer, 30))) break; |
677 | | } |
678 | | |
679 | | fgets(buffer, sizeof(buffer), infile); |
680 | | |
681 | | fclose(infile); |
682 | | |
683 | | *strstr(buffer, "bytes") = (char)NULL; |
684 | | |
685 | | size = atoi(strchr(buffer, ':') + 1) / 1572864; |
686 | | } |
687 | | } |
688 | | #endif |
689 | | |
690 | | /* The last resort */ |
691 | | |
692 | | if (size <= 0) { |
693 | | cpuid3 = cpuid(3); |
694 | | |
695 | | size = BITMASK(cpuid3, 16, 0xff) + 1; |
696 | | } |
697 | | |
698 | | sgemm_p = 320 * size; |
699 | | dgemm_p = 160 * size; |
700 | | cgemm_p = 160 * size; |
701 | | zgemm_p = 80 * size; |
702 | | #ifdef EXPRECISION |
703 | | qgemm_p = 80 * size; |
704 | | xgemm_p = 40 * size; |
705 | | #endif |
706 | | |
707 | | sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; |
708 | | dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; |
709 | | cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; |
710 | | zgemm_r = (((BUFFER_SIZE - ((ZGEMM_P * ZGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (ZGEMM_Q * 16)) - 15) & ~15; |
711 | | #ifdef EXPRECISION |
712 | | qgemm_r = (((BUFFER_SIZE - ((QGEMM_P * QGEMM_Q * 16 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (QGEMM_Q * 16)) - 15) & ~15; |
713 | | xgemm_r = (((BUFFER_SIZE - ((XGEMM_P * XGEMM_Q * 32 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (XGEMM_Q * 32)) - 15) & ~15; |
714 | | #endif |
715 | | |
716 | | return; |
717 | | } |
718 | | |
719 | | #endif |
720 | | |
721 | | #endif |
722 | | |
723 | | #if defined(ARCH_MIPS64) |
724 | | void blas_set_parameter(void){ |
725 | | #if defined(LOONGSON3R3) || defined(LOONGSON3R4) |
726 | | #ifdef SMP |
727 | | if(blas_num_threads == 1){ |
728 | | #endif |
729 | | //single thread |
730 | | dgemm_r = 1024; |
731 | | #ifdef SMP |
732 | | }else{ |
733 | | //multi thread |
734 | | dgemm_r = 200; |
735 | | } |
736 | | #endif |
737 | | #endif |
738 | | |
739 | | } |
740 | | #endif |
741 | | |
742 | | #if defined(ARCH_LOONGARCH64) |
743 | | int get_L3_size() { |
744 | | int ret = 0, id = 0x14; |
745 | | __asm__ volatile ( |
746 | | "cpucfg %[ret], %[id]" |
747 | | : [ret]"=r"(ret) |
748 | | : [id]"r"(id) |
749 | | : "memory" |
750 | | ); |
751 | | return ((ret & 0xffff) + 1) * pow(2, ((ret >> 16) & 0xff)) * pow(2, ((ret >> 24) & 0x7f)) / 1024 / 1024; // MB |
752 | | } |
753 | | |
754 | | void blas_set_parameter(void){ |
755 | | #if defined(LA464) |
756 | | int L3_size = get_L3_size(); |
757 | | #ifdef SMP |
758 | | if(blas_num_threads == 1){ |
759 | | #endif |
760 | | //single thread |
761 | | if (L3_size == 32){ // 3C5000 and 3D5000 |
762 | | sgemm_p = 256; |
763 | | sgemm_q = 384; |
764 | | sgemm_r = 8192; |
765 | | |
766 | | dgemm_p = 112; |
767 | | dgemm_q = 289; |
768 | | dgemm_r = 4096; |
769 | | |
770 | | cgemm_p = 128; |
771 | | cgemm_q = 256; |
772 | | cgemm_r = 4096; |
773 | | |
774 | | zgemm_p = 128; |
775 | | zgemm_q = 128; |
776 | | zgemm_r = 2048; |
777 | | } else { // 3A5000 and 3C5000L |
778 | | sgemm_p = 256; |
779 | | sgemm_q = 384; |
780 | | sgemm_r = 4096; |
781 | | |
782 | | dgemm_p = 112; |
783 | | dgemm_q = 300; |
784 | | dgemm_r = 3024; |
785 | | |
786 | | cgemm_p = 128; |
787 | | cgemm_q = 256; |
788 | | cgemm_r = 2048; |
789 | | |
790 | | zgemm_p = 128; |
791 | | zgemm_q = 128; |
792 | | zgemm_r = 1024; |
793 | | } |
794 | | #ifdef SMP |
795 | | }else{ |
796 | | //multi thread |
797 | | if (L3_size == 32){ // 3C5000 and 3D5000 |
798 | | sgemm_p = 256; |
799 | | sgemm_q = 384; |
800 | | sgemm_r = 1024; |
801 | | |
802 | | dgemm_p = 112; |
803 | | dgemm_q = 289; |
804 | | dgemm_r = 342; |
805 | | |
806 | | cgemm_p = 128; |
807 | | cgemm_q = 256; |
808 | | cgemm_r = 512; |
809 | | |
810 | | zgemm_p = 128; |
811 | | zgemm_q = 128; |
812 | | zgemm_r = 512; |
813 | | } else { // 3A5000 and 3C5000L |
814 | | sgemm_p = 256; |
815 | | sgemm_q = 384; |
816 | | sgemm_r = 2048; |
817 | | |
818 | | dgemm_p = 112; |
819 | | dgemm_q = 300; |
820 | | dgemm_r = 738; |
821 | | |
822 | | cgemm_p = 128; |
823 | | cgemm_q = 256; |
824 | | cgemm_r = 1024; |
825 | | |
826 | | zgemm_p = 128; |
827 | | zgemm_q = 128; |
828 | | zgemm_r = 1024; |
829 | | } |
830 | | } |
831 | | #endif |
832 | | #endif |
833 | | } |
834 | | #endif |
835 | | |
836 | | #if defined(ARCH_ARM64) |
837 | | |
838 | | void blas_set_parameter(void) |
839 | | { |
840 | | } |
841 | | |
842 | | #endif |