/root/doris/contrib/openblas/interface/syrk.c
Line | Count | Source |
1 | | /*********************************************************************/ |
2 | | /* Copyright 2009, 2010 The University of Texas at Austin. */ |
3 | | /* All rights reserved. */ |
4 | | /* */ |
5 | | /* Redistribution and use in source and binary forms, with or */ |
6 | | /* without modification, are permitted provided that the following */ |
7 | | /* conditions are met: */ |
8 | | /* */ |
9 | | /* 1. Redistributions of source code must retain the above */ |
10 | | /* copyright notice, this list of conditions and the following */ |
11 | | /* disclaimer. */ |
12 | | /* */ |
13 | | /* 2. Redistributions in binary form must reproduce the above */ |
14 | | /* copyright notice, this list of conditions and the following */ |
15 | | /* disclaimer in the documentation and/or other materials */ |
16 | | /* provided with the distribution. */ |
17 | | /* */ |
18 | | /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ |
19 | | /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ |
20 | | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ |
21 | | /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ |
22 | | /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ |
23 | | /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ |
24 | | /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ |
25 | | /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ |
26 | | /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ |
27 | | /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ |
28 | | /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ |
29 | | /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ |
30 | | /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ |
31 | | /* POSSIBILITY OF SUCH DAMAGE. */ |
32 | | /* */ |
33 | | /* The views and conclusions contained in the software and */ |
34 | | /* documentation are those of the authors and should not be */ |
35 | | /* interpreted as representing official policies, either expressed */ |
36 | | /* or implied, of The University of Texas at Austin. */ |
37 | | /*********************************************************************/ |
38 | | |
39 | | #include <stdio.h> |
40 | | #include <ctype.h> |
41 | | #include "common.h" |
42 | | #ifdef FUNCTION_PROFILE |
43 | | #include "functable.h" |
44 | | #endif |
45 | | |
46 | | #ifndef COMPLEX |
47 | 0 | #define SMP_THRESHOLD_MIN 109944. |
48 | | #ifdef XDOUBLE |
49 | | #define ERROR_NAME "QSYRK " |
50 | | #elif defined(DOUBLE) |
51 | | #define ERROR_NAME "DSYRK " |
52 | | #else |
53 | 0 | #define ERROR_NAME "SSYRK " |
54 | | #endif |
55 | | #else |
56 | | #define SMP_THRESHOLD_MIN 14824. |
57 | | #ifndef HEMM |
58 | | #ifdef XDOUBLE |
59 | | #define ERROR_NAME "XSYRK " |
60 | | #elif defined(DOUBLE) |
61 | | #define ERROR_NAME "ZSYRK " |
62 | | #else |
63 | | #define ERROR_NAME "CSYRK " |
64 | | #endif |
65 | | #else |
66 | | #ifdef XDOUBLE |
67 | | #define ERROR_NAME "XHERK " |
68 | | #elif defined(DOUBLE) |
69 | | #define ERROR_NAME "ZHERK " |
70 | | #else |
71 | | #define ERROR_NAME "CHERK " |
72 | | #endif |
73 | | #endif |
74 | | #endif |
75 | | |
76 | | #ifndef GEMM_MULTITHREAD_THRESHOLD |
77 | | #define GEMM_MULTITHREAD_THRESHOLD 4 |
78 | | #endif |
79 | | |
80 | | static int (*syrk[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = { |
81 | | #ifndef HEMM |
82 | | SYRK_UN, SYRK_UC, SYRK_LN, SYRK_LC, |
83 | | #if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) |
84 | | SYRK_THREAD_UN, SYRK_THREAD_UC, SYRK_THREAD_LN, SYRK_THREAD_LC, |
85 | | #endif |
86 | | #else |
87 | | HERK_UN, HERK_UC, HERK_LN, HERK_LC, |
88 | | #if defined(SMP) && !defined(USE_SIMPLE_THREADED_LEVEL3) |
89 | | HERK_THREAD_UN, HERK_THREAD_UC, HERK_THREAD_LN, HERK_THREAD_LC, |
90 | | #endif |
91 | | #endif |
92 | | }; |
93 | | |
94 | | #ifndef CBLAS |
95 | | |
96 | | void NAME(char *UPLO, char *TRANS, |
97 | | blasint *N, blasint *K, |
98 | | FLOAT *alpha, FLOAT *a, blasint *ldA, |
99 | 0 | FLOAT *beta, FLOAT *c, blasint *ldC){ |
100 | |
|
101 | 0 | char uplo_arg = *UPLO; |
102 | 0 | char trans_arg = *TRANS; |
103 | |
|
104 | 0 | blas_arg_t args; |
105 | |
|
106 | 0 | FLOAT *buffer; |
107 | 0 | FLOAT *sa, *sb; |
108 | |
|
109 | 0 | #ifdef SMP |
110 | 0 | double NNK; |
111 | | #ifdef USE_SIMPLE_THREADED_LEVEL3 |
112 | | #ifndef COMPLEX |
113 | | #ifdef XDOUBLE |
114 | | int mode = BLAS_XDOUBLE | BLAS_REAL; |
115 | | #elif defined(DOUBLE) |
116 | | int mode = BLAS_DOUBLE | BLAS_REAL; |
117 | | #else |
118 | | int mode = BLAS_SINGLE | BLAS_REAL; |
119 | | #endif |
120 | | #else |
121 | | #ifdef XDOUBLE |
122 | | int mode = BLAS_XDOUBLE | BLAS_COMPLEX; |
123 | | #elif defined(DOUBLE) |
124 | | int mode = BLAS_DOUBLE | BLAS_COMPLEX; |
125 | | #else |
126 | | int mode = BLAS_SINGLE | BLAS_COMPLEX; |
127 | | #endif |
128 | | #endif |
129 | | #endif |
130 | 0 | #endif |
131 | |
|
132 | 0 | blasint info; |
133 | 0 | int uplo; |
134 | 0 | int trans; |
135 | 0 | int nrowa; |
136 | |
|
137 | 0 | PRINT_DEBUG_NAME; |
138 | |
|
139 | 0 | args.n = *N; |
140 | 0 | args.k = *K; |
141 | |
|
142 | 0 | args.a = (void *)a; |
143 | 0 | args.c = (void *)c; |
144 | |
|
145 | 0 | args.lda = *ldA; |
146 | 0 | args.ldc = *ldC; |
147 | |
|
148 | 0 | args.alpha = (void *)alpha; |
149 | 0 | args.beta = (void *)beta; |
150 | |
|
151 | 0 | TOUPPER(uplo_arg); |
152 | 0 | TOUPPER(trans_arg); |
153 | |
|
154 | 0 | uplo = -1; |
155 | 0 | trans = -1; |
156 | |
|
157 | 0 | if (uplo_arg == 'U') uplo = 0; |
158 | 0 | if (uplo_arg == 'L') uplo = 1; |
159 | | |
160 | |
|
161 | 0 | #ifndef COMPLEX |
162 | 0 | if (trans_arg == 'N') trans = 0; |
163 | 0 | if (trans_arg == 'T') trans = 1; |
164 | 0 | if (trans_arg == 'C') trans = 1; |
165 | | #else |
166 | | #ifdef HEMM |
167 | | if (trans_arg == 'N') trans = 0; |
168 | | if (trans_arg == 'C') trans = 1; |
169 | | #else |
170 | | if (trans_arg == 'N') trans = 0; |
171 | | if (trans_arg == 'T') trans = 1; |
172 | | #endif |
173 | | |
174 | | #endif |
175 | |
|
176 | 0 | nrowa = args.n; |
177 | 0 | if (trans & 1) nrowa = args.k; |
178 | |
|
179 | 0 | info = 0; |
180 | |
|
181 | 0 | if (args.ldc < MAX(1,args.n)) info = 10; |
182 | 0 | if (args.lda < MAX(1,nrowa)) info = 7; |
183 | 0 | if (args.k < 0) info = 4; |
184 | 0 | if (args.n < 0) info = 3; |
185 | 0 | if (trans < 0) info = 2; |
186 | 0 | if (uplo < 0) info = 1; |
187 | |
|
188 | 0 | if (info != 0) { |
189 | 0 | BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); |
190 | 0 | return; |
191 | 0 | } |
192 | | |
193 | | #else |
194 | | |
195 | | void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, |
196 | | blasint n, blasint k, |
197 | | #if !defined(COMPLEX) || defined(HEMM) |
198 | | FLOAT alpha, |
199 | | #else |
200 | | void *valpha, |
201 | | #endif |
202 | | #if !defined(COMPLEX) |
203 | | FLOAT *a, blasint lda, |
204 | | #else |
205 | | void *va, blasint lda, |
206 | | #endif |
207 | | #if !defined(COMPLEX) || defined(HEMM) |
208 | | FLOAT beta, |
209 | | #else |
210 | | void *vbeta, |
211 | | #endif |
212 | | #if !defined(COMPLEX) |
213 | | FLOAT *c, blasint ldc) { |
214 | | #else |
215 | | void *vc, blasint ldc) { |
216 | | #endif |
217 | | |
218 | | #ifdef COMPLEX |
219 | | #if !defined(HEMM) |
220 | | FLOAT* alpha = (FLOAT*) valpha; |
221 | | FLOAT* beta = (FLOAT*) vbeta; |
222 | | #endif |
223 | | FLOAT* a = (FLOAT*) va; |
224 | | FLOAT* c = (FLOAT*) vc; |
225 | | #endif |
226 | | |
227 | | blas_arg_t args; |
228 | | int uplo, trans; |
229 | | blasint info, nrowa; |
230 | | |
231 | | FLOAT *buffer; |
232 | | FLOAT *sa, *sb; |
233 | | |
234 | | #ifdef SMP |
235 | | double NNK; |
236 | | |
237 | | #ifdef USE_SIMPLE_THREADED_LEVEL3 |
238 | | #ifndef COMPLEX |
239 | | #ifdef XDOUBLE |
240 | | int mode = BLAS_XDOUBLE | BLAS_REAL; |
241 | | #elif defined(DOUBLE) |
242 | | int mode = BLAS_DOUBLE | BLAS_REAL; |
243 | | #else |
244 | | int mode = BLAS_SINGLE | BLAS_REAL; |
245 | | #endif |
246 | | #else |
247 | | #ifdef XDOUBLE |
248 | | int mode = BLAS_XDOUBLE | BLAS_COMPLEX; |
249 | | #elif defined(DOUBLE) |
250 | | int mode = BLAS_DOUBLE | BLAS_COMPLEX; |
251 | | #else |
252 | | int mode = BLAS_SINGLE | BLAS_COMPLEX; |
253 | | #endif |
254 | | #endif |
255 | | #endif |
256 | | #endif |
257 | | |
258 | | PRINT_DEBUG_CNAME; |
259 | | |
260 | | args.n = n; |
261 | | args.k = k; |
262 | | |
263 | | args.a = (void *)a; |
264 | | args.c = (void *)c; |
265 | | |
266 | | args.lda = lda; |
267 | | args.ldc = ldc; |
268 | | |
269 | | #if !defined(COMPLEX) || defined(HEMM) |
270 | | args.alpha = (void *)α |
271 | | args.beta = (void *)β |
272 | | #else |
273 | | args.alpha = (void *)alpha; |
274 | | args.beta = (void *)beta; |
275 | | #endif |
276 | | |
277 | | trans = -1; |
278 | | uplo = -1; |
279 | | info = 0; |
280 | | |
281 | | if (order == CblasColMajor) { |
282 | | if (Uplo == CblasUpper) uplo = 0; |
283 | | if (Uplo == CblasLower) uplo = 1; |
284 | | |
285 | | if (Trans == CblasNoTrans) trans = 0; |
286 | | #ifndef COMPLEX |
287 | | if (Trans == CblasTrans) trans = 1; |
288 | | if (Trans == CblasConjNoTrans) trans = 0; |
289 | | if (Trans == CblasConjTrans) trans = 1; |
290 | | #elif !defined(HEMM) |
291 | | if (Trans == CblasTrans) trans = 1; |
292 | | #else |
293 | | if (Trans == CblasConjTrans) trans = 1; |
294 | | #endif |
295 | | |
296 | | info = -1; |
297 | | |
298 | | nrowa = args.n; |
299 | | if (trans & 1) nrowa = args.k; |
300 | | |
301 | | if (args.ldc < MAX(1,args.n)) info = 10; |
302 | | if (args.lda < MAX(1,nrowa)) info = 7; |
303 | | if (args.k < 0) info = 4; |
304 | | if (args.n < 0) info = 3; |
305 | | if (trans < 0) info = 2; |
306 | | if (uplo < 0) info = 1; |
307 | | } |
308 | | |
309 | | if (order == CblasRowMajor) { |
310 | | if (Uplo == CblasUpper) uplo = 1; |
311 | | if (Uplo == CblasLower) uplo = 0; |
312 | | |
313 | | if (Trans == CblasNoTrans) trans = 1; |
314 | | #ifndef COMPLEX |
315 | | if (Trans == CblasTrans) trans = 0; |
316 | | if (Trans == CblasConjNoTrans) trans = 1; |
317 | | if (Trans == CblasConjTrans) trans = 0; |
318 | | #elif !defined(HEMM) |
319 | | if (Trans == CblasTrans) trans = 0; |
320 | | #else |
321 | | if (Trans == CblasConjTrans) trans = 0; |
322 | | #endif |
323 | | |
324 | | info = -1; |
325 | | |
326 | | nrowa = args.n; |
327 | | if (trans & 1) nrowa = args.k; |
328 | | |
329 | | if (args.ldc < MAX(1,args.n)) info = 10; |
330 | | if (args.lda < MAX(1,nrowa)) info = 7; |
331 | | if (args.k < 0) info = 4; |
332 | | if (args.n < 0) info = 3; |
333 | | if (trans < 0) info = 2; |
334 | | if (uplo < 0) info = 1; |
335 | | } |
336 | | |
337 | | if (info >= 0) { |
338 | | BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); |
339 | | return; |
340 | | } |
341 | | |
342 | | #endif |
343 | | |
344 | 0 | if (args.n == 0) return; |
345 | | |
346 | 0 | IDEBUG_START; |
347 | |
|
348 | 0 | FUNCTION_PROFILE_START(); |
349 | |
|
350 | 0 | buffer = (FLOAT *)blas_memory_alloc(0); |
351 | |
|
352 | 0 | sa = (FLOAT *)((BLASLONG)buffer + GEMM_OFFSET_A); |
353 | 0 | sb = (FLOAT *)(((BLASLONG)sa + ((GEMM_P * GEMM_Q * COMPSIZE * SIZE + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); |
354 | |
|
355 | 0 | #ifdef SMP |
356 | | #ifdef USE_SIMPLE_THREADED_LEVEL3 |
357 | | if (!trans){ |
358 | | mode |= (BLAS_TRANSA_N | BLAS_TRANSB_T); |
359 | | } else { |
360 | | mode |= (BLAS_TRANSA_T | BLAS_TRANSB_N); |
361 | | } |
362 | | mode |= (uplo << BLAS_UPLO_SHIFT); |
363 | | #endif |
364 | |
|
365 | 0 | args.common = NULL; |
366 | |
|
367 | 0 | NNK = (double)(args.n+1)*(double)args.n*(double)args.k; |
368 | 0 | if (NNK <= (SMP_THRESHOLD_MIN * GEMM_MULTITHREAD_THRESHOLD)) { |
369 | 0 | args.nthreads = 1; |
370 | 0 | } else { |
371 | 0 | args.nthreads = num_cpu_avail(3); |
372 | 0 | } |
373 | |
|
374 | 0 | if (args.nthreads == 1) { |
375 | 0 | #endif |
376 | |
|
377 | 0 | (syrk[(uplo << 1) | trans ])(&args, NULL, NULL, sa, sb, 0); |
378 | |
|
379 | 0 | #ifdef SMP |
380 | 0 | } else { |
381 | |
|
382 | 0 | #ifndef USE_SIMPLE_THREADED_LEVEL3 |
383 | |
|
384 | 0 | (syrk[4 | (uplo << 1) | trans ])(&args, NULL, NULL, sa, sb, 0); |
385 | |
|
386 | | #else |
387 | | |
388 | | syrk_thread(mode, &args, NULL, NULL, syrk[(uplo << 1) | trans ], sa, sb, args.nthreads); |
389 | | |
390 | | #endif |
391 | |
|
392 | 0 | } |
393 | 0 | #endif |
394 | |
|
395 | 0 | blas_memory_free(buffer); |
396 | |
|
397 | 0 | FUNCTION_PROFILE_END(COMPSIZE * COMPSIZE, args.n * args.k + args.n * args.n / 2, args.n * args.n * args.k); |
398 | |
|
399 | 0 | IDEBUG_END; |
400 | |
|
401 | 0 | return; |
402 | 0 | } |