contrib/openblas/driver/others/blas_l1_thread.c
Line | Count | Source |
1 | | /*********************************************************************/ |
2 | | /* Copyright 2009, 2010 The University of Texas at Austin. */ |
3 | | /* All rights reserved. */ |
4 | | /* */ |
5 | | /* Redistribution and use in source and binary forms, with or */ |
6 | | /* without modification, are permitted provided that the following */ |
7 | | /* conditions are met: */ |
8 | | /* */ |
9 | | /* 1. Redistributions of source code must retain the above */ |
10 | | /* copyright notice, this list of conditions and the following */ |
11 | | /* disclaimer. */ |
12 | | /* */ |
13 | | /* 2. Redistributions in binary form must reproduce the above */ |
14 | | /* copyright notice, this list of conditions and the following */ |
15 | | /* disclaimer in the documentation and/or other materials */ |
16 | | /* provided with the distribution. */ |
17 | | /* */ |
18 | | /* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ |
19 | | /* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ |
20 | | /* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ |
21 | | /* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ |
22 | | /* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ |
23 | | /* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ |
24 | | /* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ |
25 | | /* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ |
26 | | /* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ |
27 | | /* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ |
28 | | /* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ |
29 | | /* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ |
30 | | /* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ |
31 | | /* POSSIBILITY OF SUCH DAMAGE. */ |
32 | | /* */ |
33 | | /* The views and conclusions contained in the software and */ |
34 | | /* documentation are those of the authors and should not be */ |
35 | | /* interpreted as representing official policies, either expressed */ |
36 | | /* or implied, of The University of Texas at Austin. */ |
37 | | /*********************************************************************/ |
38 | | |
39 | | #include <stdio.h> |
40 | | #include <stdlib.h> |
41 | | #include "common.h" |
42 | | |
43 | | int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, |
44 | | void *a, BLASLONG lda, |
45 | | void *b, BLASLONG ldb, |
46 | 0 | void *c, BLASLONG ldc, int (*function)(void), int nthreads){ |
47 | |
|
48 | 0 | blas_queue_t queue[MAX_CPU_NUMBER]; |
49 | 0 | blas_arg_t args [MAX_CPU_NUMBER]; |
50 | |
|
51 | 0 | BLASLONG i, width, astride, bstride; |
52 | 0 | int num_cpu, calc_type_a, calc_type_b; |
53 | |
|
54 | 0 | switch (mode & BLAS_PREC) { |
55 | 0 | case BLAS_INT8 : |
56 | 0 | case BLAS_BFLOAT16: |
57 | 0 | case BLAS_SINGLE : |
58 | 0 | case BLAS_DOUBLE : |
59 | 0 | case BLAS_XDOUBLE : |
60 | 0 | calc_type_a = calc_type_b = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0); |
61 | 0 | break; |
62 | 0 | case BLAS_STOBF16 : |
63 | 0 | calc_type_a = 2 + ((mode & BLAS_COMPLEX) != 0); |
64 | 0 | calc_type_b = 1 + ((mode & BLAS_COMPLEX) != 0); |
65 | 0 | break; |
66 | 0 | case BLAS_DTOBF16 : |
67 | 0 | calc_type_a = 3 + ((mode & BLAS_COMPLEX) != 0); |
68 | 0 | calc_type_b = 1 + ((mode & BLAS_COMPLEX) != 0); |
69 | 0 | break; |
70 | 0 | case BLAS_BF16TOS : |
71 | 0 | calc_type_a = 1 + ((mode & BLAS_COMPLEX) != 0); |
72 | 0 | calc_type_b = 2 + ((mode & BLAS_COMPLEX) != 0); |
73 | 0 | break; |
74 | 0 | case BLAS_BF16TOD : |
75 | 0 | calc_type_a = 1 + ((mode & BLAS_COMPLEX) != 0); |
76 | 0 | calc_type_b = 3 + ((mode & BLAS_COMPLEX) != 0); |
77 | 0 | break; |
78 | 0 | default: |
79 | 0 | calc_type_a = calc_type_b = 0; |
80 | 0 | break; |
81 | 0 | } |
82 | | |
83 | 0 | if(!(mode & BLAS_PTHREAD)) mode |= BLAS_LEGACY; |
84 | |
|
85 | 0 | for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]); |
86 | |
|
87 | 0 | num_cpu = 0; |
88 | 0 | i = m; |
89 | |
|
90 | 0 | while (i > 0){ |
91 | | |
92 | | /* Adjust Parameters */ |
93 | 0 | width = blas_quickdivide(i + nthreads - num_cpu - 1, |
94 | 0 | nthreads - num_cpu); |
95 | |
|
96 | 0 | i -= width; |
97 | 0 | if (i < 0) width = width + i; |
98 | |
|
99 | 0 | astride = width * lda; |
100 | |
|
101 | 0 | if (!(mode & BLAS_TRANSB_T)) { |
102 | 0 | bstride = width * ldb; |
103 | 0 | } else { |
104 | 0 | bstride = width; |
105 | 0 | } |
106 | |
|
107 | 0 | astride <<= calc_type_a; |
108 | 0 | bstride <<= calc_type_b; |
109 | |
|
110 | 0 | args[num_cpu].m = width; |
111 | 0 | args[num_cpu].n = n; |
112 | 0 | args[num_cpu].k = k; |
113 | 0 | args[num_cpu].a = (void *)a; |
114 | 0 | args[num_cpu].b = (void *)b; |
115 | 0 | args[num_cpu].c = (void *)c; |
116 | 0 | args[num_cpu].lda = lda; |
117 | 0 | args[num_cpu].ldb = ldb; |
118 | 0 | args[num_cpu].ldc = ldc; |
119 | 0 | args[num_cpu].alpha = alpha; |
120 | |
|
121 | 0 | queue[num_cpu].mode = mode; |
122 | 0 | queue[num_cpu].routine = function; |
123 | 0 | queue[num_cpu].args = &args[num_cpu]; |
124 | 0 | queue[num_cpu].next = &queue[num_cpu + 1]; |
125 | |
|
126 | 0 | a = (void *)((BLASULONG)a + astride); |
127 | 0 | b = (void *)((BLASULONG)b + bstride); |
128 | |
|
129 | 0 | num_cpu ++; |
130 | 0 | } |
131 | |
|
132 | 0 | if (num_cpu) { |
133 | 0 | queue[num_cpu - 1].next = NULL; |
134 | |
|
135 | 0 | exec_blas(num_cpu, queue); |
136 | 0 | } |
137 | |
|
138 | 0 | return 0; |
139 | 0 | } |
140 | | |
141 | | int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, |
142 | | void *a, BLASLONG lda, |
143 | | void *b, BLASLONG ldb, |
144 | 0 | void *c, BLASLONG ldc, int (*function)(void), int nthreads){ |
145 | |
|
146 | 0 | blas_queue_t queue[MAX_CPU_NUMBER]; |
147 | 0 | blas_arg_t args [MAX_CPU_NUMBER]; |
148 | |
|
149 | 0 | BLASLONG i, width, astride, bstride; |
150 | 0 | int num_cpu, calc_type_a, calc_type_b; |
151 | |
|
152 | 0 | switch (mode & BLAS_PREC) { |
153 | 0 | case BLAS_INT8 : |
154 | 0 | case BLAS_BFLOAT16: |
155 | 0 | case BLAS_SINGLE : |
156 | 0 | case BLAS_DOUBLE : |
157 | 0 | case BLAS_XDOUBLE : |
158 | 0 | calc_type_a = calc_type_b = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0); |
159 | 0 | break; |
160 | 0 | case BLAS_STOBF16 : |
161 | 0 | calc_type_a = 2 + ((mode & BLAS_COMPLEX) != 0); |
162 | 0 | calc_type_b = 1 + ((mode & BLAS_COMPLEX) != 0); |
163 | 0 | break; |
164 | 0 | case BLAS_DTOBF16 : |
165 | 0 | calc_type_a = 3 + ((mode & BLAS_COMPLEX) != 0); |
166 | 0 | calc_type_b = 1 + ((mode & BLAS_COMPLEX) != 0); |
167 | 0 | break; |
168 | 0 | case BLAS_BF16TOS : |
169 | 0 | calc_type_a = 1 + ((mode & BLAS_COMPLEX) != 0); |
170 | 0 | calc_type_b = 2 + ((mode & BLAS_COMPLEX) != 0); |
171 | 0 | break; |
172 | 0 | case BLAS_BF16TOD : |
173 | 0 | calc_type_a = 1 + ((mode & BLAS_COMPLEX) != 0); |
174 | 0 | calc_type_b = 3 + ((mode & BLAS_COMPLEX) != 0); |
175 | 0 | break; |
176 | 0 | default: |
177 | 0 | calc_type_a = calc_type_b = 0; |
178 | 0 | break; |
179 | 0 | } |
180 | | |
181 | 0 | mode |= BLAS_LEGACY; |
182 | |
|
183 | 0 | for (i = 0; i < nthreads; i++) blas_queue_init(&queue[i]); |
184 | |
|
185 | 0 | num_cpu = 0; |
186 | 0 | i = m; |
187 | |
|
188 | 0 | while (i > 0){ |
189 | | |
190 | | /* Adjust Parameters */ |
191 | 0 | width = blas_quickdivide(i + nthreads - num_cpu - 1, |
192 | 0 | nthreads - num_cpu); |
193 | |
|
194 | 0 | i -= width; |
195 | 0 | if (i < 0) width = width + i; |
196 | |
|
197 | 0 | astride = width * lda; |
198 | |
|
199 | 0 | if (!(mode & BLAS_TRANSB_T)) { |
200 | 0 | bstride = width * ldb; |
201 | 0 | } else { |
202 | 0 | bstride = width; |
203 | 0 | } |
204 | |
|
205 | 0 | astride <<= calc_type_a; |
206 | 0 | bstride <<= calc_type_b; |
207 | |
|
208 | 0 | args[num_cpu].m = width; |
209 | 0 | args[num_cpu].n = n; |
210 | 0 | args[num_cpu].k = k; |
211 | 0 | args[num_cpu].a = (void *)a; |
212 | 0 | args[num_cpu].b = (void *)b; |
213 | 0 | args[num_cpu].c = (void *)((char *)c + num_cpu * sizeof(double)*2); |
214 | 0 | args[num_cpu].lda = lda; |
215 | 0 | args[num_cpu].ldb = ldb; |
216 | 0 | args[num_cpu].ldc = ldc; |
217 | 0 | args[num_cpu].alpha = alpha; |
218 | |
|
219 | 0 | queue[num_cpu].mode = mode; |
220 | 0 | queue[num_cpu].routine = function; |
221 | 0 | queue[num_cpu].args = &args[num_cpu]; |
222 | 0 | queue[num_cpu].next = &queue[num_cpu + 1]; |
223 | |
|
224 | 0 | a = (void *)((BLASULONG)a + astride); |
225 | 0 | b = (void *)((BLASULONG)b + bstride); |
226 | |
|
227 | 0 | num_cpu ++; |
228 | 0 | } |
229 | |
|
230 | 0 | if (num_cpu) { |
231 | 0 | queue[num_cpu - 1].next = NULL; |
232 | |
|
233 | 0 | exec_blas(num_cpu, queue); |
234 | 0 | } |
235 | |
|
236 | 0 | return 0; |
237 | 0 | } |