Coverage Report

Created: 2026-03-16 12:03

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
contrib/openblas/driver/others/blas_server_omp.c
Line
Count
Source
1
/*********************************************************************/
2
/* Copyright 2009, 2010 The University of Texas at Austin.           */
3
/* All rights reserved.                                              */
4
/*                                                                   */
5
/* Redistribution and use in source and binary forms, with or        */
6
/* without modification, are permitted provided that the following   */
7
/* conditions are met:                                               */
8
/*                                                                   */
9
/*   1. Redistributions of source code must retain the above         */
10
/*      copyright notice, this list of conditions and the following  */
11
/*      disclaimer.                                                  */
12
/*                                                                   */
13
/*   2. Redistributions in binary form must reproduce the above      */
14
/*      copyright notice, this list of conditions and the following  */
15
/*      disclaimer in the documentation and/or other materials       */
16
/*      provided with the distribution.                              */
17
/*                                                                   */
18
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
19
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
20
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
21
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
22
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
23
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
24
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
25
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
26
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
27
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
28
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
29
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
30
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
31
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
32
/*                                                                   */
33
/* The views and conclusions contained in the software and           */
34
/* documentation are those of the authors and should not be          */
35
/* interpreted as representing official policies, either expressed   */
36
/* or implied, of The University of Texas at Austin.                 */
37
/*********************************************************************/
38
39
#include <stdbool.h>
40
#include <stdio.h>
41
#include <stdlib.h>
42
//#include <sys/mman.h>
43
#include "common.h"
44
45
#ifndef USE_OPENMP
46
47
#include "blas_server.c"
48
49
#else
50
51
#ifndef likely
52
#ifdef __GNUC__
53
#define likely(x) __builtin_expect(!!(x), 1)
54
#else
55
#define likely(x) (x)
56
#endif
57
#endif
58
#ifndef unlikely
59
#ifdef __GNUC__
60
0
#define unlikely(x) __builtin_expect(!!(x), 0)
61
#else
62
#define unlikely(x) (x)
63
#endif
64
#endif
65
66
#ifndef OMP_SCHED
67
#define OMP_SCHED static
68
#endif
69
70
int blas_server_avail = 0;
71
int blas_omp_number_max = 0;
72
int blas_omp_threads_local = 1;
73
74
extern int openblas_omp_adaptive_env(void);
75
76
static void * blas_thread_buffer[MAX_PARALLEL_NUMBER][MAX_CPU_NUMBER];
77
#ifdef HAVE_C11
78
static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
79
#else
80
static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER];
81
#endif
82
83
9
static void adjust_thread_buffers(void) {
84
85
9
  int i=0, j=0;
86
87
  //adjust buffer for each thread
88
18
  for(i=0; i < MAX_PARALLEL_NUMBER; i++) {
89
133
    for(j=0; j < blas_cpu_number; j++){
90
124
      if(blas_thread_buffer[i][j] == NULL){
91
124
        blas_thread_buffer[i][j] = blas_memory_alloc(2);
92
124
      }
93
124
    }
94
25
    for(; j < MAX_CPU_NUMBER; j++){
95
16
      if(blas_thread_buffer[i][j] != NULL){
96
0
        blas_memory_free(blas_thread_buffer[i][j]);
97
0
        blas_thread_buffer[i][j] = NULL;
98
0
      }
99
16
    }
100
9
  }
101
9
}
102
103
0
void goto_set_num_threads(int num_threads) {
104
105
0
  if (num_threads < 1) num_threads = blas_num_threads;
106
107
0
  if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
108
109
0
  if (num_threads > blas_num_threads) {
110
0
    blas_num_threads = num_threads;
111
0
  }
112
113
0
  blas_cpu_number  = num_threads;
114
115
0
  adjust_thread_buffers();
116
#if defined(ARCH_MIPS64) || defined(ARCH_LOONGARCH64)
117
#ifndef DYNAMIC_ARCH
118
  //set parameters for different number of threads.
119
  blas_set_parameter();
120
#endif
121
#endif
122
123
0
}
124
0
void openblas_set_num_threads(int num_threads) {
125
126
0
  goto_set_num_threads(num_threads);
127
0
}
128
129
#ifdef OS_LINUX
130
131
0
int openblas_setaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) {
132
0
  fprintf(stderr,"OpenBLAS: use OpenMP environment variables for setting cpu affinity\n");
133
0
  return -1;
134
0
}
135
0
int openblas_getaffinity(int thread_idx, size_t cpusetsize, cpu_set_t* cpu_set) {
136
0
  fprintf(stderr,"OpenBLAS: use OpenMP environment variables for querying cpu affinity\n");
137
0
  return -1;
138
0
}
139
#endif
140
141
9
int blas_thread_init(void){
142
143
#if defined(__FreeBSD__) && defined(__clang__)
144
extern int openblas_omp_num_threads_env(void);
145
146
   if(blas_omp_number_max <= 0)
147
     blas_omp_number_max= openblas_omp_num_threads_env();
148
   if (blas_omp_number_max <= 0) 
149
     blas_omp_number_max=MAX_CPU_NUMBER;
150
#else
151
9
    blas_omp_number_max = omp_get_max_threads();
152
9
#endif
153
154
9
  blas_get_cpu_number();
155
156
9
  adjust_thread_buffers();
157
158
9
  blas_server_avail = 1;
159
160
9
  return 0;
161
9
}
162
163
164
int BLASFUNC(blas_thread_shutdown)(void){
164
164
  int i=0, j=0;
165
164
  blas_server_avail = 0;
166
167
328
  for(i=0; i<MAX_PARALLEL_NUMBER; i++) {
168
2.39k
    for(j=0; j<MAX_CPU_NUMBER; j++){
169
2.23k
      if(blas_thread_buffer[i][j]!=NULL){
170
60
        blas_memory_free(blas_thread_buffer[i][j]);
171
60
        blas_thread_buffer[i][j]=NULL;
172
60
      }
173
2.23k
    }
174
164
  }
175
176
164
  return 0;
177
164
}
178
179
0
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
180
181
0
      if (!(mode & BLAS_COMPLEX)){
182
#ifdef EXPRECISION
183
  if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
184
    /* REAL / Extended Double */
185
    void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble,
186
      xdouble *, BLASLONG, xdouble *, BLASLONG,
187
      xdouble *, BLASLONG, void *) = func;
188
189
    afunc(args -> m, args -> n, args -> k,
190
    ((xdouble *)args -> alpha)[0],
191
    args -> a, args -> lda,
192
    args -> b, args -> ldb,
193
    args -> c, args -> ldc, sb);
194
  } else
195
#endif
196
0
    if ((mode & BLAS_PREC) == BLAS_DOUBLE){
197
      /* REAL / Double */
198
0
      void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
199
0
        double *, BLASLONG, double *, BLASLONG,
200
0
        double *, BLASLONG, void *) = func;
201
202
0
      afunc(args -> m, args -> n, args -> k,
203
0
      ((double *)args -> alpha)[0],
204
0
      args -> a, args -> lda,
205
0
      args -> b, args -> ldb,
206
0
      args -> c, args -> ldc, sb);
207
0
    } else if ((mode & BLAS_PREC) == BLAS_SINGLE){
208
      /* REAL / Single */
209
0
      void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
210
0
        float *, BLASLONG, float *, BLASLONG,
211
0
        float *, BLASLONG, void *) = func;
212
213
0
      afunc(args -> m, args -> n, args -> k,
214
0
      ((float *)args -> alpha)[0],
215
0
      args -> a, args -> lda,
216
0
      args -> b, args -> ldb,
217
0
      args -> c, args -> ldc, sb);
218
#ifdef BUILD_BFLOAT16
219
          } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){
220
            /* REAL / BFLOAT16 */
221
            void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
222
                          bfloat16 *, BLASLONG, bfloat16 *, BLASLONG,
223
                          bfloat16 *, BLASLONG, void *) = func;
224
225
            afunc(args -> m, args -> n, args -> k,
226
                  ((bfloat16 *)args -> alpha)[0],
227
                  args -> a, args -> lda,
228
                  args -> b, args -> ldb,
229
                  args -> c, args -> ldc, sb);
230
          } else if ((mode & BLAS_PREC) == BLAS_STOBF16){
231
            /* REAL / BLAS_STOBF16 */
232
            void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
233
                          float *, BLASLONG, bfloat16 *, BLASLONG,
234
                          float *, BLASLONG, void *) = func;
235
236
            afunc(args -> m, args -> n, args -> k,
237
                  ((float *)args -> alpha)[0],
238
                  args -> a, args -> lda,
239
                  args -> b, args -> ldb,
240
                  args -> c, args -> ldc, sb);
241
          } else if ((mode & BLAS_PREC) == BLAS_DTOBF16){
242
            /* REAL / BLAS_DTOBF16 */
243
            void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
244
                          double *, BLASLONG, bfloat16 *, BLASLONG,
245
                          double *, BLASLONG, void *) = func;
246
247
            afunc(args -> m, args -> n, args -> k,
248
                  ((double *)args -> alpha)[0],
249
                  args -> a, args -> lda,
250
                  args -> b, args -> ldb,
251
                  args -> c, args -> ldc, sb);
252
#endif
253
0
          } else {
254
            /* REAL / Other types in future */
255
0
    }
256
0
      } else {
257
#ifdef EXPRECISION
258
  if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
259
    /* COMPLEX / Extended Double */
260
    void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
261
      xdouble *, BLASLONG, xdouble *, BLASLONG,
262
      xdouble *, BLASLONG, void *) = func;
263
264
    afunc(args -> m, args -> n, args -> k,
265
    ((xdouble *)args -> alpha)[0],
266
    ((xdouble *)args -> alpha)[1],
267
    args -> a, args -> lda,
268
    args -> b, args -> ldb,
269
    args -> c, args -> ldc, sb);
270
  } else
271
#endif
272
0
    if ((mode & BLAS_PREC) == BLAS_DOUBLE){
273
      /* COMPLEX / Double */
274
0
    void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
275
0
      double *, BLASLONG, double *, BLASLONG,
276
0
      double *, BLASLONG, void *) = func;
277
278
0
    afunc(args -> m, args -> n, args -> k,
279
0
    ((double *)args -> alpha)[0],
280
0
    ((double *)args -> alpha)[1],
281
0
    args -> a, args -> lda,
282
0
    args -> b, args -> ldb,
283
0
    args -> c, args -> ldc, sb);
284
0
    } else if ((mode & BLAS_PREC) == BLAS_SINGLE){
285
      /* COMPLEX / Single */
286
0
    void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float,
287
0
      float *, BLASLONG, float *, BLASLONG,
288
0
      float *, BLASLONG, void *) = func;
289
290
0
    afunc(args -> m, args -> n, args -> k,
291
0
    ((float *)args -> alpha)[0],
292
0
    ((float *)args -> alpha)[1],
293
0
    args -> a, args -> lda,
294
0
    args -> b, args -> ldb,
295
0
    args -> c, args -> ldc, sb);
296
0
      } else {
297
            /* COMPLEX / Other types in future */
298
0
   }
299
0
   }
300
0
}
301
302
0
static void exec_threads(int thread_num, blas_queue_t *queue, int buf_index){
303
304
0
  void *buffer, *sa, *sb;
305
0
  int pos=0, release_flag=0;
306
307
0
  buffer = NULL;
308
0
  sa = queue -> sa;
309
0
  sb = queue -> sb;
310
311
#ifdef CONSISTENT_FPCSR
312
#ifdef __aarch64__
313
  __asm__ __volatile__ ("msr fpcr, %0" : : "r" (queue -> sse_mode));
314
#else
315
  __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
316
  __asm__ __volatile__ ("fldcw %0"   : : "m" (queue -> x87_mode));
317
#endif
318
#endif
319
320
0
  if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) {
321
322
0
    pos= thread_num;
323
0
    buffer = blas_thread_buffer[buf_index][pos];
324
325
    //fallback
326
0
    if(buffer==NULL) {
327
0
      buffer = blas_memory_alloc(2);
328
0
      release_flag=1;
329
0
    }
330
331
0
    if (sa == NULL) {
332
0
      sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
333
0
      queue->sa=sa;
334
0
    }
335
336
0
    if (sb == NULL) {
337
0
      if (!(queue -> mode & BLAS_COMPLEX)){
338
#ifdef EXPRECISION
339
  if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
340
    sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble)
341
            + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
342
  } else
343
#endif
344
0
    if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
345
0
#if defined ( BUILD_DOUBLE) || defined (BUILD_COMPLEX16)
346
0
      sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
347
0
              + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
348
0
#endif
349
0
    } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE){
350
0
#if defined (BUILD_SINGLE) || defined (BUILD_COMPLEX)
351
0
      sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float)
352
0
              + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
353
0
#endif
354
0
    } else {
355
          /* Other types in future */
356
0
    }
357
0
      } else {
358
#ifdef EXPRECISION
359
  if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
360
    sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble)
361
            + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
362
  } else
363
#endif
364
0
    if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
365
0
#ifdef BUILD_COMPLEX16
366
0
      sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double)
367
0
              + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
368
#else 
369
fprintf(stderr,"UNHANDLED COMPLEX16\n");
370
#endif
371
0
    } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) {
372
0
#ifdef BUILD_COMPLEX
373
0
      sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float)
374
0
              + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
375
#else 
376
fprintf(stderr,"UNHANDLED COMPLEX\n");
377
#endif
378
0
    } else {
379
          /* Other types in future */
380
0
    }
381
0
      }
382
0
      queue->sb=sb;
383
0
    }
384
0
  }
385
386
0
  if (queue -> mode & BLAS_LEGACY) {
387
0
    legacy_exec(queue -> routine, queue -> mode, queue -> args, sb);
388
0
  } else
389
0
    if (queue -> mode & BLAS_PTHREAD) {
390
0
      void (*pthreadcompat)(void *) = queue -> routine;
391
0
      (pthreadcompat)(queue -> args);
392
393
0
    } else {
394
0
      int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
395
396
0
      (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
397
398
0
    }
399
400
0
  if (release_flag) blas_memory_free(buffer);
401
402
0
}
403
404
0
int exec_blas(BLASLONG num, blas_queue_t *queue){
405
406
  // Handle lazy re-init of the thread-pool after a POSIX fork
407
0
  if (unlikely(blas_server_avail == 0)) blas_thread_init();
408
409
0
  BLASLONG i, buf_index;
410
411
0
  if ((num <= 0) || (queue == NULL)) return 0;
412
413
#ifdef CONSISTENT_FPCSR
414
  for (i = 0; i < num; i ++) {
415
#ifdef __aarch64__
416
    __asm__ __volatile__ ("mrs %0, fpcr" : "=r" (queue[i].sse_mode));
417
#else
418
    __asm__ __volatile__ ("fnstcw %0"  : "=m" (queue[i].x87_mode));
419
    __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue[i].sse_mode));
420
#endif
421
  }
422
#endif
423
424
0
while (true) {
425
0
    for(i=0; i < MAX_PARALLEL_NUMBER; i++) {
426
0
#ifdef HAVE_C11
427
0
      _Bool inuse = false;
428
0
      if(atomic_compare_exchange_weak(&blas_buffer_inuse[i], &inuse, true)) {
429
#else
430
      if(blas_buffer_inuse[i] == false) {
431
        blas_buffer_inuse[i] = true;
432
#endif
433
0
        buf_index = i;
434
0
        break;
435
0
      }
436
0
    }
437
0
    if(i != MAX_PARALLEL_NUMBER)
438
0
      break;
439
0
  }
440
  /*For caller-managed threading, if caller has registered the callback, pass exec_thread as callback function*/
441
0
  if (openblas_threads_callback_) {
442
0
#ifndef USE_SIMPLE_THREADED_LEVEL3
443
0
    for (i = 0; i < num; i ++)
444
0
      queue[i].position = i;
445
0
#endif
446
0
    openblas_threads_callback_(1, (openblas_dojob_callback) exec_threads, num, sizeof(blas_queue_t), (void*) queue, buf_index);
447
0
  } else {
448
449
0
 if (openblas_omp_adaptive_env() != 0) {
450
0
 #pragma omp parallel for num_threads(num) schedule(OMP_SCHED)
451
0
  for (i = 0; i < num; i ++) {
452
0
#ifndef USE_SIMPLE_THREADED_LEVEL3
453
0
    queue[i].position = i;
454
0
#endif
455
0
  exec_threads(omp_get_thread_num(), &queue[i], buf_index);
456
0
  }
457
0
} else {
458
0
#pragma omp parallel for schedule(OMP_SCHED)
459
0
  for (i = 0; i < num; i ++) {
460
461
0
#ifndef USE_SIMPLE_THREADED_LEVEL3
462
0
    queue[i].position = i;
463
0
#endif
464
465
0
  exec_threads(omp_get_thread_num(), &queue[i], buf_index);
466
0
  }
467
0
}
468
0
}
469
470
0
#ifdef HAVE_C11
471
0
  atomic_store(&blas_buffer_inuse[buf_index], false);
472
#else
473
  blas_buffer_inuse[buf_index] = false;
474
#endif
475
476
0
  return 0;
477
0
}
478
479
#endif