Coverage Report

Created: 2026-03-12 17:42

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
contrib/openblas/driver/others/memory.c
Line
Count
Source
1
/*****************************************************************************
2
Copyright (c) 2011-2014, The OpenBLAS Project
3
All rights reserved.
4
5
Redistribution and use in source and binary forms, with or without
6
modification, are permitted provided that the following conditions are
7
met:
8
9
   1. Redistributions of source code must retain the above copyright
10
      notice, this list of conditions and the following disclaimer.
11
12
   2. Redistributions in binary form must reproduce the above copyright
13
      notice, this list of conditions and the following disclaimer in
14
      the documentation and/or other materials provided with the
15
      distribution.
16
   3. Neither the name of the OpenBLAS project nor the names of
17
      its contributors may be used to endorse or promote products
18
      derived from this software without specific prior written
19
      permission.
20
21
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
25
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
28
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
29
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
30
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31
32
**********************************************************************************/
33
34
/*********************************************************************/
35
/* Copyright 2009, 2010 The University of Texas at Austin.           */
36
/* All rights reserved.                                              */
37
/*                                                                   */
38
/* Redistribution and use in source and binary forms, with or        */
39
/* without modification, are permitted provided that the following   */
40
/* conditions are met:                                               */
41
/*                                                                   */
42
/*   1. Redistributions of source code must retain the above         */
43
/*      copyright notice, this list of conditions and the following  */
44
/*      disclaimer.                                                  */
45
/*                                                                   */
46
/*   2. Redistributions in binary form must reproduce the above      */
47
/*      copyright notice, this list of conditions and the following  */
48
/*      disclaimer in the documentation and/or other materials       */
49
/*      provided with the distribution.                              */
50
/*                                                                   */
51
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
52
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
53
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
54
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
55
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
56
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
57
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
58
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
59
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
60
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
61
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
62
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
63
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
64
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
65
/*                                                                   */
66
/* The views and conclusions contained in the software and           */
67
/* documentation are those of the authors and should not be          */
68
/* interpreted as representing official policies, either expressed   */
69
/* or implied, of The University of Texas at Austin.                 */
70
/*********************************************************************/
71
72
//#undef  DEBUG
73
74
#include "common.h"
75
76
0
#define NEW_BUFFERS 512
77
#ifndef likely
78
#ifdef __GNUC__
79
152
#define likely(x) __builtin_expect(!!(x), 1)
80
340
#define unlikely(x) __builtin_expect(!!(x), 0)
81
#else
82
#define likely(x) (x)
83
#define unlikely(x) (x)
84
#endif
85
#endif
86
87
#if defined(USE_TLS) && defined(SMP)
88
#define COMPILE_TLS
89
90
#if USE_TLS != 1
91
#undef COMPILE_TLS
92
#endif
93
94
#if defined(__GLIBC_PREREQ)
95
#if !__GLIBC_PREREQ(2,20)
96
#undef COMPILE_TLS
97
#endif
98
#endif
99
#endif
100
101
/* Memory buffer must fit two matrix subblocks of maximal size */
102
#define XSTR(x) STR(x)
103
#define STR(x) #x
104
#if BUFFER_SIZE < (SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 * 2) || \
105
    BUFFER_SIZE < (SGEMM_DEFAULT_P * SGEMM_DEFAULT_R * 4 * 2) || \
106
    BUFFER_SIZE < (SGEMM_DEFAULT_R * SGEMM_DEFAULT_Q * 4 * 2)
107
#warning BUFFER_SIZE is too small for P, Q, and R of SGEMM - large calculations may crash !
108
#endif
109
#if BUFFER_SIZE < (DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 * 2) || \
110
    BUFFER_SIZE < (DGEMM_DEFAULT_P * DGEMM_DEFAULT_R * 8 * 2) || \
111
    BUFFER_SIZE < (DGEMM_DEFAULT_R * DGEMM_DEFAULT_Q * 8 * 2)
112
#warning BUFFER_SIZE is too small for P, Q, and R of DGEMM - large calculations may crash !
113
#endif
114
#if BUFFER_SIZE < (CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 * 2) || \
115
    BUFFER_SIZE < (CGEMM_DEFAULT_P * CGEMM_DEFAULT_R * 8 * 2) || \
116
    BUFFER_SIZE < (CGEMM_DEFAULT_R * CGEMM_DEFAULT_Q * 8 * 2)
117
#warning BUFFER_SIZE is too small for P, Q, and R of CGEMM - large calculations may crash !
118
#endif
119
#if BUFFER_SIZE < (ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 * 2) || \
120
    BUFFER_SIZE < (ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_R * 16 * 2) || \
121
    BUFFER_SIZE < (ZGEMM_DEFAULT_R * ZGEMM_DEFAULT_Q * 16 * 2)
122
#warning BUFFER_SIZE is too small for P, Q, and R of ZGEMM - large calculations may crash !
123
#endif
124
125
#if defined(COMPILE_TLS)
126
127
#include <errno.h>
128
129
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
130
#define ALLOC_WINDOWS
131
#ifndef MEM_LARGE_PAGES
132
#define MEM_LARGE_PAGES  0x20000000
133
#endif
134
#else
135
#define ALLOC_MMAP
136
#define ALLOC_MALLOC
137
#endif
138
139
#include <stdlib.h>
140
#include <stdio.h>
141
#include <fcntl.h>
142
143
#if !defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)
144
#include <sys/mman.h>
145
#ifndef NO_SYSV_IPC
146
#include <sys/shm.h>
147
#endif
148
#include <sys/ipc.h>
149
#endif
150
151
#include <sys/types.h>
152
153
#ifdef OS_LINUX
154
#include <sys/sysinfo.h>
155
#include <sched.h>
156
#include <errno.h>
157
#include <linux/unistd.h>
158
#include <sys/syscall.h>
159
#include <sys/time.h>
160
#include <sys/resource.h>
161
#endif
162
163
#ifdef OS_HAIKU
164
#include <unistd.h>
165
#endif
166
167
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
168
#include <sys/sysctl.h>
169
#include <sys/resource.h>
170
#endif
171
172
#if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
173
#include <conio.h>
174
#undef  printf
175
#define printf _cprintf
176
#endif
177
178
#ifdef OS_LINUX
179
180
#ifndef MPOL_PREFERRED
181
#define MPOL_PREFERRED  1
182
#endif
183
184
#endif
185
186
#if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP)
187
#define NO_WARMUP
188
#endif
189
190
#ifndef SHM_HUGETLB
191
#define SHM_HUGETLB 04000
192
#endif
193
194
#ifndef FIXED_PAGESIZE
195
#define FIXED_PAGESIZE 4096
196
#endif
197
198
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
199
200
#if defined(_MSC_VER) && !defined(__clang__)
201
#define CONSTRUCTOR __cdecl
202
#define DESTRUCTOR __cdecl
203
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
204
#define CONSTRUCTOR __attribute__ ((constructor))
205
#define DESTRUCTOR  __attribute__ ((destructor))
206
#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
207
#define CONSTRUCTOR __attribute__ ((constructor(101)))
208
#define DESTRUCTOR  __attribute__ ((destructor(101)))
209
#else
210
#define CONSTRUCTOR __attribute__ ((constructor))
211
#define DESTRUCTOR  __attribute__ ((destructor))
212
#endif
213
214
#ifdef DYNAMIC_ARCH
215
gotoblas_t *gotoblas = NULL;
216
#endif
217
extern void openblas_warning(int verbose, const char * msg);
218
219
#ifndef SMP
220
221
#define blas_cpu_number 1
222
#define blas_num_threads 1
223
224
/* Dummy Function */
225
int  goto_get_num_procs  (void) { return 1;};
226
void goto_set_num_threads(int num_threads) {};
227
228
#else
229
230
#if defined(OS_LINUX) || defined(OS_SUNOS)
231
#ifndef NO_AFFINITY
232
int get_num_procs(void);
233
#else
234
int get_num_procs(void) {
235
  static int nums = 0;
236
  int ret;
237
#if defined(__GLIBC_PREREQ)
238
  cpu_set_t cpuset,*cpusetp;
239
  size_t size;
240
241
#if !__GLIBC_PREREQ(2, 7)
242
  int i;
243
#if !__GLIBC_PREREQ(2, 6)
244
  int n;
245
#endif
246
#endif
247
#endif
248
249
  if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
250
251
#if defined(USE_OPENMP)
252
#if _OPENMP >= 201511
253
    int i,n;
254
    n = 0;
255
    ret = omp_get_num_places();
256
    if (ret > 0) for (i=0; i<ret;i++) n+= omp_get_place_num_procs(i);
257
    if (n > 0) nums = n;
258
#endif
259
    return (nums > 0 ? nums : 2);
260
#endif
261
262
#if !defined(OS_LINUX)
263
  return (nums > 0 ? nums : 2);
264
#endif
265
266
#if !defined(__GLIBC_PREREQ)
267
  return (nums > 0 ? nums :2);
268
#else
269
 #if !__GLIBC_PREREQ(2, 3)
270
  return (nums > 0 ? nums :2);
271
 #endif
272
273
 #if !__GLIBC_PREREQ(2, 7)
274
  ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
275
  if (ret!=0) return (nums > 0 ? nums :2);
276
  n=0;
277
  #if !__GLIBC_PREREQ(2, 6)
278
  for (i=0;i<nums;i++)
279
     if (CPU_ISSET(i,&cpuset)) n++;
280
  nums=n;
281
  #else
282
  nums = CPU_COUNT(sizeof(cpuset),&cpuset);
283
  #endif
284
  return (nums > 0 ? nums :2);
285
 #else
286
  if (nums >= CPU_SETSIZE) {
287
    cpusetp = CPU_ALLOC(nums);
288
      if (cpusetp == NULL) {
289
        return (nums > 0 ? nums :2);
290
      }
291
    size = CPU_ALLOC_SIZE(nums);
292
    ret = sched_getaffinity(0,size,cpusetp);
293
    if (ret!=0) {
294
      CPU_FREE(cpusetp);
295
      return (nums > 0 ? nums :2);
296
    }
297
    ret = CPU_COUNT_S(size,cpusetp);
298
    if (ret > 0 && ret < nums) nums = ret;
299
    CPU_FREE(cpusetp);
300
    return (nums > 0 ? nums :2);
301
  } else {
302
    ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
303
    if (ret!=0) {
304
      return (nums > 0 ? nums :2);
305
    }
306
    ret = CPU_COUNT(&cpuset);
307
    if (ret > 0 && ret < nums) nums = ret;
308
    return (nums > 0 ? nums :2);
309
  }
310
 #endif
311
#endif
312
}
313
#endif
314
#endif
315
316
#ifdef OS_ANDROID
317
int get_num_procs(void) {
318
  static int nums = 0;
319
  if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
320
  return nums;
321
}
322
#endif
323
324
#ifdef OS_HAIKU
325
int get_num_procs(void) {
326
  static int nums = 0;
327
  if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
328
  return nums;
329
}
330
#endif
331
332
#ifdef OS_AIX
333
int get_num_procs(void) {
334
  static int nums = 0;
335
  if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
336
  return nums;
337
}
338
#endif
339
340
341
342
#ifdef OS_WINDOWS
343
344
int get_num_procs(void) {
345
346
  static int nums = 0;
347
348
  if (nums == 0) {
349
350
    SYSTEM_INFO sysinfo;
351
352
    GetSystemInfo(&sysinfo);
353
354
    nums = sysinfo.dwNumberOfProcessors;
355
  }
356
357
  return nums;
358
}
359
360
#endif
361
362
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
363
364
int get_num_procs(void) {
365
366
  static int nums = 0;
367
368
  int m[2];
369
  size_t len;
370
371
  if (nums == 0) {
372
    m[0] = CTL_HW;
373
    m[1] = HW_NCPU;
374
    len = sizeof(int);
375
    sysctl(m, 2, &nums, &len, NULL, 0);
376
  }
377
378
  return nums;
379
}
380
381
#endif
382
383
#if defined(OS_DARWIN)
384
int get_num_procs(void) {
385
  static int nums = 0;
386
  size_t len;
387
  if (nums == 0){
388
    len = sizeof(int);
389
    sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
390
  }
391
  return nums;
392
}
393
/*
394
void set_stack_limit(int limitMB){
395
  int result=0;
396
  struct rlimit rl;
397
  rlim_t StackSize;
398
399
  StackSize=limitMB*1024*1024;
400
  result=getrlimit(RLIMIT_STACK, &rl);
401
  if(result==0){
402
    if(rl.rlim_cur < StackSize){
403
      rl.rlim_cur=StackSize;
404
      result=setrlimit(RLIMIT_STACK, &rl);
405
      if(result !=0){
406
        fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
407
      }
408
    }
409
  }
410
}
411
*/
412
#endif
413
414
415
/*
416
OpenBLAS uses the numbers of CPU cores in multithreading.
417
It can be set by openblas_set_num_threads(int num_threads);
418
*/
419
int blas_cpu_number  = 0;
420
/*
421
The numbers of threads in the thread pool.
422
This value is equal or large than blas_cpu_number. This means some threads are sleep.
423
*/
424
int blas_num_threads = 0;
425
426
int  goto_get_num_procs  (void) {
427
  return blas_cpu_number;
428
}
429
430
static void blas_memory_init(void);
431
432
void openblas_fork_handler(void)
433
{
434
  // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
435
  // built with "make USE_OPENMP=0".
436
  // Hanging can still happen when OpenBLAS is built against the libgomp
437
  // implementation of OpenMP. The problem is tracked at:
438
  //   http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
439
  // In the mean time build with USE_OPENMP=0 or link against another
440
  // implementation of OpenMP.
441
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
442
  int err;
443
  err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, blas_memory_init);
444
  if(err != 0)
445
    openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
446
#endif
447
}
448
449
extern int openblas_num_threads_env(void);
450
extern int openblas_goto_num_threads_env(void);
451
extern int openblas_omp_num_threads_env(void);
452
453
int blas_get_cpu_number(void){
454
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
455
  int max_num;
456
#endif
457
  int blas_goto_num   = 0;
458
  int blas_omp_num    = 0;
459
460
  if (blas_num_threads) return blas_num_threads;
461
462
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
463
  max_num = get_num_procs();
464
#endif
465
466
  // blas_goto_num = 0;
467
#ifndef USE_OPENMP_UNUSED
468
  blas_goto_num=openblas_num_threads_env();
469
  if (blas_goto_num < 0) blas_goto_num = 0;
470
471
  if (blas_goto_num == 0) {
472
    blas_goto_num=openblas_goto_num_threads_env();
473
    if (blas_goto_num < 0) blas_goto_num = 0;
474
  }
475
476
#endif
477
478
  // blas_omp_num = 0;
479
  blas_omp_num=openblas_omp_num_threads_env();
480
  if (blas_omp_num < 0) blas_omp_num = 0;
481
482
  if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
483
  else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
484
  else blas_num_threads = MAX_CPU_NUMBER;
485
486
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
487
  if (blas_num_threads > max_num) blas_num_threads = max_num;
488
#endif
489
490
  if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
491
492
#ifdef DEBUG
493
  printf( "Adjusted number of threads : %3d\n", blas_num_threads);
494
#endif
495
496
  blas_cpu_number = blas_num_threads;
497
498
  return blas_num_threads;
499
}
500
#endif
501
502
503
int openblas_get_num_procs(void) {
504
#ifndef SMP
505
  return 1;
506
#else
507
  return get_num_procs();
508
#endif
509
}
510
511
int openblas_get_num_threads(void) {
512
#ifndef SMP
513
  return 1;
514
#else
515
  // init blas_cpu_number if needed
516
  blas_get_cpu_number();
517
  return blas_cpu_number;
518
#endif
519
}
520
521
int hugetlb_allocated = 0;
522
523
#if defined(OS_WINDOWS)
524
#define LIKELY_ONE(x) (x)
525
#else
526
#define LIKELY_ONE(x) (__builtin_expect(x, 1))
527
#endif
528
529
/* Stores information about the allocation and how to release it */
530
struct alloc_t {
531
  /* Whether this allocation is being used */
532
  int used;
533
  /* Any special attributes needed when releasing this allocation */
534
  int attr;
535
  /* Function that can properly release this memory */
536
  void (*release_func)(struct alloc_t *);
537
  /* Pad to 64-byte alignment */
538
  char pad[64 - 2 * sizeof(int) - sizeof(void(*))];
539
};
540
541
/* Convenience macros for storing release funcs */
542
#define STORE_RELEASE_FUNC(address, func)                   \
543
  if (address != (void *)-1) {                              \
544
    struct alloc_t *alloc_info = (struct alloc_t *)address; \
545
    alloc_info->release_func = func;                        \
546
  }
547
548
#define STORE_RELEASE_FUNC_WITH_ATTR(address, func, attr)   \
549
  if (address != (void *)-1) {                              \
550
    struct alloc_t *alloc_info = (struct alloc_t *)address; \
551
    alloc_info->release_func = func;                        \
552
    alloc_info->attr = attr;                                \
553
  }
554
555
/* The number of bytes that will be allocated for each buffer. When allocating
556
   memory, we store an alloc_t followed by the actual buffer memory. This means
557
   that each allocation always has its associated alloc_t, without the need
558
   for an auxiliary tracking structure. */
559
static const int allocation_block_size = BUFFER_SIZE + sizeof(struct alloc_t);
560
561
#if defined(SMP)
562
#  if defined(OS_WINDOWS)
563
static DWORD local_storage_key = 0;
564
DWORD lsk;
565
566
#  else
567
static pthread_key_t local_storage_key = 0;
568
pthread_key_t lsk;
569
#  endif /* defined(OS_WINDOWS) */
570
#endif /* defined(SMP) */
571
572
#if defined(OS_LINUX) && !defined(NO_WARMUP)
573
static int hot_alloc = 0;
574
#endif
575
576
/* Global lock for memory allocation */
577
578
#if   defined(USE_PTHREAD_LOCK)
579
static pthread_mutex_t    alloc_lock = PTHREAD_MUTEX_INITIALIZER;
580
#elif defined(USE_PTHREAD_SPINLOCK)
581
static pthread_spinlock_t alloc_lock = 0;
582
#else
583
static BLASULONG  alloc_lock = 0UL;
584
#endif
585
586
#if   defined(USE_PTHREAD_LOCK)
587
static pthread_mutex_t    key_lock = PTHREAD_MUTEX_INITIALIZER;
588
#elif defined(USE_PTHREAD_SPINLOCK)
589
static pthread_spinlock_t key_lock = 0;
590
#else
591
static BLASULONG  key_lock = 0UL;
592
#endif
593
594
/* Returns a pointer to the start of the per-thread memory allocation data */
595
static __inline struct alloc_t ** get_memory_table(void) {
596
#if defined(SMP)
597
LOCK_COMMAND(&key_lock);
598
lsk=local_storage_key;
599
UNLOCK_COMMAND(&key_lock);
600
  if (!lsk) {
601
    blas_memory_init();
602
  }
603
#  if defined(OS_WINDOWS)
604
  struct alloc_t ** local_memory_table = (struct alloc_t **)TlsGetValue(local_storage_key);
605
#  else
606
  struct alloc_t ** local_memory_table = (struct alloc_t **)pthread_getspecific(local_storage_key);
607
#  endif /* defined(OS_WINDOWS) */
608
#else
609
  static struct alloc_t ** local_memory_table = NULL;
610
#endif /* defined(SMP) */
611
#if defined (SMP)
612
LOCK_COMMAND(&key_lock);
613
lsk=local_storage_key;
614
UNLOCK_COMMAND(&key_lock);
615
  if (lsk && !local_memory_table) {
616
#else
617
 if (!local_memory_table) {
618
#endif /* defined(SMP) */
619
    local_memory_table = (struct alloc_t **)malloc(sizeof(struct alloc_t *) * NUM_BUFFERS);
620
    memset(local_memory_table, 0, sizeof(struct alloc_t *) * NUM_BUFFERS);
621
#if defined(SMP)
622
#  if defined(OS_WINDOWS)
623
LOCK_COMMAND(&key_lock);
624
    TlsSetValue(local_storage_key, (void*)local_memory_table);
625
UNLOCK_COMMAND(&key_lock);
626
#  else
627
LOCK_COMMAND(&key_lock);
628
    pthread_setspecific(local_storage_key, (void*)local_memory_table);
629
UNLOCK_COMMAND(&key_lock);
630
#  endif /* defined(OS_WINDOWS) */
631
#endif /* defined(SMP) */
632
  }
633
  return local_memory_table;
634
}
635
636
#ifdef ALLOC_MMAP
637
638
static void alloc_mmap_free(struct alloc_t *alloc_info){
639
640
  if (munmap(alloc_info, allocation_block_size)) {
641
    printf("OpenBLAS : munmap failed\n");
642
  }
643
}
644
645
646
647
#ifdef NO_WARMUP
648
649
static void *alloc_mmap(void *address){
650
  void *map_address;
651
652
  if (address){
653
    map_address = mmap(address,
654
                       allocation_block_size,
655
                       MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
656
  } else {
657
    map_address = mmap(address,
658
                       allocation_block_size,
659
                       MMAP_ACCESS, MMAP_POLICY, -1, 0);
660
  }
661
662
  STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
663
664
#ifdef OS_LINUX
665
  my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
666
#endif
667
668
  return map_address;
669
}
670
671
#else
672
673
#define BENCH_ITERATION 4
674
#define SCALING         2
675
676
static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
677
678
  BLASULONG original, *p;
679
  BLASULONG start, stop, min;
680
  int iter, i, count;
681
682
  min = (BLASULONG)-1;
683
684
  original = *(BLASULONG *)(address + size - PAGESIZE);
685
686
  *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address;
687
688
  for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {
689
690
    p = (BLASULONG *)address;
691
692
    count = size / PAGESIZE;
693
694
    start = rpcc();
695
696
    for (i = 0; i < count; i ++) {
697
      p = (BLASULONG *)(*p);
698
    }
699
700
    stop = rpcc();
701
702
    if (min > stop - start) min = stop - start;
703
  }
704
705
  *(BLASULONG *)(address + size - PAGESIZE +  0) = original;
706
  *(BLASULONG *)(address + size - PAGESIZE +  8) = (BLASULONG)p;
707
708
  return min;
709
}
710
711
static void *alloc_mmap(void *address){
712
  void *map_address, *best_address;
713
  BLASULONG best, start, current, original;
714
  BLASULONG allocsize;
715
716
  if (address){
717
    /* Just give up use advanced operation */
718
    map_address = mmap(address, allocation_block_size, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
719
720
#ifdef OS_LINUX
721
    my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
722
#endif
723
724
  } else {
725
#if defined(OS_LINUX) && !defined(NO_WARMUP)
726
    if (hot_alloc == 0) {
727
      map_address = mmap(NULL, allocation_block_size, MMAP_ACCESS, MMAP_POLICY, -1, 0);
728
729
#ifdef OS_LINUX
730
      my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
731
#endif
732
733
    } else {
734
#endif
735
736
      map_address = mmap(NULL, allocation_block_size * SCALING,
737
                         MMAP_ACCESS, MMAP_POLICY, -1, 0);
738
739
      if (map_address != (void *)-1) {
740
741
#ifdef OS_LINUX
742
#ifdef DEBUG
743
        int ret=0;
744
        ret=my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
745
        if(ret==-1){
746
                int errsv=errno;
747
                perror("OpenBLAS alloc_mmap:");
748
                printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
749
        }
750
751
#else
752
        my_mbind(map_address, allocation_block_size * SCALING, MPOL_PREFERRED, NULL, 0, 0);
753
#endif
754
#endif
755
756
757
        allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
758
759
        start   = (BLASULONG)map_address;
760
        current = (SCALING - 1) * allocation_block_size;
761
        original = current;
762
763
        while(current > 0 && current <= original) {
764
          *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
765
          start += PAGESIZE;
766
          current -= PAGESIZE;
767
        }
768
769
        *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
770
771
        start = (BLASULONG)map_address;
772
773
        best = (BLASULONG)-1;
774
        best_address = map_address;
775
776
        while ((start + allocsize  < (BLASULONG)map_address + (SCALING - 1) * allocation_block_size)) {
777
778
          current = run_bench(start, allocsize);
779
780
          if (best > current) {
781
            best = current;
782
            best_address = (void *)start;
783
          }
784
785
          start += PAGESIZE;
786
787
        }
788
789
      if ((BLASULONG)best_address > (BLASULONG)map_address)
790
        munmap(map_address,  (BLASULONG)best_address - (BLASULONG)map_address);
791
792
      munmap((void *)((BLASULONG)best_address + allocation_block_size), (SCALING - 1) * allocation_block_size + (BLASULONG)map_address - (BLASULONG)best_address);
793
794
      map_address = best_address;
795
796
#if defined(OS_LINUX) && !defined(NO_WARMUP)
797
      hot_alloc = 2;
798
#endif
799
      }
800
    }
801
#if defined(OS_LINUX) && !defined(NO_WARMUP)
802
  }
803
#endif
804
805
  STORE_RELEASE_FUNC(map_address, alloc_mmap_free);
806
807
  return map_address;
808
}
809
810
#endif
811
812
#endif
813
814
815
#ifdef ALLOC_MALLOC
816
817
static void alloc_malloc_free(struct alloc_t *alloc_info){
818
819
  free(alloc_info);
820
821
}
822
823
static void *alloc_malloc(void *address){
824
825
  void *map_address;
826
827
  map_address = (void *)malloc(allocation_block_size + FIXED_PAGESIZE);
828
829
  if (map_address == (void *)NULL) map_address = (void *)-1;
830
831
  STORE_RELEASE_FUNC(map_address, alloc_malloc_free);
832
833
  return map_address;
834
835
}
836
837
#endif
838
839
#ifdef ALLOC_QALLOC
840
841
void *qalloc(int flags, size_t bytes);
842
void *qfree (void *address);
843
844
#define QNONCACHE 0x1
845
#define QCOMMS    0x2
846
#define QFAST     0x4
847
848
static void alloc_qalloc_free(struct alloc_t *alloc_info){
849
850
  qfree(alloc_info);
851
852
}
853
854
static void *alloc_qalloc(void *address){
855
  void *map_address;
856
857
  map_address = (void *)qalloc(QCOMMS | QFAST, allocation_block_size + FIXED_PAGESIZE);
858
859
  if (map_address == (void *)NULL) map_address = (void *)-1;
860
861
  STORE_RELEASE_FUNC(map_address, alloc_qalloc_free);
862
863
  return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
864
}
865
866
#endif
867
868
#ifdef ALLOC_WINDOWS
869
870
static void alloc_windows_free(struct alloc_t *alloc_info){
871
872
  VirtualFree(alloc_info, 0, MEM_RELEASE);
873
874
}
875
876
static void *alloc_windows(void *address){
877
  void *map_address;
878
879
  map_address  = VirtualAlloc(address,
880
                              allocation_block_size,
881
                              MEM_RESERVE | MEM_COMMIT,
882
                              PAGE_READWRITE);
883
884
  if (map_address == (void *)NULL) map_address = (void *)-1;
885
886
  STORE_RELEASE_FUNC(map_address, alloc_windows_free);
887
888
  return map_address;
889
}
890
891
#endif
892
893
#ifdef ALLOC_DEVICEDRIVER
894
#ifndef DEVICEDRIVER_NAME
895
#define DEVICEDRIVER_NAME "/dev/mapper"
896
#endif
897
898
static void alloc_devicedirver_free(struct alloc_t *alloc_info){
899
900
  int attr = alloc_info -> attr;
901
  if (munmap(address, allocation_block_size)) {
902
    printf("OpenBLAS : Bugphysarea unmap failed.\n");
903
  }
904
905
  if (close(attr)) {
906
    printf("OpenBLAS : Bugphysarea close failed.\n");
907
  }
908
909
}
910
911
static void *alloc_devicedirver(void *address){
912
913
  int fd;
914
  void *map_address;
915
916
  if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) {
917
918
    return (void *)-1;
919
920
  }
921
922
  map_address = mmap(address, allocation_block_size,
923
                     PROT_READ | PROT_WRITE,
924
                     MAP_FILE | MAP_SHARED,
925
                     fd, 0);
926
927
  STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_devicedirver_free, fd);
928
929
  return map_address;
930
}
931
932
#endif
933
934
#ifdef ALLOC_SHM
935
936
static void alloc_shm_free(struct alloc_t *alloc_info){
937
938
  if (shmdt(alloc_info)) {
939
    printf("OpenBLAS : Shared memory unmap failed.\n");
940
    }
941
}
942
943
static void *alloc_shm(void *address){
944
  void *map_address;
945
  int shmid;
946
947
  shmid = shmget(IPC_PRIVATE, allocation_block_size,IPC_CREAT | 0600);
948
949
  map_address = (void *)shmat(shmid, address, 0);
950
951
  if (map_address != (void *)-1){
952
953
#ifdef OS_LINUX
954
    my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
955
#endif
956
957
    shmctl(shmid, IPC_RMID, 0);
958
959
    struct alloc_t *alloc_info = (struct alloc_t *)map_address;
960
    alloc_info->release_func = alloc_shm_free;
961
    alloc_info->attr = shmid;
962
  }
963
964
  return map_address;
965
}
966
967
#endif
968
969
#if ((defined ALLOC_HUGETLB) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS))
970
971
static void alloc_hugetlb_free(struct alloc_t *alloc_info){
972
973
#if defined(OS_LINUX) || defined(OS_AIX)
974
  if (shmdt(alloc_info)) {
975
    printf("OpenBLAS : Hugepage unmap failed.\n");
976
  }
977
#endif
978
979
#ifdef __sun__
980
981
  munmap(alloc_info, allocation_block_size);
982
983
#endif
984
985
#ifdef OS_WINDOWS
986
987
  VirtualFree(alloc_info, 0, MEM_LARGE_PAGES | MEM_RELEASE);
988
989
#endif
990
991
}
992
993
static void *alloc_hugetlb(void *address){
994
995
  void *map_address = (void *)-1;
996
997
#if defined(OS_LINUX) || defined(OS_AIX)
998
  int shmid;
999
1000
  shmid = shmget(IPC_PRIVATE, allocation_block_size,
1001
#ifdef OS_LINUX
1002
                 SHM_HUGETLB |
1003
#endif
1004
#ifdef OS_AIX
1005
                 SHM_LGPAGE | SHM_PIN |
1006
#endif
1007
                 IPC_CREAT | SHM_R | SHM_W);
1008
1009
  if (shmid != -1) {
1010
    map_address = (void *)shmat(shmid, address, SHM_RND);
1011
1012
#ifdef OS_LINUX
1013
    my_mbind(map_address, allocation_block_size, MPOL_PREFERRED, NULL, 0, 0);
1014
#endif
1015
1016
    if (map_address != (void *)-1){
1017
      shmctl(shmid, IPC_RMID, 0);
1018
    }
1019
  }
1020
#endif
1021
1022
#ifdef __sun__
1023
  struct memcntl_mha mha;
1024
1025
  mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
1026
  mha.mha_flags = 0;
1027
  mha.mha_pagesize = HUGE_PAGESIZE;
1028
  memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
1029
1030
  map_address = (BLASULONG)memalign(HUGE_PAGESIZE, allocation_block_size);
1031
#endif
1032
1033
#ifdef OS_WINDOWS
1034
1035
  HANDLE hToken;
1036
  TOKEN_PRIVILEGES tp;
1037
1038
  if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1;
1039
1040
  tp.PrivilegeCount = 1;
1041
  tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
1042
1043
  if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
1044
      CloseHandle(hToken);
1045
      return (void*)-1;
1046
  }
1047
1048
  if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) {
1049
      CloseHandle(hToken);
1050
      return (void*)-1;
1051
  }
1052
1053
  map_address  = (void *)VirtualAlloc(address,
1054
                                      allocation_block_size,
1055
                                      MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
1056
                                      PAGE_READWRITE);
1057
1058
  tp.Privileges[0].Attributes = 0;
1059
  AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL);
1060
1061
  if (map_address == (void *)NULL) map_address = (void *)-1;
1062
1063
#endif
1064
1065
  STORE_RELEASE_FUNC(map_address, alloc_hugetlb_free);
1066
1067
  return map_address;
1068
}
1069
#endif
1070
1071
1072
1073
1074
#ifdef  ALLOC_HUGETLBFILE
1075
1076
static int hugetlb_pid = 0;
1077
1078
static void alloc_hugetlbfile_free(struct alloc_t *alloc_info){
1079
1080
  int attr = alloc_info -> attr;
1081
  if (munmap(alloc_info, allocation_block_size)) {
1082
    printf("OpenBLAS : HugeTLBfs unmap failed.\n");
1083
  }
1084
1085
  if (close(attr)) {
1086
    printf("OpenBLAS : HugeTLBfs close failed.\n");
1087
  }
1088
}
1089
1090
static void *alloc_hugetlbfile(void *address){
1091
1092
  void *map_address = (void *)-1;
1093
  int fd;
1094
  char filename[64];
1095
1096
  if (!hugetlb_pid) hugetlb_pid = getpid();
1097
1098
  sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid);
1099
1100
  if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) {
1101
    return (void *)-1;
1102
  }
1103
1104
  unlink(filename);
1105
1106
  map_address = mmap(address, allocation_block_size,
1107
                     PROT_READ | PROT_WRITE,
1108
                     MAP_SHARED,
1109
                     fd, 0);
1110
1111
  STORE_RELEASE_FUNC_WITH_ATTR(map_address, alloc_hugetlbfile_free, fd);
1112
1113
  return map_address;
1114
}
1115
#endif
1116
1117
1118
#ifdef SEEK_ADDRESS
1119
static BLASULONG base_address      = 0UL;
1120
#else
1121
static BLASULONG base_address      = BASE_ADDRESS;
1122
#endif
1123
1124
#ifdef HAVE_C11
1125
static _Atomic int memory_initialized = 0;
1126
#else
1127
static volatile int memory_initialized = 0;
1128
#endif
1129
1130
/*       Memory allocation routine           */
1131
/* procpos ... indicates where it comes from */
1132
/*                0 : Level 3 functions      */
1133
/*                1 : Level 2 functions      */
1134
/*                2 : Thread                 */
1135
1136
static void blas_memory_cleanup(void* ptr){
1137
  if (ptr) {
1138
    struct alloc_t ** table = (struct alloc_t **)ptr;
1139
    int pos;
1140
    for (pos = 0; pos < NUM_BUFFERS; pos ++){
1141
      struct alloc_t *alloc_info = table[pos];
1142
      if (alloc_info) {
1143
        alloc_info->release_func(alloc_info);
1144
        table[pos] = (void *)0;
1145
      }
1146
    }
1147
    free(table);
1148
  }
1149
}
1150
1151
static void blas_memory_init(void){
1152
#if defined(SMP)
1153
#  if defined(OS_WINDOWS)
1154
  local_storage_key = TlsAlloc();
1155
#  else
1156
  pthread_key_create(&local_storage_key, blas_memory_cleanup);
1157
#  endif /* defined(OS_WINDOWS) */
1158
#endif /* defined(SMP) */
1159
}
1160
1161
void *blas_memory_alloc(int procpos){
1162
1163
  int position;
1164
1165
  void *map_address;
1166
1167
  void *(*memoryalloc[])(void *address) = {
1168
#ifdef ALLOC_DEVICEDRIVER
1169
    alloc_devicedirver,
1170
#endif
1171
#ifdef ALLOC_SHM && !defined(ALLOC_HUGETLB)
1172
    alloc_shm,
1173
#endif
1174
#if ((defined ALLOC_HUGETLB) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS))
1175
    alloc_hugetlb,
1176
#endif
1177
#ifdef ALLOC_MMAP
1178
    alloc_mmap,
1179
#endif
1180
#ifdef ALLOC_QALLOC
1181
    alloc_qalloc,
1182
#endif
1183
#ifdef ALLOC_WINDOWS
1184
    alloc_windows,
1185
#endif
1186
#ifdef ALLOC_MALLOC
1187
    alloc_malloc,
1188
#endif
1189
    NULL,
1190
  };
1191
  void *(**func)(void *address);
1192
  struct alloc_t * alloc_info;
1193
  struct alloc_t ** alloc_table;
1194
1195
#if defined(SMP) && !defined(USE_OPENMP)
1196
int mi;
1197
LOCK_COMMAND(&alloc_lock);
1198
mi=memory_initialized;
1199
UNLOCK_COMMAND(&alloc_lock);
1200
  if (!LIKELY_ONE(mi)) {
1201
#else
1202
  if (!LIKELY_ONE(memory_initialized)) {
1203
#endif
1204
#if defined(SMP) && !defined(USE_OPENMP)
1205
    /* Only allow a single thread to initialize memory system */
1206
    LOCK_COMMAND(&alloc_lock);
1207
1208
    if (!memory_initialized) {
1209
#endif
1210
      blas_memory_init();
1211
#ifdef DYNAMIC_ARCH
1212
      gotoblas_dynamic_init();
1213
#endif
1214
1215
#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
1216
      gotoblas_affinity_init();
1217
#endif
1218
1219
#ifdef SMP
1220
      if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
1221
#endif
1222
1223
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) || defined(ARCH_LOONGARCH64)
1224
#ifndef DYNAMIC_ARCH
1225
      blas_set_parameter();
1226
#endif
1227
#endif
1228
1229
      memory_initialized = 1;
1230
1231
#if defined(SMP) && !defined(USE_OPENMP)
1232
    }
1233
    UNLOCK_COMMAND(&alloc_lock);
1234
#endif
1235
  }
1236
1237
#ifdef DEBUG
1238
  printf("Alloc Start ...\n");
1239
#endif
1240
1241
  position = 0;
1242
  alloc_table = get_memory_table();
1243
  do {
1244
      if (!alloc_table[position] || !alloc_table[position]->used) goto allocation;
1245
    position ++;
1246
1247
  } while (position < NUM_BUFFERS);
1248
1249
  goto error;
1250
1251
  allocation :
1252
1253
#ifdef DEBUG
1254
  printf("  Position -> %d\n", position);
1255
#endif
1256
1257
  alloc_info = alloc_table[position];
1258
  if (!alloc_info) {
1259
    do {
1260
#ifdef DEBUG
1261
      printf("Allocation Start : %lx\n", base_address);
1262
#endif
1263
1264
      map_address = (void *)-1;
1265
1266
      func = &memoryalloc[0];
1267
1268
      while ((*func != NULL) && (map_address == (void *) -1)) {
1269
1270
        map_address = (*func)((void *)base_address);
1271
1272
#ifdef ALLOC_DEVICEDRIVER
1273
        if ((*func ==  alloc_devicedirver) && (map_address == (void *)-1)) {
1274
            fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation failed.\n");
1275
        }
1276
#endif
1277
1278
#ifdef ALLOC_HUGETLBFILE
1279
        if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
1280
#ifndef OS_WINDOWS
1281
            fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation failed.\n");
1282
#endif
1283
        }
1284
#endif
1285
1286
#if (defined ALLOC_HUGETLB) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS)
1287
        if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
1288
#endif
1289
1290
        func ++;
1291
      }
1292
1293
#ifdef DEBUG
1294
      printf("  Success -> %08lx\n", map_address);
1295
#endif
1296
      if (((BLASLONG) map_address) == -1) base_address = 0UL;
1297
1298
      if (base_address) base_address += allocation_block_size + FIXED_PAGESIZE;
1299
1300
    } while ((BLASLONG)map_address == -1);
1301
1302
    alloc_table[position] = alloc_info = map_address;
1303
1304
#ifdef DEBUG
1305
    printf("  Mapping Succeeded. %p(%d)\n", (void *)alloc_info, position);
1306
#endif
1307
  }
1308
1309
#ifdef DEBUG
1310
  printf("Mapped   : %p  %3d\n\n", (void *)alloc_info, position);
1311
#endif
1312
1313
  alloc_info->used = 1;
1314
1315
  return (void *)(((char *)alloc_info) + sizeof(struct alloc_t));
1316
1317
 error:
1318
  printf("OpenBLAS : Program will terminate because you tried to allocate too many TLS memory regions.\n");
1319
  printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
1320
  printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
1321
  printf("a sufficiently small number. This error typically occurs when the software that relies on\n");
1322
  printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n");
1323
  printf("cpu cores than what OpenBLAS was configured to handle.\n"); 
1324
1325
  return NULL;
1326
}
1327
1328
void blas_memory_free(void *buffer){
1329
#ifdef DEBUG
1330
  int position;
1331
  struct alloc_t ** alloc_table;
1332
#endif
1333
  /* Since we passed an offset pointer to the caller, get back to the actual allocation */
1334
  struct alloc_t *alloc_info = (void *)(((char *)buffer) - sizeof(struct alloc_t));
1335
1336
#ifdef DEBUG
1337
  printf("Unmapped Start : %p ...\n", alloc_info);
1338
#endif
1339
1340
  alloc_info->used = 0;
1341
1342
#ifdef DEBUG
1343
  printf("Unmap Succeeded.\n\n");
1344
#endif
1345
1346
  return;
1347
1348
#ifdef DEBUG
1349
  alloc_table = get_memory_table();
1350
  for (position = 0; position < NUM_BUFFERS; position++){
1351
    if (alloc_table[position]) {
1352
      printf("%4ld  %p : %d\n", position, alloc_table[position], alloc_table[position]->used);
1353
    }
1354
  }
1355
#endif
1356
  return;
1357
}
1358
1359
void *blas_memory_alloc_nolock(int unused) {
1360
  void *map_address;
1361
  map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
1362
  return map_address;
1363
}
1364
1365
void blas_memory_free_nolock(void * map_address) {
1366
  free(map_address);
1367
}
1368
1369
#ifdef SMP
1370
void blas_thread_memory_cleanup(void) {
1371
    blas_memory_cleanup((void*)get_memory_table());
1372
}
1373
#endif
1374
1375
1376
void blas_shutdown(void){
1377
#ifdef SMP
1378
  BLASFUNC(blas_thread_shutdown)();
1379
#endif
1380
1381
#ifdef SMP
1382
  /* Only cleanupIf we were built for threading and TLS was initialized */
1383
  if (local_storage_key)
1384
#endif
1385
    blas_thread_memory_cleanup();
1386
1387
#ifdef SEEK_ADDRESS
1388
  base_address      = 0UL;
1389
#else
1390
  base_address      = BASE_ADDRESS;
1391
#endif
1392
1393
  return;
1394
}
1395
1396
#if defined(OS_LINUX) && !defined(NO_WARMUP)
1397
1398
#ifdef SMP
1399
#if   defined(USE_PTHREAD_LOCK)
1400
static pthread_mutex_t    init_lock = PTHREAD_MUTEX_INITIALIZER;
1401
#elif defined(USE_PTHREAD_SPINLOCK)
1402
static pthread_spinlock_t init_lock = 0;
1403
#else
1404
static BLASULONG   init_lock = 0UL;
1405
#endif
1406
#endif
1407
1408
static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
1409
                          void *sa, void *sb, BLASLONG pos) {
1410
1411
#if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
1412
1413
  size_t size;
1414
  BLASULONG buffer;
1415
1416
  size   = allocation_block_size - PAGESIZE;
1417
  buffer = (BLASULONG)sa + GEMM_OFFSET_A;
1418
1419
#if defined(OS_LINUX) && !defined(NO_WARMUP)
1420
    if (hot_alloc != 2) {
1421
#endif
1422
1423
#ifdef SMP
1424
  LOCK_COMMAND(&init_lock);
1425
#endif
1426
1427
  while (size > 0) {
1428
    *(int *)buffer = size;
1429
    buffer  += PAGESIZE;
1430
    size    -= PAGESIZE;
1431
  }
1432
1433
#ifdef SMP
1434
  UNLOCK_COMMAND(&init_lock);
1435
#endif
1436
1437
  size = MIN((allocation_block_size - PAGESIZE), L2_SIZE);
1438
  buffer = (BLASULONG)sa + GEMM_OFFSET_A;
1439
1440
  while (size > 0) {
1441
    *(int *)buffer = size;
1442
    buffer  += 64;
1443
    size    -= 64;
1444
  }
1445
1446
#if defined(OS_LINUX) && !defined(NO_WARMUP)
1447
    }
1448
#endif
1449
1450
#endif
1451
}
1452
1453
#ifdef SMP
1454
1455
static void _init_thread_memory(void *buffer) {
1456
1457
  blas_queue_t queue[MAX_CPU_NUMBER];
1458
  int num_cpu;
1459
1460
  for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {
1461
1462
    blas_queue_init(&queue[num_cpu]);
1463
    queue[num_cpu].mode    = BLAS_DOUBLE | BLAS_REAL;
1464
    queue[num_cpu].routine = &_touch_memory;
1465
    queue[num_cpu].args    = NULL;
1466
    queue[num_cpu].next    = &queue[num_cpu + 1];
1467
  }
1468
1469
  queue[num_cpu - 1].next = NULL;
1470
  queue[0].sa = buffer;
1471
1472
  exec_blas(num_cpu, queue);
1473
1474
}
1475
#endif
1476
1477
static void gotoblas_memory_init(void) {
1478
1479
  void *buffer;
1480
1481
  hot_alloc = 1;
1482
1483
  buffer = (void *)blas_memory_alloc(0);
1484
1485
#ifdef SMP
1486
  if (blas_cpu_number == 0) blas_get_cpu_number();
1487
#ifdef SMP_SERVER
1488
  if (blas_server_avail == 0) blas_thread_init();
1489
#endif
1490
1491
  _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
1492
1493
#else
1494
1495
  _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
1496
1497
#endif
1498
1499
  blas_memory_free(buffer);
1500
}
1501
#endif
1502
1503
/* Initialization for all function; this function should be called before main */
1504
1505
static int gotoblas_initialized = 0;
1506
extern void openblas_read_env(void);
1507
1508
void CONSTRUCTOR gotoblas_init(void) {
1509
1510
  if (gotoblas_initialized) return;
1511
1512
#ifdef SMP
1513
  openblas_fork_handler();
1514
#endif
1515
1516
  openblas_read_env();
1517
1518
#ifdef PROFILE
1519
   moncontrol (0);
1520
#endif
1521
1522
#ifdef DYNAMIC_ARCH
1523
   gotoblas_dynamic_init();
1524
#endif
1525
1526
#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
1527
   gotoblas_affinity_init();
1528
#endif
1529
1530
#if defined(OS_LINUX) && !defined(NO_WARMUP)
1531
   gotoblas_memory_init();
1532
#endif
1533
1534
//#if defined(OS_LINUX)
1535
#if 0
1536
   struct rlimit curlimit;
1537
   if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
1538
   {
1539
      if ( curlimit.rlim_cur != curlimit.rlim_max )
1540
      {
1541
              curlimit.rlim_cur = curlimit.rlim_max;
1542
              setrlimit(RLIMIT_STACK, &curlimit);
1543
      }
1544
   }
1545
#endif
1546
1547
#ifdef SMP
1548
  if (blas_cpu_number == 0) blas_get_cpu_number();
1549
#ifdef SMP_SERVER
1550
  if (blas_server_avail == 0) blas_thread_init();
1551
#endif
1552
#endif
1553
1554
#ifdef FUNCTION_PROFILE
1555
   gotoblas_profile_init();
1556
#endif
1557
1558
   gotoblas_initialized = 1;
1559
1560
#ifdef PROFILE
1561
   moncontrol (1);
1562
#endif
1563
1564
}
1565
1566
void DESTRUCTOR gotoblas_quit(void) {
1567
1568
  if (gotoblas_initialized == 0) return;
1569
1570
  blas_shutdown();
1571
1572
#if defined(SMP)
1573
#if defined(OS_WINDOWS)
1574
  TlsFree(local_storage_key);
1575
#else
1576
  pthread_key_delete(local_storage_key);
1577
#endif
1578
#endif
1579
1580
#ifdef PROFILE
1581
   moncontrol (0);
1582
#endif
1583
1584
#ifdef FUNCTION_PROFILE
1585
   gotoblas_profile_quit();
1586
#endif
1587
1588
#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
1589
   gotoblas_affinity_quit();
1590
#endif
1591
1592
#ifdef DYNAMIC_ARCH
1593
   gotoblas_dynamic_quit();
1594
#endif
1595
1596
   gotoblas_initialized = 0;
1597
1598
#ifdef PROFILE
1599
   moncontrol (1);
1600
#endif
1601
}
1602
1603
#if defined(_MSC_VER) && !defined(__clang__)
1604
BOOL APIENTRY DllMain(HMODULE hModule, DWORD  ul_reason_for_call, LPVOID lpReserved)
1605
{
1606
  switch (ul_reason_for_call)
1607
  {
1608
    case DLL_PROCESS_ATTACH:
1609
      gotoblas_init();
1610
      break;
1611
    case DLL_THREAD_ATTACH:
1612
      break;
1613
    case DLL_THREAD_DETACH:
1614
#if defined(SMP)
1615
      blas_thread_memory_cleanup();
1616
#endif
1617
      break;
1618
    case DLL_PROCESS_DETACH:
1619
      gotoblas_quit();
1620
      break;
1621
    default:
1622
      break;
1623
  }
1624
  return TRUE;
1625
}
1626
1627
/*
1628
  This is to allow static linking.
1629
  Code adapted from Google performance tools:
1630
  https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc
1631
  Reference:
1632
  https://sourceware.org/ml/pthreads-win32/2008/msg00028.html
1633
  http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp
1634
*/
1635
static int on_process_term(void)
1636
{
1637
  gotoblas_quit();
1638
  return 0;
1639
}
1640
#ifdef _WIN64
1641
#pragma comment(linker, "/INCLUDE:_tls_used")
1642
#else
1643
#pragma comment(linker, "/INCLUDE:__tls_used")
1644
#endif
1645
1646
#ifdef _WIN64
1647
#pragma const_seg(".CRT$XLB")
1648
#else
1649
#pragma data_seg(".CRT$XLB")
1650
#endif
1651
1652
#ifdef _WIN64
1653
static const PIMAGE_TLS_CALLBACK dll_callback(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
1654
#pragma const_seg()
1655
#else
1656
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
1657
#pragma data_seg()
1658
#endif
1659
1660
#ifdef _WIN64
1661
#pragma const_seg(".CRT$XTU")
1662
#else
1663
#pragma data_seg(".CRT$XTU")
1664
#endif
1665
1666
#ifdef _WIN64
1667
static const int(*p_process_term)(void) = on_process_term;
1668
#pragma const_seg()
1669
#else
1670
static int(*p_process_term)(void) = on_process_term;
1671
#pragma data_seg()
1672
#endif
1673
#endif
1674
1675
#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
1676
/* Don't call me; this is just work around for PGI / Sun bug */
1677
void gotoblas_dummy_for_PGI(void) {
1678
1679
  gotoblas_init();
1680
  gotoblas_quit();
1681
1682
#if __PGIC__ < 19
1683
#if 0
1684
  asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
1685
  asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
1686
#else
1687
  asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
1688
  asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
1689
#endif
1690
#endif
1691
}
1692
#endif
1693
1694
#else
1695
/* USE_TLS / COMPILE_TLS not set */
1696
1697
#include <errno.h>
1698
1699
#if defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)
1700
#define ALLOC_WINDOWS
1701
#ifndef MEM_LARGE_PAGES
1702
#define MEM_LARGE_PAGES  0x20000000
1703
#endif
1704
#elif !defined(OS_EMBEDDED)
1705
#define ALLOC_MMAP
1706
#define ALLOC_MALLOC
1707
#else
1708
#define ALLOC_MALLOC
1709
1710
inline int puts(const char *str) { return 0; }
1711
inline int printf(const char *format, ...) { return 0; }
1712
inline char *getenv(const char *name) { return ""; }
1713
inline int atoi(const char *str) { return 0; }
1714
#endif
1715
1716
#include <stdlib.h>
1717
#include <stdio.h>
1718
#include <fcntl.h>
1719
1720
#if (!defined(OS_WINDOWS) || defined(OS_CYGWIN_NT)) && !defined(OS_EMBEDDED)
1721
#include <sys/mman.h>
1722
#ifndef NO_SYSV_IPC
1723
#include <sys/shm.h>
1724
#endif
1725
#include <sys/ipc.h>
1726
#endif
1727
1728
#include <sys/types.h>
1729
1730
#ifdef OS_LINUX
1731
#include <sys/sysinfo.h>
1732
#include <sched.h>
1733
#include <errno.h>
1734
#include <sys/syscall.h>
1735
#include <sys/time.h>
1736
#include <sys/resource.h>
1737
#endif
1738
1739
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN)
1740
#include <sys/sysctl.h>
1741
#include <sys/resource.h>
1742
#endif
1743
1744
#if defined(OS_WINDOWS) && (defined(__MINGW32__) || defined(__MINGW64__))
1745
#include <conio.h>
1746
#undef  printf
1747
#define printf _cprintf
1748
#endif
1749
1750
#ifdef OS_LINUX
1751
1752
#ifndef MPOL_PREFERRED
1753
#define MPOL_PREFERRED  1
1754
#endif
1755
1756
#endif
1757
1758
#if (defined(PPC440) || !defined(OS_LINUX) || defined(HPL)) && !defined(NO_WARMUP)
1759
#define NO_WARMUP
1760
#endif
1761
1762
#ifndef SHM_HUGETLB
1763
#define SHM_HUGETLB 04000
1764
#endif
1765
1766
#ifndef FIXED_PAGESIZE
1767
0
#define FIXED_PAGESIZE 4096
1768
#endif
1769
1770
#define BITMASK(a, b, c) ((((a) >> (b)) & (c)))
1771
1772
#if defined(_MSC_VER) && !defined(__clang__)
1773
#define CONSTRUCTOR __cdecl
1774
#define DESTRUCTOR __cdecl
1775
#elif (defined(OS_DARWIN) || defined(OS_SUNOS)) && defined(C_GCC)
1776
#define CONSTRUCTOR __attribute__ ((constructor))
1777
#define DESTRUCTOR  __attribute__ ((destructor))
1778
#elif __GNUC__ && INIT_PRIORITY && ((GCC_VERSION >= 40300) || (CLANG_VERSION >= 20900))
1779
#define CONSTRUCTOR __attribute__ ((constructor(101)))
1780
#define DESTRUCTOR  __attribute__ ((destructor(101)))
1781
#else
1782
#define CONSTRUCTOR __attribute__ ((constructor))
1783
#define DESTRUCTOR  __attribute__ ((destructor))
1784
#endif
1785
1786
#ifdef DYNAMIC_ARCH
1787
gotoblas_t *gotoblas = NULL;
1788
#endif
1789
extern void openblas_warning(int verbose, const char * msg);
1790
1791
#ifndef SMP
1792
1793
#define blas_cpu_number 1
1794
#define blas_num_threads 1
1795
1796
/* Dummy Function */
1797
int  goto_get_num_procs  (void) { return 1;};
1798
void goto_set_num_threads(int num_threads) {};
1799
1800
#else
1801
1802
#if defined(OS_LINUX) || defined(OS_SUNOS)
1803
#ifndef NO_AFFINITY
1804
int get_num_procs(void);
1805
#else
1806
9
int get_num_procs(void) {
1807
1808
9
  static int nums = 0;
1809
9
  int ret;
1810
  
1811
9
#if defined(__GLIBC_PREREQ)
1812
9
  cpu_set_t cpuset,*cpusetp;
1813
9
  size_t size;
1814
1815
#if !__GLIBC_PREREQ(2, 7)
1816
  int i;
1817
#if !__GLIBC_PREREQ(2, 6)
1818
  int n;
1819
#endif
1820
#endif
1821
9
#endif
1822
1823
9
  if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
1824
1825
9
#if defined(USE_OPENMP)
1826
/*  if (omp_get_proc_bind() != omp_proc_bind_false) */
1827
9
#if _OPENMP >= 201511
1828
9
    int i,n;
1829
9
    n = 0;
1830
9
    ret = omp_get_num_places();
1831
18
    if (ret > 0) for (i=0;i<ret;i++) n+= omp_get_place_num_procs(i);
1832
9
    if (n > 0) nums = n;
1833
9
#endif
1834
9
    return (nums > 0 ? nums :2);
1835
0
#endif
1836
1837
#if !defined(OS_LINUX)
1838
  return (nums > 0 ? nums :2);
1839
#endif
1840
  
1841
#if !defined(__GLIBC_PREREQ)
1842
  return (nums > 0 ? nums :2);
1843
#else
1844
 #if !__GLIBC_PREREQ(2, 3)
1845
  return (nums > 0 ? nums :2);
1846
 #endif
1847
1848
 #if !__GLIBC_PREREQ(2, 7)
1849
  ret = sched_getaffinity(0,sizeof(cpuset), &cpuset);
1850
  if (ret!=0) return (nums > 0 ? nums :2);
1851
  n=0;
1852
  #if !__GLIBC_PREREQ(2, 6)
1853
  for (i=0;i<(nums > 0 ? nums :2);i++)
1854
     if (CPU_ISSET(i,&cpuset)) n++;
1855
  nums=n;
1856
  #else
1857
  nums = CPU_COUNT(sizeof(cpuset),&cpuset);
1858
  #endif
1859
  return (nums > 0 ? nums :2);
1860
 #else
1861
0
  if (nums >= CPU_SETSIZE) {
1862
0
    cpusetp = CPU_ALLOC(nums);
1863
0
      if (cpusetp == NULL) {
1864
0
        return (nums > 0 ? nums :2);
1865
0
      }
1866
0
    size = CPU_ALLOC_SIZE(nums);
1867
0
    ret = sched_getaffinity(0,size,cpusetp);
1868
0
    if (ret!=0) {
1869
0
      CPU_FREE(cpusetp);
1870
0
      return (nums > 0 ? nums :2);
1871
0
    }
1872
0
    ret = CPU_COUNT_S(size,cpusetp);
1873
0
    if (ret > 0 && ret < nums) nums = ret;
1874
0
    CPU_FREE(cpusetp);
1875
0
    return (nums > 0 ? nums :2);
1876
0
  } else {
1877
0
    ret = sched_getaffinity(0,sizeof(cpuset),&cpuset);
1878
0
    if (ret!=0) {
1879
0
      return (nums > 0 ? nums :2);
1880
0
    }
1881
0
    ret = CPU_COUNT(&cpuset);
1882
0
    if (ret > 0 && ret < nums) nums = ret;
1883
0
    return (nums > 0 ? nums :2);
1884
0
  }
1885
0
 #endif
1886
0
#endif
1887
0
}
1888
#endif
1889
#endif
1890
1891
#ifdef OS_ANDROID
1892
int get_num_procs(void) {
1893
  static int nums = 0;
1894
  if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
1895
  return nums;
1896
}
1897
#endif
1898
1899
#ifdef OS_HAIKU
1900
int get_num_procs(void) {
1901
  static int nums = 0;
1902
  if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
1903
  return nums;
1904
}
1905
#endif
1906
1907
#ifdef OS_AIX
1908
int get_num_procs(void) {
1909
  static int nums = 0;
1910
  if (!nums) nums = sysconf(_SC_NPROCESSORS_CONF);
1911
  return nums;
1912
}
1913
#endif
1914
1915
#ifdef OS_WINDOWS
1916
1917
int get_num_procs(void) {
1918
1919
  static int nums = 0;
1920
1921
  if (nums == 0) {
1922
1923
    SYSTEM_INFO sysinfo;
1924
1925
    GetSystemInfo(&sysinfo);
1926
1927
    nums = sysinfo.dwNumberOfProcessors;
1928
  }
1929
1930
  return nums;
1931
}
1932
1933
#endif
1934
1935
#if defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY)
1936
1937
int get_num_procs(void) {
1938
1939
  static int nums = 0;
1940
1941
  int m[2];
1942
  size_t len;
1943
1944
  if (nums == 0) {
1945
    m[0] = CTL_HW;
1946
    m[1] = HW_NCPU;
1947
    len = sizeof(int);
1948
    sysctl(m, 2, &nums, &len, NULL, 0);
1949
  }
1950
1951
  return nums;
1952
}
1953
1954
#endif
1955
1956
#if defined(OS_DARWIN)
1957
int get_num_procs(void) {
1958
  static int nums = 0;
1959
  size_t len;
1960
  if (nums == 0){
1961
    len = sizeof(int);
1962
    sysctlbyname("hw.physicalcpu", &nums, &len, NULL, 0);
1963
  }
1964
  return nums;
1965
}
1966
/*
1967
void set_stack_limit(int limitMB){
1968
  int result=0;
1969
  struct rlimit rl;
1970
  rlim_t StackSize;
1971
1972
  StackSize=limitMB*1024*1024;
1973
  result=getrlimit(RLIMIT_STACK, &rl);
1974
  if(result==0){
1975
    if(rl.rlim_cur < StackSize){
1976
      rl.rlim_cur=StackSize;
1977
      result=setrlimit(RLIMIT_STACK, &rl);
1978
      if(result !=0){
1979
        fprintf(stderr, "OpenBLAS: set stack limit error =%d\n", result);
1980
      }
1981
    }
1982
  }
1983
}
1984
*/
1985
#endif
1986
1987
1988
/*
1989
OpenBLAS uses the numbers of CPU cores in multithreading.
1990
It can be set by openblas_set_num_threads(int num_threads);
1991
*/
1992
int blas_cpu_number  = 0;
1993
/*
1994
The numbers of threads in the thread pool.
1995
This value is equal or large than blas_cpu_number. This means some threads are sleep.
1996
*/
1997
int blas_num_threads = 0;
1998
1999
0
int  goto_get_num_procs  (void) {
2000
0
  return blas_cpu_number;
2001
0
}
2002
2003
void openblas_fork_handler(void)
2004
9
{
2005
  // This handler shuts down the OpenBLAS-managed PTHREAD pool when OpenBLAS is
2006
  // built with "make USE_OPENMP=0".
2007
  // Hanging can still happen when OpenBLAS is built against the libgomp
2008
  // implementation of OpenMP. The problem is tracked at:
2009
  //   http://gcc.gnu.org/bugzilla/show_bug.cgi?id=60035
2010
  // In the mean time build with USE_OPENMP=0 or link against another
2011
  // implementation of OpenMP.
2012
9
#if !((defined(OS_WINDOWS) && !defined(OS_CYGWIN_NT)) || defined(OS_ANDROID)) && defined(SMP_SERVER)
2013
9
  int err;
2014
9
  err = pthread_atfork ((void (*)(void)) BLASFUNC(blas_thread_shutdown), NULL, NULL);
2015
9
  if(err != 0)
2016
0
    openblas_warning(0, "OpenBLAS Warning ... cannot install fork handler. You may meet hang after fork.\n");
2017
9
#endif
2018
9
}
2019
2020
extern int openblas_num_threads_env(void);
2021
extern int openblas_goto_num_threads_env(void);
2022
extern int openblas_omp_num_threads_env(void);
2023
2024
18
int blas_get_cpu_number(void){
2025
18
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
2026
18
  int max_num;
2027
18
#endif
2028
18
  int blas_goto_num   = 0;
2029
18
  int blas_omp_num    = 0;
2030
2031
18
  if (blas_num_threads) return blas_num_threads;
2032
2033
9
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
2034
9
  max_num = get_num_procs();
2035
9
#endif
2036
2037
  // blas_goto_num = 0;
2038
#ifndef USE_OPENMP
2039
  blas_goto_num=openblas_num_threads_env();
2040
  if (blas_goto_num < 0) blas_goto_num = 0;
2041
2042
  if (blas_goto_num == 0) {
2043
    blas_goto_num=openblas_goto_num_threads_env();
2044
    if (blas_goto_num < 0) blas_goto_num = 0;
2045
  }
2046
2047
#endif
2048
2049
  // blas_omp_num = 0;
2050
9
  blas_omp_num=openblas_omp_num_threads_env();
2051
9
  if (blas_omp_num < 0) blas_omp_num = 0;
2052
2053
9
  if (blas_goto_num > 0) blas_num_threads = blas_goto_num;
2054
9
  else if (blas_omp_num > 0) blas_num_threads = blas_omp_num;
2055
9
  else blas_num_threads = MAX_CPU_NUMBER;
2056
2057
9
#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_OPENBSD) || defined(OS_NETBSD) || defined(OS_DRAGONFLY) || defined(OS_DARWIN) || defined(OS_ANDROID) || defined(OS_HAIKU)
2058
9
  if (blas_num_threads > max_num) blas_num_threads = max_num;
2059
9
#endif
2060
2061
9
  if (blas_num_threads > MAX_CPU_NUMBER) blas_num_threads = MAX_CPU_NUMBER;
2062
2063
#ifdef DEBUG
2064
  printf( "Adjusted number of threads : %3d\n", blas_num_threads);
2065
#endif
2066
2067
9
  blas_cpu_number = blas_num_threads;
2068
2069
9
  return blas_num_threads;
2070
18
}
2071
#endif
2072
2073
2074
0
int openblas_get_num_procs(void) {
2075
#ifndef SMP
2076
  return 1;
2077
#else
2078
0
  return get_num_procs();
2079
0
#endif
2080
0
}
2081
2082
0
int openblas_get_num_threads(void) {
2083
#ifndef SMP
2084
  return 1;
2085
#else
2086
  // init blas_cpu_number if needed
2087
0
  blas_get_cpu_number();
2088
0
  return blas_cpu_number;
2089
0
#endif
2090
0
}
2091
2092
struct release_t {
2093
  void *address;
2094
  void (*func)(struct release_t *);
2095
  long attr;
2096
};
2097
2098
int hugetlb_allocated = 0;
2099
2100
static struct release_t release_info[NUM_BUFFERS];
2101
static struct release_t *new_release_info;
2102
static int release_pos = 0;
2103
2104
#if defined(OS_LINUX) && !defined(NO_WARMUP)
2105
static int hot_alloc = 0;
2106
#endif
2107
2108
/* Global lock for memory allocation */
2109
2110
#if   defined(USE_PTHREAD_LOCK)
2111
static pthread_mutex_t    alloc_lock = PTHREAD_MUTEX_INITIALIZER;
2112
#elif defined(USE_PTHREAD_SPINLOCK)
2113
static pthread_spinlock_t alloc_lock = 0;
2114
#else
2115
static BLASULONG  alloc_lock = 0UL;
2116
#endif
2117
2118
#ifdef ALLOC_MMAP
2119
2120
0
static void alloc_mmap_free(struct release_t *release){
2121
2122
0
if (!release->address) return;
2123
2124
0
  if (munmap(release -> address, BUFFER_SIZE)) {
2125
0
      int errsv=errno;
2126
0
       perror("OpenBLAS : munmap failed:");
2127
0
       printf("error code=%d,\trelease->address=%p\n",errsv,release->address);
2128
0
  }
2129
0
}
2130
2131
2132
2133
#ifdef NO_WARMUP
2134
2135
76
static void *alloc_mmap(void *address){
2136
76
  void *map_address;
2137
2138
76
  if (address){
2139
0
    map_address = mmap(address,
2140
0
                       BUFFER_SIZE,
2141
0
                       MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
2142
76
  } else {
2143
76
    map_address = mmap(address,
2144
76
                       BUFFER_SIZE,
2145
76
                       MMAP_ACCESS, MMAP_POLICY, -1, 0);
2146
76
  }
2147
2148
76
  if (map_address != (void *)-1) {
2149
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
2150
    LOCK_COMMAND(&alloc_lock);
2151
#endif
2152
76
    if (likely(release_pos < NUM_BUFFERS)) {
2153
76
    release_info[release_pos].address = map_address;
2154
76
    release_info[release_pos].func    = alloc_mmap_free;
2155
76
    } else {
2156
0
    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
2157
0
    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_mmap_free;
2158
0
    }
2159
76
    release_pos ++;
2160
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
2161
    UNLOCK_COMMAND(&alloc_lock);
2162
#endif
2163
76
  } else {
2164
#ifdef DEBUG
2165
        int errsv=errno;
2166
       perror("OpenBLAS : mmap failed:");
2167
       printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
2168
#endif
2169
0
  }
2170
2171
76
#ifdef OS_LINUX
2172
76
  my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
2173
76
#endif
2174
2175
76
  return map_address;
2176
76
}
2177
2178
#else
2179
2180
#define BENCH_ITERATION 4
2181
#define SCALING         2
2182
2183
static inline BLASULONG run_bench(BLASULONG address, BLASULONG size) {
2184
2185
  BLASULONG original, *p;
2186
  BLASULONG start, stop, min;
2187
  int iter, i, count;
2188
2189
  min = (BLASULONG)-1;
2190
2191
  original = *(BLASULONG *)(address + size - PAGESIZE);
2192
2193
  *(BLASULONG *)(address + size - PAGESIZE) = (BLASULONG)address;
2194
2195
  for (iter = 0; iter < BENCH_ITERATION; iter ++ ) {
2196
2197
    p = (BLASULONG *)address;
2198
2199
    count = size / PAGESIZE;
2200
2201
    start = rpcc();
2202
2203
    for (i = 0; i < count; i ++) {
2204
      p = (BLASULONG *)(*p);
2205
    }
2206
2207
    stop = rpcc();
2208
2209
    if (min > stop - start) min = stop - start;
2210
  }
2211
2212
  *(BLASULONG *)(address + size - PAGESIZE +  0) = original;
2213
  *(BLASULONG *)(address + size - PAGESIZE +  8) = (BLASULONG)p;
2214
2215
  return min;
2216
}
2217
2218
static void *alloc_mmap(void *address){
2219
  void *map_address, *best_address;
2220
  BLASULONG best, start, current;
2221
  BLASULONG allocsize;
2222
2223
  if (address){
2224
    /* Just give up use advanced operation */
2225
    map_address = mmap(address, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY | MAP_FIXED, -1, 0);
2226
2227
#ifdef OS_LINUX
2228
    my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
2229
#endif
2230
2231
  } else {
2232
#if defined(OS_LINUX) && !defined(NO_WARMUP)
2233
    if (hot_alloc == 0) {
2234
      map_address = mmap(NULL, BUFFER_SIZE, MMAP_ACCESS, MMAP_POLICY, -1, 0);
2235
2236
#ifdef OS_LINUX
2237
      my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
2238
#endif
2239
2240
    } else {
2241
#endif
2242
2243
      map_address = mmap(NULL, BUFFER_SIZE * SCALING,
2244
                         MMAP_ACCESS, MMAP_POLICY, -1, 0);
2245
2246
      if (map_address != (void *)-1) {
2247
2248
#ifdef OS_LINUX
2249
#ifdef DEBUG
2250
        int ret=0;
2251
        ret=my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
2252
        if(ret==-1){
2253
                int errsv=errno;
2254
                perror("OpenBLAS alloc_mmap:");
2255
                printf("error code=%d,\tmap_address=%lx\n",errsv,map_address);
2256
        }
2257
2258
#else
2259
        my_mbind(map_address, BUFFER_SIZE * SCALING, MPOL_PREFERRED, NULL, 0, 0);
2260
#endif
2261
#endif
2262
2263
#ifdef BUILD_DOUBLE 
2264
  allocsize = DGEMM_P * DGEMM_Q * sizeof(double);
2265
#elif defined(BUILD_COMPLEX16)
2266
  allocsize = ZGEMM_P * ZGEMM_Q * sizeof(double);
2267
#elif defined(BUILD_COMPLEX)
2268
  allocsize = CGEMM_P * CGEMM_Q * sizeof(double);
2269
#else
2270
  allocsize = SGEMM_P * SGEMM_Q * sizeof(double);
2271
#endif
2272
  start   = (BLASULONG)map_address;
2273
  current = (SCALING - 1) * BUFFER_SIZE;
2274
2275
        while(current > 0) {
2276
          *(BLASLONG *)start = (BLASLONG)start + PAGESIZE;
2277
          start += PAGESIZE;
2278
          current -= PAGESIZE;
2279
        }
2280
2281
        *(BLASLONG *)(start - PAGESIZE) = (BLASULONG)map_address;
2282
2283
        start = (BLASULONG)map_address;
2284
2285
        best = (BLASULONG)-1;
2286
        best_address = map_address;
2287
2288
        while ((start + allocsize  < (BLASULONG)map_address + (SCALING - 1) * BUFFER_SIZE)) {
2289
2290
          current = run_bench(start, allocsize);
2291
2292
          if (best > current) {
2293
            best = current;
2294
            best_address = (void *)start;
2295
          }
2296
2297
          start += PAGESIZE;
2298
2299
        }
2300
2301
      if ((BLASULONG)best_address > (BLASULONG)map_address)
2302
        munmap(map_address,  (BLASULONG)best_address - (BLASULONG)map_address);
2303
2304
      munmap((void *)((BLASULONG)best_address + BUFFER_SIZE), (SCALING - 1) * BUFFER_SIZE + (BLASULONG)map_address - (BLASULONG)best_address);
2305
2306
      map_address = best_address;
2307
2308
#if defined(OS_LINUX) && !defined(NO_WARMUP)
2309
      hot_alloc = 2;
2310
#endif
2311
      }
2312
    }
2313
#if defined(OS_LINUX) && !defined(NO_WARMUP)
2314
  }
2315
#endif
2316
2317
  if (map_address != (void *)-1) {
2318
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
2319
    LOCK_COMMAND(&alloc_lock);
2320
#endif
2321
    if (likely(release_pos < NUM_BUFFERS)) {
2322
    release_info[release_pos].address = map_address;
2323
    release_info[release_pos].func    = alloc_mmap_free;
2324
    } else {
2325
    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
2326
    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_mmap_free;
2327
    }
2328
    release_pos ++;
2329
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
2330
    UNLOCK_COMMAND(&alloc_lock);
2331
#endif
2332
  }
2333
2334
  return map_address;
2335
}
2336
2337
#endif
2338
2339
#endif
2340
2341
2342
#ifdef ALLOC_MALLOC
2343
2344
0
static void alloc_malloc_free(struct release_t *release){
2345
2346
0
  free(release -> address);
2347
2348
0
}
2349
2350
0
static void *alloc_malloc(void *address){
2351
2352
0
  void *map_address;
2353
2354
0
  map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
2355
2356
0
  if (map_address == (void *)NULL) map_address = (void *)-1;
2357
2358
0
  if (map_address != (void *)-1) {
2359
0
    if (likely(release_pos < NUM_BUFFERS)) {
2360
0
    release_info[release_pos].address = map_address;
2361
0
    release_info[release_pos].func    = alloc_malloc_free;
2362
0
    } else {
2363
0
    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
2364
0
    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_malloc_free;
2365
0
    }
2366
0
    release_pos ++;
2367
0
  }
2368
2369
0
  return map_address;
2370
2371
0
}
2372
2373
#endif
2374
2375
#ifdef ALLOC_QALLOC
2376
2377
void *qalloc(int flags, size_t bytes);
2378
void *qfree (void *address);
2379
2380
#define QNONCACHE 0x1
2381
#define QCOMMS    0x2
2382
#define QFAST     0x4
2383
2384
static void alloc_qalloc_free(struct release_t *release){
2385
2386
  qfree(release -> address);
2387
2388
}
2389
2390
static void *alloc_qalloc(void *address){
2391
  void *map_address;
2392
2393
  map_address = (void *)qalloc(QCOMMS | QFAST, BUFFER_SIZE + FIXED_PAGESIZE);
2394
2395
  if (map_address == (void *)NULL) map_address = (void *)-1;
2396
2397
  if (map_address != (void *)-1) {
2398
    if (likely(release_pos < NUM_BUFFERS)) {
2399
    release_info[release_pos].address = map_address;
2400
    release_info[release_pos].func    = alloc_qalloc_free;
2401
    } else {
2402
    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
2403
    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_qalloc_free;
2404
    }
2405
    release_pos ++;
2406
  }
2407
2408
  return (void *)(((BLASULONG)map_address + FIXED_PAGESIZE - 1) & ~(FIXED_PAGESIZE - 1));
2409
}
2410
2411
#endif
2412
2413
#ifdef ALLOC_WINDOWS
2414
2415
static void alloc_windows_free(struct release_t *release){
2416
2417
  VirtualFree(release -> address, 0, MEM_RELEASE);
2418
2419
}
2420
2421
static void *alloc_windows(void *address){
2422
  void *map_address;
2423
2424
  map_address  = VirtualAlloc(address,
2425
                              BUFFER_SIZE,
2426
                              MEM_RESERVE | MEM_COMMIT,
2427
                              PAGE_READWRITE);
2428
2429
  if (map_address == (void *)NULL) map_address = (void *)-1;
2430
2431
  if (map_address != (void *)-1) {
2432
    if (likely(release_pos < NUM_BUFFERS)) {
2433
    release_info[release_pos].address = map_address;
2434
    release_info[release_pos].func    = alloc_windows_free;
2435
    } else {
2436
    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
2437
    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_windows_free;
2438
    }
2439
    release_pos ++;
2440
  }
2441
2442
  return map_address;
2443
}
2444
2445
#endif
2446
2447
#ifdef ALLOC_DEVICEDRIVER
2448
#ifndef DEVICEDRIVER_NAME
2449
#define DEVICEDRIVER_NAME "/dev/mapper"
2450
#endif
2451
2452
static void alloc_devicedirver_free(struct release_t *release){
2453
2454
  if (munmap(release -> address, BUFFER_SIZE)) {
2455
    printf("OpenBLAS : Bugphysarea unmap failed.\n");
2456
  }
2457
2458
  if (close(release -> attr)) {
2459
    printf("OpenBLAS : Bugphysarea close failed.\n");
2460
  }
2461
2462
}
2463
2464
static void *alloc_devicedirver(void *address){
2465
2466
  int fd;
2467
  void *map_address;
2468
2469
  if ((fd = open(DEVICEDRIVER_NAME, O_RDWR | O_SYNC)) < 0) {
2470
2471
    return (void *)-1;
2472
2473
  }
2474
2475
  map_address = mmap(address, BUFFER_SIZE,
2476
                     PROT_READ | PROT_WRITE,
2477
                     MAP_FILE | MAP_SHARED,
2478
                     fd, 0);
2479
2480
  if (map_address != (void *)-1) {
2481
    if (likely(release_pos < NUM_BUFFERS)) {
2482
    release_info[release_pos].address = map_address;
2483
    release_info[release_pos].attr    = fd;
2484
    release_info[release_pos].func    = alloc_devicedirver_free;
2485
    } else {
2486
    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
2487
    new_release_info[release_pos-NUM_BUFFERS].attr    = fd;
2488
    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_devicedirver_free;
2489
    }
2490
    release_pos ++;
2491
  }
2492
2493
  return map_address;
2494
}
2495
2496
#endif
2497
2498
#if defined(ALLOC_SHM) && !defined(ALLOC_HUGETLB)
2499
2500
static void alloc_shm_free(struct release_t *release){
2501
2502
  if (shmdt(release -> address)) {
2503
    printf("OpenBLAS : Shared memory unmap failed.\n");
2504
    }
2505
}
2506
2507
static void *alloc_shm(void *address){
2508
  void *map_address;
2509
  int shmid;
2510
#ifdef DEBUG
2511
 fprintf(stderr,"alloc_shm got called\n");
2512
#endif
2513
  shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,IPC_CREAT | 0600);
2514
2515
  map_address = (void *)shmat(shmid, address, 0);
2516
2517
  if (map_address != (void *)-1){
2518
2519
#ifdef OS_LINUX
2520
    my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
2521
#endif
2522
2523
    shmctl(shmid, IPC_RMID, 0);
2524
2525
    if (likely(release_pos < NUM_BUFFERS)) {
2526
    release_info[release_pos].address = map_address;
2527
    release_info[release_pos].attr    = shmid;
2528
    release_info[release_pos].func    = alloc_shm_free;
2529
    } else {
2530
    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
2531
    new_release_info[release_pos-NUM_BUFFERS].attr    = shmid;
2532
    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_shm_free;
2533
    }
2534
    release_pos ++;
2535
  }
2536
2537
  return map_address;
2538
}
2539
#endif
2540
2541
#if ((defined ALLOC_HUGETLB) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS))
2542
2543
static void alloc_hugetlb_free(struct release_t *release){
2544
2545
#if defined(OS_LINUX) || defined(OS_AIX)
2546
  if (shmdt(release -> address)) {
2547
    printf("OpenBLAS : Hugepage unmap failed.\n");
2548
  }
2549
#endif
2550
2551
#ifdef __sun__
2552
2553
  munmap(release -> address, BUFFER_SIZE);
2554
2555
#endif
2556
2557
#ifdef OS_WINDOWS
2558
2559
  VirtualFree(release -> address, 0, MEM_LARGE_PAGES | MEM_RELEASE);
2560
2561
#endif
2562
2563
}
2564
2565
static void *alloc_hugetlb(void *address){
2566
2567
  void *map_address = (void *)-1;
2568
2569
#ifdef DEBUG
2570
fprintf(stderr,"alloc_hugetlb got called\n");
2571
#endif
2572
2573
#if defined(OS_LINUX) || defined(OS_AIX)
2574
  int shmid;
2575
2576
  shmid = shmget(IPC_PRIVATE, BUFFER_SIZE,
2577
#ifdef OS_LINUX
2578
                 SHM_HUGETLB |
2579
#endif
2580
#ifdef OS_AIX
2581
                 SHM_LGPAGE | SHM_PIN |
2582
#endif
2583
                 IPC_CREAT | SHM_R | SHM_W);
2584
2585
  if (shmid != -1) {
2586
    map_address = (void *)shmat(shmid, address, SHM_RND);
2587
2588
#ifdef OS_LINUX
2589
    my_mbind(map_address, BUFFER_SIZE, MPOL_PREFERRED, NULL, 0, 0);
2590
#endif
2591
2592
    if (map_address != (void *)-1){
2593
      shmctl(shmid, IPC_RMID, 0);
2594
    }else printf("alloc_hugetlb failed\n");
2595
  }
2596
#endif
2597
2598
#ifdef __sun__
2599
  struct memcntl_mha mha;
2600
2601
  mha.mha_cmd = MHA_MAPSIZE_BSSBRK;
2602
  mha.mha_flags = 0;
2603
  mha.mha_pagesize = HUGE_PAGESIZE;
2604
  memcntl(NULL, 0, MC_HAT_ADVISE, (char *)&mha, 0, 0);
2605
2606
  map_address = (BLASULONG)memalign(HUGE_PAGESIZE, BUFFER_SIZE);
2607
#endif
2608
2609
#ifdef OS_WINDOWS
2610
2611
  HANDLE hToken;
2612
  TOKEN_PRIVILEGES tp;
2613
2614
  if (OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES, &hToken) != TRUE) return (void *) -1;
2615
2616
  tp.PrivilegeCount = 1;
2617
  tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
2618
2619
  if (LookupPrivilegeValue(NULL, SE_LOCK_MEMORY_NAME, &tp.Privileges[0].Luid) != TRUE) {
2620
      CloseHandle(hToken);
2621
      return (void*)-1;
2622
  }
2623
2624
  if (AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL) != TRUE) {
2625
      CloseHandle(hToken);
2626
      return (void*)-1;
2627
  }
2628
2629
  map_address  = (void *)VirtualAlloc(address,
2630
                                      BUFFER_SIZE,
2631
                                      MEM_LARGE_PAGES | MEM_RESERVE | MEM_COMMIT,
2632
                                      PAGE_READWRITE);
2633
2634
  tp.Privileges[0].Attributes = 0;
2635
  AdjustTokenPrivileges(hToken, FALSE, &tp, 0, NULL, NULL);
2636
2637
  if (map_address == (void *)NULL) map_address = (void *)-1;
2638
2639
#endif
2640
2641
  if (map_address != (void *)-1){
2642
    if (likely(release_pos < NUM_BUFFERS)) {
2643
    release_info[release_pos].address = map_address;
2644
    release_info[release_pos].func    = alloc_hugetlb_free;
2645
    } else {
2646
    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
2647
    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_hugetlb_free;
2648
    }
2649
    release_pos ++;
2650
  }
2651
2652
  return map_address;
2653
}
2654
#endif
2655
2656
2657
#ifdef  ALLOC_HUGETLBFILE
2658
2659
static int hugetlb_pid = 0;
2660
2661
static void alloc_hugetlbfile_free(struct release_t *release){
2662
2663
  if (munmap(release -> address, BUFFER_SIZE)) {
2664
    printf("OpenBLAS : HugeTLBfs unmap failed.\n");
2665
  }
2666
2667
  if (close(release -> attr)) {
2668
    printf("OpenBLAS : HugeTLBfs close failed.\n");
2669
  }
2670
}
2671
2672
static void *alloc_hugetlbfile(void *address){
2673
2674
  void *map_address = (void *)-1;
2675
  int fd;
2676
  char filename[64];
2677
2678
  if (!hugetlb_pid) hugetlb_pid = getpid();
2679
2680
  sprintf(filename, "%s/gotoblas.%d", HUGETLB_FILE_NAME, hugetlb_pid);
2681
2682
  if ((fd = open(filename, O_RDWR | O_CREAT, 0700)) < 0) {
2683
    return (void *)-1;
2684
  }
2685
2686
  unlink(filename);
2687
2688
  map_address = mmap(address, BUFFER_SIZE,
2689
                     PROT_READ | PROT_WRITE,
2690
                     MAP_SHARED,
2691
                     fd, 0);
2692
2693
  if (map_address != (void *)-1) {
2694
    if (likely(release_pos < NUM_BUFFERS)) {
2695
    release_info[release_pos].address = map_address;
2696
    release_info[release_pos].attr    = fd;
2697
    release_info[release_pos].func    = alloc_hugetlbfile_free;
2698
    } else {
2699
    new_release_info[release_pos-NUM_BUFFERS].address = map_address;
2700
    new_release_info[release_pos-NUM_BUFFERS].attr    = fd;
2701
    new_release_info[release_pos-NUM_BUFFERS].func    = alloc_hugetlbfile_free;
2702
    }
2703
    release_pos ++;
2704
  }
2705
2706
  return map_address;
2707
}
2708
#endif
2709
2710
2711
#ifdef SEEK_ADDRESS
2712
static BLASULONG base_address      = 0UL;
2713
#else
2714
static BLASULONG base_address      = BASE_ADDRESS;
2715
#endif
2716
2717
static volatile struct {
2718
  BLASULONG lock;
2719
  void *addr;
2720
#if defined(WHEREAMI) && !defined(USE_OPENMP)
2721
  int   pos;
2722
#endif
2723
  int used;
2724
#ifndef __64BIT__
2725
  char dummy[48];
2726
#else
2727
  char dummy[40];
2728
#endif
2729
2730
} memory[NUM_BUFFERS];
2731
2732
struct newmemstruct 
2733
{
2734
  BLASULONG lock;
2735
  void *addr;
2736
#if defined(WHEREAMI) && !defined(USE_OPENMP)
2737
  int   pos;
2738
#endif
2739
  int used;
2740
#ifndef __64BIT__
2741
  char dummy[48];
2742
#else
2743
  char dummy[40];
2744
#endif
2745
2746
};
2747
static volatile struct newmemstruct *newmemory;
2748
2749
static volatile int memory_initialized = 0;
2750
static int memory_overflowed = 0;
2751
/*       Memory allocation routine           */
2752
/* procpos ... indicates where it comes from */
2753
/*                0 : Level 3 functions      */
2754
/*                1 : Level 2 functions      */
2755
/*                2 : Thread                 */
2756
2757
380
void *blas_memory_alloc(int procpos){
2758
2759
380
  int i;
2760
  
2761
380
  int position;
2762
#if defined(WHEREAMI) && !defined(USE_OPENMP)
2763
  int mypos = 0;
2764
#endif
2765
2766
380
  void *map_address;
2767
2768
380
  void *(*memoryalloc[])(void *address) = {
2769
#ifdef ALLOC_DEVICEDRIVER
2770
    alloc_devicedirver,
2771
#endif
2772
#if defined(ALLOC_SHM) && !defined(ALLOC_HUGETLB)
2773
    alloc_shm,
2774
#endif
2775
#if ((defined ALLOC_HUGETLB) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS))
2776
    alloc_hugetlb,
2777
#endif
2778
380
#ifdef ALLOC_MMAP
2779
380
    alloc_mmap,
2780
380
#endif
2781
#ifdef ALLOC_QALLOC
2782
    alloc_qalloc,
2783
#endif
2784
#ifdef ALLOC_WINDOWS
2785
    alloc_windows,
2786
#endif
2787
380
#ifdef ALLOC_MALLOC
2788
380
    alloc_malloc,
2789
380
#endif
2790
380
    NULL,
2791
380
  };
2792
380
  void *(**func)(void *address);
2793
2794
380
  if (!memory_initialized) {
2795
#if defined(SMP) && !defined(USE_OPENMP)
2796
    LOCK_COMMAND(&alloc_lock);
2797
    if (!memory_initialized) {
2798
#endif
2799
2800
#if defined(WHEREAMI) && !defined(USE_OPENMP)
2801
    for (position = 0; position < NUM_BUFFERS; position ++){
2802
      memory[position].addr   = (void *)0;
2803
      memory[position].pos    = -1;
2804
      memory[position].used   = 0;
2805
      memory[position].lock   = 0;
2806
    }
2807
#endif
2808
2809
9
#ifdef DYNAMIC_ARCH
2810
9
    gotoblas_dynamic_init();
2811
9
#endif
2812
2813
#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
2814
    gotoblas_affinity_init();
2815
#endif
2816
2817
9
#ifdef SMP
2818
9
    if (!blas_num_threads) blas_cpu_number = blas_get_cpu_number();
2819
9
#endif
2820
2821
9
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) || defined(ARCH_LOONGARCH64)
2822
#ifndef DYNAMIC_ARCH
2823
    blas_set_parameter();
2824
#endif
2825
9
#endif
2826
2827
9
    memory_initialized = 1;
2828
9
    WMB;
2829
#if defined(SMP) && !defined(USE_OPENMP)
2830
  }
2831
  UNLOCK_COMMAND(&alloc_lock);
2832
#endif
2833
9
}
2834
2835
#ifdef DEBUG
2836
  printf("Alloc Start ...\n");
2837
#endif
2838
2839
/* #if defined(WHEREAMI) && !defined(USE_OPENMP)
2840
2841
  mypos = WhereAmI();
2842
2843
  position = mypos;
2844
  while (position >= NUM_BUFFERS) position >>= 1;
2845
2846
  do {
2847
    if (!memory[position].used && (memory[position].pos == mypos)) {
2848
#if defined(SMP) && !defined(USE_OPENMP)
2849
      LOCK_COMMAND(&alloc_lock);
2850
#else
2851
      blas_lock(&memory[position].lock);
2852
#endif
2853
      if (!memory[position].used) goto allocation;
2854
#if defined(SMP) && !defined(USE_OPENMP)
2855
      UNLOCK_COMMAND(&alloc_lock);
2856
#else
2857
      blas_unlock(&memory[position].lock);
2858
#endif
2859
    }
2860
2861
    position ++;
2862
2863
  } while (position < NUM_BUFFERS);
2864
2865
2866
#endif */
2867
2868
380
  position = 0;
2869
2870
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
2871
  LOCK_COMMAND(&alloc_lock);
2872
#endif
2873
670
  do {
2874
670
    RMB;
2875
670
#if defined(USE_OPENMP)
2876
670
    if (!memory[position].used) {
2877
380
      blas_lock(&memory[position].lock);
2878
380
#endif
2879
380
      if (!memory[position].used) goto allocation;
2880
2881
0
#if defined(USE_OPENMP)
2882
0
      blas_unlock(&memory[position].lock);
2883
0
    }
2884
290
#endif
2885
290
    position ++;
2886
2887
290
  } while (position < NUM_BUFFERS);
2888
2889
0
  if (memory_overflowed) {
2890
2891
0
    do {
2892
0
      RMB;
2893
0
#if defined(USE_OPENMP)
2894
0
      if (!newmemory[position-NUM_BUFFERS].used) {
2895
0
        blas_lock(&newmemory[position-NUM_BUFFERS].lock);
2896
0
#endif
2897
0
        if (!newmemory[position-NUM_BUFFERS].used) goto allocation2;
2898
2899
0
#if defined(USE_OPENMP)
2900
0
        blas_unlock(&newmemory[position-NUM_BUFFERS].lock);
2901
0
      }
2902
0
#endif
2903
0
      position ++;
2904
2905
0
    } while (position < NEW_BUFFERS + NUM_BUFFERS);
2906
0
  }
2907
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
2908
  UNLOCK_COMMAND(&alloc_lock);
2909
#endif
2910
0
  goto error;
2911
2912
380
  allocation :
2913
2914
#ifdef DEBUG
2915
  printf("  Position -> %d\n", position);
2916
#endif
2917
2918
380
  memory[position].used = 1;
2919
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
2920
  UNLOCK_COMMAND(&alloc_lock);
2921
#else
2922
380
  blas_unlock(&memory[position].lock);
2923
380
#endif
2924
380
  if (!memory[position].addr) {
2925
76
    do {
2926
#ifdef DEBUG
2927
      printf("Allocation Start : %lx\n", base_address);
2928
#endif
2929
2930
76
      map_address = (void *)-1;
2931
2932
76
      func = &memoryalloc[0];
2933
2934
152
      while ((*func != NULL) && (map_address == (void *) -1)) {
2935
2936
76
        map_address = (*func)((void *)base_address);
2937
2938
#ifdef ALLOC_DEVICEDRIVER
2939
        if ((*func ==  alloc_devicedirver) && (map_address == (void *)-1)) {
2940
            fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
2941
        }
2942
#endif
2943
2944
#ifdef ALLOC_HUGETLBFILE
2945
        if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
2946
#ifndef OS_WINDOWS
2947
            fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
2948
#endif
2949
        }
2950
#endif
2951
2952
#if (defined ALLOC_HUGETLB) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS)
2953
        if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
2954
#ifdef DEBUG
2955
  if (hugetlb_allocated) printf("allocating via shared memory with large page support (hugetlb)\n");
2956
#endif
2957
#endif
2958
2959
#if (defined ALLOC_SHM) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS)
2960
#ifdef DEBUG
2961
  printf("allocating via shared memory\n");
2962
#endif
2963
        if ((*func == alloc_shm) && (map_address == (void *)-1)) {
2964
#ifndef OS_WINDOWS
2965
            fprintf(stderr, "OpenBLAS Warning ... shared memory allocation was failed.\n");
2966
#endif
2967
  }
2968
#endif
2969
2970
76
        func ++;
2971
76
      }
2972
2973
#ifdef DEBUG
2974
      printf("  Success -> %08lx\n", map_address);
2975
#endif
2976
76
      if (((BLASLONG) map_address) == -1) base_address = 0UL;
2977
2978
76
      if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
2979
2980
76
    } while ((BLASLONG)map_address == -1);
2981
2982
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
2983
    LOCK_COMMAND(&alloc_lock);
2984
#endif
2985
76
    memory[position].addr = map_address;
2986
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
2987
    UNLOCK_COMMAND(&alloc_lock);
2988
#endif
2989
2990
#ifdef DEBUG
2991
    printf("  Mapping Succeeded. %p(%d)\n", (void *)memory[position].addr, position);
2992
#endif
2993
76
  }
2994
2995
#if defined(WHEREAMI) && !defined(USE_OPENMP)
2996
2997
  if (memory[position].pos == -1) memory[position].pos = mypos;
2998
2999
#endif
3000
3001
380
#ifdef DYNAMIC_ARCH
3002
3003
380
  if (memory_initialized == 1) {
3004
3005
9
    LOCK_COMMAND(&alloc_lock);
3006
3007
9
    if (memory_initialized == 1) {
3008
3009
9
      if (!gotoblas) gotoblas_dynamic_init();
3010
3011
9
      memory_initialized = 2;
3012
9
    }
3013
3014
9
    UNLOCK_COMMAND(&alloc_lock);
3015
3016
9
  }
3017
380
#endif
3018
3019
3020
#ifdef DEBUG
3021
  printf("Mapped   : %p  %3d\n\n",
3022
          (void *)memory[position].addr, position);
3023
#endif
3024
3025
380
  return (void *)memory[position].addr;
3026
3027
0
 error:
3028
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
3029
  LOCK_COMMAND(&alloc_lock);
3030
#endif
3031
0
 if (memory_overflowed) goto terminate;
3032
0
  fprintf(stderr,"OpenBLAS warning: precompiled NUM_THREADS exceeded, adding auxiliary array for thread metadata.\n");
3033
0
  fprintf(stderr,"To avoid this warning, please rebuild your copy of OpenBLAS with a larger NUM_THREADS setting\n");
3034
0
  fprintf(stderr,"or set the environment variable OPENBLAS_NUM_THREADS to %d or lower\n", MAX_CPU_NUMBER);
3035
0
  memory_overflowed=1;
3036
0
  MB;
3037
0
  new_release_info = (struct release_t*) malloc(NEW_BUFFERS * sizeof(struct release_t));
3038
0
  newmemory = (struct newmemstruct*) malloc(NEW_BUFFERS * sizeof(struct newmemstruct));
3039
0
  for (i = 0; i < NEW_BUFFERS; i++) {
3040
0
  newmemory[i].addr   = (void *)0;
3041
#if defined(WHEREAMI) && !defined(USE_OPENMP)
3042
  newmemory[i].pos    = -1;
3043
#endif
3044
0
  newmemory[i].used   = 0;
3045
0
  newmemory[i].lock   = 0;
3046
0
}
3047
  
3048
0
allocation2:
3049
0
  newmemory[position-NUM_BUFFERS].used = 1;
3050
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
3051
  UNLOCK_COMMAND(&alloc_lock);
3052
#else
3053
0
  blas_unlock(&newmemory[position-NUM_BUFFERS].lock);
3054
0
#endif
3055
0
    do {
3056
#ifdef DEBUG
3057
      printf("Allocation Start : %lx\n", base_address);
3058
#endif
3059
3060
0
      map_address = (void *)-1;
3061
3062
0
      func = &memoryalloc[0];
3063
3064
0
      while ((*func != NULL) && (map_address == (void *) -1)) {
3065
3066
0
        map_address = (*func)((void *)base_address);
3067
3068
#ifdef ALLOC_DEVICEDRIVER
3069
        if ((*func ==  alloc_devicedirver) && (map_address == (void *)-1)) {
3070
            fprintf(stderr, "OpenBLAS Warning ... Physically contiguous allocation was failed.\n");
3071
        }
3072
#endif
3073
3074
#ifdef ALLOC_HUGETLBFILE
3075
        if ((*func == alloc_hugetlbfile) && (map_address == (void *)-1)) {
3076
#ifndef OS_WINDOWS
3077
            fprintf(stderr, "OpenBLAS Warning ... HugeTLB(File) allocation was failed.\n");
3078
#endif
3079
        }
3080
#endif
3081
3082
#if (defined ALLOC_HUGETLB) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS)
3083
#ifdef DEBUG
3084
  fprintf(stderr,"OpenBLAS: allocating via shared memory with large page support (hugetlb)\n");
3085
#endif
3086
        if ((*func == alloc_hugetlb) && (map_address != (void *)-1)) hugetlb_allocated = 1;
3087
#endif
3088
3089
#if (defined ALLOC_SHM) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS)
3090
#ifdef DEBUG
3091
  fprintf(stderr,"allocating via shared memory\n");
3092
#endif
3093
        if ((*func == alloc_shm) && (map_address == (void *)-1)) {
3094
#ifndef OS_WINDOWS
3095
            fprintf(stderr, "OpenBLAS Warning ... shared memory allocation was failed.\n");
3096
#endif
3097
  }
3098
#endif
3099
0
        func ++;
3100
0
      }
3101
3102
#ifdef DEBUG
3103
      printf("  Success -> %08lx\n", map_address);
3104
#endif
3105
0
      if (((BLASLONG) map_address) == -1) base_address = 0UL;
3106
3107
0
      if (base_address) base_address += BUFFER_SIZE + FIXED_PAGESIZE;
3108
3109
0
    } while ((BLASLONG)map_address == -1);
3110
3111
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
3112
    LOCK_COMMAND(&alloc_lock);
3113
#endif
3114
0
    newmemory[position-NUM_BUFFERS].addr = map_address;
3115
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
3116
    UNLOCK_COMMAND(&alloc_lock);
3117
#endif
3118
3119
#ifdef DEBUG
3120
    printf("  Mapping Succeeded. %p(%d)\n", (void *)newmemory[position-NUM_BUFFERS].addr, position);
3121
#endif
3122
3123
#if defined(WHEREAMI) && !defined(USE_OPENMP)
3124
3125
  if (newmemory[position-NUM_BUFFERS].pos == -1) newmemory[position-NUM_BUFFERS].pos = mypos;
3126
3127
#endif
3128
0
  return (void *)newmemory[position-NUM_BUFFERS].addr;
3129
3130
0
terminate:
3131
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
3132
    UNLOCK_COMMAND(&alloc_lock);
3133
#endif
3134
0
  printf("OpenBLAS : Program is Terminated. Because you tried to allocate too many memory regions.\n");
3135
0
  printf("This library was built to support a maximum of %d threads - either rebuild OpenBLAS\n", NUM_BUFFERS);
3136
0
  printf("with a larger NUM_THREADS value or set the environment variable OPENBLAS_NUM_THREADS to\n");
3137
0
  printf("a sufficiently small number. This error typically occurs when the software that relies on\n");
3138
0
  printf("OpenBLAS calls BLAS functions from many threads in parallel, or when your computer has more\n");
3139
0
  printf("cpu cores than what OpenBLAS was configured to handle.\n"); 
3140
0
  return NULL;
3141
0
}
3142
3143
340
void blas_memory_free(void *free_area){
3144
3145
340
  int position;
3146
3147
#ifdef DEBUG
3148
  printf("Unmapped Start : %p ...\n", free_area);
3149
#endif
3150
3151
340
  position = 0;
3152
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
3153
  LOCK_COMMAND(&alloc_lock);
3154
#endif
3155
490
  while ((position < NUM_BUFFERS) && (memory[position].addr != free_area))
3156
150
    position++;
3157
3158
340
  if (position >= NUM_BUFFERS && !memory_overflowed) goto error;
3159
3160
#ifdef DEBUG
3161
  if (memory[position].addr != free_area) goto error;
3162
  printf("  Position : %d\n", position);
3163
#endif
3164
340
  if (unlikely(memory_overflowed && position >= NUM_BUFFERS)) {
3165
0
    while ((position < NUM_BUFFERS+NEW_BUFFERS) && (newmemory[position-NUM_BUFFERS].addr != free_area))
3166
0
      position++;
3167
  // arm: ensure all writes are finished before other thread takes this memory
3168
0
  WMB;
3169
0
if (position - NUM_BUFFERS >= NEW_BUFFERS) goto error;
3170
0
  newmemory[position-NUM_BUFFERS].used = 0;
3171
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
3172
  UNLOCK_COMMAND(&alloc_lock);
3173
#endif
3174
3175
#ifdef DEBUG
3176
  printf("Unmap from overflow area succeeded.\n\n");
3177
#endif
3178
0
  return;
3179
340
} else {
3180
  // arm: ensure all writes are finished before other thread takes this memory
3181
340
  WMB;
3182
3183
340
  memory[position].used = 0;
3184
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
3185
  UNLOCK_COMMAND(&alloc_lock);
3186
#endif
3187
3188
#ifdef DEBUG
3189
  printf("Unmap Succeeded.\n\n");
3190
#endif
3191
3192
340
  return;
3193
340
}
3194
0
 error:
3195
0
  printf("BLAS : Bad memory unallocation! : %4d  %p\n", position,  free_area);
3196
3197
#ifdef DEBUG
3198
  for (position = 0; position < NUM_BUFFERS; position++)
3199
    printf("%4ld  %p : %d\n", position, memory[position].addr, memory[position].used);
3200
#endif
3201
#if (defined(SMP) || defined(USE_LOCKING)) && !defined(USE_OPENMP)
3202
  UNLOCK_COMMAND(&alloc_lock);
3203
#endif
3204
0
  return;
3205
340
}
3206
3207
0
void *blas_memory_alloc_nolock(int unused) {
3208
0
  void *map_address;
3209
0
  map_address = (void *)malloc(BUFFER_SIZE + FIXED_PAGESIZE);
3210
0
  return map_address;
3211
0
}
3212
3213
0
void blas_memory_free_nolock(void * map_address) {
3214
0
  free(map_address);
3215
0
}
3216
3217
0
void blas_shutdown(void){
3218
3219
0
  int pos;
3220
3221
0
#ifdef SMP
3222
0
  BLASFUNC(blas_thread_shutdown)();
3223
0
#endif
3224
3225
0
  LOCK_COMMAND(&alloc_lock);
3226
3227
0
  for (pos = 0; pos < release_pos; pos ++) {
3228
0
    if (likely(pos < NUM_BUFFERS))
3229
0
    release_info[pos].func(&release_info[pos]);
3230
0
    else
3231
0
    new_release_info[pos-NUM_BUFFERS].func(&new_release_info[pos-NUM_BUFFERS]);
3232
0
  }
3233
3234
0
#ifdef SEEK_ADDRESS
3235
0
  base_address      = 0UL;
3236
#else
3237
  base_address      = BASE_ADDRESS;
3238
#endif
3239
3240
0
  for (pos = 0; pos < NUM_BUFFERS; pos ++){
3241
0
    memory[pos].addr   = (void *)0;
3242
0
    memory[pos].used   = 0;
3243
#if defined(WHEREAMI) && !defined(USE_OPENMP)
3244
    memory[pos].pos    = -1;
3245
#endif
3246
0
    memory[pos].lock   = 0;
3247
0
  }
3248
0
  if (memory_overflowed) {
3249
0
    for (pos = 0; pos < NEW_BUFFERS; pos ++){
3250
0
      newmemory[pos].addr   = (void *)0;
3251
0
      newmemory[pos].used   = 0;
3252
#if defined(WHEREAMI) && !defined(USE_OPENMP)
3253
      newmemory[pos].pos    = -1;
3254
#endif
3255
0
      newmemory[pos].lock   = 0;
3256
0
    }
3257
0
    free((void*)newmemory);
3258
0
    newmemory = NULL;
3259
0
    memory_overflowed = 0;  
3260
0
  }
3261
3262
0
  UNLOCK_COMMAND(&alloc_lock);
3263
3264
0
  return;
3265
0
}
3266
3267
#if defined(OS_LINUX) && !defined(NO_WARMUP)
3268
3269
#if defined(SMP) || defined(USE_LOCKING)
3270
#if   defined(USE_PTHREAD_LOCK)
3271
static pthread_mutex_t    init_lock = PTHREAD_MUTEX_INITIALIZER;
3272
#elif defined(USE_PTHREAD_SPINLOCK)
3273
static pthread_spinlock_t init_lock = 0;
3274
#else
3275
static BLASULONG   init_lock = 0UL;
3276
#endif
3277
#endif
3278
3279
static void _touch_memory(blas_arg_t *arg, BLASLONG *range_m, BLASLONG *range_n,
3280
                          void *sa, void *sb, BLASLONG pos) {
3281
3282
#if !defined(ARCH_POWER) && !defined(ARCH_SPARC)
3283
3284
  size_t size;
3285
  BLASULONG buffer;
3286
3287
  size   = BUFFER_SIZE - PAGESIZE;
3288
  buffer = (BLASULONG)sa + GEMM_OFFSET_A;
3289
3290
#if defined(OS_LINUX) && !defined(NO_WARMUP)
3291
    if (hot_alloc != 2) {
3292
#endif
3293
3294
#if defined(SMP) || defined(USE_LOCKING)
3295
  LOCK_COMMAND(&init_lock);
3296
#endif
3297
3298
  while (size > 0) {
3299
    *(int *)buffer = size;
3300
    buffer  += PAGESIZE;
3301
    size    -= PAGESIZE;
3302
  }
3303
3304
#if defined(SMP) || defined(USE_LOCKING)
3305
  UNLOCK_COMMAND(&init_lock);
3306
#endif
3307
3308
  size = MIN((BUFFER_SIZE - PAGESIZE), L2_SIZE);
3309
  buffer = (BLASULONG)sa + GEMM_OFFSET_A;
3310
3311
  while (size > 0) {
3312
    *(int *)buffer = size;
3313
    buffer  += 64;
3314
    size    -= 64;
3315
  }
3316
3317
#if defined(OS_LINUX) && !defined(NO_WARMUP)
3318
    }
3319
#endif
3320
3321
#endif
3322
}
3323
3324
#ifdef SMP
3325
3326
static void _init_thread_memory(void *buffer) {
3327
3328
  blas_queue_t queue[MAX_CPU_NUMBER];
3329
  int num_cpu;
3330
3331
  for (num_cpu = 0; num_cpu < blas_num_threads; num_cpu++) {
3332
3333
    blas_queue_init(&queue[num_cpu]);
3334
    queue[num_cpu].mode    = BLAS_DOUBLE | BLAS_REAL;
3335
    queue[num_cpu].routine = &_touch_memory;
3336
    queue[num_cpu].args    = NULL;
3337
    queue[num_cpu].next    = &queue[num_cpu + 1];
3338
  }
3339
3340
  queue[num_cpu - 1].next = NULL;
3341
  queue[0].sa = buffer;
3342
3343
  exec_blas(num_cpu, queue);
3344
3345
}
3346
#endif
3347
3348
static void gotoblas_memory_init(void) {
3349
3350
  void *buffer;
3351
3352
  hot_alloc = 1;
3353
3354
  buffer = (void *)blas_memory_alloc(0);
3355
3356
#ifdef SMP
3357
  if (blas_cpu_number == 0) blas_get_cpu_number();
3358
#ifdef SMP_SERVER
3359
  if (blas_server_avail == 0) blas_thread_init();
3360
#endif
3361
3362
  _init_thread_memory((void *)((BLASULONG)buffer + GEMM_OFFSET_A));
3363
3364
#else
3365
3366
  _touch_memory(NULL, NULL, NULL, (void *)((BLASULONG)buffer + GEMM_OFFSET_A), NULL, 0);
3367
3368
#endif
3369
3370
  blas_memory_free(buffer);
3371
}
3372
#endif
3373
3374
/* Initialization for all function; this function should be called before main */
3375
3376
static int gotoblas_initialized = 0;
3377
extern void openblas_read_env(void);
3378
3379
9
void CONSTRUCTOR gotoblas_init(void) {
3380
3381
9
  if (gotoblas_initialized) return;
3382
3383
9
#ifdef SMP
3384
9
  openblas_fork_handler();
3385
9
#endif
3386
3387
9
  openblas_read_env();
3388
3389
#ifdef PROFILE
3390
   moncontrol (0);
3391
#endif
3392
3393
9
#ifdef DYNAMIC_ARCH
3394
9
   gotoblas_dynamic_init();
3395
9
#endif
3396
3397
#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
3398
   gotoblas_affinity_init();
3399
#endif
3400
3401
#if defined(OS_LINUX) && !defined(NO_WARMUP)
3402
   gotoblas_memory_init();
3403
#endif
3404
3405
//#if defined(OS_LINUX)
3406
#if 0
3407
  struct rlimit curlimit;
3408
  if ( getrlimit(RLIMIT_STACK, &curlimit ) == 0 )
3409
  {
3410
    if ( curlimit.rlim_cur != curlimit.rlim_max )
3411
    {
3412
      curlimit.rlim_cur = curlimit.rlim_max;
3413
      setrlimit(RLIMIT_STACK, &curlimit);
3414
    }
3415
  }
3416
#endif
3417
3418
9
#ifdef SMP
3419
9
  if (blas_cpu_number == 0) blas_get_cpu_number();
3420
9
#ifdef SMP_SERVER
3421
9
  if (blas_server_avail == 0) blas_thread_init();
3422
9
#endif
3423
9
#endif
3424
3425
#ifdef FUNCTION_PROFILE
3426
   gotoblas_profile_init();
3427
#endif
3428
3429
9
   gotoblas_initialized = 1;
3430
3431
#ifdef PROFILE
3432
   moncontrol (1);
3433
#endif
3434
3435
9
}
3436
3437
0
void DESTRUCTOR gotoblas_quit(void) {
3438
3439
0
  if (gotoblas_initialized == 0) return;
3440
3441
0
  blas_shutdown();
3442
3443
#ifdef PROFILE
3444
   moncontrol (0);
3445
#endif
3446
3447
#ifdef FUNCTION_PROFILE
3448
   gotoblas_profile_quit();
3449
#endif
3450
3451
#if defined(SMP) && defined(OS_LINUX) && !defined(NO_AFFINITY)
3452
   gotoblas_affinity_quit();
3453
#endif
3454
3455
0
#ifdef DYNAMIC_ARCH
3456
0
   gotoblas_dynamic_quit();
3457
0
#endif
3458
3459
0
   gotoblas_initialized = 0;
3460
3461
#ifdef PROFILE
3462
   moncontrol (1);
3463
#endif
3464
0
}
3465
3466
#if defined(_MSC_VER) && !defined(__clang__)
3467
BOOL APIENTRY DllMain(HMODULE hModule, DWORD  ul_reason_for_call, LPVOID lpReserved)
3468
{
3469
  switch (ul_reason_for_call)
3470
  {
3471
    case DLL_PROCESS_ATTACH:
3472
      gotoblas_init();
3473
      break;
3474
    case DLL_THREAD_ATTACH:
3475
      break;
3476
    case DLL_THREAD_DETACH:
3477
      break;
3478
    case DLL_PROCESS_DETACH:
3479
      gotoblas_quit();
3480
      break;
3481
    default:
3482
      break;
3483
  }
3484
  return TRUE;
3485
}
3486
3487
/*
3488
  This is to allow static linking.
3489
  Code adapted from Google performance tools:
3490
  https://gperftools.googlecode.com/git-history/perftools-1.0/src/windows/port.cc
3491
  Reference:
3492
  https://sourceware.org/ml/pthreads-win32/2008/msg00028.html
3493
  http://ci.boost.org/svn-trac/browser/trunk/libs/thread/src/win32/tss_pe.cpp
3494
*/
3495
static int on_process_term(void)
3496
{
3497
  gotoblas_quit();
3498
  return 0;
3499
}
3500
#ifdef _WIN64
3501
#pragma comment(linker, "/INCLUDE:_tls_used")
3502
#else
3503
#pragma comment(linker, "/INCLUDE:__tls_used")
3504
#endif
3505
3506
#ifdef _WIN64
3507
#pragma const_seg(".CRT$XLB")
3508
#else
3509
#pragma data_seg(".CRT$XLB")
3510
#endif
3511
static void (APIENTRY *dll_callback)(HINSTANCE h, DWORD ul_reason_for_call, PVOID pv) = DllMain;
3512
#ifdef _WIN64
3513
#pragma const_seg()
3514
#else
3515
#pragma data_seg()
3516
#endif
3517
3518
#ifdef _WIN64
3519
#pragma const_seg(".CRT$XTU")
3520
#else
3521
#pragma data_seg(".CRT$XTU")
3522
#endif
3523
static int(*p_process_term)(void) = on_process_term;
3524
#ifdef _WIN64
3525
#pragma const_seg()
3526
#else
3527
#pragma data_seg()
3528
#endif
3529
#endif
3530
3531
#if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64))
3532
/* Don't call me; this is just work around for PGI / Sun bug */
3533
void gotoblas_dummy_for_PGI(void) {
3534
3535
  gotoblas_init();
3536
  gotoblas_quit();
3537
#if __PGIC__ < 19
3538
#if 0
3539
  asm ("\t.section\t.ctors,\"aw\",@progbits; .align 8; .quad gotoblas_init; .section .text");
3540
  asm ("\t.section\t.dtors,\"aw\",@progbits; .align 8; .quad gotoblas_quit; .section .text");
3541
#else
3542
  asm (".section .init,\"ax\"; call gotoblas_init@PLT; .section .text");
3543
  asm (".section .fini,\"ax\"; call gotoblas_quit@PLT; .section .text");
3544
#endif
3545
#endif
3546
}
3547
#endif
3548
3549
#endif