YAKL
YAKL_atomics.h
Go to the documentation of this file.
1 
7 #pragma once
8 // Included by YAKL.h
9 
11 namespace yakl {
12 
13  // These are YAKL's atomic operations: atomicAdd, atomicMin, and atomicMax
14  // Where possible, hardware atomics are used. Where that's not possible, CompareAndSwap (CAS)
15  // implementations are used.
16 
18  template <class T> inline void atomicAdd_host(T &update, T value) {
19  update += value;
20  }
22  template <class T> inline void atomicMin_host(T &update, T value) {
23  update = update < value ? update : value;
24  }
26  template <class T> inline void atomicMax_host(T &update, T value) {
27  update = update > value ? update : value;
28  }
29 
30 
31  #ifdef YAKL_ARCH_CUDA
32 
33 
34  __device__ __forceinline__ void atomicMin_device(float &update , float value) {
35  int oldval, newval, readback;
36  oldval = __float_as_int(update);
37  newval = __float_as_int( __int_as_float(oldval) < value ? __int_as_float(oldval) : value );
38  while ( ( readback = atomicCAS( (int *) &update , oldval , newval ) ) != oldval ) {
39  oldval = readback;
40  newval = __float_as_int( __int_as_float(oldval) < value ? __int_as_float(oldval) : value );
41  }
42  }
43  __device__ __forceinline__ void atomicMin_device(double &update , double value) {
44  unsigned long long oldval, newval, readback;
45  oldval = __double_as_longlong(update);
46  newval = __double_as_longlong( __longlong_as_double(oldval) < value ? __longlong_as_double(oldval) : value );
47  while ( ( readback = atomicCAS( (unsigned long long *) &update , oldval , newval ) ) != oldval ) {
48  oldval = readback;
49  newval = __double_as_longlong( __longlong_as_double(oldval) < value ? __longlong_as_double(oldval) : value );
50  }
51  }
52  __device__ __forceinline__ void atomicMin_device(int &update , int value) {
53  ::atomicMin( &update , value );
54  }
55  __device__ __forceinline__ void atomicMin_device(unsigned int &update , unsigned int value) {
56  ::atomicMin( &update , value );
57  }
58  __device__ __forceinline__ void atomicMin_device(unsigned long long int &update , unsigned long long int value) {
59  #if ( defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) ) || ( defined(__NVCOMPILER_CUDA_ARCH__) && (__NVCOMPILER_CUDA_ARCH__ >= 350) )
60  ::atomicMin( &update , value );
61  #else
62  yakl_throw("ERROR: atomicMin not implemented for unsigned long long int for this CUDA architecture");
63  #endif
64  }
65  template <class T> __host__ __device__ __forceinline__ void atomicMin(T &update , T value) {
66  // This is safe from executing the command twice because with a CUDA backend, only one of these executes
67  YAKL_EXECUTE_ON_DEVICE_ONLY( atomicMin_device(update,value); )
68  YAKL_EXECUTE_ON_HOST_ONLY( atomicMin_host (update,value); )
69  }
70 
71  __device__ __forceinline__ void atomicMax_device(float &update , float value) {
72  int oldval, newval, readback;
73  oldval = __float_as_int(update);
74  newval = __float_as_int( __int_as_float(oldval) > value ? __int_as_float(oldval) : value );
75  while ( ( readback = atomicCAS( (int *) &update , oldval , newval ) ) != oldval ) {
76  oldval = readback;
77  newval = __float_as_int( __int_as_float(oldval) > value ? __int_as_float(oldval) : value );
78  }
79  }
80 
81  __device__ __forceinline__ void atomicMax_device(double &update , double value) {
82  unsigned long long oldval, newval, readback;
83  oldval = __double_as_longlong(update);
84  newval = __double_as_longlong( __longlong_as_double(oldval) > value ? __longlong_as_double(oldval) : value );
85  while ( ( readback = atomicCAS( (unsigned long long *) &update , oldval , newval ) ) != oldval ) {
86  oldval = readback;
87  newval = __double_as_longlong( __longlong_as_double(oldval) > value ? __longlong_as_double(oldval) : value );
88  }
89  }
90  __device__ __forceinline__ void atomicMax_device(int &update , int value) {
91  ::atomicMax( &update , value );
92  }
93  __device__ __forceinline__ void atomicMax_device(unsigned int &update , unsigned int value) {
94  ::atomicMax( &update , value );
95  }
96  __device__ __forceinline__ void atomicMax_device(unsigned long long int &update , unsigned long long int value) {
97  #if ( defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) ) || ( defined(__NVCOMPILER_CUDA_ARCH__) && (__NVCOMPILER_CUDA_ARCH__ >= 350) )
98  ::atomicMax( &update , value );
99  #else
100  yakl_throw("ERROR: atomicMin not implemented for unsigned long long int for this CUDA architecture");
101  #endif
102  }
103  template <class T> __host__ __device__ __forceinline__ void atomicMax(T &update , T value) {
104  // This is safe from executing the command twice because with a CUDA backend, only one of these executes
105  YAKL_EXECUTE_ON_DEVICE_ONLY( atomicMax_device(update,value); )
106  YAKL_EXECUTE_ON_HOST_ONLY( atomicMax_host (update,value); )
107  }
108 
109  __device__ __forceinline__ void atomicAdd_device(float &update , float value) {
110  ::atomicAdd( &update , value );
111  }
112  __device__ __forceinline__ void atomicAdd_device(double &update , double value) {
113  #if ( defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600) ) || ( defined(__NVCOMPILER_CUDA_ARCH__) && (__NVCOMPILER_CUDA_ARCH__ >= 600) )
114  ::atomicAdd( &update , value );
115  #else
116  unsigned long long oldval, newval, readback;
117  oldval = __double_as_longlong(update);
118  newval = __double_as_longlong( __longlong_as_double(oldval) + value );
119  while ( ( readback = atomicCAS( (unsigned long long *) &update , oldval , newval ) ) != oldval ) {
120  oldval = readback;
121  newval = __double_as_longlong( __longlong_as_double(oldval) + value );
122  }
123  #endif
124  }
125  __device__ __forceinline__ void atomicAdd_device(int &update , int value) {
126  ::atomicAdd( &update , value );
127  }
128  __device__ __forceinline__ void atomicAdd_device(unsigned int &update , unsigned int value) {
129  ::atomicAdd( &update , value );
130  }
131  __device__ __forceinline__ void atomicAdd_device(unsigned long long int &update , unsigned long long int value) {
132  ::atomicAdd( &update , value );
133  }
134  template <class T> __host__ __device__ __forceinline__ void atomicAdd(T &update , T value) {
135  // This is safe from executing the command twice because with a CUDA backend, only one of these executes
136  YAKL_EXECUTE_ON_DEVICE_ONLY( atomicAdd_device(update,value); )
137  YAKL_EXECUTE_ON_HOST_ONLY( atomicAdd_host (update,value); )
138  }
139 
140 
141  #elif defined(YAKL_ARCH_SYCL)
142 
143 
144  template <typename T, sycl::access::address_space addressSpace =
145  sycl::access::address_space::global_space>
146  using relaxed_atomic_ref =
147  sycl::atomic_ref< T,
148  sycl::memory_order::relaxed,
149  sycl::memory_scope::device,
150  addressSpace>;
151 
152  template <typename T, sycl::access::address_space addressSpace =
153  sycl::access::address_space::global_space>
154  __inline__ __attribute__((always_inline)) void atomicMin(T &update , T value) {
155  relaxed_atomic_ref<T, addressSpace>( update ).fetch_min( value );
156  }
157 
158  template <typename T, sycl::access::address_space addressSpace =
159  sycl::access::address_space::global_space>
160  __inline__ __attribute__((always_inline)) void atomicMax(T &update , T value) {
161  relaxed_atomic_ref<T, addressSpace>( update ).fetch_max( value );
162  }
163 
164  template <typename T, sycl::access::address_space addressSpace =
165  sycl::access::address_space::global_space>
166  __inline__ __attribute__((always_inline)) void atomicAdd(T &update , T value) {
167  relaxed_atomic_ref<T, addressSpace>( update ).fetch_add( value );
168  }
169 
170 
171  #elif defined(YAKL_ARCH_HIP)
172 
173 
174  __device__ __forceinline__ void atomicMin_device(float &update , float value) {
175  int oldval, newval, readback;
176  oldval = __float_as_int(update);
177  newval = __float_as_int( __int_as_float(oldval) < value ? __int_as_float(oldval) : value );
178  while ( ( readback = atomicCAS( (int *) &update , oldval , newval ) ) != oldval ) {
179  oldval = readback;
180  newval = __float_as_int( __int_as_float(oldval) < value ? __int_as_float(oldval) : value );
181  }
182  }
183 
184  __device__ __forceinline__ void atomicMin_device(double &update , double value) {
185  unsigned long long oldval, newval, readback;
186  oldval = __double_as_longlong(update);
187  newval = __double_as_longlong( __longlong_as_double(oldval) < value ? __longlong_as_double(oldval) : value );
188  while ( ( readback = atomicCAS( (unsigned long long *) &update , oldval , newval ) ) != oldval ) {
189  oldval = readback;
190  newval = __double_as_longlong( __longlong_as_double(oldval) < value ? __longlong_as_double(oldval) : value );
191  }
192  }
193  __device__ __forceinline__ void atomicMin_device(int &update , int value) {
194  ::atomicMin( &update , value );
195  }
196  __device__ __forceinline__ void atomicMin_device(unsigned int &update , unsigned int value) {
197  ::atomicMin( &update , value );
198  }
199  __device__ __forceinline__ void atomicMin_device(unsigned long long int &update , unsigned long long int value) {
200  ::atomicMin( &update , value );
201  }
202  template <class T> __host__ __device__ __forceinline__ void atomicMin(T &update , T value) {
203  // This is safe from executing the command twice because with a HIP backend, only one of these executes
204  YAKL_EXECUTE_ON_DEVICE_ONLY( atomicMin_device(update,value); )
205  YAKL_EXECUTE_ON_HOST_ONLY( atomicMin_host (update,value); )
206  }
207 
208  __device__ __forceinline__ void atomicMax_device(float &update , float value) {
209  int oldval, newval, readback;
210  oldval = __float_as_int(update);
211  newval = __float_as_int( __int_as_float(oldval) > value ? __int_as_float(oldval) : value );
212  while ( ( readback = atomicCAS( (int *) &update , oldval , newval ) ) != oldval ) {
213  oldval = readback;
214  newval = __float_as_int( __int_as_float(oldval) > value ? __int_as_float(oldval) : value );
215  }
216  }
217 
218  __device__ __forceinline__ void atomicMax_device(double &update , double value) {
219  unsigned long long oldval, newval, readback;
220  oldval = __double_as_longlong(update);
221  newval = __double_as_longlong( __longlong_as_double(oldval) > value ? __longlong_as_double(oldval) : value );
222  while ( ( readback = atomicCAS( (unsigned long long *) &update , oldval , newval ) ) != oldval ) {
223  oldval = readback;
224  newval = __double_as_longlong( __longlong_as_double(oldval) > value ? __longlong_as_double(oldval) : value );
225  }
226  }
227  __device__ __forceinline__ void atomicMax_device(int &update , int value) {
228  ::atomicMax( &update , value );
229  }
230  __device__ __forceinline__ void atomicMax_device(unsigned int &update , unsigned int value) {
231  ::atomicMax( &update , value );
232  }
233  __device__ __forceinline__ void atomicMax_device(unsigned long long int &update , unsigned long long int value) {
234  ::atomicMax( &update , value );
235  }
236  template <class T> __host__ __device__ __forceinline__ void atomicMax(T &update , T value) {
237  // This is safe from executing the command twice because with a HIP backend, only one of these executes
238  YAKL_EXECUTE_ON_DEVICE_ONLY( atomicMax_device(update,value); )
239  YAKL_EXECUTE_ON_HOST_ONLY( atomicMax_host (update,value); )
240  }
241 
242  __device__ __forceinline__ void atomicAdd_device(float &update , float value) {
243  ::atomicAdd( &update , value );
244  }
245  __device__ __forceinline__ void atomicAdd_device(double &update , double value) {
246  ::atomicAdd( &update , value );
247  }
248  __device__ __forceinline__ void atomicAdd_device(int &update , int value) {
249  ::atomicAdd( &update , value );
250  }
251  __device__ __forceinline__ void atomicAdd_device(unsigned int &update , unsigned int value) {
252  ::atomicAdd( &update , value );
253  }
254  __device__ __forceinline__ void atomicAdd_device(unsigned long long int &update , unsigned long long int value) {
255  ::atomicAdd( &update , value );
256  }
257  template <class T> __host__ __device__ __forceinline__ void atomicAdd(T &update , T value) {
258  // This is safe from executing the command twice because with a HIP backend, only one of these executes
259  YAKL_EXECUTE_ON_DEVICE_ONLY( atomicAdd_device(update,value); )
260  YAKL_EXECUTE_ON_HOST_ONLY( atomicAdd_host (update,value); )
261  }
262 
263 
264  #elif defined(YAKL_ARCH_OPENMP)
265 
266 
267  template <class T> inline void atomicMin(T &update, T value) {
268  #pragma omp critical
269  { update = value < update ? value : update; }
270  }
271 
272  template <class T> inline void atomicMax(T &update, T value) {
273  #pragma omp critical
274  { update = value > update ? value : update; }
275  }
276 
277  template <class T> inline void atomicAdd(T &update, T value) {
278  #pragma omp atomic update
279  update += value;
280  }
281 
282 
283  #else
284 
285 
309  template <class T> YAKL_INLINE void atomicMin(T &update, T value) { atomicMin_host(update,value); }
310 
315  template <class T> YAKL_INLINE void atomicMax(T &update, T value) { atomicMax_host(update,value); }
316 
321  template <class T> YAKL_INLINE void atomicAdd(T &update, T value) { atomicAdd_host(update,value); }
322 
323 
324  #endif
325 
326 }
328 
329 
YAKL_EXECUTE_ON_DEVICE_ONLY
#define YAKL_EXECUTE_ON_DEVICE_ONLY(...)
[NOT COMMONLY USED] Macro function used to determine if the current code is compiling for the device.
Definition: YAKL_defines.h:158
yakl::atomicMax
YAKL_INLINE void atomicMax(T &update, T value)
yakl::atomicMax(update,value) atomically performs update = max(update,value)
Definition: YAKL_atomics.h:315
yakl::atomicMin
YAKL_INLINE void atomicMin(T &update, T value)
yakl::atomicMin(update,value) atomically performs update = min(update,value)
Definition: YAKL_atomics.h:309
__YAKL_NAMESPACE_WRAPPER_END__
#define __YAKL_NAMESPACE_WRAPPER_END__
Definition: YAKL.h:20
yakl::atomicAdd
YAKL_INLINE void atomicAdd(T &update, T value)
[NON_B4B] yakl::atomicAdd(update,value) atomically performs update += value)
Definition: YAKL_atomics.h:321
__YAKL_NAMESPACE_WRAPPER_BEGIN__
#define __YAKL_NAMESPACE_WRAPPER_BEGIN__
Definition: YAKL.h:19
YAKL_INLINE
#define YAKL_INLINE
Used to decorate functions called from kernels (parallel_for and parallel_outer) or from CPU function...
Definition: YAKL_defines.h:140
YAKL_EXECUTE_ON_HOST_ONLY
#define YAKL_EXECUTE_ON_HOST_ONLY(...)
[NOT COMMONLY USED] Macro function used to determine if the current code is compiling for the host.
Definition: YAKL_defines.h:153
yakl::yakl_throw
YAKL_INLINE void yakl_throw(const char *msg)
Throw an error message. Works from the host or device.
Definition: YAKL_error.h:17
yakl