18 template <
class T>
inline void atomicAdd_host(T &update, T value) {
22 template <
class T>
inline void atomicMin_host(T &update, T value) {
23 update = update < value ? update : value;
26 template <
class T>
inline void atomicMax_host(T &update, T value) {
27 update = update > value ? update : value;
34 __device__ __forceinline__
void atomicMin_device(
float &update ,
float value) {
35 int oldval, newval, readback;
36 oldval = __float_as_int(update);
37 newval = __float_as_int( __int_as_float(oldval) < value ? __int_as_float(oldval) : value );
38 while ( ( readback = atomicCAS( (
int *) &update , oldval , newval ) ) != oldval ) {
40 newval = __float_as_int( __int_as_float(oldval) < value ? __int_as_float(oldval) : value );
43 __device__ __forceinline__
void atomicMin_device(
double &update ,
double value) {
44 unsigned long long oldval, newval, readback;
45 oldval = __double_as_longlong(update);
46 newval = __double_as_longlong( __longlong_as_double(oldval) < value ? __longlong_as_double(oldval) : value );
47 while ( ( readback = atomicCAS( (
unsigned long long *) &update , oldval , newval ) ) != oldval ) {
49 newval = __double_as_longlong( __longlong_as_double(oldval) < value ? __longlong_as_double(oldval) : value );
52 __device__ __forceinline__
void atomicMin_device(
int &update ,
int value) {
55 __device__ __forceinline__
void atomicMin_device(
unsigned int &update ,
unsigned int value) {
58 __device__ __forceinline__
void atomicMin_device(
unsigned long long int &update ,
unsigned long long int value) {
59 #if ( defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) ) || ( defined(__NVCOMPILER_CUDA_ARCH__) && (__NVCOMPILER_CUDA_ARCH__ >= 350) )
62 yakl_throw(
"ERROR: atomicMin not implemented for unsigned long long int for this CUDA architecture");
65 template <
class T> __host__ __device__ __forceinline__
void atomicMin(T &update , T value) {
71 __device__ __forceinline__
void atomicMax_device(
float &update ,
float value) {
72 int oldval, newval, readback;
73 oldval = __float_as_int(update);
74 newval = __float_as_int( __int_as_float(oldval) > value ? __int_as_float(oldval) : value );
75 while ( ( readback = atomicCAS( (
int *) &update , oldval , newval ) ) != oldval ) {
77 newval = __float_as_int( __int_as_float(oldval) > value ? __int_as_float(oldval) : value );
81 __device__ __forceinline__
void atomicMax_device(
double &update ,
double value) {
82 unsigned long long oldval, newval, readback;
83 oldval = __double_as_longlong(update);
84 newval = __double_as_longlong( __longlong_as_double(oldval) > value ? __longlong_as_double(oldval) : value );
85 while ( ( readback = atomicCAS( (
unsigned long long *) &update , oldval , newval ) ) != oldval ) {
87 newval = __double_as_longlong( __longlong_as_double(oldval) > value ? __longlong_as_double(oldval) : value );
90 __device__ __forceinline__
void atomicMax_device(
int &update ,
int value) {
93 __device__ __forceinline__
void atomicMax_device(
unsigned int &update ,
unsigned int value) {
96 __device__ __forceinline__
void atomicMax_device(
unsigned long long int &update ,
unsigned long long int value) {
97 #if ( defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350) ) || ( defined(__NVCOMPILER_CUDA_ARCH__) && (__NVCOMPILER_CUDA_ARCH__ >= 350) )
100 yakl_throw(
"ERROR: atomicMin not implemented for unsigned long long int for this CUDA architecture");
103 template <
class T> __host__ __device__ __forceinline__
void atomicMax(T &update , T value) {
109 __device__ __forceinline__
void atomicAdd_device(
float &update ,
float value) {
112 __device__ __forceinline__
void atomicAdd_device(
double &update ,
double value) {
113 #if ( defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 600) ) || ( defined(__NVCOMPILER_CUDA_ARCH__) && (__NVCOMPILER_CUDA_ARCH__ >= 600) )
116 unsigned long long oldval, newval, readback;
117 oldval = __double_as_longlong(update);
118 newval = __double_as_longlong( __longlong_as_double(oldval) + value );
119 while ( ( readback = atomicCAS( (
unsigned long long *) &update , oldval , newval ) ) != oldval ) {
121 newval = __double_as_longlong( __longlong_as_double(oldval) + value );
125 __device__ __forceinline__
void atomicAdd_device(
int &update ,
int value) {
128 __device__ __forceinline__
void atomicAdd_device(
unsigned int &update ,
unsigned int value) {
131 __device__ __forceinline__
void atomicAdd_device(
unsigned long long int &update ,
unsigned long long int value) {
134 template <
class T> __host__ __device__ __forceinline__
void atomicAdd(T &update , T value) {
141 #elif defined(YAKL_ARCH_SYCL)
144 template <
typename T, sycl::access::address_space addressSpace =
145 sycl::access::address_space::global_space>
146 using relaxed_atomic_ref =
148 sycl::memory_order::relaxed,
149 sycl::memory_scope::device,
152 template <
typename T, sycl::access::address_space addressSpace =
153 sycl::access::address_space::global_space>
154 __inline__ __attribute__((always_inline))
void atomicMin(T &update , T value) {
155 relaxed_atomic_ref<T, addressSpace>( update ).fetch_min( value );
158 template <
typename T, sycl::access::address_space addressSpace =
159 sycl::access::address_space::global_space>
160 __inline__ __attribute__((always_inline))
void atomicMax(T &update , T value) {
161 relaxed_atomic_ref<T, addressSpace>( update ).fetch_max( value );
164 template <
typename T, sycl::access::address_space addressSpace =
165 sycl::access::address_space::global_space>
166 __inline__ __attribute__((always_inline))
void atomicAdd(T &update , T value) {
167 relaxed_atomic_ref<T, addressSpace>( update ).fetch_add( value );
171 #elif defined(YAKL_ARCH_HIP)
174 __device__ __forceinline__
void atomicMin_device(
float &update ,
float value) {
175 int oldval, newval, readback;
176 oldval = __float_as_int(update);
177 newval = __float_as_int( __int_as_float(oldval) < value ? __int_as_float(oldval) : value );
178 while ( ( readback = atomicCAS( (
int *) &update , oldval , newval ) ) != oldval ) {
180 newval = __float_as_int( __int_as_float(oldval) < value ? __int_as_float(oldval) : value );
184 __device__ __forceinline__
void atomicMin_device(
double &update ,
double value) {
185 unsigned long long oldval, newval, readback;
186 oldval = __double_as_longlong(update);
187 newval = __double_as_longlong( __longlong_as_double(oldval) < value ? __longlong_as_double(oldval) : value );
188 while ( ( readback = atomicCAS( (
unsigned long long *) &update , oldval , newval ) ) != oldval ) {
190 newval = __double_as_longlong( __longlong_as_double(oldval) < value ? __longlong_as_double(oldval) : value );
193 __device__ __forceinline__
void atomicMin_device(
int &update ,
int value) {
196 __device__ __forceinline__
void atomicMin_device(
unsigned int &update ,
unsigned int value) {
199 __device__ __forceinline__
void atomicMin_device(
unsigned long long int &update ,
unsigned long long int value) {
202 template <
class T> __host__ __device__ __forceinline__
void atomicMin(T &update , T value) {
208 __device__ __forceinline__
void atomicMax_device(
float &update ,
float value) {
209 int oldval, newval, readback;
210 oldval = __float_as_int(update);
211 newval = __float_as_int( __int_as_float(oldval) > value ? __int_as_float(oldval) : value );
212 while ( ( readback = atomicCAS( (
int *) &update , oldval , newval ) ) != oldval ) {
214 newval = __float_as_int( __int_as_float(oldval) > value ? __int_as_float(oldval) : value );
218 __device__ __forceinline__
void atomicMax_device(
double &update ,
double value) {
219 unsigned long long oldval, newval, readback;
220 oldval = __double_as_longlong(update);
221 newval = __double_as_longlong( __longlong_as_double(oldval) > value ? __longlong_as_double(oldval) : value );
222 while ( ( readback = atomicCAS( (
unsigned long long *) &update , oldval , newval ) ) != oldval ) {
224 newval = __double_as_longlong( __longlong_as_double(oldval) > value ? __longlong_as_double(oldval) : value );
227 __device__ __forceinline__
void atomicMax_device(
int &update ,
int value) {
230 __device__ __forceinline__
void atomicMax_device(
unsigned int &update ,
unsigned int value) {
233 __device__ __forceinline__
void atomicMax_device(
unsigned long long int &update ,
unsigned long long int value) {
236 template <
class T> __host__ __device__ __forceinline__
void atomicMax(T &update , T value) {
242 __device__ __forceinline__
void atomicAdd_device(
float &update ,
float value) {
245 __device__ __forceinline__
void atomicAdd_device(
double &update ,
double value) {
248 __device__ __forceinline__
void atomicAdd_device(
int &update ,
int value) {
251 __device__ __forceinline__
void atomicAdd_device(
unsigned int &update ,
unsigned int value) {
254 __device__ __forceinline__
void atomicAdd_device(
unsigned long long int &update ,
unsigned long long int value) {
257 template <
class T> __host__ __device__ __forceinline__
void atomicAdd(T &update , T value) {
264 #elif defined(YAKL_ARCH_OPENMP)
267 template <
class T>
inline void atomicMin(T &update, T value) {
269 { update = value < update ? value : update; }
272 template <
class T>
inline void atomicMax(T &update, T value) {
274 { update = value > update ? value : update; }
277 template <
class T>
inline void atomicAdd(T &update, T value) {
278 #pragma omp atomic update