YAKL
YAKL_memset.h
Go to the documentation of this file.
1 
7 #pragma once
8 // Included by YAKL.h
9 
11 namespace yakl {
12 
14  template <class T, int rank, int myMem, int myStyle, class I>
15  void memset( Array<T,rank,myMem,myStyle> const &arr , I val , Stream stream = Stream() ) {
16  #ifdef YAKL_DEBUG
17  if (! arr.initialized()) {
18  yakl_throw("ERROR: calling memset on an array that is not allocated");
19  }
20  #endif
21 
22  // Use memset for zero values when possible
23  if (myMem == memDevice && val == 0) {
24  #if defined(YAKL_ARCH_CUDA)
25  cudaMemsetAsync( arr.data() , 0 , sizeof(T)*arr.totElems() , stream.get_real_stream() );
26  #elif defined(YAKL_ARCH_HIP)
27  hipMemsetAsync ( arr.data() , 0 , sizeof(T)*arr.totElems() , stream.get_real_stream() );
28  #elif defined(YAKL_ARCH_SYCL)
29  stream.get_real_stream().memset( arr.data() , 0 , sizeof(T)*arr.totElems() );
30  #else
31  c::parallel_for( "YAKL_internal_memset" , arr.totElems() , YAKL_LAMBDA (int i) {
32  arr.data()[i] = 0;
33  } , DefaultLaunchConfig().set_stream(stream) );
34  #endif
35  } else {
36  // SYCL has a fill routine, but CUDA and HIP do not
37  if (myMem == memDevice) {
38  #if defined(YAKL_ARCH_SYCL)
39  stream.get_real_stream().fill<T>( arr.data() , val , arr.totElems() );
40  #else
41  c::parallel_for( "YAKL_internal_memset" , arr.totElems() , YAKL_LAMBDA (int i) {
42  arr.data()[i] = val;
43  } , DefaultLaunchConfig().set_stream(stream) );
44  #endif
45  } else if (myMem == memHost) {
46  std::fill( arr.data(), arr.data()+arr.totElems(), val );
47  }
48  }
49  #if defined(YAKL_AUTO_FENCE)
50  fence();
51  #endif
52  }
53 
54 
56  template <class T, int rank, class B0, class B1, class B2, class B3, class I>
57  YAKL_INLINE void memset( FSArray<T,rank,B0,B1,B2,B3> &arr , I val ) {
58  for (index_t i = 0; i < arr.totElems(); i++) {
59  arr.data()[i] = val;
60  }
61  }
62 
63 
65  template <class T, int rank, unsigned D0, unsigned D1, unsigned D2, unsigned D3, class I>
66  YAKL_INLINE void memset( SArray<T,rank,D0,D1,D2,D3> &arr , I val ) {
67  for (index_t i = 0; i < arr.totElems(); i++) {
68  arr.data()[i] = val;
69  }
70  }
71 
72 }
74 
75 
yakl::memDevice
constexpr int memDevice
Specifies a device memory address space for a yakl::Array object.
Definition: YAKL_memory_spaces.h:13
yakl::c::parallel_for
void parallel_for(char const *str, Bounds< N, simple > const &bounds, F const &f, LaunchConfig< VecLen, B4B > config=LaunchConfig<>())
[ASYNCHRONOUS] Launch the passed functor in parallel.
__YAKL_NAMESPACE_WRAPPER_END__
#define __YAKL_NAMESPACE_WRAPPER_END__
Definition: YAKL.h:20
__YAKL_NAMESPACE_WRAPPER_BEGIN__
#define __YAKL_NAMESPACE_WRAPPER_BEGIN__
Definition: YAKL.h:19
yakl::DefaultLaunchConfig
LaunchConfig<> DefaultLaunchConfig
This launch configuration sets vector length to the device default and B4B to false.
Definition: YAKL_LaunchConfig.h:77
YAKL_INLINE
#define YAKL_INLINE
Used to decorate functions called from kernels (parallel_for and parallel_outer) or from CPU function...
Definition: YAKL_defines.h:140
yakl::fence
void fence()
Block the host code until all device code has completed.
Definition: YAKL_fence.h:16
yakl::index_t
unsigned int index_t
Definition: YAKL.h:41
yakl::yakl_throw
YAKL_INLINE void yakl_throw(const char *msg)
Throw an error message. Works from the host or device.
Definition: YAKL_error.h:17
yakl
YAKL_LAMBDA
#define YAKL_LAMBDA
Used to create C++ lambda expressions passed to parallel_for and parallel_outer
Definition: YAKL_defines.h:128
yakl::memHost
constexpr int memHost
Specifies a device memory address space for a yakl::Array object.
Definition: YAKL_memory_spaces.h:15