YAKL
YAKL_mem_transfers.h
Go to the documentation of this file.
1 
6 #pragma once
7 // Included by YAKL.h
8 
10 namespace yakl {
11 
12  // Your one-stop shop for memory transfers to / from host / device
13 
17  template <class T1, class T2,
18  typename std::enable_if< std::is_same< typename std::remove_cv<T1>::type ,
19  typename std::remove_cv<T2>::type >::value , int >::type = 0>
20  inline void memcpy_host_to_host(T1 *dst , T2 *src , index_t elems) {
21  #ifdef YAKL_AUTO_PROFILE
22  timer_start("YAKL_internal_memcpy_host_to_host");
23  #endif
24  for (index_t i=0; i<elems; i++) { dst[i] = src[i]; }
25  #ifdef YAKL_AUTO_PROFILE
26  timer_stop("YAKL_internal_memcpy_host_to_host");
27  #endif
28  }
29 
30 
34  inline void memcpy_host_to_host_void(void *dst , void *src , size_t bytes) {
35  #ifdef YAKL_AUTO_PROFILE
36  timer_start("YAKL_internal_memcpy_host_to_host");
37  #endif
38  memcpy( dst , src , bytes );
39  #ifdef YAKL_AUTO_PROFILE
40  timer_stop("YAKL_internal_memcpy_host_to_host");
41  #endif
42  }
43 
44 
48  template <class T1, class T2,
49  typename std::enable_if< std::is_same< typename std::remove_cv<T1>::type ,
50  typename std::remove_cv<T2>::type >::value , int >::type = 0>
51  inline void memcpy_device_to_host(T1 *dst , T2 *src , index_t elems , Stream stream = Stream() ) {
52  #ifdef YAKL_AUTO_PROFILE
53  timer_start("YAKL_internal_memcpy_device_to_host");
54  #endif
55  #ifdef YAKL_ARCH_CUDA
56  cudaMemcpyAsync(dst,src,elems*sizeof(T1),cudaMemcpyDeviceToHost,stream.get_real_stream());
58  #elif defined(YAKL_ARCH_HIP)
59  hipMemcpyAsync(dst,src,elems*sizeof(T1),hipMemcpyDeviceToHost,stream.get_real_stream());
61  #elif defined (YAKL_ARCH_SYCL)
62  stream.get_real_stream().memcpy(dst, src, elems*sizeof(T1));
64  #elif defined(YAKL_ARCH_OPENMP)
65  #pragma omp parallel for
66  for (index_t i=0; i<elems; i++) { dst[i] = src[i]; }
67  #else
68  for (index_t i=0; i<elems; i++) { dst[i] = src[i]; }
69  #endif
70  #if defined(YAKL_AUTO_FENCE)
71  fence();
72  #endif
73  #ifdef YAKL_AUTO_PROFILE
74  timer_stop("YAKL_internal_memcpy_device_to_host");
75  #endif
76  }
77 
78 
82  template <class T1, class T2,
83  typename std::enable_if< std::is_same< typename std::remove_cv<T1>::type ,
84  typename std::remove_cv<T2>::type >::value , int >::type = 0>
85  inline void memcpy_host_to_device(T1 *dst , T2 *src , index_t elems , Stream stream = Stream() ) {
86  #ifdef YAKL_AUTO_PROFILE
87  timer_start("YAKL_internal_memcpy_host_to_device");
88  #endif
89  #ifdef YAKL_ARCH_CUDA
90  cudaMemcpyAsync(dst,src,elems*sizeof(T1),cudaMemcpyHostToDevice,stream.get_real_stream());
92  #elif defined(YAKL_ARCH_HIP)
93  hipMemcpyAsync(dst,src,elems*sizeof(T1),hipMemcpyHostToDevice,stream.get_real_stream());
95  #elif defined (YAKL_ARCH_SYCL)
96  stream.get_real_stream().memcpy(dst, src, elems*sizeof(T1));
98  #elif defined(YAKL_ARCH_OPENMP)
99  #pragma omp parallel for
100  for (index_t i=0; i<elems; i++) { dst[i] = src[i]; }
101  #else
102  for (index_t i=0; i<elems; i++) { dst[i] = src[i]; }
103  #endif
104  #if defined(YAKL_AUTO_FENCE)
105  fence();
106  #endif
107  #ifdef YAKL_AUTO_PROFILE
108  timer_stop("YAKL_internal_memcpy_host_to_device");
109  #endif
110  }
111 
112 
116  template <class T1, class T2,
117  typename std::enable_if< std::is_same< typename std::remove_cv<T1>::type ,
118  typename std::remove_cv<T2>::type >::value , int >::type = 0>
119  inline void memcpy_device_to_device(T1 *dst , T2 *src , index_t elems , Stream stream = Stream() ) {
120  #ifdef YAKL_AUTO_PROFILE
121  timer_start("YAKL_internal_memcpy_device_to_device");
122  #endif
123  #ifdef YAKL_ARCH_CUDA
124  cudaMemcpyAsync(dst,src,elems*sizeof(T1),cudaMemcpyDeviceToDevice,stream.get_real_stream());
126  #elif defined(YAKL_ARCH_HIP)
127  hipMemcpyAsync(dst,src,elems*sizeof(T1),hipMemcpyDeviceToDevice,stream.get_real_stream());
129  #elif defined (YAKL_ARCH_SYCL)
130  stream.get_real_stream().memcpy(dst, src, elems*sizeof(T1));
132  #elif defined(YAKL_ARCH_OPENMP)
133  #pragma omp parallel for
134  for (index_t i=0; i<elems; i++) { dst[i] = src[i]; }
135  #else
136  for (index_t i=0; i<elems; i++) { dst[i] = src[i]; }
137  #endif
138  #if defined(YAKL_AUTO_FENCE)
139  fence();
140  #endif
141  #ifdef YAKL_AUTO_PROFILE
142  timer_stop("YAKL_internal_memcpy_device_to_device");
143  #endif
144  }
145 
146 
150  inline void memcpy_device_to_device_void(void *dst , void *src , size_t bytes , Stream stream = Stream() ) {
151  #ifdef YAKL_AUTO_PROFILE
152  timer_start("YAKL_internal_memcpy_device_to_device");
153  #endif
154  #ifdef YAKL_ARCH_CUDA
155  cudaMemcpyAsync(dst,src,bytes,cudaMemcpyDeviceToDevice,stream.get_real_stream());
157  #elif defined(YAKL_ARCH_HIP)
158  hipMemcpyAsync(dst,src,bytes,hipMemcpyDeviceToDevice,stream.get_real_stream());
160  #elif defined (YAKL_ARCH_SYCL)
161  stream.get_real_stream().memcpy(dst, src, bytes);
163  #else
164  memcpy( dst , src , bytes );
165  #endif
166  #if defined(YAKL_AUTO_FENCE)
167  fence();
168  #endif
169  #ifdef YAKL_AUTO_PROFILE
170  timer_stop("YAKL_internal_memcpy_device_to_device");
171  #endif
172  }
173 
174 }
176 
177 
yakl::timer_stop
void timer_stop(char const *lab)
Stop a timer with the given string label. NOTE: Timers must be perfectly nested.
Definition: YAKL_timers.h:26
yakl::memcpy_device_to_device_void
void memcpy_device_to_device_void(void *dst, void *src, size_t bytes, Stream stream=Stream())
[USE AT YOUR OWN RISK]: memcpy the specified number of bytes on the device
Definition: YAKL_mem_transfers.h:150
yakl::Stream
Implements the functionality of a stream for parallel kernel execution. If the Stream::create() metho...
Definition: YAKL_streams_events.h:394
__YAKL_NAMESPACE_WRAPPER_END__
#define __YAKL_NAMESPACE_WRAPPER_END__
Definition: YAKL.h:20
yakl::memcpy_host_to_device
void memcpy_host_to_device(T1 *dst, T2 *src, index_t elems, Stream stream=Stream())
[USE AT YOUR OWN RISK]: memcpy the specified number of elements from host to device
Definition: YAKL_mem_transfers.h:85
yakl::memcpy_host_to_host_void
void memcpy_host_to_host_void(void *dst, void *src, size_t bytes)
[USE AT YOUR OWN RISK]: memcpy the specified number of bytes on the host
Definition: YAKL_mem_transfers.h:34
__YAKL_NAMESPACE_WRAPPER_BEGIN__
#define __YAKL_NAMESPACE_WRAPPER_BEGIN__
Definition: YAKL.h:19
yakl::fence
void fence()
Block the host code until all device code has completed.
Definition: YAKL_fence.h:16
yakl::index_t
unsigned int index_t
Definition: YAKL.h:41
yakl::memcpy_host_to_host
void memcpy_host_to_host(T1 *dst, T2 *src, index_t elems)
[USE AT YOUR OWN RISK]: memcpy the specified number of elements on the host
Definition: YAKL_mem_transfers.h:20
yakl::memcpy_device_to_device
void memcpy_device_to_device(T1 *dst, T2 *src, index_t elems, Stream stream=Stream())
[USE AT YOUR OWN RISK]: memcpy the specified number of elements on the device
Definition: YAKL_mem_transfers.h:119
yakl
yakl::memcpy_device_to_host
void memcpy_device_to_host(T1 *dst, T2 *src, index_t elems, Stream stream=Stream())
[USE AT YOUR OWN RISK]: memcpy the specified number of elements from device to host
Definition: YAKL_mem_transfers.h:51
yakl::timer_start
void timer_start(char const *lab)
Start a timer with the given string label. NOTE: Timers must be perfectly nested.
Definition: YAKL_timers.h:23
yakl::check_last_error
void check_last_error()
Checks to see if an error has occurred on the device.
Definition: YAKL_error.h:45