17 template <
class T1,
class T2,
18 typename std::enable_if< std::is_same< typename std::remove_cv<T1>::type ,
19 typename std::remove_cv<T2>::type >::value ,
int >::type = 0>
21 #ifdef YAKL_AUTO_PROFILE
24 for (
index_t i=0; i<elems; i++) { dst[i] = src[i]; }
25 #ifdef YAKL_AUTO_PROFILE
26 timer_stop(
"YAKL_internal_memcpy_host_to_host");
35 #ifdef YAKL_AUTO_PROFILE
38 memcpy( dst , src , bytes );
39 #ifdef YAKL_AUTO_PROFILE
40 timer_stop(
"YAKL_internal_memcpy_host_to_host");
48 template <
class T1,
class T2,
49 typename std::enable_if< std::is_same< typename std::remove_cv<T1>::type ,
50 typename std::remove_cv<T2>::type >::value ,
int >::type = 0>
52 #ifdef YAKL_AUTO_PROFILE
56 cudaMemcpyAsync(dst,src,elems*
sizeof(T1),cudaMemcpyDeviceToHost,stream.get_real_stream());
58 #elif defined(YAKL_ARCH_HIP)
59 hipMemcpyAsync(dst,src,elems*
sizeof(T1),hipMemcpyDeviceToHost,stream.get_real_stream());
61 #elif defined (YAKL_ARCH_SYCL)
62 stream.get_real_stream().memcpy(dst, src, elems*
sizeof(T1));
64 #elif defined(YAKL_ARCH_OPENMP)
65 #pragma omp parallel for
66 for (
index_t i=0; i<elems; i++) { dst[i] = src[i]; }
68 for (
index_t i=0; i<elems; i++) { dst[i] = src[i]; }
70 #if defined(YAKL_AUTO_FENCE)
73 #ifdef YAKL_AUTO_PROFILE
74 timer_stop(
"YAKL_internal_memcpy_device_to_host");
82 template <
class T1,
class T2,
83 typename std::enable_if< std::is_same< typename std::remove_cv<T1>::type ,
84 typename std::remove_cv<T2>::type >::value ,
int >::type = 0>
86 #ifdef YAKL_AUTO_PROFILE
90 cudaMemcpyAsync(dst,src,elems*
sizeof(T1),cudaMemcpyHostToDevice,stream.get_real_stream());
92 #elif defined(YAKL_ARCH_HIP)
93 hipMemcpyAsync(dst,src,elems*
sizeof(T1),hipMemcpyHostToDevice,stream.get_real_stream());
95 #elif defined (YAKL_ARCH_SYCL)
96 stream.get_real_stream().memcpy(dst, src, elems*
sizeof(T1));
98 #elif defined(YAKL_ARCH_OPENMP)
99 #pragma omp parallel for
100 for (
index_t i=0; i<elems; i++) { dst[i] = src[i]; }
102 for (
index_t i=0; i<elems; i++) { dst[i] = src[i]; }
104 #if defined(YAKL_AUTO_FENCE)
107 #ifdef YAKL_AUTO_PROFILE
108 timer_stop(
"YAKL_internal_memcpy_host_to_device");
116 template <
class T1,
class T2,
117 typename std::enable_if< std::is_same< typename std::remove_cv<T1>::type ,
118 typename std::remove_cv<T2>::type >::value ,
int >::type = 0>
120 #ifdef YAKL_AUTO_PROFILE
121 timer_start(
"YAKL_internal_memcpy_device_to_device");
123 #ifdef YAKL_ARCH_CUDA
124 cudaMemcpyAsync(dst,src,elems*
sizeof(T1),cudaMemcpyDeviceToDevice,stream.get_real_stream());
126 #elif defined(YAKL_ARCH_HIP)
127 hipMemcpyAsync(dst,src,elems*
sizeof(T1),hipMemcpyDeviceToDevice,stream.get_real_stream());
129 #elif defined (YAKL_ARCH_SYCL)
130 stream.get_real_stream().memcpy(dst, src, elems*
sizeof(T1));
132 #elif defined(YAKL_ARCH_OPENMP)
133 #pragma omp parallel for
134 for (
index_t i=0; i<elems; i++) { dst[i] = src[i]; }
136 for (
index_t i=0; i<elems; i++) { dst[i] = src[i]; }
138 #if defined(YAKL_AUTO_FENCE)
141 #ifdef YAKL_AUTO_PROFILE
142 timer_stop(
"YAKL_internal_memcpy_device_to_device");
151 #ifdef YAKL_AUTO_PROFILE
152 timer_start(
"YAKL_internal_memcpy_device_to_device");
154 #ifdef YAKL_ARCH_CUDA
155 cudaMemcpyAsync(dst,src,bytes,cudaMemcpyDeviceToDevice,stream.get_real_stream());
157 #elif defined(YAKL_ARCH_HIP)
158 hipMemcpyAsync(dst,src,bytes,hipMemcpyDeviceToDevice,stream.get_real_stream());
160 #elif defined (YAKL_ARCH_SYCL)
161 stream.get_real_stream().memcpy(dst, src, bytes);
164 memcpy( dst , src , bytes );
166 #if defined(YAKL_AUTO_FENCE)
169 #ifdef YAKL_AUTO_PROFILE
170 timer_stop(
"YAKL_internal_memcpy_device_to_device");