YAKL
YAKL_fft.h
Go to the documentation of this file.
1 
2 #pragma once
3 
4 #include "YAKL.h"
5 #if !defined(YAKL_ARCH_CUDA) && !defined(YAKL_ARCH_HIP) && !defined(YAKL_ARCH_SYCL)
6  #define POCKETFFT_CACHE_SIZE 2
7  #define POCKETFFT_NO_MULTITHREADING
8  #include "pocketfft_hdronly.h"
9 #endif
10 
12 namespace yakl {
13 
49  template <class T> class RealFFT1D {
50  public:
52  int batch_size;
54  int transform_size;
56  int trdim;
57  #if defined(YAKL_ARCH_CUDA)
58  cufftHandle plan_forward;
59  cufftHandle plan_inverse;
60  #define CHECK(func) { int myloc = func; if (myloc != CUFFT_SUCCESS) { std::cerr << "ERROR: YAKL CUFFT: " << __FILE__ << ": " <<__LINE__ << std::endl; yakl_throw(""); } }
61  #elif defined(YAKL_ARCH_HIP)
62  rocfft_plan plan_forward;
63  rocfft_plan plan_inverse;
64  #define CHECK(func) { int myloc = func; if (myloc != rocfft_status_success) { std::cerr << "ERROR: YAKL ROCFFT: " << __FILE__ << ": " <<__LINE__ << std::endl; yakl_throw(""); } }
65  #elif defined(YAKL_ARCH_SYCL)
66  typedef oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::REAL> desc_single_t;
67  typedef oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::DOUBLE, oneapi::mkl::dft::domain::REAL> desc_double_t;
68  void *plan_forward, *plan_inverse;
69  #define CHECK(func) { \
70  try { \
71  func; \
72  } \
73  catch (oneapi::mkl::exception const &ex) { \
74  std::cerr << "ERROR: YAKL ONEMKL-FFT: " << __FILE__ << " : " << __LINE__ \
75  << std::endl; yakl_throw(""); \
76  } \
77  }
78  #endif
79 
80  void nullify() {batch_size = -1; transform_size = -1; trdim = -1;}
81 
82  RealFFT1D() { nullify(); }
83  ~RealFFT1D() { cleanup(); }
84 
86  void cleanup() {
87  if (transform_size != -1) {
88  #if defined(YAKL_ARCH_CUDA)
89  CHECK( cufftDestroy( plan_forward ) );
90  CHECK( cufftDestroy( plan_inverse ) );
91  #elif defined(YAKL_ARCH_HIP)
92  CHECK( rocfft_plan_destroy( plan_forward ) );
93  CHECK( rocfft_plan_destroy( plan_inverse ) );
94  #endif
95  }
96  nullify();
97  }
98 
101  template <int N> void init( Array<T,N,memDevice,styleC> &arr , int trdim , int tr_size ) {
102  int rank = 1;
103  int n = tr_size;
104  int istride = 1;
105  int ostride = 1;
106  int idist = arr.extent(trdim);
107  int odist = idist / 2;
108  int batch = arr.totElems() / arr.extent(trdim);
109  int *inembed = nullptr;
110  int *onembed = nullptr;
111 
112  #if defined(YAKL_ARCH_CUDA)
113  if constexpr (std::is_same<T,float >::value) {
114  CHECK( cufftPlanMany(&plan_forward, rank, &n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, batch) );
115  CHECK( cufftPlanMany(&plan_inverse, rank, &n, onembed, ostride, odist, inembed, istride, idist, CUFFT_C2R, batch) );
116  } else if constexpr (std::is_same<T,double>::value) {
117  CHECK( cufftPlanMany(&plan_forward, rank, &n, inembed, istride, idist, onembed, ostride, odist, CUFFT_D2Z, batch) );
118  CHECK( cufftPlanMany(&plan_inverse, rank, &n, onembed, ostride, odist, inembed, istride, idist, CUFFT_Z2D, batch) );
119  }
120  #elif defined(YAKL_ARCH_HIP)
121  size_t len = tr_size;
122  size_t const roc_istride = istride;
123  size_t const roc_ostride = ostride;
124  size_t const roc_idist = idist;
125  size_t const roc_odist = odist;
126  size_t const roc_off = 0;
127  rocfft_plan_description desc;
128  if constexpr (std::is_same<T,float >::value) {
129  CHECK( rocfft_plan_description_create( &desc ) );
130  CHECK( rocfft_plan_description_set_data_layout( desc, rocfft_array_type_real, rocfft_array_type_hermitian_interleaved, &roc_off, &roc_off, (size_t) 1, &roc_istride, roc_idist, (size_t) 1, &roc_ostride, roc_odist ) );
131  CHECK( rocfft_plan_create(&plan_forward, rocfft_placement_inplace, rocfft_transform_type_real_forward, rocfft_precision_single, (size_t) 1, &len, (size_t) batch, desc) );
132  CHECK( rocfft_plan_description_destroy( desc ) );
133  CHECK( rocfft_plan_description_create( &desc ) );
134  CHECK( rocfft_plan_description_set_data_layout( desc, rocfft_array_type_hermitian_interleaved, rocfft_array_type_real, &roc_off, &roc_off, (size_t) 1, &roc_ostride, roc_odist, (size_t) 1, &roc_istride, roc_idist ) );
135  CHECK( rocfft_plan_create(&plan_inverse, rocfft_placement_inplace, rocfft_transform_type_real_inverse, rocfft_precision_single, (size_t) 1, &len, (size_t) batch, desc) );
136  CHECK( rocfft_plan_description_destroy( desc ) );
137  } else if constexpr (std::is_same<T,double>::value) {
138  CHECK( rocfft_plan_description_create( &desc ) );
139  CHECK( rocfft_plan_description_set_data_layout( desc, rocfft_array_type_real, rocfft_array_type_hermitian_interleaved, &roc_off, &roc_off, (size_t) 1, &roc_istride, roc_idist, (size_t) 1, &roc_ostride, roc_odist ) );
140  CHECK( rocfft_plan_create(&plan_forward, rocfft_placement_inplace, rocfft_transform_type_real_forward, rocfft_precision_double, (size_t) 1, &len, (size_t) batch, desc) );
141  CHECK( rocfft_plan_description_destroy( desc ) );
142  CHECK( rocfft_plan_description_create( &desc ) );
143  CHECK( rocfft_plan_description_set_data_layout( desc, rocfft_array_type_hermitian_interleaved, rocfft_array_type_real, &roc_off, &roc_off, (size_t) 1, &roc_ostride, roc_odist, (size_t) 1, &roc_istride, roc_idist ) );
144  CHECK( rocfft_plan_create(&plan_inverse, rocfft_placement_inplace, rocfft_transform_type_real_inverse, rocfft_precision_double, (size_t) 1, &len, (size_t) batch, desc) );
145  CHECK( rocfft_plan_description_destroy( desc ) );
146  }
147  #elif defined(YAKL_ARCH_SYCL)
148  if constexpr (std::is_same_v<T,float>) {
149  plan_forward = new desc_single_t(n);
150  plan_inverse = new desc_single_t(n);
151 
152  CHECK( static_cast<desc_single_t*>(plan_forward)->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, batch) );
153  CHECK( static_cast<desc_single_t*>(plan_forward)->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, idist) );
154  CHECK( static_cast<desc_single_t*>(plan_forward)->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, odist) );
155 
156  CHECK( static_cast<desc_single_t*>(plan_inverse)->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, batch) );
157  CHECK( static_cast<desc_single_t*>(plan_inverse)->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, odist) );
158  CHECK( static_cast<desc_single_t*>(plan_inverse)->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, idist) );
159 
160  CHECK( static_cast<desc_single_t*>(plan_forward)->commit( sycl_default_stream() ) );
161  CHECK( static_cast<desc_single_t*>(plan_inverse)->commit( sycl_default_stream() ) );
162  } else if constexpr (std::is_same_v<T,double>) {
163  plan_forward = new desc_double_t(n);
164  plan_inverse = new desc_double_t(n);
165 
166  CHECK( static_cast<desc_double_t*>(plan_forward)->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, batch) );
167  CHECK( static_cast<desc_double_t*>(plan_forward)->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, idist) );
168  CHECK( static_cast<desc_double_t*>(plan_forward)->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, odist) );
169 
170  CHECK( static_cast<desc_double_t*>(plan_inverse)->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, batch) );
171  CHECK( static_cast<desc_double_t*>(plan_inverse)->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE, idist) );
172  CHECK( static_cast<desc_double_t*>(plan_inverse)->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE, odist) );
173 
174  CHECK( static_cast<desc_double_t*>(plan_forward)->commit( sycl_default_stream() ) );
175  CHECK( static_cast<desc_double_t*>(plan_inverse)->commit( sycl_default_stream() ) );
176  }
177 
178  #endif
179 
180  this->batch_size = batch ;
181  this->transform_size = tr_size;
182  this->trdim = trdim ;
183  }
184 
185 
189  template <int N> void forward_real( Array<T,N,memDevice,styleC> &arr , int trdim_in = -1 , int transform_size_in = -1 ) {
190  if constexpr (streams_enabled) fence();
191  // Test if it's been initialized at all
192  if (trdim < 0 || transform_size < 0 || batch_size < 0) {
193  if (trdim_in < 0 || transform_size_in < 0) yakl_throw("ERROR: Using forward_real before calling init without "
194  "specifying both trdim_in and transform_size_in");
195  init( arr , trdim_in , transform_size_in );
196  }
197  // Test if the apparent size of the transform has changed or the batch size has changed
198  if ( ( (transform_size%2==1 ? transform_size+1 : transform_size+2) != arr.extent(trdim) ) ||
199  ( transform_size_in > 0 && transform_size_in != transform_size ) ||
200  ( arr.totElems() / arr.extent(trdim) != batch_size ) ) {
201  if (trdim_in < 0 || transform_size_in < 0) yakl_throw("ERROR: Changing transform size or batch sizewithout "
202  "specifying both trdim_in and transform_size_in");
203  cleanup();
204  init( arr , trdim_in , transform_size_in );
205  }
206  auto dims = arr.get_dimensions();
207  int d2 = 1; for (int i=N-1; i > trdim; i--) { d2 *= dims(i); } // Fastest varying
208  int d1 = dims(trdim); // Transform dimension
209  int d0 = arr.totElems() / d2 / d1; // Slowest varying
211  if (trdim == N-1) {
212  copy = arr.reshape(d0,d2,d1);
213  } else {
214  auto in = arr.reshape(d0,d1,d2);
215  copy = arr.createDeviceCopy().reshape(d0,d2,d1);
216  c::parallel_for( c::SimpleBounds<3>(d0,d1,d2) , YAKL_LAMBDA (int i0, int i1, int i2) { copy(i0,i2,i1) = in(i0,i1,i2); });
217  }
218  // Perform the FFT
219  #if defined(YAKL_ARCH_CUDA)
220  if constexpr (std::is_same<T,float >::value) {
221  CHECK( cufftExecR2C(plan_forward, (cufftReal *) copy.data(), (cufftComplex *) copy.data()) );
222  } else if constexpr (std::is_same<T,double>::value) {
223  CHECK( cufftExecD2Z(plan_forward, (cufftDoubleReal *) copy.data(), (cufftDoubleComplex *) copy.data()) );
224  }
225  #elif defined(YAKL_ARCH_HIP)
226  std::array<double *,1> ibuf( {copy.data()} );
227  std::array<double *,1> obuf( {copy.data()} );
228  rocfft_execution_info info;
229  CHECK( rocfft_execution_info_create( &info ) );
230  CHECK( rocfft_execute(plan_forward, (void **) ibuf.data(), (void **) obuf.data(), info) );
231  CHECK( rocfft_execution_info_destroy( info ) );
232  #elif defined(YAKL_ARCH_SYCL)
233  if constexpr (std::is_same<T,float >::value) {
234  CHECK( oneapi::mkl::dft::compute_forward(*(static_cast<desc_single_t*>(plan_forward)), static_cast<T*>(copy.data())) );
235  } else if constexpr (std::is_same<T,double>::value) {
236  CHECK( oneapi::mkl::dft::compute_forward(*(static_cast<desc_double_t*>(plan_forward)), static_cast<T*>(copy.data())) );
237  }
238  #else
239  Array<T ,3,memHost,styleC> pfft_in ("pfft_in" ,d0,d2, transform_size );
240  Array<std::complex<T>,3,memHost,styleC> pfft_out("pfft_out",d0,d2,(transform_size+2)/2);
241  auto copy_host = copy.createHostCopy();
242  for (int i0 = 0; i0 < d0; i0++) {
243  for (int i2 = 0; i2 < d2; i2++) {
244  for (int i1 = 0; i1 < transform_size; i1++) {
245  pfft_in(i0,i2,i1) = copy_host(i0,i2,i1);
246  }
247  }
248  }
249  using pocketfft::detail::shape_t;
250  using pocketfft::detail::stride_t;
251  shape_t shape_in (3); for (int i=0; i < 3; i++) { shape_in[i] = pfft_in.extent(i); }
252  stride_t stride_in (3);
253  stride_t stride_out(3);
254  stride_in [0] = d2* transform_size *sizeof( T );
255  stride_in [1] = transform_size *sizeof( T );
256  stride_in [2] = sizeof( T );
257  stride_out[0] = d2*((transform_size+2)/2)*sizeof(std::complex<T>);
258  stride_out[1] = ((transform_size+2)/2)*sizeof(std::complex<T>);
259  stride_out[2] = sizeof(std::complex<T>);
260  pocketfft::r2c<T>(shape_in, stride_in, stride_out, (size_t) 2, true, pfft_in.data(), pfft_out.data(), (T) 1);
261  for (int i0 = 0; i0 < d0; i0++) {
262  for (int i2 = 0; i2 < d2; i2++) {
263  for (int i1 = 0; i1 < (transform_size+2)/2; i1++) {
264  copy_host(i0,i2,2*i1 ) = pfft_out(i0,i2,i1).real();
265  copy_host(i0,i2,2*i1+1) = pfft_out(i0,i2,i1).imag();
266  }
267  }
268  }
269  copy_host.deep_copy_to( copy );
270  fence();
271  #endif
272  if (trdim != N-1) {
273  auto out = arr.reshape(d0,d1,d2);
274  c::parallel_for( c::SimpleBounds<3>(d0,d1,d2) , YAKL_LAMBDA (int i0, int i1, int i2) { out(i0,i1,i2) = copy(i0,i2,i1); });
275  }
276  if constexpr (streams_enabled) fence();
277  }
278 
279 
283  template <int N> void inverse_real( Array<T,N,memDevice,styleC> &arr , int trdim_in = -1 , int transform_size_in = -1 ) {
284  if constexpr (streams_enabled) fence();
285  // Test if it's been initialized at all
286  if (trdim < 0 || transform_size < 0 || batch_size < 0) {
287  if (trdim_in < 0 || transform_size_in < 0) yakl_throw("ERROR: Using forward_real before calling init without "
288  "specifying both trdim_in and transform_size_in");
289  init( arr , trdim_in , transform_size_in );
290  }
291  // Test if the apparent size of the transform has changed or the batch size has changed
292  if ( ( (transform_size%2==1 ? transform_size+1 : transform_size+2) != arr.extent(trdim) ) ||
293  ( transform_size_in > 0 && transform_size_in != transform_size ) ||
294  ( arr.totElems() / arr.extent(trdim) != batch_size ) ) {
295  if (trdim_in < 0 || transform_size_in < 0) yakl_throw("ERROR: Changing transform size or batch sizewithout "
296  "specifying both trdim_in and transform_size_in");
297  cleanup();
298  init( arr , trdim_in , transform_size_in );
299  }
300  auto dims = arr.get_dimensions();
301  int d2 = 1; for (int i=N-1; i > trdim; i--) { d2 *= dims(i); } // Fastest varying
302  int d1 = dims(trdim); // Transform dimension
303  int d0 = arr.totElems() / d2 / d1; // Slowest varying
305  if (trdim == N-1) {
306  copy = arr.reshape(d0,d2,d1);
307  } else {
308  auto in = arr.reshape(d0,d1,d2);
309  copy = arr.createDeviceCopy().reshape(d0,d2,d1);
310  c::parallel_for( c::SimpleBounds<3>(d0,d1,d2) , YAKL_LAMBDA (int i0, int i1, int i2) { copy(i0,i2,i1) = in(i0,i1,i2); });
311  }
312  // Perform the FFT
313  #if defined(YAKL_ARCH_CUDA)
314  if constexpr (std::is_same<T,float >::value) {
315  CHECK( cufftExecC2R(plan_inverse, (cufftComplex *) copy.data(), (cufftReal *) copy.data()) );
316  } else if constexpr (std::is_same<T,double>::value) {
317  CHECK( cufftExecZ2D(plan_inverse, (cufftDoubleComplex *) copy.data(), (cufftDoubleReal *) copy.data()) );
318  }
319  #elif defined(YAKL_ARCH_HIP)
320  std::array<double *,1> ibuf( {copy.data()} );
321  std::array<double *,1> obuf( {copy.data()} );
322  rocfft_execution_info info;
323  CHECK( rocfft_execution_info_create( &info ) );
324  CHECK( rocfft_execute(plan_inverse, (void **) ibuf.data(), (void **) obuf.data(), info) );
325  CHECK( rocfft_execution_info_destroy( info ) );
326  #elif defined(YAKL_ARCH_SYCL)
327  if constexpr (std::is_same<T,float >::value) {
328  CHECK( oneapi::mkl::dft::compute_backward(*(static_cast<desc_single_t*>(plan_inverse)), static_cast<T*>(copy.data())) );
329  } else if constexpr (std::is_same<T,double>::value) {
330  CHECK( oneapi::mkl::dft::compute_backward(*(static_cast<desc_double_t*>(plan_inverse)), static_cast<T*>(copy.data())) );
331  }
332  #else
333  Array<std::complex<T>,3,memHost,styleC> pfft_in ("pfft_in" ,d0,d2,(transform_size+2)/2);
334  Array<T ,3,memHost,styleC> pfft_out("pfft_out",d0,d2, transform_size );
335  auto copy_host = copy.createHostCopy();
336  for (int i0 = 0; i0 < d0; i0++) {
337  for (int i2 = 0; i2 < d2; i2++) {
338  for (int i1 = 0; i1 < (transform_size+2)/2; i1++) {
339  pfft_in(i0,i2,i1) = std::complex<T>( copy_host(i0,i2,2*i1 ) , copy_host(i0,i2,2*i1+1) );
340  }
341  }
342  }
343  using pocketfft::detail::shape_t;
344  using pocketfft::detail::stride_t;
345  shape_t shape_out (3); for (int i=0; i < 3; i++) { shape_out [i] = pfft_out.extent(i); }
346  stride_t stride_in (3);
347  stride_t stride_out(3);
348  stride_in [0] = d2*((transform_size+2)/2)*sizeof(std::complex<T>);
349  stride_in [1] = ((transform_size+2)/2)*sizeof(std::complex<T>);
350  stride_in [2] = sizeof(std::complex<T>);
351  stride_out[0] = d2* transform_size *sizeof( T );
352  stride_out[1] = transform_size *sizeof( T );
353  stride_out[2] = sizeof( T );
354  pocketfft::c2r<T>(shape_out, stride_in, stride_out, (size_t) 2, false, pfft_in.data() , pfft_out.data() , (T) 1 );
355  for (int i0 = 0; i0 < d0; i0++) {
356  for (int i2 = 0; i2 < d2; i2++) {
357  for (int i1 = 0; i1 < transform_size; i1++) {
358  copy_host(i0,i2,i1) = pfft_out(i0,i2,i1);
359  }
360  }
361  }
362  copy_host.deep_copy_to( copy );
363  fence();
364  #endif
365  auto out = arr.reshape(d0,d1,d2);
366  YAKL_SCOPE( transform_size , this->transform_size );
367  c::parallel_for( c::SimpleBounds<3>(d0,d1,d2) , YAKL_LAMBDA (int i0, int i1, int i2) { out(i0,i1,i2) = copy(i0,i2,i1) / transform_size; });
368  if constexpr (streams_enabled) fence();
369  }
370 
371  };
372 
373 }
375 
yakl::RealFFT1D::~RealFFT1D
~RealFFT1D()
Definition: YAKL_fft.h:83
yakl::RealFFT1D::forward_real
void forward_real(Array< T, N, memDevice, styleC > &arr, int trdim_in=-1, int transform_size_in=-1)
Perform a forward transform (real-to-complex)
Definition: YAKL_fft.h:189
yakl::RealFFT1D::init
void init(Array< T, N, memDevice, styleC > &arr, int trdim, int tr_size)
Setup FFT plans, allocate, compute needed data.
Definition: YAKL_fft.h:101
YAKL.h
yakl::RealFFT1D::RealFFT1D
RealFFT1D()
Definition: YAKL_fft.h:82
YAKL_SCOPE
#define YAKL_SCOPE(a, b)
Used to bring non-local data into local scope (e.g., this->data or namespace::data)....
Definition: YAKL_defines.h:148
yakl::c::parallel_for
void parallel_for(char const *str, Bounds< N, simple > const &bounds, F const &f, LaunchConfig< VecLen, B4B > config=LaunchConfig<>())
[ASYNCHRONOUS] Launch the passed functor in parallel.
__YAKL_NAMESPACE_WRAPPER_END__
#define __YAKL_NAMESPACE_WRAPPER_END__
Definition: YAKL.h:20
yakl::streams_enabled
constexpr bool streams_enabled
If the CPP Macro YAKL_ENABLE_STREAMS is defined, then this bool is set to true
Definition: YAKL_streams_events.h:11
__YAKL_NAMESPACE_WRAPPER_BEGIN__
#define __YAKL_NAMESPACE_WRAPPER_BEGIN__
Definition: YAKL.h:19
yakl::fence
void fence()
Block the host code until all device code has completed.
Definition: YAKL_fence.h:16
yakl::yakl_throw
YAKL_INLINE void yakl_throw(const char *msg)
Throw an error message. Works from the host or device.
Definition: YAKL_error.h:17
yakl::styleC
constexpr int styleC
Template parameter for yakl::Array that specifies it should follow C-style behavior.
Definition: YAKL_Array.h:20
yakl::Array
This declares the yakl::Array class. Please see the yakl::styleC and yakl::styleFortran template spec...
Definition: YAKL_Array.h:40
yakl::RealFFT1D::nullify
void nullify()
Definition: YAKL_fft.h:80
yakl::RealFFT1D
Compute batched real-to-complex forward and inverse FFTs on yakl::Array objects using vendor librarie...
Definition: YAKL_fft.h:49
yakl::c::Bounds< N, true >
Describes a set of C-style tightly-nested loops where all loops have lower bounds of 0 strides of 1.
Definition: YAKL_Bounds_c.h:97
yakl
YAKL_LAMBDA
#define YAKL_LAMBDA
Used to create C++ lambda expressions passed to parallel_for and parallel_outer
Definition: YAKL_defines.h:128
yakl::RealFFT1D::inverse_real
void inverse_real(Array< T, N, memDevice, styleC > &arr, int trdim_in=-1, int transform_size_in=-1)
Perform an inverse transform (complex-to-real)
Definition: YAKL_fft.h:283
yakl::memHost
constexpr int memHost
Specifies a device memory address space for a yakl::Array object.
Definition: YAKL_memory_spaces.h:15