5 #if !defined(YAKL_ARCH_CUDA) && !defined(YAKL_ARCH_HIP) && !defined(YAKL_ARCH_SYCL) 
    6   #define POCKETFFT_CACHE_SIZE 2 
    7   #define POCKETFFT_NO_MULTITHREADING 
    8   #include "pocketfft_hdronly.h" 
   57     #if   defined(YAKL_ARCH_CUDA) 
   58       cufftHandle plan_forward;
 
   59       cufftHandle plan_inverse;
 
   60       #define CHECK(func) { int myloc = func; if (myloc != CUFFT_SUCCESS) { std::cerr << "ERROR: YAKL CUFFT: " << __FILE__ << ": " <<__LINE__ << std::endl; yakl_throw(""); } } 
   61     #elif defined(YAKL_ARCH_HIP) 
   62       rocfft_plan plan_forward;
 
   63       rocfft_plan plan_inverse;
 
   64       #define CHECK(func) { int myloc = func; if (myloc != rocfft_status_success) { std::cerr << "ERROR: YAKL ROCFFT: " << __FILE__ << ": " <<__LINE__ << std::endl; yakl_throw(""); } } 
   65     #elif defined(YAKL_ARCH_SYCL) 
   66       typedef oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::SINGLE, oneapi::mkl::dft::domain::REAL> desc_single_t;
 
   67       typedef oneapi::mkl::dft::descriptor<oneapi::mkl::dft::precision::DOUBLE, oneapi::mkl::dft::domain::REAL> desc_double_t;
 
   68       void *plan_forward, *plan_inverse;
 
   69       #define CHECK(func) {                                                        \ 
   73         catch (oneapi::mkl::exception const &ex) {                                 \ 
   74           std::cerr << "ERROR: YAKL ONEMKL-FFT: " << __FILE__ << " : " << __LINE__ \ 
   75           << std::endl; yakl_throw("");                                            \ 
   80     void nullify() {batch_size = -1;  transform_size = -1;  trdim = -1;}
 
   87       if (transform_size != -1) {
 
   88         #if   defined(YAKL_ARCH_CUDA) 
   89           CHECK( cufftDestroy( plan_forward ) );
 
   90           CHECK( cufftDestroy( plan_inverse ) );
 
   91         #elif defined(YAKL_ARCH_HIP) 
   92           CHECK( rocfft_plan_destroy( plan_forward ) );
 
   93           CHECK( rocfft_plan_destroy( plan_inverse ) );
 
  106       int idist   = arr.extent(trdim);
 
  107       int odist   = idist / 2;
 
  108       int batch   = arr.totElems() / arr.extent(trdim);
 
  109       int *inembed = 
nullptr;
 
  110       int *onembed = 
nullptr;
 
  112       #if   defined(YAKL_ARCH_CUDA) 
  113         if        constexpr (std::is_same<T,float >::value) {
 
  114           CHECK( cufftPlanMany(&plan_forward, rank, &n, inembed, istride, idist, onembed, ostride, odist, CUFFT_R2C, batch) );
 
  115           CHECK( cufftPlanMany(&plan_inverse, rank, &n, onembed, ostride, odist, inembed, istride, idist, CUFFT_C2R, batch) );
 
  116         } 
else if constexpr (std::is_same<T,double>::value) {
 
  117           CHECK( cufftPlanMany(&plan_forward, rank, &n, inembed, istride, idist, onembed, ostride, odist, CUFFT_D2Z, batch) );
 
  118           CHECK( cufftPlanMany(&plan_inverse, rank, &n, onembed, ostride, odist, inembed, istride, idist, CUFFT_Z2D, batch) );
 
  120       #elif defined(YAKL_ARCH_HIP) 
  121         size_t len = tr_size;
 
  122         size_t const roc_istride = istride;
 
  123         size_t const roc_ostride = ostride;
 
  124         size_t const roc_idist = idist;
 
  125         size_t const roc_odist = odist;
 
  126         size_t const roc_off = 0;
 
  127         rocfft_plan_description desc;
 
  128         if        constexpr (std::is_same<T,float >::value) {
 
  129           CHECK( rocfft_plan_description_create( &desc ) );
 
  130           CHECK( rocfft_plan_description_set_data_layout( desc, rocfft_array_type_real, rocfft_array_type_hermitian_interleaved, &roc_off, &roc_off, (
size_t) 1, &roc_istride, roc_idist, (
size_t) 1, &roc_ostride, roc_odist ) );
 
  131           CHECK( rocfft_plan_create(&plan_forward, rocfft_placement_inplace, rocfft_transform_type_real_forward, rocfft_precision_single, (
size_t) 1, &len, (
size_t) batch, desc) );
 
  132           CHECK( rocfft_plan_description_destroy( desc ) );
 
  133           CHECK( rocfft_plan_description_create( &desc ) );
 
  134           CHECK( rocfft_plan_description_set_data_layout( desc, rocfft_array_type_hermitian_interleaved, rocfft_array_type_real, &roc_off, &roc_off, (
size_t) 1, &roc_ostride, roc_odist, (
size_t) 1, &roc_istride, roc_idist ) );
 
  135           CHECK( rocfft_plan_create(&plan_inverse, rocfft_placement_inplace, rocfft_transform_type_real_inverse, rocfft_precision_single, (
size_t) 1, &len, (
size_t) batch, desc) );
 
  136           CHECK( rocfft_plan_description_destroy( desc ) );
 
  137         } 
else if constexpr (std::is_same<T,double>::value) {
 
  138           CHECK( rocfft_plan_description_create( &desc ) );
 
  139           CHECK( rocfft_plan_description_set_data_layout( desc, rocfft_array_type_real, rocfft_array_type_hermitian_interleaved, &roc_off, &roc_off, (
size_t) 1, &roc_istride, roc_idist, (
size_t) 1, &roc_ostride, roc_odist ) );
 
  140           CHECK( rocfft_plan_create(&plan_forward, rocfft_placement_inplace, rocfft_transform_type_real_forward, rocfft_precision_double, (
size_t) 1, &len, (
size_t) batch, desc) );
 
  141           CHECK( rocfft_plan_description_destroy( desc ) );
 
  142           CHECK( rocfft_plan_description_create( &desc ) );
 
  143           CHECK( rocfft_plan_description_set_data_layout( desc, rocfft_array_type_hermitian_interleaved, rocfft_array_type_real, &roc_off, &roc_off, (
size_t) 1, &roc_ostride, roc_odist, (
size_t) 1, &roc_istride, roc_idist ) );
 
  144           CHECK( rocfft_plan_create(&plan_inverse, rocfft_placement_inplace, rocfft_transform_type_real_inverse, rocfft_precision_double, (
size_t) 1, &len, (
size_t) batch, desc) );
 
  145           CHECK( rocfft_plan_description_destroy( desc ) );
 
  147       #elif defined(YAKL_ARCH_SYCL) 
  148         if constexpr (std::is_same_v<T,float>) {
 
  149           plan_forward = 
new desc_single_t(n);
 
  150           plan_inverse = 
new desc_single_t(n);
 
  152           CHECK( 
static_cast<desc_single_t*
>(plan_forward)->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, batch) );
 
  153           CHECK( 
static_cast<desc_single_t*
>(plan_forward)->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE,         idist) );
 
  154           CHECK( 
static_cast<desc_single_t*
>(plan_forward)->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE,         odist) );
 
  156           CHECK( 
static_cast<desc_single_t*
>(plan_inverse)->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, batch) );
 
  157           CHECK( 
static_cast<desc_single_t*
>(plan_inverse)->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE,         odist) );
 
  158           CHECK( 
static_cast<desc_single_t*
>(plan_inverse)->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE,         idist) );
 
  160           CHECK( 
static_cast<desc_single_t*
>(plan_forward)->commit( sycl_default_stream() ) );
 
  161           CHECK( 
static_cast<desc_single_t*
>(plan_inverse)->commit( sycl_default_stream() ) );
 
  162         } 
else if constexpr (std::is_same_v<T,double>) {
 
  163           plan_forward = 
new desc_double_t(n);
 
  164           plan_inverse = 
new desc_double_t(n);
 
  166           CHECK( 
static_cast<desc_double_t*
>(plan_forward)->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, batch) );
 
  167           CHECK( 
static_cast<desc_double_t*
>(plan_forward)->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE,         idist) );
 
  168           CHECK( 
static_cast<desc_double_t*
>(plan_forward)->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE,         odist) );
 
  170           CHECK( 
static_cast<desc_double_t*
>(plan_inverse)->set_value(oneapi::mkl::dft::config_param::NUMBER_OF_TRANSFORMS, batch) );
 
  171           CHECK( 
static_cast<desc_double_t*
>(plan_inverse)->set_value(oneapi::mkl::dft::config_param::FWD_DISTANCE,         idist) );
 
  172           CHECK( 
static_cast<desc_double_t*
>(plan_inverse)->set_value(oneapi::mkl::dft::config_param::BWD_DISTANCE,         odist) );
 
  174           CHECK( 
static_cast<desc_double_t*
>(plan_forward)->commit( sycl_default_stream() ) );
 
  175           CHECK( 
static_cast<desc_double_t*
>(plan_inverse)->commit( sycl_default_stream() ) );
 
  180       this->batch_size     = batch  ;
 
  181       this->transform_size = tr_size;
 
  182       this->trdim          = trdim  ;
 
  192       if (trdim < 0 || transform_size < 0 || batch_size < 0) {
 
  193         if (trdim_in < 0 || transform_size_in < 0) 
yakl_throw(
"ERROR: Using forward_real before calling init without " 
  194                                                               "specifying both trdim_in and transform_size_in");
 
  195         init( arr , trdim_in , transform_size_in );
 
  198       if ( ( (transform_size%2==1 ? transform_size+1 : transform_size+2) != arr.extent(trdim) ) ||
 
  199            ( transform_size_in > 0 && transform_size_in != transform_size ) ||
 
  200            ( arr.totElems() / arr.extent(trdim) != batch_size ) ) {
 
  201         if (trdim_in < 0 || transform_size_in < 0) 
yakl_throw(
"ERROR: Changing transform size  or batch sizewithout " 
  202                                                               "specifying both trdim_in and transform_size_in");
 
  204         init( arr , trdim_in , transform_size_in );
 
  206       auto dims = arr.get_dimensions();
 
  207       int d2 = 1;   
for (
int i=N-1; i > trdim; i--) { d2 *= dims(i); } 
 
  208       int d1 = dims(trdim);                                            
 
  209       int d0 = arr.totElems() / d2 / d1;                               
 
  212         copy = arr.reshape(d0,d2,d1);
 
  214         auto in = arr.reshape(d0,d1,d2);
 
  215         copy = arr.createDeviceCopy().reshape(d0,d2,d1);
 
  219       #if   defined(YAKL_ARCH_CUDA) 
  220         if        constexpr (std::is_same<T,float >::value) {
 
  221           CHECK( cufftExecR2C(plan_forward, (cufftReal       *) copy.data(), (cufftComplex       *) copy.data()) );
 
  222         } 
else if constexpr (std::is_same<T,double>::value) {
 
  223           CHECK( cufftExecD2Z(plan_forward, (cufftDoubleReal *) copy.data(), (cufftDoubleComplex *) copy.data()) );
 
  225       #elif defined(YAKL_ARCH_HIP) 
  226         std::array<double *,1> ibuf( {copy.data()} );
 
  227         std::array<double *,1> obuf( {copy.data()} );
 
  228         rocfft_execution_info info;
 
  229         CHECK( rocfft_execution_info_create( &info ) );
 
  230         CHECK( rocfft_execute(plan_forward, (
void **) ibuf.data(), (
void **) obuf.data(), info) );
 
  231         CHECK( rocfft_execution_info_destroy( info ) );
 
  232       #elif defined(YAKL_ARCH_SYCL) 
  233         if        constexpr (std::is_same<T,float >::value) {
 
  234           CHECK( oneapi::mkl::dft::compute_forward(*(
static_cast<desc_single_t*
>(plan_forward)), 
static_cast<T*
>(copy.data())) );
 
  235         } 
else if constexpr (std::is_same<T,double>::value) {
 
  236           CHECK( oneapi::mkl::dft::compute_forward(*(
static_cast<desc_double_t*
>(plan_forward)), 
static_cast<T*
>(copy.data())) );
 
  241         auto copy_host = copy.createHostCopy();
 
  242         for (
int i0 = 0; i0 < d0; i0++) {
 
  243           for (
int i2 = 0; i2 < d2; i2++) {
 
  244             for (
int i1 = 0; i1 < transform_size; i1++) {
 
  245               pfft_in(i0,i2,i1) = copy_host(i0,i2,i1);
 
  249         using pocketfft::detail::shape_t;
 
  250         using pocketfft::detail::stride_t;
 
  251         shape_t  shape_in  (3);   
for (
int i=0; i < 3; i++) { shape_in[i] = pfft_in.extent(i); }
 
  252         stride_t stride_in (3);
 
  253         stride_t stride_out(3);
 
  254         stride_in [0] = d2*  transform_size      *
sizeof(             T );
 
  255         stride_in [1] =      transform_size      *
sizeof(             T );
 
  256         stride_in [2] =                           
sizeof(             T );
 
  257         stride_out[0] = d2*((transform_size+2)/2)*
sizeof(std::complex<T>);
 
  258         stride_out[1] =    ((transform_size+2)/2)*
sizeof(std::complex<T>);
 
  259         stride_out[2] =                           
sizeof(std::complex<T>);
 
  260         pocketfft::r2c<T>(shape_in, stride_in, stride_out, (
size_t) 2, 
true, pfft_in.data(), pfft_out.data(), (T) 1);
 
  261         for (
int i0 = 0; i0 < d0; i0++) {
 
  262           for (
int i2 = 0; i2 < d2; i2++) {
 
  263             for (
int i1 = 0; i1 < (transform_size+2)/2; i1++) {
 
  264               copy_host(i0,i2,2*i1  ) = pfft_out(i0,i2,i1).real();
 
  265               copy_host(i0,i2,2*i1+1) = pfft_out(i0,i2,i1).imag();
 
  269         copy_host.deep_copy_to( copy );
 
  273         auto out = arr.reshape(d0,d1,d2);
 
  286       if (trdim < 0 || transform_size < 0 || batch_size < 0) {
 
  287         if (trdim_in < 0 || transform_size_in < 0) 
yakl_throw(
"ERROR: Using forward_real before calling init without " 
  288                                                               "specifying both trdim_in and transform_size_in");
 
  289         init( arr , trdim_in , transform_size_in );
 
  292       if ( ( (transform_size%2==1 ? transform_size+1 : transform_size+2) != arr.extent(trdim) ) ||
 
  293            ( transform_size_in > 0 && transform_size_in != transform_size ) ||
 
  294            ( arr.totElems() / arr.extent(trdim) != batch_size ) ) {
 
  295         if (trdim_in < 0 || transform_size_in < 0) 
yakl_throw(
"ERROR: Changing transform size  or batch sizewithout " 
  296                                                               "specifying both trdim_in and transform_size_in");
 
  298         init( arr , trdim_in , transform_size_in );
 
  300       auto dims = arr.get_dimensions();
 
  301       int d2 = 1;   
for (
int i=N-1; i > trdim; i--) { d2 *= dims(i); } 
 
  302       int d1 = dims(trdim);                                            
 
  303       int d0 = arr.totElems() / d2 / d1;                               
 
  306         copy = arr.reshape(d0,d2,d1);
 
  308         auto in = arr.reshape(d0,d1,d2);
 
  309         copy = arr.createDeviceCopy().reshape(d0,d2,d1);
 
  313       #if   defined(YAKL_ARCH_CUDA) 
  314         if        constexpr (std::is_same<T,float >::value) {
 
  315           CHECK( cufftExecC2R(plan_inverse, (cufftComplex       *) copy.data(), (cufftReal       *) copy.data()) );
 
  316         } 
else if constexpr (std::is_same<T,double>::value) {
 
  317           CHECK( cufftExecZ2D(plan_inverse, (cufftDoubleComplex *) copy.data(), (cufftDoubleReal *) copy.data()) );
 
  319       #elif defined(YAKL_ARCH_HIP) 
  320         std::array<double *,1> ibuf( {copy.data()} );
 
  321         std::array<double *,1> obuf( {copy.data()} );
 
  322         rocfft_execution_info info;
 
  323         CHECK( rocfft_execution_info_create( &info ) );
 
  324         CHECK( rocfft_execute(plan_inverse, (
void **) ibuf.data(), (
void **) obuf.data(), info) );
 
  325         CHECK( rocfft_execution_info_destroy( info ) );
 
  326       #elif defined(YAKL_ARCH_SYCL) 
  327         if        constexpr (std::is_same<T,float >::value) {
 
  328           CHECK( oneapi::mkl::dft::compute_backward(*(
static_cast<desc_single_t*
>(plan_inverse)), 
static_cast<T*
>(copy.data())) );
 
  329         } 
else if constexpr (std::is_same<T,double>::value) {
 
  330           CHECK( oneapi::mkl::dft::compute_backward(*(
static_cast<desc_double_t*
>(plan_inverse)), 
static_cast<T*
>(copy.data())) );
 
  335         auto copy_host = copy.createHostCopy();
 
  336         for (
int i0 = 0; i0 < d0; i0++) {
 
  337           for (
int i2 = 0; i2 < d2; i2++) {
 
  338             for (
int i1 = 0; i1 < (transform_size+2)/2; i1++) {
 
  339               pfft_in(i0,i2,i1) = std::complex<T>( copy_host(i0,i2,2*i1  ) , copy_host(i0,i2,2*i1+1) );
 
  343         using pocketfft::detail::shape_t;
 
  344         using pocketfft::detail::stride_t;
 
  345         shape_t  shape_out (3);   
for (
int i=0; i < 3; i++) { shape_out [i] = pfft_out.extent(i); }
 
  346         stride_t stride_in (3);
 
  347         stride_t stride_out(3);
 
  348         stride_in [0] = d2*((transform_size+2)/2)*
sizeof(std::complex<T>);
 
  349         stride_in [1] =    ((transform_size+2)/2)*
sizeof(std::complex<T>);
 
  350         stride_in [2] =                           
sizeof(std::complex<T>);
 
  351         stride_out[0] = d2*  transform_size      *
sizeof(             T );
 
  352         stride_out[1] =      transform_size      *
sizeof(             T );
 
  353         stride_out[2] =                           
sizeof(             T );
 
  354         pocketfft::c2r<T>(shape_out, stride_in, stride_out, (
size_t) 2, 
false, pfft_in.data() , pfft_out.data() , (T) 1 );
 
  355         for (
int i0 = 0; i0 < d0; i0++) {
 
  356           for (
int i2 = 0; i2 < d2; i2++) {
 
  357             for (
int i1 = 0; i1 < transform_size; i1++) {
 
  358               copy_host(i0,i2,i1) = pfft_out(i0,i2,i1);
 
  362         copy_host.deep_copy_to( copy );
 
  365       auto out = arr.reshape(d0,d1,d2);
 
  366       YAKL_SCOPE( transform_size , this->transform_size );