Go to the documentation of this file.
34 template <
class F,
int N,
bool simple,
int VecLen=YAKL_DEFAULT_VECTOR_LEN ,
bool B4B = false>
35 inline void parallel_for(
char const * str , Bounds<N,simple>
const &bounds , F
const &f ,
36 LaunchConfig<VecLen,B4B> config = LaunchConfig<>() );
43 template <
class F,
int N,
bool simple,
int VecLen=YAKL_DEFAULT_VECTOR_LEN ,
bool B4B = false>
44 inline void parallel_for( Bounds<N,simple>
const &bounds , F
const &f ,
45 LaunchConfig<VecLen,B4B> config = LaunchConfig<>() );
96 template <
class F,
int N,
bool simple,
int VecLen=YAKL_DEFAULT_VECTOR_LEN,
bool B4B = false>
97 inline void parallel_outer(
char const * str , Bounds<N,simple>
const &bounds , F
const &f ,
98 LaunchConfig<VecLen,B4B> config = LaunchConfig<>() );
105 template <
class F,
int N,
bool simple,
int VecLen=YAKL_DEFAULT_VECTOR_LEN,
bool B4B = false>
106 inline void parallel_outer( Bounds<N,simple>
const &bounds , F
const &f ,
107 LaunchConfig<VecLen,B4B> config = LaunchConfig<>() );
132 template <
class F,
int N,
bool simple>
struct yakl::InnerHandlerEmpty InnerHandler
This class is necessary for coordination of two-level parallelism.
void parallel_outer(char const *str, Bounds< N, simple > const &bounds, F const &f, LaunchConfig< VecLen, B4B > config=LaunchConfig<>())
[ASYNCHRONOUS] Launch the passed functor in parallel in the coarsest-level parallelism on the device
void parallel_for(char const *str, Bounds< N, simple > const &bounds, F const &f, LaunchConfig< VecLen, B4B > config=LaunchConfig<>())
[ASYNCHRONOUS] Launch the passed functor in parallel.
#define YAKL_INLINE
Used to decorate functions called from kernels (parallel_for and parallel_outer) or from CPU function...
Definition: YAKL_defines.h:140
YAKL_INLINE void single_inner(F const &f, InnerHandler handler)
Launch the passed functor to only use one of the inner threads (still parallel over outer threads).
YAKL_INLINE void parallel_inner(Bounds< N, simple > const &bounds, F const &f, InnerHandler handler)
Launch the passed functor in parallel in the finenst-level parallelism on the device.