Go to the documentation of this file.
33 template <
class F,
int N,
bool simple,
int VecLen=YAKL_DEFAULT_VECTOR_LEN ,
bool B4B = false>
34 inline void parallel_for(
char const * str , Bounds<N,simple>
const &bounds , F
const &f ,
35 LaunchConfig<VecLen,B4B> config = LaunchConfig<>() );
42 template <
class F,
int N,
bool simple,
int VecLen=YAKL_DEFAULT_VECTOR_LEN ,
bool B4B = false>
43 inline void parallel_for( Bounds<N,simple>
const &bounds , F
const &f ,
44 LaunchConfig<VecLen,B4B> config = LaunchConfig<>() );
95 template <
class F,
int N,
bool simple,
int VecLen=YAKL_DEFAULT_VECTOR_LEN,
bool B4B = false>
96 inline void parallel_outer(
char const * str , Bounds<N,simple>
const &bounds , F
const &f ,
97 LaunchConfig<VecLen,B4B> config = LaunchConfig<>() );
104 template <
class F,
int N,
bool simple,
int VecLen=YAKL_DEFAULT_VECTOR_LEN,
bool B4B = false>
105 inline void parallel_outer( Bounds<N,simple>
const &bounds , F
const &f ,
106 LaunchConfig<VecLen,B4B> config = LaunchConfig<>() );
131 template <
class F,
int N,
bool simple>
struct yakl::InnerHandlerEmpty InnerHandler
This class is necessary for coordination of two-level parallelism.
void parallel_for(char const *str, Bounds< N, simple > const &bounds, F const &f, LaunchConfig< VecLen, B4B > config=LaunchConfig<>())
[ASYNCHRONOUS] Launch the passed functor in parallel.
#define YAKL_INLINE
Used to decorate functions called from kernels (parallel_for and parallel_outer) or from CPU function...
Definition: YAKL_defines.h:140
YAKL_INLINE void single_inner(F const &f, InnerHandler handler)
Launch the passed functor to only use one of the inner threads (still parallel over outer threads)
void parallel_outer(char const *str, Bounds< N, simple > const &bounds, F const &f, LaunchConfig< VecLen, B4B > config=LaunchConfig<>())
[ASYNCHRONOUS] Launch the passed functor in parallel in the coarsest-level parallelism on the device.
YAKL_INLINE void parallel_inner(Bounds< N, simple > const &bounds, F const &f, InnerHandler handler)
Launch the passed functor in parallel in the finenst-level parallelism on the device.