3.1
The Original Application
#define _(row,col,ld) ((row)+(col)*(ld))
void stencil5_cpu(double *xy, double *xm1y, double *xp1y, double *xym1, double *xyp1)
{
*xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
}
int main(int argc, char **argv)
{
int niter, n;
int x, y, loop;
read_params(argc, argv, &n, &niter);
double *A = calloc(n*n, sizeof(*A));
fill(A, n, n);
for(loop=0 ; loop<niter; loop++)
{
for (x = 0; x < n; x++)
{
for (y = 0; y < n; y++)
{
int xm1 = (x==0) ? n-1 : x-1;
int xp1 = (x==n-1) ? 0 : x+1;
int ym1 = (y==0) ? n-1 : y-1;
int yp1 = (y==n-1) ? 0 : y+1;
stencil5_cpu(&A[_(x,y,n)],
&A[_(xm1,y,n)], &A[_(xp1,y,n)],
&A[_(x,ym1,n)], &A[_(x,yp1,n)]);
}
}
}
return 0;
}
3.2
The StarPU Application
The computation function must be defined through a codelet.
#define _(row,col,ld) ((row)+(col)*(ld))
void stencil5_cpu(void *descr[], void *_args)
{
(void)_args;
*xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
}
{
.nbuffers = 5,
};
starpu_cpu_func_t cpu_funcs[STARPU_MAXIMPLEMENTATIONS]
Definition starpu_task.h:414
Definition starpu_task.h:338
#define STARPU_VARIABLE_GET_PTR(interface)
Definition starpu_data_interfaces.h:2211
@ STARPU_RW
Definition starpu_data.h:60
@ STARPU_R
Definition starpu_data.h:58
Data must be registered to StarPU.
data_handles = malloc(n*n*sizeof(*data_handles));
for(x = 0; x < n; x++)
{
for (y = 0; y < n; y++)
{
(uintptr_t)&(A[_(x,y,n)]), sizeof(double));
}
}
#define STARPU_MAIN_RAM
Definition starpu_task.h:144
void starpu_variable_data_register(starpu_data_handle_t *handle, int home_node, uintptr_t ptr, size_t size)
Instead of directly calling the function, a StarPU task must be created.
int xm1 = (x==0) ? n-1 : x-1;
int xp1 = (x==n-1) ? 0 : x+1;
int ym1 = (y==0) ? n-1 : y-1;
int yp1 = (y==n-1) ? 0 : y+1;
0);
int starpu_task_insert(struct starpu_codelet *cl,...)
And finally data must be released from StarPU.
for(x = 0; x < n; x++)
{
for (y = 0; y < n; y++)
{
}
}
void starpu_data_unregister(starpu_data_handle_t handle)
The whole StarPU application looks as follows.
#define _(row,col,ld) ((row)+(col)*(ld))
void stencil5_cpu(void *descr[], void *_args)
{
(void)_args;
*xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
}
{
.nbuffers = 5,
};
int main(int argc, char **argv)
{
int ret;
int niter, n;
int x, y, loop;
read_params(argc, argv, &verbose, &n, &niter);
double *A = calloc(n*n, sizeof(*A));
fill(A, n, n);
data_handles = malloc(n*n*sizeof(*data_handles));
for(x = 0; x < n; x++)
{
for (y = 0; y < n; y++)
{
(uintptr_t)&(A[_(x,y,n)]), sizeof(double));
}
}
for(loop=0 ; loop<niter; loop++)
{
for (x = 0; x < n; x++)
{
for (y = 0; y < n; y++)
{
int xm1 = (x==0) ? n-1 : x-1;
int xp1 = (x==n-1) ? 0 : x+1;
int ym1 = (y==0) ? n-1 : y-1;
int yp1 = (y==n-1) ? 0 : y+1;
0);
}
}
}
for(x = 0; x < n; x++)
{
for (y = 0; y < n; y++)
{
}
}
return 0;
}
int starpu_task_wait_for_all(void)
struct _starpu_data_state * starpu_data_handle_t
Definition starpu_data.h:45
void starpu_shutdown(void)
int starpu_init(struct starpu_conf *conf)
3.3
The StarPU MPI Application
The initialisation for StarPU-MPI is as follows.
int starpu_mpi_comm_rank(MPI_Comm comm, int *rank)
int starpu_mpi_comm_size(MPI_Comm comm, int *size)
int starpu_mpi_init_conf(int *argc, char ***argv, int initialize_mpi, MPI_Comm comm, struct starpu_conf *conf)
An additional call to starpu_mpi_data_register() is necessary.
(uintptr_t)&(A[_(x,y,n)]), sizeof(double));
int mpi_rank = my_distrib(x, y, size);
#define starpu_mpi_data_register(data_handle, data_tag, rank)
Definition starpu_mpi.h:539
And to insert a task, the function starpu_mpi_task_insert() must be used.
0);
int starpu_mpi_task_insert(MPI_Comm comm, struct starpu_codelet *codelet,...)
The whole StarPU-MPI application looks as follows.
#define _(row,col,ld) ((row)+(col)*(ld))
void stencil5_cpu(void *descr[], void *_args);
int my_distrib(int x, int y, int nb_nodes)
{
return ((int)(x / sqrt(nb_nodes) + (y / sqrt(nb_nodes)) * sqrt(nb_nodes))) % nb_nodes;
}
int main(int argc, char **argv)
{
int niter, n;
int my_rank, size, x, y, loop;
read_params(argc, argv, &n, &niter);
double *A = calloc(n*n, sizeof(*A));
fill(A, n, n);
data_handles = malloc(n*n*sizeof(*data_handles));
for(x = 0; x < n; x++)
{
for (y = 0; y < n; y++)
{
(uintptr_t)&(A[_(x,y,n)]), sizeof(double));
int mpi_rank = my_distrib(x, y, size);
}
}
for(loop=0 ; loop<niter; loop++)
{
for (x = 0; x < n; x++)
{
for (y = 0; y < n; y++)
{
int xm1 = (x==0) ? n-1 : x-1;
int xp1 = (x==n-1) ? 0 : x+1;
int ym1 = (y==0) ? n-1 : y-1;
int yp1 = (y==n-1) ? 0 : y+1;
0);
}
}
}
for(x = 0; x < n; x++)
{
for (y = 0; y < n; y++)
{
}
}
return 0;
}
void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t handle, int new_rank)
int starpu_mpi_shutdown(void)
3.4
Running the application
$ docker run -it registry.gitlab.inria.fr/starpu/starpu-docker/starpu:latest
If your machine has GPU devices, you can use the following command to enable the GPU devices within the docker image.
$ docker run -it --gpus all registry.gitlab.inria.fr/starpu/starpu-docker/starpu:latest
From your docker image, you can then call the following commands.
$ git clone https://gitlab.inria.fr/starpu/starpu-applications.git
$ cd starpu-applications/stencil5
$ make
To run the non-StarPU application
$ ./stencil5 -v
To run the sequential StarPU application
$ ./stencil5_starpu -v
To run the StarPU MPI application. Setting the variable STARPU_COMM_STATS to 1 will display the amount of communication between the different MPI processes.
$ STARPU_COMM_STATS=1 mpirun -np 4 ./stencil5_starpu_mpi -v 4 3