StarPU Handbook
Loading...
Searching...
No Matches
16. A Stencil Application

16.1 The Original Application

#define _(row,col,ld) ((row)+(col)*(ld))
void stencil5_cpu(double *xy, double *xm1y, double *xp1y, double *xym1, double *xyp1)
{
*xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
}
int main(int argc, char **argv)
{
int niter, n;
int x, y, loop;
read_params(argc, argv, &n, &niter);
double *A = calloc(n*n, sizeof(*A));
fill(A, n, n);
for(loop=0 ; loop<niter; loop++)
{
for (x = 0; x < n; x++)
{
for (y = 0; y < n; y++)
{
int xm1 = (x==0) ? n-1 : x-1;
int xp1 = (x==n-1) ? 0 : x+1;
int ym1 = (y==0) ? n-1 : y-1;
int yp1 = (y==n-1) ? 0 : y+1;
stencil5_cpu(&A[_(x,y,n)],
&A[_(xm1,y,n)], &A[_(xp1,y,n)],
&A[_(x,ym1,n)], &A[_(x,yp1,n)]);
}
}
}
return 0;
}

16.2 The StarPU Application

The computation function must be defined through a codelet.

#define _(row,col,ld) ((row)+(col)*(ld))
void stencil5_cpu(void *descr[], void *_args)
{
(void)_args;
double *xy = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
double *xm1y = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
double *xp1y = (double *)STARPU_VARIABLE_GET_PTR(descr[2]);
double *xym1 = (double *)STARPU_VARIABLE_GET_PTR(descr[3]);
double *xyp1 = (double *)STARPU_VARIABLE_GET_PTR(descr[4]);
*xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
}
struct starpu_codelet stencil5_cl =
{
.cpu_funcs = {stencil5_cpu},
.nbuffers = 5,
};
starpu_cpu_func_t cpu_funcs[STARPU_MAXIMPLEMENTATIONS]
Definition starpu_task.h:414
Definition starpu_task.h:338
#define STARPU_VARIABLE_GET_PTR(interface)
Definition starpu_data_interfaces.h:2211
@ STARPU_RW
Definition starpu_data.h:60
@ STARPU_R
Definition starpu_data.h:58
struct starpu_perfmodel starpu_perfmodel_nop

Data must be registered to StarPU.

data_handles = malloc(n*n*sizeof(*data_handles));
for(x = 0; x < n; x++)
{
for (y = 0; y < n; y++)
{
starpu_variable_data_register(&data_handles[_(x,y,n)],
(uintptr_t)&(A[_(x,y,n)]), sizeof(double));
}
}
#define STARPU_MAIN_RAM
Definition starpu_task.h:144
void starpu_variable_data_register(starpu_data_handle_t *handle, int home_node, uintptr_t ptr, size_t size)

Instead of directly calling the function, a StarPU task must be created.

int xm1 = (x==0) ? n-1 : x-1;
int xp1 = (x==n-1) ? 0 : x+1;
int ym1 = (y==0) ? n-1 : y-1;
int yp1 = (y==n-1) ? 0 : y+1;
starpu_task_insert(&stencil5_cl,
STARPU_RW, data_handles[_(x,y,n)],
STARPU_R, data_handles[_(xm1,y,n)],
STARPU_R, data_handles[_(xp1,y,n)],
STARPU_R, data_handles[_(x,ym1,n)],
STARPU_R, data_handles[_(x,yp1,n)],
0);
int starpu_task_insert(struct starpu_codelet *cl,...)

And finally data must be released from StarPU.

for(x = 0; x < n; x++)
{
for (y = 0; y < n; y++)
{
starpu_data_unregister(data_handles[_(x,y,n)]);
}
}
void starpu_data_unregister(starpu_data_handle_t handle)

The whole StarPU application looks as follows.

#define _(row,col,ld) ((row)+(col)*(ld))
void stencil5_cpu(void *descr[], void *_args)
{
(void)_args;
double *xy = (double *)STARPU_VARIABLE_GET_PTR(descr[0]);
double *xm1y = (double *)STARPU_VARIABLE_GET_PTR(descr[1]);
double *xp1y = (double *)STARPU_VARIABLE_GET_PTR(descr[2]);
double *xym1 = (double *)STARPU_VARIABLE_GET_PTR(descr[3]);
double *xyp1 = (double *)STARPU_VARIABLE_GET_PTR(descr[4]);
*xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
}
struct starpu_codelet stencil5_cl =
{
.cpu_funcs = {stencil5_cpu},
.nbuffers = 5,
};
int main(int argc, char **argv)
{
starpu_data_handle_t *data_handles;
int ret;
int niter, n;
int x, y, loop;
ret = starpu_init(NULL);
STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
read_params(argc, argv, &verbose, &n, &niter);
double *A = calloc(n*n, sizeof(*A));
fill(A, n, n);
data_handles = malloc(n*n*sizeof(*data_handles));
for(x = 0; x < n; x++)
{
for (y = 0; y < n; y++)
{
starpu_variable_data_register(&data_handles[_(x,y,n)],
(uintptr_t)&(A[_(x,y,n)]), sizeof(double));
}
}
for(loop=0 ; loop<niter; loop++)
{
for (x = 0; x < n; x++)
{
for (y = 0; y < n; y++)
{
int xm1 = (x==0) ? n-1 : x-1;
int xp1 = (x==n-1) ? 0 : x+1;
int ym1 = (y==0) ? n-1 : y-1;
int yp1 = (y==n-1) ? 0 : y+1;
starpu_task_insert(&stencil5_cl,
STARPU_RW, data_handles[_(x,y,n)],
STARPU_R, data_handles[_(xm1,y,n)],
STARPU_R, data_handles[_(xp1,y,n)],
STARPU_R, data_handles[_(x,ym1,n)],
STARPU_R, data_handles[_(x,yp1,n)],
0);
}
}
}
for(x = 0; x < n; x++)
{
for (y = 0; y < n; y++)
{
starpu_data_unregister(data_handles[_(x,y,n)]);
}
}
return 0;
}
int starpu_task_wait_for_all(void)
struct _starpu_data_state * starpu_data_handle_t
Definition starpu_data.h:45
void starpu_shutdown(void)
int starpu_init(struct starpu_conf *conf)
#define STARPU_CHECK_RETURN_VALUE(err, message,...)
Definition starpu_util.h:416

16.3 The StarPU MPI Application

The initialisation for StarPU-MPI is as follows.

int ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
starpu_mpi_comm_rank(MPI_COMM_WORLD, &my_rank);
starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
int starpu_mpi_comm_rank(MPI_Comm comm, int *rank)
int starpu_mpi_comm_size(MPI_Comm comm, int *size)
int starpu_mpi_init_conf(int *argc, char ***argv, int initialize_mpi, MPI_Comm comm, struct starpu_conf *conf)

An additional call to starpu_mpi_data_register() is necessary.

starpu_variable_data_register(&data_handles[_(x,y,n)],
(uintptr_t)&(A[_(x,y,n)]), sizeof(double));
int mpi_rank = my_distrib(x, y, size);
starpu_mpi_data_register(data_handles[_(x,y,n)], (y*n)+x, mpi_rank);
#define starpu_mpi_data_register(data_handle, data_tag, rank)
Definition starpu_mpi.h:539

And to insert a task, the function starpu_mpi_task_insert() must be used.

starpu_mpi_task_insert(MPI_COMM_WORLD, &stencil5_cl,
STARPU_RW, data_handles[_(x,y,n)],
STARPU_R, data_handles[_(xm1,y,n)],
STARPU_R, data_handles[_(xp1,y,n)],
STARPU_R, data_handles[_(x,ym1,n)],
STARPU_R, data_handles[_(x,yp1,n)],
0);
int starpu_mpi_task_insert(MPI_Comm comm, struct starpu_codelet *codelet,...)

The whole StarPU-MPI application looks as follows.

#define _(row,col,ld) ((row)+(col)*(ld))
void stencil5_cpu(void *descr[], void *_args); // Same as in sequential StarPU
struct starpu_codelet stencil5_cl; // Same as in sequential StarPU
/* Returns the MPI node number where data indexes index is */
int my_distrib(int x, int y, int nb_nodes)
{
return ((int)(x / sqrt(nb_nodes) + (y / sqrt(nb_nodes)) * sqrt(nb_nodes))) % nb_nodes;
}
int main(int argc, char **argv)
{
starpu_data_handle_t *data_handles;
int niter, n;
int my_rank, size, x, y, loop;
int ret = starpu_mpi_init_conf(&argc, &argv, 1, MPI_COMM_WORLD, NULL);
STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
starpu_mpi_comm_rank(MPI_COMM_WORLD, &my_rank);
starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
read_params(argc, argv, &n, &niter);
double *A = calloc(n*n, sizeof(*A));
fill(A, n, n);
data_handles = malloc(n*n*sizeof(*data_handles));
for(x = 0; x < n; x++)
{
for (y = 0; y < n; y++)
{
starpu_variable_data_register(&data_handles[_(x,y,n)],
(uintptr_t)&(A[_(x,y,n)]), sizeof(double));
int mpi_rank = my_distrib(x, y, size);
starpu_mpi_data_register(data_handles[_(x,y,n)], (y*n)+x, mpi_rank);
}
}
for(loop=0 ; loop<niter; loop++)
{
for (x = 0; x < n; x++)
{
for (y = 0; y < n; y++)
{
int xm1 = (x==0) ? n-1 : x-1;
int xp1 = (x==n-1) ? 0 : x+1;
int ym1 = (y==0) ? n-1 : y-1;
int yp1 = (y==n-1) ? 0 : y+1;
starpu_mpi_task_insert(MPI_COMM_WORLD, &stencil5_cl,
STARPU_RW, data_handles[_(x,y,n)],
STARPU_R, data_handles[_(xm1,y,n)],
STARPU_R, data_handles[_(xp1,y,n)],
STARPU_R, data_handles[_(x,ym1,n)],
STARPU_R, data_handles[_(x,yp1,n)],
0);
}
}
}
/* bring data back to node 0 and unregister it */
for(x = 0; x < n; x++)
{
for (y = 0; y < n; y++)
{
starpu_mpi_data_migrate(MPI_COMM_WORLD, data_handles[_(x,y,n)], 0);
starpu_data_unregister(data_handles[_(x,y,n)]);
}
}
return 0;
}
void starpu_mpi_data_migrate(MPI_Comm comm, starpu_data_handle_t handle, int new_rank)
int starpu_mpi_shutdown(void)

16.4 Running the application

$ docker run -it registry.gitlab.inria.fr/starpu/starpu-docker/starpu:latest

If your machine has GPU devices, you can use the following command to enable the GPU devices within the docker image.

$ docker run -it --gpus all registry.gitlab.inria.fr/starpu/starpu-docker/starpu:latest

From your docker image, you can then call the following commands.

$ git clone https://gitlab.inria.fr/starpu/starpu-applications.git
$ cd starpu-applications/stencil5
$ make

To run the non-StarPU application

$ ./stencil5 -v

To run the sequential StarPU application

$ ./stencil5_starpu -v

To run the StarPU MPI application. Setting the variable STARPU_COMM_STATS to 1 will display the amount of communication between the different MPI processes.

$ STARPU_COMM_STATS=1 mpirun -np 4 ./stencil5_starpu_mpi -v 4 3