diff --git a/openmp/Makefile b/openmp/Makefile index 136f146..4047fb1 100644 --- a/openmp/Makefile +++ b/openmp/Makefile @@ -1,11 +1,11 @@ -CC = cc -LD = cc +CC = mpicc +LD = mpicc CPPFLAGS = -I. CFLAGS = -O3 -fopenmp LDFLAGS = LDLIBS = -lm -lgomp -EXEC = ma.x +EXEC = miniAMR.x OBJS = block.o check_sum.o comm_block.o comm.o comm_parent.o comm_refine.o \ comm_util.o driver.o init.o main.o move.o pack.o plot.o profile.o \ @@ -18,7 +18,7 @@ $(EXEC): $(OBJS) $(CC) $(CPPFLAGS) $(CFLAGS) -c $< clean: - rm *.o ma.x + rm *.o $(EXEC) # need dependencies diff --git a/openmp/block.c b/openmp/block.c index ce89647..13e49c3 100644 --- a/openmp/block.c +++ b/openmp/block.c @@ -149,19 +149,23 @@ void split_blocks(void) i1 *= x_block_half; j1 *= y_block_half; k1 *= z_block_half; - for (v = 0; v < num_vars; v++) + for (v = 0; v < num_vars; v++) { + typedef double (*block3D_t)[y_block_size+2][z_block_size+2]; + block3D_t array_from = (block3D_t)&bp->array[v*block3D_size]; + block3D_t array_to = (block3D_t)&bp1->array[v*block3D_size]; for (i2 = i = 1; i <= x_block_half; i++, i2+=2) for (j2 = j = 1; j <= y_block_half; j++, j2+=2) for (k2 = k = 1; k <= z_block_half; k++, k2+=2) - bp1->array[v][i2 ][j2 ][k2 ] = - bp1->array[v][i2+1][j2 ][k2 ] = - bp1->array[v][i2 ][j2+1][k2 ] = - bp1->array[v][i2+1][j2+1][k2 ] = - bp1->array[v][i2 ][j2 ][k2+1] = - bp1->array[v][i2+1][j2 ][k2+1] = - bp1->array[v][i2 ][j2+1][k2+1] = - bp1->array[v][i2+1][j2+1][k2+1] = - bp->array[v][i+i1][j+j1][k+k1]/8.0; + array_to[i2 ][j2 ][k2 ] = + array_to[i2+1][j2 ][k2 ] = + array_to[i2 ][j2+1][k2 ] = + array_to[i2+1][j2+1][k2 ] = + array_to[i2 ][j2 ][k2+1] = + array_to[i2+1][j2 ][k2+1] = + array_to[i2 ][j2+1][k2+1] = + array_to[i2+1][j2+1][k2+1] = + array_from[i+i1][j+j1][k+k1]/8.0; + } } // children all defined - connect children & disconnect parent @@ -403,19 +407,23 @@ void consolidate_blocks(void) i1 = (o%2)*x_block_half; j1 = ((o/2)%2)*y_block_half; k1 = (o/4)*z_block_half; - for (v = 0; v < num_vars; v++) + for (v = 0; v < num_vars; v++) { + typedef double (*block3D_t)[y_block_size+2][z_block_size+2]; + block3D_t array_from = (block3D_t)&bp1->array[v*block3D_size]; + block3D_t array_to = (block3D_t)&bp->array[v*block3D_size]; for (i2 = i = 1; i <= x_block_half; i++, i2+=2) for (j2 = j = 1; j <= y_block_half; j++, j2+=2) for (k2 = k = 1; k <= z_block_half; k++, k2+=2) - bp->array[v][i+i1][j+j1][k+k1] = - bp1->array[v][i2 ][j2 ][k2 ] + - bp1->array[v][i2+1][j2 ][k2 ] + - bp1->array[v][i2 ][j2+1][k2 ] + - bp1->array[v][i2+1][j2+1][k2 ] + - bp1->array[v][i2 ][j2 ][k2+1] + - bp1->array[v][i2+1][j2 ][k2+1] + - bp1->array[v][i2 ][j2+1][k2+1] + - bp1->array[v][i2+1][j2+1][k2+1]; + array_to[i+i1][j+j1][k+k1] = + array_from[i2 ][j2 ][k2 ] + + array_from[i2+1][j2 ][k2 ] + + array_from[i2 ][j2+1][k2 ] + + array_from[i2+1][j2+1][k2 ] + + array_from[i2 ][j2 ][k2+1] + + array_from[i2+1][j2 ][k2+1] + + array_from[i2 ][j2+1][k2+1] + + array_from[i2+1][j2+1][k2+1]; + } } // now figure out communication for (c = 0; c < 6; c++) { diff --git a/openmp/block.h b/openmp/block.h index 5b73921..970607d 100644 --- a/openmp/block.h +++ b/openmp/block.h @@ -41,7 +41,7 @@ typedef struct { int nei_level[6]; /* 0 to 5 = W, E, S, N, D, U; use -2 for boundary */ int nei[6][2][2]; /* negative if off processor (-1 - proc) */ int cen[3]; - double ****array; + double * array; } block; block *blocks; @@ -73,6 +73,7 @@ int max_num_blocks; int num_refine; int uniform_refine; int x_block_size, y_block_size, z_block_size; +int block3D_size; int num_cells; int num_vars; int mat; diff --git a/openmp/check_sum.c b/openmp/check_sum.c index f50d66c..e461356 100644 --- a/openmp/check_sum.c +++ b/openmp/check_sum.c @@ -46,10 +46,12 @@ double check_sum(int var) for (in = 0; in < sorted_index[num_refine+1]; in++) { bp = &blocks[sorted_list[in].n]; block_sum = 0.0; + typedef double (*block3D_t)[y_block_size+2][z_block_size+2]; + block3D_t array = (block3D_t)&bp->array[var*block3D_size]; for (i = 1; i <= x_block_size; i++) for (j = 1; j <= y_block_size; j++) for (k = 1; k <= z_block_size; k++) - block_sum += bp->array[var][i][j][k]; + block_sum += array[i][j][k]; //if (!my_pe) printf("cs in %d block %d sum %lf\n", in, sorted_list[in].n, block_sum); sum += block_sum; } diff --git a/openmp/comm.c b/openmp/comm.c index 1d2b8aa..57867fd 100644 --- a/openmp/comm.c +++ b/openmp/comm.c @@ -33,6 +33,8 @@ #include "timer.h" #include "proto.h" +#include + // The routines in this file are used in the communication of ghost values // between blocks, both on processor and off processor. @@ -98,6 +100,7 @@ void comm(int start, int num_comm, int stage) // processor. Also apply boundary conditions for boundary of domain. time1 = time2 = time3 = 0.0; c1 = c2 = c3 = 0; + #pragma omp parallel for private (n, bp, l, m, i, j, k, t2, time1, time2, \ time3) reduction (+: c1, c2, c3) for (in = 0; in < sorted_index[num_refine+1]; in++) { @@ -184,6 +187,8 @@ void pack_face(double *send_buf, int block_num, int face_case, int dir, bp = &blocks[block_num]; + typedef double (*block3D_t)[y_block_size+2][z_block_size+2]; + if (!code) { if (dir == 0) { /* X - East, West */ @@ -197,19 +202,23 @@ void pack_face(double *send_buf, int block_num, int face_case, int dir, } else /* -X - West */ i = 1; if (face_case < 2) { /* whole face -> whole face */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (j = 1; j <= y_block_size; j++) for (k = 1; k <= z_block_size; k++, n++) - send_buf[n] = bp->array[m][i][j][k]; + send_buf[n] = array[i][j][k]; + } } else if (face_case >= 2 && face_case <= 5) { /* whole face -> quarter face - case does not matter */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (j = 1; j < y_block_size; j += 2) for (k = 1; k < z_block_size; k += 2, n++) - send_buf[n] = bp->array[m][i][j ][k ] + - bp->array[m][i][j ][k+1] + - bp->array[m][i][j+1][k ] + - bp->array[m][i][j+1][k+1]; + send_buf[n] = array[i][j ][k ] + + array[i][j ][k+1] + + array[i][j+1][k ] + + array[i][j+1][k+1]; + } } else { /* quarter face -> whole face */ /* four cases - figure out which quarter of face to send */ if (face_case%2 == 0) { @@ -226,10 +235,12 @@ void pack_face(double *send_buf, int block_num, int face_case, int dir, ks = z_block_half + 1; ke = z_block_size; } - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (j = js; j <= je; j++) for (k = ks; k <= ke; k++, n++) - send_buf[n] = bp->array[m][i][j][k]/4.0; + send_buf[n] = array[i][j][k]/4.0; + } } } else if (dir == 1) { /* Y - North, South */ @@ -241,25 +252,32 @@ void pack_face(double *send_buf, int block_num, int face_case, int dir, face_case = face_case - 10; } else /* -Y - South */ j = 1; + if (face_case == 0) { /* whole face -> whole face */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = 1; i <= x_block_size; i++) for (k = 1; k <= z_block_size; k++, n++) - send_buf[n] = bp->array[m][i][j][k]; + send_buf[n] = array[i][j][k]; + } } else if (face_case == 1) { - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = 0; i <= x_block_size+1; i++) for (k = 1; k <= z_block_size; k++, n++) - send_buf[n] = bp->array[m][i][j][k]; + send_buf[n] = array[i][j][k]; + } } else if (face_case >= 2 && face_case <= 5) { /* whole face -> quarter face - case does not matter */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = 1; i < x_block_size; i += 2) for (k = 1; k < z_block_size; k += 2, n++) - send_buf[n] = bp->array[m][i ][j][k ] + - bp->array[m][i ][j][k+1] + - bp->array[m][i+1][j][k ] + - bp->array[m][i+1][j][k+1]; + send_buf[n] = array[i ][j][k ] + + array[i ][j][k+1] + + array[i+1][j][k ] + + array[i+1][j][k+1]; + } } else { /* quarter face -> whole face */ /* four cases - figure out which quarter of face to send */ if (face_case%2 == 0) { @@ -276,10 +294,12 @@ void pack_face(double *send_buf, int block_num, int face_case, int dir, ks = z_block_half + 1; ke = z_block_size; } - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = is; i <= ie; i++) for (k = ks; k <= ke; k++, n++) - send_buf[n] = bp->array[m][i][j][k]/4.0; + send_buf[n] = array[i][j][k]/4.0; + } } } else { /* Z - Up, Down */ @@ -291,25 +311,32 @@ void pack_face(double *send_buf, int block_num, int face_case, int dir, face_case = face_case - 10; } else /* -Z - Down */ k = 1; + if (face_case == 0) { /* whole face -> whole face */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = 1; i <= x_block_size; i++) for (j = 1; j <= y_block_size; j++, n++) - send_buf[n] = bp->array[m][i][j][k]; + send_buf[n] = array[i][j][k]; + } } else if (face_case == 1) { - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = 0; i <= x_block_size+1; i++) for (j = 0; j <= y_block_size+1; j++, n++) - send_buf[n] = bp->array[m][i][j][k]; + send_buf[n] = array[i][j][k]; + } } else if (face_case >= 2 && face_case <= 5) { /* whole face -> quarter face - case does not matter */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = 1; i < x_block_size; i += 2) for (j = 1; j < y_block_size; j += 2, n++) - send_buf[n] = bp->array[m][i ][j ][k] + - bp->array[m][i ][j+1][k] + - bp->array[m][i+1][j ][k] + - bp->array[m][i+1][j+1][k]; + send_buf[n] = array[i ][j ][k] + + array[i ][j+1][k] + + array[i+1][j ][k] + + array[i+1][j+1][k]; + } } else { /* quarter face -> whole face */ /* four cases - figure out which quarter of face to send */ if (face_case%2 == 0) { @@ -326,10 +353,12 @@ void pack_face(double *send_buf, int block_num, int face_case, int dir, js = y_block_half + 1; je = y_block_size; } - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = is; i <= ie; i++) for (j = js; j <= je; j++, n++) - send_buf[n] = bp->array[m][i][j][k]/4.0; + send_buf[n] = array[i][j][k]/4.0; + } } } @@ -342,57 +371,61 @@ void pack_face(double *send_buf, int block_num, int face_case, int dir, face_case = face_case - 10; } else /* -X - West */ i = 1; + if (face_case < 2) { /* whole face -> whole face */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (j = 0; j <= y_block_size+1; j++) for (k = 0; k <= z_block_size+1; k++, n++) - send_buf[n] = bp->array[m][i][j][k]; + send_buf[n] = array[i][j][k]; + } } else if (face_case >= 2 && face_case <= 5) { /* whole face -> quarter face */ for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; if (face_case%2 == 0) { j = 0; if ((face_case/2)%2 == 1) { k = 0; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } for (k = 1; k < z_block_size; k += 2, n++) - send_buf[n] = bp->array[m][i][j][k ] + - bp->array[m][i][j][k+1]; + send_buf[n] = array[i][j][k ] + + array[i][j][k+1]; if ((face_case/2)%2 == 0) { k = z_block_size + 1; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } } for (j = 1; j < y_block_size; j += 2) { if ((face_case/2)%2 == 1) { k = 0; - send_buf[n++] = bp->array[m][i][j ][k] + - bp->array[m][i][j+1][k]; + send_buf[n++] = array[i][j ][k] + + array[i][j+1][k]; } for (k = 1; k < z_block_size; k += 2, n++) - send_buf[n] = bp->array[m][i][j ][k ] + - bp->array[m][i][j ][k+1] + - bp->array[m][i][j+1][k ] + - bp->array[m][i][j+1][k+1]; + send_buf[n] = array[i][j ][k ] + + array[i][j ][k+1] + + array[i][j+1][k ] + + array[i][j+1][k+1]; if ((face_case/2)%2 == 0) { k = z_block_size + 1; - send_buf[n++] = bp->array[m][i][j ][k] + - bp->array[m][i][j+1][k]; + send_buf[n++] = array[i][j ][k] + + array[i][j+1][k]; } } if (face_case%2 == 1) { j = y_block_size + 1; if ((face_case/2)%2 == 1) { k = 0; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } for (k = 1; k < z_block_size; k += 2, n++) - send_buf[n] = bp->array[m][i][j][k ] + - bp->array[m][i][j][k+1]; + send_buf[n] = array[i][j][k ] + + array[i][j][k+1]; if ((face_case/2)%2 == 0) { k = z_block_size + 1; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } } } @@ -412,10 +445,12 @@ void pack_face(double *send_buf, int block_num, int face_case, int dir, ks = z_block_half; ke = z_block_size + 1; } - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (j = js; j <= je; j++) for (k = ks; k <= ke; k++, n++) - send_buf[n] = bp->array[m][i][j][k]/4.0; + send_buf[n] = array[i][j][k]/4.0; + } } } else if (dir == 1) { /* Y - North, South */ @@ -425,57 +460,61 @@ void pack_face(double *send_buf, int block_num, int face_case, int dir, face_case = face_case - 10; } else /* -Y - South */ j = 1; + if (face_case < 2) { /* whole face -> whole face */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = 0; i <= x_block_size+1; i++) for (k = 0; k <= z_block_size+1; k++, n++) - send_buf[n] = bp->array[m][i][j][k]; + send_buf[n] = array[i][j][k]; + } } else if (face_case >= 2 && face_case <= 5) { /* whole face -> quarter face */ for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; if (face_case%2 == 0) { i = 0; if ((face_case/2)%2 == 1) { k = 0; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } for (k = 1; k < z_block_size; k += 2, n++) - send_buf[n] = bp->array[m][i][j][k ] + - bp->array[m][i][j][k+1]; + send_buf[n] = array[i][j][k ] + + array[i][j][k+1]; if ((face_case/2)%2 == 0) { k = z_block_size + 1; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } } for (i = 1; i < x_block_size; i += 2) { if ((face_case/2)%2 == 1) { k = 0; - send_buf[n++] = bp->array[m][i ][j][k] + - bp->array[m][i+1][j][k]; + send_buf[n++] = array[i ][j][k] + + array[i+1][j][k]; } for (k = 1; k < z_block_size; k += 2, n++) - send_buf[n] = bp->array[m][i ][j][k ] + - bp->array[m][i ][j][k+1] + - bp->array[m][i+1][j][k ] + - bp->array[m][i+1][j][k+1]; + send_buf[n] = array[i ][j][k ] + + array[i ][j][k+1] + + array[i+1][j][k ] + + array[i+1][j][k+1]; if ((face_case/2)%2 == 0) { k = z_block_size + 1; - send_buf[n++] = bp->array[m][i ][j][k] + - bp->array[m][i+1][j][k]; + send_buf[n++] = array[i ][j][k] + + array[i+1][j][k]; } } if (face_case%2 == 1) { i = x_block_size + 1; if ((face_case/2)%2 == 1) { k = 0; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } for (k = 1; k < z_block_size; k += 2, n++) - send_buf[n] = bp->array[m][i][j][k ] + - bp->array[m][i][j][k+1]; + send_buf[n] = array[i][j][k ] + + array[i][j][k+1]; if ((face_case/2)%2 == 0) { k = z_block_size + 1; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } } } @@ -495,10 +534,12 @@ void pack_face(double *send_buf, int block_num, int face_case, int dir, ks = z_block_half; ke = z_block_size + 1; } - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = is; i <= ie; i++) for (k = ks; k <= ke; k++, n++) - send_buf[n] = bp->array[m][i][j][k]/4.0; + send_buf[n] = array[i][j][k]/4.0; + } } } else { /* Z - Up, Down */ @@ -510,57 +551,61 @@ void pack_face(double *send_buf, int block_num, int face_case, int dir, face_case = face_case - 10; } else /* -Z - Down */ k = 1; + if (face_case < 2) { /* whole face -> whole face */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = 0; i <= x_block_size+1; i++) for (j = 0; j <= y_block_size+1; j++, n++) - send_buf[n] = bp->array[m][i][j][k]; + send_buf[n] = array[i][j][k]; + } } else if (face_case >= 2 && face_case <= 5) { /* whole face -> quarter face - case does not matter */ for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; if (face_case%2 == 0) { i = 0; if ((face_case/2)%2 == 1) { j = 0; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } for (j = 1; j < y_block_size; j += 2, n++) - send_buf[n] = bp->array[m][i][j ][k] + - bp->array[m][i][j+1][k]; + send_buf[n] = array[i][j ][k] + + array[i][j+1][k]; if ((face_case/2)%2 == 0) { j = y_block_size + 1; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } } for (i = 1; i < x_block_size; i += 2) { if ((face_case/2)%2 == 1) { j = 0; - send_buf[n++] = bp->array[m][i ][j][k] + - bp->array[m][i+1][j][k]; + send_buf[n++] = array[i ][j][k] + + array[i+1][j][k]; } for (j = 1; j < y_block_size; j += 2, n++) - send_buf[n] = bp->array[m][i ][j ][k] + - bp->array[m][i ][j+1][k] + - bp->array[m][i+1][j ][k] + - bp->array[m][i+1][j+1][k]; + send_buf[n] = array[i ][j ][k] + + array[i ][j+1][k] + + array[i+1][j ][k] + + array[i+1][j+1][k]; if ((face_case/2)%2 == 0) { j = y_block_size + 1; - send_buf[n++] = bp->array[m][i ][j][k] + - bp->array[m][i+1][j][k]; + send_buf[n++] = array[i ][j][k] + + array[i+1][j][k]; } } if (face_case%2 == 1) { i = x_block_size + 1; if ((face_case/2)%2 == 1) { j = 0; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } for (j = 1; j < y_block_size; j += 2, n++) - send_buf[n] = bp->array[m][i][j ][k] + - bp->array[m][i][j+1][k]; + send_buf[n] = array[i][j ][k] + + array[i][j+1][k]; if ((face_case/2)%2 == 0) { j = y_block_size + 1; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } } } @@ -580,10 +625,12 @@ void pack_face(double *send_buf, int block_num, int face_case, int dir, js = y_block_half; je = y_block_size + 1; } - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = is; i <= ie; i++) for (j = js; j <= je; j++, n++) - send_buf[n] = bp->array[m][i][j][k]/4.0; + send_buf[n] = array[i][j][k]/4.0; + } } } @@ -596,57 +643,61 @@ void pack_face(double *send_buf, int block_num, int face_case, int dir, face_case = face_case - 10; } else /* -X - West */ i = 1; + if (face_case < 2) { /* whole face -> whole face */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (j = 0; j <= y_block_size+1; j++) for (k = 0; k <= z_block_size+1; k++, n++) - send_buf[n] = bp->array[m][i][j][k]; + send_buf[n] = array[i][j][k]; + } } else if (face_case >= 2 && face_case <= 5) { /* whole face -> quarter face */ for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; if (face_case%2 == 0) { j = 0; if ((face_case/2)%2 == 1) { k = 0; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } for (k = 1; k < z_block_size; k += 2, n++) - send_buf[n] = bp->array[m][i][j][k ] + - bp->array[m][i][j][k+1]; + send_buf[n] = array[i][j][k ] + + array[i][j][k+1]; if ((face_case/2)%2 == 0) { k = z_block_size + 1; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } } for (j = 1; j < y_block_size; j += 2) { if ((face_case/2)%2 == 1) { k = 0; - send_buf[n++] = bp->array[m][i][j ][k] + - bp->array[m][i][j+1][k]; + send_buf[n++] = array[i][j ][k] + + array[i][j+1][k]; } for (k = 1; k < z_block_size; k += 2, n++) - send_buf[n] = bp->array[m][i][j ][k ] + - bp->array[m][i][j ][k+1] + - bp->array[m][i][j+1][k ] + - bp->array[m][i][j+1][k+1]; + send_buf[n] = array[i][j ][k ] + + array[i][j ][k+1] + + array[i][j+1][k ] + + array[i][j+1][k+1]; if ((face_case/2)%2 == 0) { k = z_block_size + 1; - send_buf[n++] = bp->array[m][i][j ][k] + - bp->array[m][i][j+1][k]; + send_buf[n++] = array[i][j ][k] + + array[i][j+1][k]; } } if (face_case%2 == 1) { j = y_block_size + 1; if ((face_case/2)%2 == 1) { k = 0; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } for (k = 1; k < z_block_size; k += 2, n++) - send_buf[n] = bp->array[m][i][j][k ] + - bp->array[m][i][j][k+1]; + send_buf[n] = array[i][j][k ] + + array[i][j][k+1]; if ((face_case/2)%2 == 0) { k = z_block_size + 1; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } } } @@ -667,34 +718,35 @@ void pack_face(double *send_buf, int block_num, int face_case, int dir, ke = z_block_size; } for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; j = js - 1; k = ks - 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; for (k = ks; k <= ke; k++, n+=2) - send_buf[n] = send_buf[n+1] = bp->array[m][i][j][k]/4.0; + send_buf[n] = send_buf[n+1] = array[i][j][k]/4.0; k = ke + 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; for (j = js; j <= je; j++) { k = ks - 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; for (k = ks; k <= ke; k++, n+=2) - send_buf[n] = send_buf[n+1] = bp->array[m][i][j][k]/4.0; + send_buf[n] = send_buf[n+1] = array[i][j][k]/4.0; k = ke + 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; k = ks - 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; for (k = ks; k <= ke; k++, n+=2) - send_buf[n] = send_buf[n+1] = bp->array[m][i][j][k]/4.0; + send_buf[n] = send_buf[n+1] = array[i][j][k]/4.0; k = ke + 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; } j = je + 1; k = ks - 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; for (k = ks; k <= ke; k++, n+=2) - send_buf[n] = send_buf[n+1] = bp->array[m][i][j][k]/4.0; + send_buf[n] = send_buf[n+1] = array[i][j][k]/4.0; k = ke + 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; } } @@ -705,57 +757,61 @@ void pack_face(double *send_buf, int block_num, int face_case, int dir, face_case = face_case - 10; } else /* -Y - South */ j = 1; + if (face_case < 2) { /* whole face -> whole face */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = 0; i <= x_block_size+1; i++) for (k = 0; k <= z_block_size+1; k++, n++) - send_buf[n] = bp->array[m][i][j][k]; + send_buf[n] = array[i][j][k]; + } } else if (face_case >= 2 && face_case <= 5) { /* whole face -> quarter face */ for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; if (face_case%2 == 0) { i = 0; if ((face_case/2)%2 == 1) { k = 0; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } for (k = 1; k < z_block_size; k += 2, n++) - send_buf[n] = bp->array[m][i][j][k ] + - bp->array[m][i][j][k+1]; + send_buf[n] = array[i][j][k ] + + array[i][j][k+1]; if ((face_case/2)%2 == 0) { k = z_block_size + 1; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } } for (i = 1; i < x_block_size; i += 2) { if ((face_case/2)%2 == 1) { k = 0; - send_buf[n++] = bp->array[m][i ][j][k] + - bp->array[m][i+1][j][k]; + send_buf[n++] = array[i ][j][k] + + array[i+1][j][k]; } for (k = 1; k < z_block_size; k += 2, n++) - send_buf[n] = bp->array[m][i ][j][k ] + - bp->array[m][i ][j][k+1] + - bp->array[m][i+1][j][k ] + - bp->array[m][i+1][j][k+1]; + send_buf[n] = array[i ][j][k ] + + array[i ][j][k+1] + + array[i+1][j][k ] + + array[i+1][j][k+1]; if ((face_case/2)%2 == 0) { k = z_block_size + 1; - send_buf[n++] = bp->array[m][i ][j][k] + - bp->array[m][i+1][j][k]; + send_buf[n++] = array[i ][j][k] + + array[i+1][j][k]; } } if (face_case%2 == 1) { i = x_block_size + 1; if ((face_case/2)%2 == 1) { k = 0; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } for (k = 1; k < z_block_size; k += 2, n++) - send_buf[n] = bp->array[m][i][j][k ] + - bp->array[m][i][j][k+1]; + send_buf[n] = array[i][j][k ] + + array[i][j][k+1]; if ((face_case/2)%2 == 0) { k = z_block_size + 1; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } } } @@ -776,34 +832,35 @@ void pack_face(double *send_buf, int block_num, int face_case, int dir, ke = z_block_size; } for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; i = is - 1; k = ks - 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; for (k = ks; k <= ke; k++, n+=2) - send_buf[n] = send_buf[n+1] = bp->array[m][i][j][k]/4.0; + send_buf[n] = send_buf[n+1] = array[i][j][k]/4.0; k = ke + 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; for (i = is; i <= ie; i++) { k = ks - 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; for (k = ks; k <= ke; k++, n+=2) - send_buf[n] = send_buf[n+1] = bp->array[m][i][j][k]/4.0; + send_buf[n] = send_buf[n+1] = array[i][j][k]/4.0; k = ke + 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; k = ks - 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; for (k = ks; k <= ke; k++, n+=2) - send_buf[n] = send_buf[n+1] = bp->array[m][i][j][k]/4.0; + send_buf[n] = send_buf[n+1] = array[i][j][k]/4.0; k = ke + 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; } i = ie + 1; k = ks - 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; for (k = ks; k <= ke; k++, n+=2) - send_buf[n] = send_buf[n+1] = bp->array[m][i][j][k]/4.0; + send_buf[n] = send_buf[n+1] = array[i][j][k]/4.0; k = ke + 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; } } @@ -816,57 +873,61 @@ void pack_face(double *send_buf, int block_num, int face_case, int dir, face_case = face_case - 10; } else /* -Z - Down */ k = 1; + typedef double (*block3D_t)[y_block_size+2][z_block_size+2]; if (face_case < 2) { /* whole face -> whole face */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = 0; i <= x_block_size+1; i++) for (j = 0; j <= y_block_size+1; j++, n++) - send_buf[n] = bp->array[m][i][j][k]; + send_buf[n] = array[i][j][k]; + } } else if (face_case >= 2 && face_case <= 5) { /* whole face -> quarter face - case does not matter */ for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; if (face_case%2 == 0) { i = 0; if ((face_case/2)%2 == 1) { j = 0; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } for (j = 1; j < y_block_size; j += 2, n++) - send_buf[n] = bp->array[m][i][j ][k] + - bp->array[m][i][j+1][k]; + send_buf[n] = array[i][j ][k] + + array[i][j+1][k]; if ((face_case/2)%2 == 0) { j = y_block_size + 1; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } } for (i = 1; i < x_block_size; i += 2) { if ((face_case/2)%2 == 1) { j = 0; - send_buf[n++] = bp->array[m][i ][j][k] + - bp->array[m][i+1][j][k]; + send_buf[n++] = array[i ][j][k] + + array[i+1][j][k]; } for (j = 1; j < y_block_size; j += 2, n++) - send_buf[n] = bp->array[m][i ][j ][k] + - bp->array[m][i ][j+1][k] + - bp->array[m][i+1][j ][k] + - bp->array[m][i+1][j+1][k]; + send_buf[n] = array[i ][j ][k] + + array[i ][j+1][k] + + array[i+1][j ][k] + + array[i+1][j+1][k]; if ((face_case/2)%2 == 0) { j = y_block_size + 1; - send_buf[n++] = bp->array[m][i ][j][k] + - bp->array[m][i+1][j][k]; + send_buf[n++] = array[i ][j][k] + + array[i+1][j][k]; } } if (face_case%2 == 1) { i = x_block_size + 1; if ((face_case/2)%2 == 1) { j = 0; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } for (j = 1; j < y_block_size; j += 2, n++) - send_buf[n] = bp->array[m][i][j ][k] + - bp->array[m][i][j+1][k]; + send_buf[n] = array[i][j ][k] + + array[i][j+1][k]; if ((face_case/2)%2 == 0) { j = y_block_size + 1; - send_buf[n++] = bp->array[m][i][j][k]; + send_buf[n++] = array[i][j][k]; } } } @@ -887,34 +948,35 @@ void pack_face(double *send_buf, int block_num, int face_case, int dir, je = y_block_size; } for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; i = is - 1; j = js - 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; for (j = js; j <= je; j++, n+=2) - send_buf[n] = send_buf[n+1] = bp->array[m][i][j][k]/4.0; + send_buf[n] = send_buf[n+1] = array[i][j][k]/4.0; j = je + 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; for (i = is; i <= ie; i++) { j = js - 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; for (j = js; j <= je; j++, n+=2) - send_buf[n] = send_buf[n+1] = bp->array[m][i][j][k]/4.0; + send_buf[n] = send_buf[n+1] = array[i][j][k]/4.0; j = je + 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; j = js - 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; for (j = js; j <= je; j++, n+=2) - send_buf[n] = send_buf[n+1] = bp->array[m][i][j][k]/4.0; + send_buf[n] = send_buf[n+1] = array[i][j][k]/4.0; j = je + 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; } i = ie + 1; j = js - 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; for (j = js; j <= je; j++, n+=2) - send_buf[n] = send_buf[n+1] = bp->array[m][i][j][k]/4.0; + send_buf[n] = send_buf[n+1] = array[i][j][k]/4.0; j = je + 1; - send_buf[n++] = bp->array[m][i][j][k]/4.0; + send_buf[n++] = array[i][j][k]/4.0; } } } @@ -932,6 +994,8 @@ void unpack_face(double *recv_buf, int block_num, int face_case, int dir, bp = &blocks[block_num]; + typedef double (*block3D_t)[y_block_size+2][z_block_size+2]; + if (!code) { if (dir == 0) { /* X - East, West */ @@ -945,20 +1009,25 @@ void unpack_face(double *recv_buf, int block_num, int face_case, int dir, face_case = face_case - 10; } else /* -X - from West */ i = 0; + if (face_case < 2) { /* whole face -> whole face */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (j = 1; j <= y_block_size; j++) for (k = 1; k <= z_block_size; k++, n++) - bp->array[m][i][j][k] = recv_buf[n]; + array[i][j][k] = recv_buf[n]; + } } else if (face_case >= 2 && face_case <= 5) { /* whole face -> quarter face - one case */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (j = 1; j < y_block_size; j += 2) for (k = 1; k < z_block_size; k += 2, n++) - bp->array[m][i][j ][k ] = - bp->array[m][i][j ][k+1] = - bp->array[m][i][j+1][k ] = - bp->array[m][i][j+1][k+1] = recv_buf[n]; + array[i][j ][k ] = + array[i][j ][k+1] = + array[i][j+1][k ] = + array[i][j+1][k+1] = recv_buf[n]; + } } else { /* quarter face -> whole face */ /* four cases - figure out which quarter of face to recv */ if (face_case%2 == 0) { @@ -975,10 +1044,12 @@ void unpack_face(double *recv_buf, int block_num, int face_case, int dir, ks = z_block_half + 1; ke = z_block_size; } - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (j = js; j <= je; j++) for (k = ks; k <= ke; k++, n++) - bp->array[m][i][j][k] = recv_buf[n]; + array[i][j][k] = recv_buf[n]; + } } } else if (dir == 1) { /* Y - North, South */ @@ -990,25 +1061,32 @@ void unpack_face(double *recv_buf, int block_num, int face_case, int dir, face_case = face_case - 10; } else /* -Y - from South */ j = 0; + if (face_case == 0) { /* whole face -> whole face */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = 1; i <= x_block_size; i++) for (k = 1; k <= z_block_size; k++, n++) - bp->array[m][i][j][k] = recv_buf[n]; + array[i][j][k] = recv_buf[n]; + } } else if (face_case == 1) { - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = 0; i <= x_block_size+1; i++) for (k = 1; k <= z_block_size; k++, n++) - bp->array[m][i][j][k] = recv_buf[n]; + array[i][j][k] = recv_buf[n]; + } } else if (face_case >= 2 && face_case <= 5) { /* one case - recv into 4 cells per cell sent */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = 1; i < x_block_size; i += 2) for (k = 1; k < z_block_size; k += 2, n++) - bp->array[m][i ][j][k ] = - bp->array[m][i ][j][k+1] = - bp->array[m][i+1][j][k ] = - bp->array[m][i+1][j][k+1] = recv_buf[n]; + array[i ][j][k ] = + array[i ][j][k+1] = + array[i+1][j][k ] = + array[i+1][j][k+1] = recv_buf[n]; + } } else { /* quarter face -> whole face */ /* whole face -> quarter face - determine case */ if (face_case%2 == 0) { @@ -1025,10 +1103,12 @@ void unpack_face(double *recv_buf, int block_num, int face_case, int dir, ks = z_block_half + 1; ke = z_block_size; } - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = is; i <= ie; i++) for (k = ks; k <= ke; k++, n++) - bp->array[m][i][j][k] = recv_buf[n]; + array[i][j][k] = recv_buf[n]; + } } } else { /* Z - Up, Down */ @@ -1040,25 +1120,32 @@ void unpack_face(double *recv_buf, int block_num, int face_case, int dir, face_case = face_case - 10; } else /* -Z - from Down */ k = 0; + if (face_case == 0) { /* whole face -> whole face */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = 1; i <= x_block_size; i++) for (j = 1; j <= y_block_size; j++, n++) - bp->array[m][i][j][k] = recv_buf[n]; + array[i][j][k] = recv_buf[n]; + } } else if (face_case == 1) { - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = 0; i <= x_block_size+1; i++) for (j = 0; j <= y_block_size+1; j++, n++) - bp->array[m][i][j][k] = recv_buf[n]; + array[i][j][k] = recv_buf[n]; + } } else if (face_case >= 2 && face_case <= 5) { /* one case - receive into 4 cells */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = 1; i < x_block_size; i += 2) for (j = 1; j < y_block_size; j += 2, n++) - bp->array[m][i ][j ][k] = - bp->array[m][i ][j+1][k] = - bp->array[m][i+1][j ][k] = - bp->array[m][i+1][j+1][k] = recv_buf[n]; + array[i ][j ][k] = + array[i ][j+1][k] = + array[i+1][j ][k] = + array[i+1][j+1][k] = recv_buf[n]; + } } else { /* quarter face -> whole face */ /* whole face -> quarter face - determine case */ if (face_case%2 == 0) { @@ -1075,10 +1162,12 @@ void unpack_face(double *recv_buf, int block_num, int face_case, int dir, js = y_block_half + 1; je = y_block_size; } - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = is; i <= ie; i++) for (j = js; j <= je; j++, n++) - bp->array[m][i][j][k] = recv_buf[n]; + array[i][j][k] = recv_buf[n]; + } } } @@ -1091,43 +1180,47 @@ void unpack_face(double *recv_buf, int block_num, int face_case, int dir, face_case = face_case - 10; } else /* -X - from West */ i = 0; + if (face_case < 2) { /* whole face -> whole */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (j = 0; j <= y_block_size+1; j++) for (k = 0; k <= z_block_size+1; k++, n++) - bp->array[m][i][j][k] = recv_buf[n]; + array[i][j][k] = recv_buf[n]; + } } else if (face_case >= 2 && face_case <= 5) { /* whole face -> quarter face */ for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; j = 0; k = 0; - bp->array[m][i][j][k] = recv_buf[n++]; + array[i][j][k] = recv_buf[n++]; for (k = 1; k < z_block_size; k += 2, n++) - bp->array[m][i][j][k ] = - bp->array[m][i][j][k+1] = recv_buf[n]; + array[i][j][k ] = + array[i][j][k+1] = recv_buf[n]; k = z_block_size + 1; - bp->array[m][i][j][k] = recv_buf[n++]; + array[i][j][k] = recv_buf[n++]; for (j = 1; j < y_block_size; j += 2) { k = 0; - bp->array[m][i][j ][k] = - bp->array[m][i][j+1][k] = recv_buf[n++]; + array[i][j ][k] = + array[i][j+1][k] = recv_buf[n++]; for (k = 1; k < z_block_size; k += 2, n++) - bp->array[m][i][j ][k ] = - bp->array[m][i][j ][k+1] = - bp->array[m][i][j+1][k ] = - bp->array[m][i][j+1][k+1] = recv_buf[n]; + array[i][j ][k ] = + array[i][j ][k+1] = + array[i][j+1][k ] = + array[i][j+1][k+1] = recv_buf[n]; k = z_block_size + 1; - bp->array[m][i][j ][k] = - bp->array[m][i][j+1][k] = recv_buf[n++]; + array[i][j ][k] = + array[i][j+1][k] = recv_buf[n++]; } j = y_block_size + 1; k = 0; - bp->array[m][i][j][k] = recv_buf[n++]; + array[i][j][k] = recv_buf[n++]; for (k = 1; k < z_block_size; k += 2, n++) - bp->array[m][i][j][k ] = - bp->array[m][i][j][k+1] = recv_buf[n]; + array[i][j][k ] = + array[i][j][k+1] = recv_buf[n]; k = z_block_size + 1; - bp->array[m][i][j][k] = recv_buf[n++]; + array[i][j][k] = recv_buf[n++]; } } else { /* quarter face -> whole face */ /* four cases - figure out which quarter of face to recv */ @@ -1145,10 +1238,12 @@ void unpack_face(double *recv_buf, int block_num, int face_case, int dir, ks = z_block_half + 1; ke = z_block_size + 1; } - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (j = js; j <= je; j++) for (k = ks; k <= ke; k++, n++) - bp->array[m][i][j][k] = recv_buf[n]; + array[i][j][k] = recv_buf[n]; + } } } else if (dir == 1) { /* Y - North, South */ @@ -1158,43 +1253,47 @@ void unpack_face(double *recv_buf, int block_num, int face_case, int dir, face_case = face_case - 10; } else /* -Y - from South */ j = 0; + if (face_case < 2) { /* whole face -> whole */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = 0; i <= x_block_size+1; i++) for (k = 0; k <= z_block_size+1; k++, n++) - bp->array[m][i][j][k] = recv_buf[n]; + array[i][j][k] = recv_buf[n]; + } } else if (face_case >= 2 && face_case <= 5) { /* whole face -> quarter face */ for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; i = 0; k = 0; - bp->array[m][i][j][k] = recv_buf[n++]; + array[i][j][k] = recv_buf[n++]; for (k = 1; k < z_block_size; k += 2, n++) - bp->array[m][i][j][k ] = - bp->array[m][i][j][k+1] = recv_buf[n]; + array[i][j][k ] = + array[i][j][k+1] = recv_buf[n]; k = z_block_size + 1; - bp->array[m][i][j][k] = recv_buf[n++]; + array[i][j][k] = recv_buf[n++]; for (i = 1; i < x_block_size; i += 2) { k = 0; - bp->array[m][i ][j][k] = - bp->array[m][i+1][j][k] = recv_buf[n++]; + array[i ][j][k] = + array[i+1][j][k] = recv_buf[n++]; for (k = 1; k < z_block_size; k += 2, n++) - bp->array[m][i ][j][k ] = - bp->array[m][i ][j][k+1] = - bp->array[m][i+1][j][k ] = - bp->array[m][i+1][j][k+1] = recv_buf[n]; + array[i ][j][k ] = + array[i ][j][k+1] = + array[i+1][j][k ] = + array[i+1][j][k+1] = recv_buf[n]; k = z_block_size + 1; - bp->array[m][i ][j][k] = - bp->array[m][i+1][j][k] = recv_buf[n++]; + array[i ][j][k] = + array[i+1][j][k] = recv_buf[n++]; } i = x_block_size + 1; k = 0; - bp->array[m][i][j][k] = recv_buf[n++]; + array[i][j][k] = recv_buf[n++]; for (k = 1; k < z_block_size; k += 2, n++) - bp->array[m][i][j][k ] = - bp->array[m][i][j][k+1] = recv_buf[n]; + array[i][j][k ] = + array[i][j][k+1] = recv_buf[n]; k = z_block_size + 1; - bp->array[m][i][j][k] = recv_buf[n++]; + array[i][j][k] = recv_buf[n++]; } } else { /* quarter face -> whole face */ /* whole face -> quarter face - determine case */ @@ -1212,10 +1311,12 @@ void unpack_face(double *recv_buf, int block_num, int face_case, int dir, ks = z_block_half + 1; ke = z_block_size + 1; } - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = is; i <= ie; i++) for (k = ks; k <= ke; k++, n++) - bp->array[m][i][j][k] = recv_buf[n]; + array[i][j][k] = recv_buf[n]; + } } } else { /* Z - Up, Down */ @@ -1225,43 +1326,47 @@ void unpack_face(double *recv_buf, int block_num, int face_case, int dir, face_case = face_case - 10; } else /* -Z - from Down */ k = 0; + if (face_case < 2) { /* whole face -> whole face */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = 0; i <= x_block_size+1; i++) for (j = 0; j <= y_block_size+1; j++, n++) - bp->array[m][i][j][k] = recv_buf[n]; + array[i][j][k] = recv_buf[n]; + } } else if (face_case >= 2 && face_case <= 5) { /* whole face -> quarter face */ for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; i = 0; j = 0; - bp->array[m][i][j][k] = recv_buf[n++]; + array[i][j][k] = recv_buf[n++]; for (j = 1; j < y_block_size; j += 2, n++) - bp->array[m][i][j ][k] = - bp->array[m][i][j+1][k] = recv_buf[n]; + array[i][j ][k] = + array[i][j+1][k] = recv_buf[n]; j = y_block_size + 1; - bp->array[m][i][j][k] = recv_buf[n++]; + array[i][j][k] = recv_buf[n++]; for (i = 1; i < x_block_size; i += 2) { j = 0; - bp->array[m][i ][j][k] = - bp->array[m][i+1][j][k] = recv_buf[n++]; + array[i ][j][k] = + array[i+1][j][k] = recv_buf[n++]; for (j = 1; j < y_block_size; j += 2, n++) - bp->array[m][i ][j ][k] = - bp->array[m][i ][j+1][k] = - bp->array[m][i+1][j ][k] = - bp->array[m][i+1][j+1][k] = recv_buf[n]; + array[i ][j ][k] = + array[i ][j+1][k] = + array[i+1][j ][k] = + array[i+1][j+1][k] = recv_buf[n]; j = y_block_size + 1; - bp->array[m][i ][j][k] = - bp->array[m][i+1][j][k] = recv_buf[n++]; + array[i ][j][k] = + array[i+1][j][k] = recv_buf[n++]; } i = x_block_size + 1; j = 0; - bp->array[m][i][j][k] = recv_buf[n++]; + array[i][j][k] = recv_buf[n++]; for (j = 1; j < y_block_size; j += 2, n++) - bp->array[m][i][j ][k] = - bp->array[m][i][j+1][k] = recv_buf[n]; + array[i][j ][k] = + array[i][j+1][k] = recv_buf[n]; j = y_block_size + 1; - bp->array[m][i][j][k] = recv_buf[n++]; + array[i][j][k] = recv_buf[n++]; } } else { /* quarter face -> whole face */ /* whole face -> quarter face - determine case */ @@ -1279,10 +1384,12 @@ void unpack_face(double *recv_buf, int block_num, int face_case, int dir, js = y_block_half + 1; je = y_block_size + 1; } - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = is; i <= ie; i++) for (j = js; j <= je; j++, n++) - bp->array[m][i][j][k] = recv_buf[n]; + array[i][j][k] = recv_buf[n]; + } } } @@ -1295,11 +1402,14 @@ void unpack_face(double *recv_buf, int block_num, int face_case, int dir, face_case = face_case - 10; } else /* -X - from West */ i = 0; + if (face_case <= 5) { /* whole face -> whole or quarter face */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (j = 0; j <= y_block_size+1; j++) for (k = 0; k <= z_block_size+1; k++, n++) - bp->array[m][i][j][k] = recv_buf[n]; + array[i][j][k] = recv_buf[n]; + } } else { /* quarter face -> whole face */ /* four cases - figure out which quarter of face to recv */ if (face_case%2 == 0) { @@ -1316,10 +1426,12 @@ void unpack_face(double *recv_buf, int block_num, int face_case, int dir, ks = z_block_half + 1; ke = z_block_size + 1; } - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (j = js; j <= je; j++) for (k = ks; k <= ke; k++, n++) - bp->array[m][i][j][k] = recv_buf[n]; + array[i][j][k] = recv_buf[n]; + } } } else if (dir == 1) { /* Y - North, South */ @@ -1329,11 +1441,14 @@ void unpack_face(double *recv_buf, int block_num, int face_case, int dir, face_case = face_case - 10; } else /* -Y - from South */ j = 0; + if (face_case <= 5) { /* whole face -> whole or quarter face */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = 0; i <= x_block_size+1; i++) for (k = 0; k <= z_block_size+1; k++, n++) - bp->array[m][i][j][k] = recv_buf[n]; + array[i][j][k] = recv_buf[n]; + } } else { /* quarter face -> whole face */ /* whole face -> quarter face - determine case */ if (face_case%2 == 0) { @@ -1350,10 +1465,12 @@ void unpack_face(double *recv_buf, int block_num, int face_case, int dir, ks = z_block_half + 1; ke = z_block_size + 1; } - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = is; i <= ie; i++) for (k = ks; k <= ke; k++, n++) - bp->array[m][i][j][k] = recv_buf[n]; + array[i][j][k] = recv_buf[n]; + } } } else { /* Z - Up, Down */ @@ -1363,11 +1480,14 @@ void unpack_face(double *recv_buf, int block_num, int face_case, int dir, face_case = face_case - 10; } else /* -Z - from Down */ k = 0; + if (face_case <= 5) { /* whole face -> whole or quarter face */ - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = 0; i <= x_block_size+1; i++) for (j = 0; j <= y_block_size+1; j++, n++) - bp->array[m][i][j][k] = recv_buf[n]; + array[i][j][k] = recv_buf[n]; + } } else { /* quarter face -> whole face */ /* whole face -> quarter face - determine case */ if (face_case%2 == 0) { @@ -1384,10 +1504,12 @@ void unpack_face(double *recv_buf, int block_num, int face_case, int dir, js = y_block_half + 1; je = y_block_size + 1; } - for (n = 0, m = start; m < start+num_comm; m++) + for (n = 0, m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; for (i = is; i <= ie; i++) for (j = js; j <= je; j++, n++) - bp->array[m][i][j][k] = recv_buf[n]; + array[i][j][k] = recv_buf[n]; + } } } } @@ -1401,6 +1523,7 @@ void on_proc_comm(int n, int n1, int l, int start, int num_comm) int is, ie, js, je; block *bp, *bp1; + typedef double (*block3D_t)[y_block_size+2][z_block_size+2]; /* Determine direction and then exchange data across the face */ if (!code) { @@ -1412,12 +1535,16 @@ void on_proc_comm(int n, int n1, int l, int start, int num_comm) bp1 = &blocks[n]; bp = &blocks[n1]; } - for (m = start; m < start+num_comm; m++) + //#pragma omp task depend(in: array[1:y_block_size][1:z_block_size]) + for (m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; + block3D_t array1 = (block3D_t)&bp1->array[m*block3D_size]; for (j = 1; j <= y_block_size; j++) for (k = 1; k <= z_block_size; k++) { - bp1->array[m][x_block_size+1][j][k] = bp->array[m][1][j][k]; - bp->array[m][0][j][k] = bp1->array[m][x_block_size][j][k]; + array1[x_block_size+1][j][k] = array[1][j][k]; + array[0][j][k] = array1[x_block_size][j][k]; } + } } else if ((l/2) == 1) { /* South, North */ if ((l%2) == 0) { /* South */ bp = &blocks[n]; @@ -1433,12 +1560,15 @@ void on_proc_comm(int n, int n1, int l, int start, int num_comm) is = 0; ie = x_block_size + 1; } - for (m = start; m < start+num_comm; m++) + for (m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; + block3D_t array1 = (block3D_t)&bp1->array[m*block3D_size]; for (i = is; i <= ie; i++) for (k = 1; k <= z_block_size; k++) { - bp1->array[m][i][y_block_size+1][k] = bp->array[m][i][1][k]; - bp->array[m][i][0][k] = bp1->array[m][i][y_block_size][k]; + array1[i][y_block_size+1][k] = array[i][1][k]; + array[i][0][k] = array1[i][y_block_size][k]; } + } } else if ((l/2) == 2) { /* Down, Up */ if ((l%2) == 0) { /* Down */ bp = &blocks[n]; @@ -1458,12 +1588,15 @@ void on_proc_comm(int n, int n1, int l, int start, int num_comm) js = 0; je = y_block_size + 1; } - for (m = start; m < start+num_comm; m++) + for (m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; + block3D_t array1 = (block3D_t)&bp1->array[m*block3D_size]; for (i = is; i <= ie; i++) for (j = js; j <= je; j++) { - bp1->array[m][i][j][z_block_size+1] = bp->array[m][i][j][1]; - bp->array[m][i][j][0] = bp1->array[m][i][j][z_block_size]; + array1[i][j][z_block_size+1] = array[i][j][1]; + array[i][j][0] = array1[i][j][z_block_size]; } + } } } else { /* set all ghosts */ if ((l/2) == 0) { /* West, East */ @@ -1474,12 +1607,15 @@ void on_proc_comm(int n, int n1, int l, int start, int num_comm) bp1 = &blocks[n]; bp = &blocks[n1]; } - for (m = start; m < start+num_comm; m++) + for (m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; + block3D_t array1 = (block3D_t)&bp1->array[m*block3D_size]; for (j = 0; j <= y_block_size+1; j++) for (k = 0; k <= z_block_size+1; k++) { - bp1->array[m][x_block_size+1][j][k] = bp->array[m][1][j][k]; - bp->array[m][0][j][k] = bp1->array[m][x_block_size][j][k]; + array1[x_block_size+1][j][k] = array[1][j][k]; + array[0][j][k] = array1[x_block_size][j][k]; } + } } else if ((l/2) == 1) { /* South, North */ if ((l%2) == 0) { /* South */ bp = &blocks[n]; @@ -1488,12 +1624,15 @@ void on_proc_comm(int n, int n1, int l, int start, int num_comm) bp1 = &blocks[n]; bp = &blocks[n1]; } - for (m = start; m < start+num_comm; m++) + for (m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; + block3D_t array1 = (block3D_t)&bp1->array[m*block3D_size]; for (i = 0; i <= x_block_size+1; i++) for (k = 0; k <= z_block_size+1; k++) { - bp1->array[m][i][y_block_size+1][k] = bp->array[m][i][1][k]; - bp->array[m][i][0][k] = bp1->array[m][i][y_block_size][k]; + array1[i][y_block_size+1][k] = array[i][1][k]; + array[i][0][k] = array1[i][y_block_size][k]; } + } } else if ((l/2) == 2) { /* Down, Up */ if ((l%2) == 0) { /* Down */ bp = &blocks[n]; @@ -1502,12 +1641,15 @@ void on_proc_comm(int n, int n1, int l, int start, int num_comm) bp1 = &blocks[n]; bp = &blocks[n1]; } - for (m = start; m < start+num_comm; m++) + for (m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; + block3D_t array1 = (block3D_t)&bp1->array[m*block3D_size]; for (i = 0; i <= x_block_size+1; i++) for (j = 0; j <= y_block_size+1; j++) { - bp1->array[m][i][j][z_block_size+1] = bp->array[m][i][j][1]; - bp->array[m][i][j][0] = bp1->array[m][i][j][z_block_size]; + array1[i][j][z_block_size+1] = array[i][j][1]; + array[i][j][0] = array1[i][j][z_block_size]; } + } } } } @@ -1525,6 +1667,7 @@ void on_proc_comm_diff(int n, int n1, int l, int iq, int jq, bp = &blocks[n]; bp1 = &blocks[n1]; + typedef double (*block3D_t)[y_block_size+2][z_block_size+2]; /* (iq, jq) quarter face on block n to whole face on block n1 */ if (!code) { @@ -1548,20 +1691,23 @@ void on_proc_comm_diff(int n, int n1, int l, int iq, int jq, } j1 = jq*y_block_half; k1 = iq*z_block_half; - for (m = start; m < start+num_comm; m++) + for (m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; + block3D_t array1 = (block3D_t)&bp1->array[m*block3D_size]; for (j = 1; j <= y_block_half; j++) for (k = 1; k <= z_block_half; k++) { - bp1->array[m][i2][2*j-1][2*k-1] = - bp1->array[m][i2][2*j-1][2*k ] = - bp1->array[m][i2][2*j ][2*k-1] = - bp1->array[m][i2][2*j ][2*k ] = - bp->array[m][i1][j+j1][k+k1]/4.0; - bp->array[m][i0][j+j1][k+k1] = - bp1->array[m][i3][2*j-1][2*k-1] + - bp1->array[m][i3][2*j-1][2*k ] + - bp1->array[m][i3][2*j ][2*k-1] + - bp1->array[m][i3][2*j ][2*k ]; + array1[i2][2*j-1][2*k-1] = + array1[i2][2*j-1][2*k ] = + array1[i2][2*j ][2*k-1] = + array1[i2][2*j ][2*k ] = + array[i1][j+j1][k+k1]/4.0; + array[i0][j+j1][k+k1] = + array1[i3][2*j-1][2*k-1] + + array1[i3][2*j-1][2*k ] + + array1[i3][2*j ][2*k-1] + + array1[i3][2*j ][2*k ]; } + } } else if ((l/2) == 1) { if (l == 2) { /* South */ j0 = 0; @@ -1576,20 +1722,23 @@ void on_proc_comm_diff(int n, int n1, int l, int iq, int jq, } i1 = jq*x_block_half; k1 = iq*z_block_half; - for (m = start; m < start+num_comm; m++) + for (m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; + block3D_t array1 = (block3D_t)&bp1->array[m*block3D_size]; for (i = 1; i <= x_block_half; i++) for (k = 1; k <= z_block_half; k++) { - bp1->array[m][2*i-1][j2][2*k-1] = - bp1->array[m][2*i-1][j2][2*k ] = - bp1->array[m][2*i ][j2][2*k-1] = - bp1->array[m][2*i ][j2][2*k ] = - bp->array[m][i+i1][j1][k+k1]/4.0; - bp->array[m][i+i1][j0][k+k1] = - bp1->array[m][2*i-1][j3][2*k-1] + - bp1->array[m][2*i-1][j3][2*k ] + - bp1->array[m][2*i ][j3][2*k-1] + - bp1->array[m][2*i ][j3][2*k ]; + array1[2*i-1][j2][2*k-1] = + array1[2*i-1][j2][2*k ] = + array1[2*i ][j2][2*k-1] = + array1[2*i ][j2][2*k ] = + array[i+i1][j1][k+k1]/4.0; + array[i+i1][j0][k+k1] = + array1[2*i-1][j3][2*k-1] + + array1[2*i-1][j3][2*k ] + + array1[2*i ][j3][2*k-1] + + array1[2*i ][j3][2*k ]; } + } } else if ((l/2) == 2) { if (l == 4) { /* Down */ k0 = 0; @@ -1604,20 +1753,23 @@ void on_proc_comm_diff(int n, int n1, int l, int iq, int jq, } i1 = jq*x_block_half; j1 = iq*y_block_half; - for (m = start; m < start+num_comm; m++) + for (m = start; m < start+num_comm; m++) { + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; + block3D_t array1 = (block3D_t)&bp1->array[m*block3D_size]; for (i = 1; i <= x_block_half; i++) for (j = 1; j <= y_block_half; j++) { - bp1->array[m][2*i-1][2*j-1][k2] = - bp1->array[m][2*i-1][2*j ][k2] = - bp1->array[m][2*i ][2*j-1][k2] = - bp1->array[m][2*i ][2*j ][k2] = - bp->array[m][i+i1][j+j1][k1]/4.0; - bp->array[m][i+i1][j+j1][k0] = - bp1->array[m][2*i-1][2*j-1][k3] + - bp1->array[m][2*i-1][2*j ][k3] + - bp1->array[m][2*i ][2*j-1][k3] + - bp1->array[m][2*i ][2*j ][k3]; + array1[2*i-1][2*j-1][k2] = + array1[2*i-1][2*j ][k2] = + array1[2*i ][2*j-1][k2] = + array1[2*i ][2*j ][k2] = + array[i+i1][j+j1][k1]/4.0; + array[i+i1][j+j1][k0] = + array1[2*i-1][2*j-1][k3] + + array1[2*i-1][2*j ][k3] + + array1[2*i ][2*j-1][k3] + + array1[2*i ][2*j ][k3]; } + } } } else { /* transfer ghosts */ if ((l/2) == 0) { @@ -1639,57 +1791,59 @@ void on_proc_comm_diff(int n, int n1, int l, int iq, int jq, k2 = z_block_size + 1; k3 = z_block_half + 1; for (m = start; m < start+num_comm; m++) { - bp1->array[m][i2][0][0] = bp->array[m][i1][j1][k1]/4.0; + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; + block3D_t array1 = (block3D_t)&bp1->array[m*block3D_size]; + array1[i2][0][0] = array[i1][j1][k1]/4.0; for (k = 1; k <= z_block_half; k++) - bp1->array[m][i2][0][2*k-1] = - bp1->array[m][i2][0][2*k ] = bp->array[m][i1][j1][k+k1]/4.0; - bp1->array[m][i2][0][k2] = bp->array[m][i1][j1][k3+k1]/4.0; + array1[i2][0][2*k-1] = + array1[i2][0][2*k ] = array[i1][j1][k+k1]/4.0; + array1[i2][0][k2] = array[i1][j1][k3+k1]/4.0; if (jq == 0) { if (iq == 0) - bp->array[m][i0][0][0 ] = bp1->array[m][i3][0][0 ]; + array[i0][0][0 ] = array1[i3][0][0 ]; else - bp->array[m][i0][0][k2] = bp1->array[m][i3][0][k2]; + array[i0][0][k2] = array1[i3][0][k2]; for (k = 1; k <= z_block_half; k++) - bp->array[m][i0][0][k+k1] = (bp1->array[m][i3][0][2*k-1] + - bp1->array[m][i3][0][2*k ]); + array[i0][0][k+k1] = (array1[i3][0][2*k-1] + + array1[i3][0][2*k ]); } for (j = 1; j <= y_block_half; j++) { - bp1->array[m][i2][2*j-1][0] = - bp1->array[m][i2][2*j ][0] = bp->array[m][i1][j+j1][k1]/4.0; + array1[i2][2*j-1][0] = + array1[i2][2*j ][0] = array[i1][j+j1][k1]/4.0; if (iq == 0) - bp->array[m][i0][j+j1][0 ] = (bp1->array[m][i3][2*j-1][0 ] + - bp1->array[m][i3][2*j ][0 ]); + array[i0][j+j1][0 ] = (array1[i3][2*j-1][0 ] + + array1[i3][2*j ][0 ]); else - bp->array[m][i0][j+j1][k2] = (bp1->array[m][i3][2*j-1][k2] + - bp1->array[m][i3][2*j ][k2]); + array[i0][j+j1][k2] = (array1[i3][2*j-1][k2] + + array1[i3][2*j ][k2]); for (k = 1; k <= z_block_half; k++) { - bp1->array[m][i2][2*j-1][2*k-1] = - bp1->array[m][i2][2*j-1][2*k ] = - bp1->array[m][i2][2*j ][2*k-1] = - bp1->array[m][i2][2*j ][2*k ] = - bp->array[m][i1][j+j1][k+k1]/4.0; - bp->array[m][i0][j+j1][k+k1] = - bp1->array[m][i3][2*j-1][2*k-1] + - bp1->array[m][i3][2*j-1][2*k ] + - bp1->array[m][i3][2*j ][2*k-1] + - bp1->array[m][i3][2*j ][2*k ]; + array1[i2][2*j-1][2*k-1] = + array1[i2][2*j-1][2*k ] = + array1[i2][2*j ][2*k-1] = + array1[i2][2*j ][2*k ] = + array[i1][j+j1][k+k1]/4.0; + array[i0][j+j1][k+k1] = + array1[i3][2*j-1][2*k-1] + + array1[i3][2*j-1][2*k ] + + array1[i3][2*j ][2*k-1] + + array1[i3][2*j ][2*k ]; } - bp1->array[m][i2][2*j-1][k2] = - bp1->array[m][i2][2*j ][k2] = bp->array[m][i1][j+j1][k3+k1]/4.0; + array1[i2][2*j-1][k2] = + array1[i2][2*j ][k2] = array[i1][j+j1][k3+k1]/4.0; } - bp1->array[m][i2][j2][0] = bp->array[m][i1][j3+j1][k1]/4.0; + array1[i2][j2][0] = array[i1][j3+j1][k1]/4.0; for (k = 1; k <= z_block_half; k++) - bp1->array[m][i2][j2][2*k-1] = - bp1->array[m][i2][j2][2*k ] = bp->array[m][i1][j3+j1][k+k1]/4.0; - bp1->array[m][i2][j2][k2] = bp->array[m][i1][j3+j1][k3+k1]/4.0; + array1[i2][j2][2*k-1] = + array1[i2][j2][2*k ] = array[i1][j3+j1][k+k1]/4.0; + array1[i2][j2][k2] = array[i1][j3+j1][k3+k1]/4.0; if (jq == 1) { if (iq == 0) - bp->array[m][i0][j2][0 ] = bp1->array[m][i3][j2][0 ]; + array[i0][j2][0 ] = array1[i3][j2][0 ]; else - bp->array[m][i0][j2][k2] = bp1->array[m][i3][j2][k2]; + array[i0][j2][k2] = array1[i3][j2][k2]; for (k = 1; k <= z_block_half; k++) - bp->array[m][i0][j2][k+k1] = (bp1->array[m][i3][j2][2*k-1] + - bp1->array[m][i3][j2][2*k ]); + array[i0][j2][k+k1] = (array1[i3][j2][2*k-1] + + array1[i3][j2][2*k ]); } } } else if ((l/2) == 1) { @@ -1711,57 +1865,59 @@ void on_proc_comm_diff(int n, int n1, int l, int iq, int jq, k2 = z_block_size + 1; k3 = z_block_half + 1; for (m = start; m < start+num_comm; m++) { - bp1->array[m][0][j2][0 ] = bp->array[m][i1][j1][k1]/4.0; + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; + block3D_t array1 = (block3D_t)&bp1->array[m*block3D_size]; + array1[0][j2][0 ] = array[i1][j1][k1]/4.0; for (k = 1; k <= z_block_half; k++) - bp1->array[m][0][j2][2*k-1] = - bp1->array[m][0][j2][2*k ] = bp->array[m][i1][j1][k+k1]/4.0; - bp1->array[m][0][j2][k2] = bp->array[m][i1][j1][k3+k1]/4.0; + array1[0][j2][2*k-1] = + array1[0][j2][2*k ] = array[i1][j1][k+k1]/4.0; + array1[0][j2][k2] = array[i1][j1][k3+k1]/4.0; if (jq == 0) { if (iq == 0) - bp->array[m][0][j0][0 ] = bp1->array[m][0][j3][0 ]; + array[0][j0][0 ] = array1[0][j3][0 ]; else - bp->array[m][0][j0][k2] = bp1->array[m][0][j3][k2]; + array[0][j0][k2] = array1[0][j3][k2]; for (k = 1; k <= z_block_half; k++) - bp->array[m][0][j0][k+k1] = (bp1->array[m][0][j3][2*k-1] + - bp1->array[m][0][j3][2*k ]); + array[0][j0][k+k1] = (array1[0][j3][2*k-1] + + array1[0][j3][2*k ]); } for (i = 1; i <= x_block_half; i++) { - bp1->array[m][2*i-1][j2][0] = - bp1->array[m][2*i ][j2][0] = bp->array[m][i+i1][j1][k1]/4.0; + array1[2*i-1][j2][0] = + array1[2*i ][j2][0] = array[i+i1][j1][k1]/4.0; if (iq == 0) - bp->array[m][i+i1][j0][0 ] = (bp1->array[m][2*i-1][j3][0 ] + - bp1->array[m][2*i ][j3][0 ]); + array[i+i1][j0][0 ] = (array1[2*i-1][j3][0 ] + + array1[2*i ][j3][0 ]); else - bp->array[m][i+i1][j0][k2] = (bp1->array[m][2*i-1][j3][k2] + - bp1->array[m][2*i ][j3][k2]); + array[i+i1][j0][k2] = (array1[2*i-1][j3][k2] + + array1[2*i ][j3][k2]); for (k = 1; k <= z_block_half; k++) { - bp1->array[m][2*i-1][j2][2*k-1] = - bp1->array[m][2*i-1][j2][2*k ] = - bp1->array[m][2*i ][j2][2*k-1] = - bp1->array[m][2*i ][j2][2*k ] = - bp->array[m][i+i1][j1][k+k1]/4.0; - bp->array[m][i+i1][j0][k+k1] = - bp1->array[m][2*i-1][j3][2*k-1] + - bp1->array[m][2*i-1][j3][2*k ] + - bp1->array[m][2*i ][j3][2*k-1] + - bp1->array[m][2*i ][j3][2*k ]; + array1[2*i-1][j2][2*k-1] = + array1[2*i-1][j2][2*k ] = + array1[2*i ][j2][2*k-1] = + array1[2*i ][j2][2*k ] = + array[i+i1][j1][k+k1]/4.0; + array[i+i1][j0][k+k1] = + array1[2*i-1][j3][2*k-1] + + array1[2*i-1][j3][2*k ] + + array1[2*i ][j3][2*k-1] + + array1[2*i ][j3][2*k ]; } - bp1->array[m][2*i-1][j2][k2] = - bp1->array[m][2*i ][j2][k2] = bp->array[m][i+i1][j1][k3+k1]/4.0; + array1[2*i-1][j2][k2] = + array1[2*i ][j2][k2] = array[i+i1][j1][k3+k1]/4.0; } - bp1->array[m][i2][j2][0 ] = bp->array[m][i3+i1][j1][k1]/4.0; + array1[i2][j2][0 ] = array[i3+i1][j1][k1]/4.0; for (k = 1; k <= z_block_half; k++) - bp1->array[m][i2][j2][2*k-1] = - bp1->array[m][i2][j2][2*k ] = bp->array[m][i3+i1][j1][k+k1]/4.0; - bp1->array[m][i2][j2][k2] = bp->array[m][i3+i1][j1][k3+k1]/4.0; + array1[i2][j2][2*k-1] = + array1[i2][j2][2*k ] = array[i3+i1][j1][k+k1]/4.0; + array1[i2][j2][k2] = array[i3+i1][j1][k3+k1]/4.0; if (jq == 1) { if (iq == 0) - bp->array[m][i2][j0][0 ] = bp1->array[m][i2][j3][0 ]; + array[i2][j0][0 ] = array1[i2][j3][0 ]; else - bp->array[m][i2][j0][k2] = bp1->array[m][i2][j3][k2]; + array[i2][j0][k2] = array1[i2][j3][k2]; for (k = 1; k <= z_block_half; k++) - bp->array[m][i2][j0][k+k1] = (bp1->array[m][i2][j3][2*k-1] + - bp1->array[m][i2][j3][2*k ]); + array[i2][j0][k+k1] = (array1[i2][j3][2*k-1] + + array1[i2][j3][2*k ]); } } } else if ((l/2) == 2) { @@ -1783,57 +1939,59 @@ void on_proc_comm_diff(int n, int n1, int l, int iq, int jq, j2 = y_block_size + 1; j3 = y_block_half + 1; for (m = start; m < start+num_comm; m++) { - bp1->array[m][0][0 ][k2] = bp->array[m][i1][j1][k1]/4.0; + block3D_t array = (block3D_t)&bp->array[m*block3D_size]; + block3D_t array1 = (block3D_t)&bp1->array[m*block3D_size]; + array1[0][0 ][k2] = array[i1][j1][k1]/4.0; for (j = 1; j <= y_block_half; j++) - bp1->array[m][0][2*j-1][k2] = - bp1->array[m][0][2*j ][k2] = bp->array[m][i1][j+j1][k1]/4.0; - bp1->array[m][0][j2][k2] = bp->array[m][i1][j3+j1][k1]/4.0; + array1[0][2*j-1][k2] = + array1[0][2*j ][k2] = array[i1][j+j1][k1]/4.0; + array1[0][j2][k2] = array[i1][j3+j1][k1]/4.0; if (jq == 0) { if (iq == 0) - bp->array[m][0][0 ][k0] = bp1->array[m][0][0 ][k3]; + array[0][0 ][k0] = array1[0][0 ][k3]; else - bp->array[m][0][j2][k0] = bp1->array[m][0][j2][k3]; + array[0][j2][k0] = array1[0][j2][k3]; for (j = 1; j <= y_block_half; j++) - bp->array[m][0][j+j1][k0] = (bp1->array[m][0][2*j-1][k3] + - bp1->array[m][0][2*j ][k3]); + array[0][j+j1][k0] = (array1[0][2*j-1][k3] + + array1[0][2*j ][k3]); } for (i = 1; i <= x_block_half; i++) { - bp1->array[m][2*i-1][0][k2] = - bp1->array[m][2*i ][0][k2] = bp->array[m][i+i1][j1][k1]/4.0; + array1[2*i-1][0][k2] = + array1[2*i ][0][k2] = array[i+i1][j1][k1]/4.0; if (iq == 0) - bp->array[m][i+i1][0][k0] = (bp1->array[m][2*i-1][0][k3] + - bp1->array[m][2*i ][0][k3]); + array[i+i1][0][k0] = (array1[2*i-1][0][k3] + + array1[2*i ][0][k3]); else - bp->array[m][i+i1][j2][k0] = (bp1->array[m][2*i-1][j2][k3] + - bp1->array[m][2*i ][j2][k3]); + array[i+i1][j2][k0] = (array1[2*i-1][j2][k3] + + array1[2*i ][j2][k3]); for (j = 1; j <= y_block_half; j++) { - bp1->array[m][2*i-1][2*j-1][k2] = - bp1->array[m][2*i-1][2*j ][k2] = - bp1->array[m][2*i ][2*j-1][k2] = - bp1->array[m][2*i ][2*j ][k2] = - bp->array[m][i+i1][j+j1][k1]/4.0; - bp->array[m][i+i1][j+j1][k0] = - bp1->array[m][2*i-1][2*j-1][k3] + - bp1->array[m][2*i-1][2*j ][k3] + - bp1->array[m][2*i ][2*j-1][k3] + - bp1->array[m][2*i ][2*j ][k3]; + array1[2*i-1][2*j-1][k2] = + array1[2*i-1][2*j ][k2] = + array1[2*i ][2*j-1][k2] = + array1[2*i ][2*j ][k2] = + array[i+i1][j+j1][k1]/4.0; + array[i+i1][j+j1][k0] = + array1[2*i-1][2*j-1][k3] + + array1[2*i-1][2*j ][k3] + + array1[2*i ][2*j-1][k3] + + array1[2*i ][2*j ][k3]; } - bp1->array[m][2*i-1][j2][k2] = - bp1->array[m][2*i ][j2][k2] = bp->array[m][i+i1][j3+j1][k1]/4.0; + array1[2*i-1][j2][k2] = + array1[2*i ][j2][k2] = array[i+i1][j3+j1][k1]/4.0; } - bp1->array[m][i2][0 ][k2] = bp->array[m][i3+i1][j1][k1]/4.0; + array1[i2][0 ][k2] = array[i3+i1][j1][k1]/4.0; for (j = 1; j <= y_block_half; j++) - bp1->array[m][i2][2*j-1][k2] = - bp1->array[m][i2][2*j ][k2] = bp->array[m][i3+i1][j+j1][k1]/4.0; - bp1->array[m][i2][j2][k2] = bp->array[m][i3+i1][j3+j1][k1]/4.0; + array1[i2][2*j-1][k2] = + array1[i2][2*j ][k2] = array[i3+i1][j+j1][k1]/4.0; + array1[i2][j2][k2] = array[i3+i1][j3+j1][k1]/4.0; if (jq == 1) { if (iq == 0) - bp->array[m][i2][0 ][k0] = bp1->array[m][i2][0 ][k3]; + array[i2][0 ][k0] = array1[i2][0 ][k3]; else - bp->array[m][i2][j2][k0] = bp1->array[m][i2][j2][k3]; + array[i2][j2][k0] = array1[i2][j2][k3]; for (j = 1; j <= y_block_half; j++) - bp->array[m][i2][j+j1][k0] = (bp1->array[m][i2][2*j-1][k3] + - bp1->array[m][i2][2*j ][k3]); + array[i2][j+j1][k0] = (array1[i2][2*j-1][k3] + + array1[i2][2*j ][k3]); } } } @@ -1845,54 +2003,67 @@ void apply_bc(int l, block *bp, int start, int num_comm) { int var, i, j, k, f, t; + typedef double (*block3D_t)[y_block_size+2][z_block_size+2]; t = 0; f = 1; if (!code && stencil == 7) switch (l) { case 1: t = x_block_size + 1; f = x_block_size; - case 0: for (var = start; var < start+num_comm; var++) + case 0: for (var = start; var < start+num_comm; var++) { + block3D_t array = (block3D_t)&bp->array[var*block3D_size]; for (j = 1; j <= y_block_size; j++) for (k = 1; k <= z_block_size; k++) - bp->array[var][t][j][k] = bp->array[var][f][j][k]; + array[t][j][k] = array[f][j][k]; + } break; case 3: t = y_block_size + 1; f = y_block_size; - case 2: for (var = start; var < start+num_comm; var++) + case 2: for (var = start; var < start+num_comm; var++) { + block3D_t array = (block3D_t)&bp->array[var*block3D_size]; for (i = 1; i <= x_block_size; i++) for (k = 1; k <= z_block_size; k++) - bp->array[var][i][t][k] = bp->array[var][i][f][k]; + array[i][t][k] = array[i][f][k]; + } break; case 5: t = z_block_size + 1; f = z_block_size; - case 4: for (var = start; var < start+num_comm; var++) + case 4: for (var = start; var < start+num_comm; var++) { + block3D_t array = (block3D_t)&bp->array[var*block3D_size]; for (i = 1; i <= x_block_size; i++) for (j = 1; j <= y_block_size; j++) - bp->array[var][i][j][t] = bp->array[var][i][j][f]; + array[i][j][t] = array[i][j][f]; + } break; } else switch (l) { case 1: t = x_block_size + 1; f = x_block_size; - case 0: for (var = start; var < start+num_comm; var++) + case 0: for (var = start; var < start+num_comm; var++) { + block3D_t array = (block3D_t)&bp->array[var*block3D_size]; for (j = 0; j <= y_block_size+1; j++) for (k = 0; k <= z_block_size+1; k++) - bp->array[var][t][j][k] = bp->array[var][f][j][k]; + array[t][j][k] = array[f][j][k]; + } break; case 3: t = y_block_size + 1; f = y_block_size; - case 2: for (var = start; var < start+num_comm; var++) + case 2: for (var = start; var < start+num_comm; var++) { + block3D_t array = (block3D_t)&bp->array[var*block3D_size]; for (i = 0; i <= x_block_size+1; i++) for (k = 0; k <= z_block_size+1; k++) - bp->array[var][i][t][k] = bp->array[var][i][f][k]; + array[i][t][k] = array[i][f][k]; + } break; case 5: t = z_block_size + 1; f = z_block_size; - case 4: for (var = start; var < start+num_comm; var++) + case 4: for (var = start; var < start+num_comm; var++) { + block3D_t array = (block3D_t)&bp->array[var*block3D_size]; for (i = 0; i <= x_block_size+1; i++) for (j = 0; j <= y_block_size+1; j++) - bp->array[var][i][j][t] = bp->array[var][i][j][f]; + array[i][j][t] = array[i][j][f]; + } break; } } diff --git a/openmp/init.c b/openmp/init.c index b408ab6..281521f 100644 --- a/openmp/init.c +++ b/openmp/init.c @@ -213,12 +213,15 @@ void init(void) bp->cen[1] = j1*size + size/2; bp->cen[2] = k1*size + size/2; add_sorted_list(o, n, 0); - for (var = 0; var < num_vars; var++) + for (var = 0; var < num_vars; var++) { + typedef double (*block3D_t)[y_block_size+2][z_block_size+2]; + block3D_t array = (block3D_t)&bp->array[var*block3D_size]; for (ib = 1; ib <= x_block_size; ib++) for (jb = 1; jb <= y_block_size; jb++) for (kb = 1; kb <= z_block_size; kb++) - bp->array[var][ib][jb][kb] = + array[ib][jb][kb] = ((double) rand())/((double) RAND_MAX); + } if (i2 == 0) if (i == 0) { /* 0 boundary */ bp->nei_level[0] = -2; diff --git a/openmp/main.c b/openmp/main.c index 42a47fe..5393f9e 100644 --- a/openmp/main.c +++ b/openmp/main.c @@ -342,6 +342,8 @@ void allocate(void) { int i, j, k, m, n; + block3D_size = (x_block_size+2)*(y_block_size+2)*(z_block_size+2); + num_blocks = (num_sz *) ma_malloc((num_refine+1)*sizeof(num_sz), __FILE__, __LINE__); num_blocks[0] = num_pes*init_block_x*init_block_y*init_block_z; @@ -354,22 +356,22 @@ void allocate(void) for (n = 0; n < max_num_blocks; n++) { blocks[n].number = -1; - blocks[n].array = (double ****) ma_malloc(num_vars*sizeof(double ***), + blocks[n].array = (double *) ma_malloc(num_vars*block3D_size*sizeof(double), __FILE__, __LINE__); - for (m = 0; m < num_vars; m++) { - blocks[n].array[m] = (double ***) - ma_malloc((x_block_size+2)*sizeof(double **), - __FILE__, __LINE__); - for (i = 0; i < x_block_size+2; i++) { - blocks[n].array[m][i] = (double **) - ma_malloc((y_block_size+2)*sizeof(double *), - __FILE__, __LINE__); - for (j = 0; j < y_block_size+2; j++) - blocks[n].array[m][i][j] = (double *) - ma_malloc((z_block_size+2)*sizeof(double), - __FILE__, __LINE__); - } - } + //for (m = 0; m < num_vars; m++) { + // blocks[n].array[m] = (double ***) + // ma_malloc((x_block_size+2)*sizeof(double**), + // __FILE__, __LINE__); + // for (i = 0; i < x_block_size+2; i++) { + // blocks[n].array[m][i] = (double **) + // ma_malloc((y_block_size+2)*sizeof(double *), + // __FILE__, __LINE__); + // for (j = 0; j < y_block_size+2; j++) + // blocks[n].array[m][i][j] = (double *) + // ma_malloc((z_block_size+2)*sizeof(double), + // __FILE__, __LINE__); + // } + //} } sorted_list = (sorted_block *)ma_malloc(max_num_blocks*sizeof(sorted_block), @@ -491,8 +493,7 @@ void allocate(void) __FILE__, __LINE__); if (num_refine) { - s_buf_size = (int) (0.10*((double)max_num_blocks))*comm_vars* - (x_block_size+2)*(y_block_size+2)*(z_block_size+2); + s_buf_size = (int) (0.10*((double)max_num_blocks))*comm_vars*block3D_size; if (s_buf_size < (num_vars*x_block_size*y_block_size*z_block_size + 47)) s_buf_size = num_vars*x_block_size*y_block_size*z_block_size + 47; r_buf_size = 5*s_buf_size; @@ -528,14 +529,14 @@ void deallocate(void) int i, j, m, n; for (n = 0; n < max_num_blocks; n++) { - for (m = 0; m < num_vars; m++) { - for (i = 0; i < x_block_size+2; i++) { - for (j = 0; j < y_block_size+2; j++) - free(blocks[n].array[m][i][j]); - free(blocks[n].array[m][i]); - } - free(blocks[n].array[m]); - } + //for (m = 0; m < num_vars; m++) { + // for (i = 0; i < x_block_size+2; i++) { + // for (j = 0; j < y_block_size+2; j++) + // free(blocks[n].array[m][i][j]); + // free(blocks[n].array[m][i]); + // } + // free(blocks[n].array[m]); + //} free(blocks[n].array); } free(blocks); diff --git a/openmp/pack.c b/openmp/pack.c index 99d0cdd..1157d04 100644 --- a/openmp/pack.c +++ b/openmp/pack.c @@ -62,11 +62,14 @@ void pack_block(int n) for (i = 0; i < 3; i++) send_int[l++] = bp->cen[i]; - for (v = 0; v < num_vars; v++) + for (v = 0; v < num_vars; v++) { + typedef double (*block3D_t)[y_block_size+2][z_block_size+2]; + block3D_t array = (block3D_t)&bp->array[v*block3D_size]; for (i = 1; i <= x_block_size; i++) for (j = 1; j <= y_block_size; j++) for (k = 1; k <= z_block_size; k++) - send_buff[l++] = bp->array[v][i][j][k]; + send_buff[l++] = array[i][j][k]; + } } void unpack_block(int n) @@ -96,9 +99,12 @@ void unpack_block(int n) for (i = 0; i < 3; i++) bp->cen[i] = recv_int[l++]; - for (v = 0; v < num_vars; v++) + for (v = 0; v < num_vars; v++) { + typedef double (*block3D_t)[y_block_size+2][z_block_size+2]; + block3D_t array = (block3D_t)&bp->array[v*block3D_size]; for (i = 1; i <= x_block_size; i++) for (j = 1; j <= y_block_size; j++) for (k = 1; k <= z_block_size; k++) - bp->array[v][i][j][k] = recv_buff[l++]; + array[i][j][k] = recv_buff[l++]; + } } diff --git a/openmp/profile.c b/openmp/profile.c index 2558f8c..d0f86d7 100644 --- a/openmp/profile.c +++ b/openmp/profile.c @@ -44,7 +44,11 @@ void profile(void) char *version = "1.4? w/OpenMP"; FILE *fp; +#ifdef _OPENMP ompt = omp_get_max_threads(); +#else + ompt = 1; +#endif calculate_results(); total_fp_ops = average[128] + average[129] + average[130]; total_gflops = total_fp_ops/(average[38]*1024.0*1024.0*1024.0); diff --git a/openmp/stencil.c b/openmp/stencil.c index 7055bc3..d48c96a 100644 --- a/openmp/stencil.c +++ b/openmp/stencil.c @@ -29,6 +29,8 @@ #include #include +#include +#include #include "block.h" #include "comm.h" @@ -50,1047 +52,1041 @@ void stencil_driver(int var, int cacl_stage) if (stencil) stencil_calc(var, stencil); - else - if (!var) - stencil_calc(var, 7); - else if (var < 4*mat) { - switch (cacl_stage%6) { - case 0: - stencil_0(var); - break; - case 1: - stencil_x(var); - break; - case 2: - stencil_y(var); - break; - case 3: - stencil_z(var); - break; - case 4: - stencil_7(var); - break; - case 5: - stencil_27(var); - break; - } - stencil_check(var); - } else - stencil_calc(var, 7); + else { + assert(!"Supporting stencil=7 for now"); + //FIXME: extend refactoring to other stencils + //if (!var) + // stencil_calc(var, 7); + //else if (var < 4*mat) { + // switch (cacl_stage%6) { + // case 0: + // stencil_0(var); + // break; + // case 1: + // stencil_x(var); + // break; + // case 2: + // stencil_y(var); + // break; + // case 3: + // stencil_z(var); + // break; + // case 4: + // stencil_7(var); + // break; + // case 5: + // stencil_27(var); + // break; + // } + // stencil_check(var); + //} else + // stencil_calc(var, 7); + } } void stencil_calc(int var, int stencil_in) { - int i, j, k, in; - double sb, sm, sf, work[x_block_size+2][y_block_size+2][z_block_size+2]; - block *bp; - - int tid; if (stencil_in == 7) { -#pragma omp parallel default(shared) private(i, j, k, bp) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - work[i][j][k] = (bp->array[var][i-1][j ][k ] + - bp->array[var][i ][j-1][k ] + - bp->array[var][i ][j ][k-1] + - bp->array[var][i ][j ][k ] + - bp->array[var][i ][j ][k+1] + - bp->array[var][i ][j+1][k ] + - bp->array[var][i+1][j ][k ])/7.0; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - bp->array[var][i][j][k] = work[i][j][k]; +#pragma omp parallel for default(shared) + for (int in = 0; in < sorted_index[num_refine+1]; in++) { + block *bp = &blocks[sorted_list[in].n]; + typedef double (*block3D_t)[y_block_size+2][z_block_size+2]; + block3D_t array = (block3D_t)&bp->array[var*block3D_size]; + double work[x_block_size+2][y_block_size+2][z_block_size+2]; + memcpy(work, array, sizeof(work)); + for (int i = 1; i <= x_block_size; i++) + for (int j = 1; j <= y_block_size; j++) + for (int k = 1; k <= z_block_size; k++) + array[i][j][k] = (work[i-1][j ][k ] + + work[i ][j-1][k ] + + work[i ][j ][k-1] + + work[i ][j ][k ] + + work[i ][j ][k+1] + + work[i ][j+1][k ] + + work[i+1][j ][k ])/7.0; } -} total_fp_divs += (double) num_active*num_cells; total_fp_adds += (double) 6*num_active*num_cells; } else { -#pragma omp parallel default(shared) private (i, j, k, bp, sb, sm, sf) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) { - sb = bp->array[var][i-1][j-1][k-1] + - bp->array[var][i-1][j-1][k ] + - bp->array[var][i-1][j-1][k+1] + - bp->array[var][i-1][j ][k-1] + - bp->array[var][i-1][j ][k ] + - bp->array[var][i-1][j ][k+1] + - bp->array[var][i-1][j+1][k-1] + - bp->array[var][i-1][j+1][k ] + - bp->array[var][i-1][j+1][k+1]; - sm = bp->array[var][i ][j-1][k-1] + - bp->array[var][i ][j-1][k ] + - bp->array[var][i ][j-1][k+1] + - bp->array[var][i ][j ][k-1] + - bp->array[var][i ][j ][k ] + - bp->array[var][i ][j ][k+1] + - bp->array[var][i ][j+1][k-1] + - bp->array[var][i ][j+1][k ] + - bp->array[var][i ][j+1][k+1]; - sf = bp->array[var][i+1][j-1][k-1] + - bp->array[var][i+1][j-1][k ] + - bp->array[var][i+1][j-1][k+1] + - bp->array[var][i+1][j ][k-1] + - bp->array[var][i+1][j ][k ] + - bp->array[var][i+1][j ][k+1] + - bp->array[var][i+1][j+1][k-1] + - bp->array[var][i+1][j+1][k ] + - bp->array[var][i+1][j+1][k+1]; - work[i][j][k] = (sb + sm + sf)/27.0; +#pragma omp parallel for default(shared) + for (int in = 0; in < sorted_index[num_refine+1]; in++) { + block *bp = &blocks[sorted_list[in].n]; + typedef double (*block3D_t)[y_block_size+2][z_block_size+2]; + block3D_t array = (block3D_t)&bp->array[var*block3D_size]; + double work[x_block_size+2][y_block_size+2][z_block_size+2]; + memcpy(work, array, sizeof(work)); + for (int i = 1; i <= x_block_size; i++) + for (int j = 1; j <= y_block_size; j++) + for (int k = 1; k <= z_block_size; k++) { + double sb = work[i-1][j-1][k-1] + + work[i-1][j-1][k ] + + work[i-1][j-1][k+1] + + work[i-1][j ][k-1] + + work[i-1][j ][k ] + + work[i-1][j ][k+1] + + work[i-1][j+1][k-1] + + work[i-1][j+1][k ] + + work[i-1][j+1][k+1]; + double sm = work[i ][j-1][k-1] + + work[i ][j-1][k ] + + work[i ][j-1][k+1] + + work[i ][j ][k-1] + + work[i ][j ][k ] + + work[i ][j ][k+1] + + work[i ][j+1][k-1] + + work[i ][j+1][k ] + + work[i ][j+1][k+1]; + double sf = work[i+1][j-1][k-1] + + work[i+1][j-1][k ] + + work[i+1][j-1][k+1] + + work[i+1][j ][k-1] + + work[i+1][j ][k ] + + work[i+1][j ][k+1] + + work[i+1][j+1][k-1] + + work[i+1][j+1][k ] + + work[i+1][j+1][k+1]; + array[i][j][k] = (sb + sm + sf)/27.0; } - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - bp->array[var][i][j][k] = work[i][j][k]; } -} total_fp_divs += (double) num_active*num_cells; total_fp_adds += (double) 26*num_active*num_cells; } } -void stencil_0(int var) -{ - int in, i, j, k, v; - block *bp; - - if (var == 1) { -#pragma omp parallel default(shared) private(i, j, k, bp, v) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - for (v = mat; v < 2*mat; v++) - bp->array[var][i][j][k] += bp->array[v][i][j][k]* - bp->array[0][i][j][k]; - } -} - - total_fp_adds += (double) mat*num_active*num_cells; - total_fp_muls += (double) mat*num_active*num_cells; - } else if (var < mat) { -#pragma omp parallel default(shared) private(i, j, k, bp, v) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - bp->array[var][i][j][k] += bp->array[var][i][j][k]* - (bp->array[0][i][j][k] + - bp->array[1][i][j][k] - - beta*bp->array[var][i][j][k]); - } -} - - total_fp_adds += (double) 3*num_active*num_cells; - total_fp_muls += (double) 2*num_active*num_cells; - } else if (var < 2*mat) { -#pragma omp parallel default(shared) private(i, j, k, bp, v) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - bp->array[var][i][j][k] = bp->array[var][i][j][k]* - (bp->array[0][i][j][k] + - bp->array[var][i][j][k] + - beta*bp->array[var+mat][i][j][k] + - (1.0-beta)*bp->array[var+2*mat][i][j][k])/ - bp->array[1][i][j][k]; - } -} - - total_fp_adds += (double) 3*num_active*num_cells; - total_fp_muls += (double) 3*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } else if (var < 3*mat) { -#pragma omp parallel default(shared) private(i, j, k, bp, v) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - bp->array[var][i][j][k] += bp->array[var-mat][i][j][k]* - (beta*bp->array[0][i][j][k] + - alpha[var-2*mat]*bp->array[var][i][j][k] + - (1.0-beta)*bp->array[var+mat][i][j][k])/ - bp->array[1][i][j][k]; - } -} - - total_fp_adds += (double) 4*num_active*num_cells; - total_fp_muls += (double) 3*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } else { -#pragma omp parallel default(shared) private(i, j, k, bp, v) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - bp->array[var][i][j][k] += bp->array[var-2*mat][i][j][k]* - (beta*bp->array[0][i][j][k] + - alpha[var-3*mat]*bp->array[var][i][j][k] + - (1.0-alpha[var-3*mat])*bp->array[var-mat][i][j][k] + - (1.0-beta)*bp->array[var-2*mat][i][j][k])/ - (bp->array[1][i][j][k]* - bp->array[1][i][j][k]); - } -} - total_fp_adds += (double) 6*num_active*num_cells; - total_fp_muls += (double) 6*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } -} - -void stencil_x(int var) -{ - int in, i, j, k, v; - double tmp1, tmp2; - block *bp; - - if (var == 1) { -#pragma omp parallel default(shared) private(i, j, k, bp, v) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) { - for (v = 2; v < mat+2; v++) - bp->array[1][i][j][k] += bp->array[v][i][j][k]* - bp->array[0][i][j][k]; - bp->array[1][i][j][k] /= (beta + bp->array[1][i][j][k]); - } - } -} - total_fp_adds += (double) (mat+1)*num_active*num_cells; - total_fp_muls += (double) mat*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } else if (var < mat) { -#pragma omp parallel default(shared) private(i, j, k, bp, v) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - bp->array[var][i][j][k] += bp->array[var][i][j][k]* - (bp->array[0][i][j][k] + - bp->array[1][i][j][k] - - beta*bp->array[var][i][j][k])/ - (alpha[var] + bp->array[1][i][j][k]); - } -} - total_fp_adds += (double) 4*num_active*num_cells; - total_fp_muls += (double) 2*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } else if (var < 2*mat) { -#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - if ((tmp1 = fabs(bp->array[var][i][j][k] - - bp->array[var][i-1][j][k])) > - (tmp2 = fabs(bp->array[var][i][j][k] - - bp->array[var][i+1][j][k]))) - bp->array[var][i][j][k] = (tmp1*bp->array[var][i-1][j][k] + - (tmp1-tmp2)*(bp->array[var][i][j][k] + - bp->array[1][i][j][k]) + - tmp2*bp->array[var][i+1][j][k])/ - (beta + alpha[var-mat] + - bp->array[var][i-1][j][k] + - bp->array[var][i][j][k] + - bp->array[var][i+1][j][k] + - bp->array[0][i][j][k] + - bp->array[1][i][j][k]); - else - bp->array[var][i][j][k] = (tmp1*bp->array[var][i-1][j][k] + - (tmp2-tmp1)*(bp->array[var][i][j][k] + - bp->array[1][i][j][k]) + - tmp2*bp->array[var][i+1][j][k])/ - (beta + alpha[var-mat] + - bp->array[var][i-1][j][k] + - bp->array[var][i][j][k] + - bp->array[var][i+1][j][k] + - bp->array[0][i][j][k] + - bp->array[1][i][j][k]); - } -} - total_fp_adds += (double) 12*num_active*num_cells; - total_fp_muls += (double) 3*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } else if (var < 3*mat) { -#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - if ((tmp1 = fabs(bp->array[var-mat][i][j][k] - - bp->array[var-mat][i-1][j][k])) > - (tmp2 = fabs(bp->array[var-mat][i][j][k] - - bp->array[var-mat][i+1][j][k]))) - bp->array[var][i][j][k] = (tmp1*bp->array[var][i-1][j][k] + - (tmp1-tmp2)*(bp->array[var][i][j][k] + - bp->array[1][i][j][k]) + - tmp2*bp->array[var][i+1][j][k])/ - (beta + alpha[var-2*mat] + - bp->array[var-mat][i][j][k] + - bp->array[var+mat][i][j][k] + - bp->array[var][i-1][j][k] + - bp->array[var][i][j][k] + - bp->array[var][i+1][j][k]); - else - bp->array[var][i][j][k] = (tmp1*bp->array[var][i-1][j][k] + - (tmp2-tmp1)*(bp->array[var][i][j][k] + - bp->array[1][i][j][k]) + - tmp2*bp->array[var][i+1][j][k])/ - (beta + alpha[var-2*mat] + - bp->array[var-mat][i][j][k] + - bp->array[var+mat][i][j][k] + - bp->array[var][i-1][j][k] + - bp->array[var][i][j][k] + - bp->array[var][i+1][j][k]); - } -} - total_fp_adds += (double) 12*num_active*num_cells; - total_fp_muls += (double) 3*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } else { -#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - if ((tmp1 = fabs(bp->array[var-2*mat][i][j][k] - - bp->array[var-2*mat][i-1][j][k])) > - (tmp2 = fabs(bp->array[var-2*mat][i][j][k] - - bp->array[var-2*mat][i+1][j][k]))) - bp->array[var][i][j][k] = (tmp1*bp->array[var][i-1][j][k] + - (tmp1-tmp2)*(bp->array[var][i][j][k] + - bp->array[1][i][j][k]) + - tmp2*bp->array[var][i+1][j][k])/ - (beta + alpha[var-3*mat] + - bp->array[var-mat][i][j][k] + - bp->array[var-2*mat][i][j][k] + - bp->array[var][i-1][j][k] + - bp->array[var][i][j][k] + - bp->array[var][i+1][j][k]); - else - bp->array[var][i][j][k] = (tmp1*bp->array[var][i-1][j][k] + - (tmp2-tmp1)*(bp->array[var][i][j][k] + - bp->array[1][i][j][k]) + - tmp2*bp->array[var][i+1][j][k])/ - (beta + alpha[var-3*mat] + - bp->array[var-mat][i][j][k] + - bp->array[var-2*mat][i][j][k] + - bp->array[var][i-1][j][k] + - bp->array[var][i][j][k] + - bp->array[var][i+1][j][k]); - } -} - total_fp_adds += (double) 12*num_active*num_cells; - total_fp_muls += (double) 3*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } -} - -void stencil_y(int var) -{ - int in, i, j, k, v; - double tmp1, tmp2; - block *bp; - - if (var == 1) { -#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; -#pragma omp for - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) { - for (v = 2; v < mat+2; v++) - bp->array[1][i][j][k] += bp->array[v][i][j][k]* - bp->array[0][i][j][k]; - bp->array[1][i][j][k] /= (beta + bp->array[1][i][j][k]); - } - } -} - total_fp_adds += (double) (mat+1)*num_active*num_cells; - total_fp_muls += (double) mat*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } else if (var < mat) { -#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - bp->array[var][i][j][k] += bp->array[var][i][j][k]* - (bp->array[0][i][j][k] + - bp->array[1][i][j][k] - - beta*bp->array[var][i][j][k])/ - (alpha[var] + bp->array[1][i][j][k]); - } -} - total_fp_adds += (double) 4*num_active*num_cells; - total_fp_muls += (double) 2*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } else if (var < 2*mat) { -#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - if ((tmp1 = fabs(bp->array[var][i][j][k] - - bp->array[var][i][j-1][k])) > - (tmp2 = fabs(bp->array[var][i][j][k] - - bp->array[var][i][j+1][k]))) - bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j-1][k] + - (tmp1-tmp2)*(bp->array[var][i][j][k] + - bp->array[1][i][j][k]) + - tmp2*bp->array[var][i][j+1][k])/ - (beta + alpha[var-mat] + - bp->array[var][i][j-1][k] + - bp->array[var][i][j][k] + - bp->array[var][i][j+1][k] + - bp->array[0][i][j][k] + - bp->array[1][i][j][k]); - else - bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j-1][k] + - (tmp2-tmp1)*(bp->array[var][i][j][k] + - bp->array[1][i][j][k]) + - tmp2*bp->array[var][i][j+1][k])/ - (beta + alpha[var-mat] + - bp->array[var][i][j-1][k] + - bp->array[var][i][j][k] + - bp->array[var][i][j+1][k] + - bp->array[0][i][j][k] + - bp->array[1][i][j][k]); - } -} - total_fp_adds += (double) 12*num_active*num_cells; - total_fp_muls += (double) 3*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } else if (var < 3*mat) { -#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - if ((tmp1 = fabs(bp->array[var-mat][i][j][k] - - bp->array[var-mat][i][j-1][k])) > - (tmp2 = fabs(bp->array[var-mat][i][j][k] - - bp->array[var-mat][i][j+1][k]))) - bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j-1][k] + - (tmp1-tmp2)*(bp->array[var][i][j][k] + - bp->array[1][i][j][k]) + - tmp2*bp->array[var][i][j+1][k])/ - (beta + alpha[var-2*mat] + - bp->array[var-mat][i][j][k] + - bp->array[var+mat][i][j][k] + - bp->array[var][i][j-1][k] + - bp->array[var][i][j][k] + - bp->array[var][i][j+1][k]); - else - bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j-1][k] + - (tmp2-tmp1)*(bp->array[var][i][j][k] + - bp->array[1][i][j][k]) + - tmp2*bp->array[var][i][j+1][k])/ - (beta + alpha[var-2*mat] + - bp->array[var-mat][i][j][k] + - bp->array[var+mat][i][j][k] + - bp->array[var][i][j-1][k] + - bp->array[var][i][j][k] + - bp->array[var][i][j+1][k]); - } -} - total_fp_adds += (double) 12*num_active*num_cells; - total_fp_muls += (double) 3*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } else { -#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - if ((tmp1 = fabs(bp->array[var-2*mat][i][j][k] - - bp->array[var-2*mat][i][j-1][k])) > - (tmp2 = fabs(bp->array[var-2*mat][i][j][k] - - bp->array[var-2*mat][i][j+1][k]))) - bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j-1][k] + - (tmp1-tmp2)*(bp->array[var][i][j][k] + - bp->array[1][i][j][k]) + - tmp2*bp->array[var][i][j+1][k])/ - (beta + alpha[var-3*mat] + - bp->array[var-mat][i][j][k] + - bp->array[var-2*mat][i][j][k] + - bp->array[var][i][j-1][k] + - bp->array[var][i][j][k] + - bp->array[var][i][j+1][k]); - else - bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j-1][k] + - (tmp2-tmp1)*(bp->array[var][i][j][k] + - bp->array[1][i][j][k]) + - tmp2*bp->array[var][i][j+1][k])/ - (beta + alpha[var-3*mat] + - bp->array[var-mat][i][j][k] + - bp->array[var-2*mat][i][j][k] + - bp->array[var][i][j-1][k] + - bp->array[var][i][j][k] + - bp->array[var][i][j+1][k]); - } -} - total_fp_adds += (double) 12*num_active*num_cells; - total_fp_muls += (double) 3*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } -} - -void stencil_z(int var) -{ - int in, i, j, k, v; - double tmp1, tmp2; - block *bp; - - - if (var == 1) { -#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) { - for (v = 2; v < mat+2; v++) - bp->array[1][i][j][k] += bp->array[v][i][j][k]* - bp->array[0][i][j][k]; - bp->array[1][i][j][k] /= (beta + bp->array[1][i][j][k]); - } - } -} - - total_fp_adds += (double) (mat+1)*num_active*num_cells; - total_fp_muls += (double) mat*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } else if (var < mat) { -#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - bp->array[var][i][j][k] += bp->array[var][i][j][k]* - (bp->array[0][i][j][k] + - bp->array[1][i][j][k] - - beta*bp->array[var][i][j][k])/ - (alpha[var] + bp->array[1][i][j][k]); - } -} - total_fp_adds += (double) 4*num_active*num_cells; - total_fp_muls += (double) 2*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } else if (var < 2*mat) { -#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - if ((tmp1 = fabs(bp->array[var][i][j][k] - - bp->array[var][i][j][k-1])) > - (tmp2 = fabs(bp->array[var][i][j][k] - - bp->array[var][i][j][k+1]))) - bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j][k-1] + - (tmp1-tmp2)*(bp->array[var][i][j][k] + - bp->array[1][i][j][k]) + - tmp2*bp->array[var][i][j][k+1])/ - (beta + alpha[var-mat] + - bp->array[var][i][j][k-1] + - bp->array[var][i][j][k] + - bp->array[var][i][j][k+1] + - bp->array[0][i][j][k] + - bp->array[1][i][j][k]); - else - bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j][k-1] + - (tmp2-tmp1)*(bp->array[var][i][j][k] + - bp->array[1][i][j][k]) + - tmp2*bp->array[var][i][j][k+1])/ - (beta + alpha[var-mat] + - bp->array[var][i][j][k-1] + - bp->array[var][i][j][k] + - bp->array[var][i][j][k+1] + - bp->array[0][i][j][k] + - bp->array[1][i][j][k]); - } -} - total_fp_adds += (double) 12*num_active*num_cells; - total_fp_muls += (double) 3*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } else if (var < 3*mat) { -#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - if ((tmp1 = fabs(bp->array[var-mat][i][j][k] - - bp->array[var-mat][i][j][k-1])) > - (tmp2 = fabs(bp->array[var-mat][i][j][k] - - bp->array[var-mat][i][j][k+1]))) - bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j][k-1] + - (tmp1-tmp2)*(bp->array[var][i][j][k] + - bp->array[1][i][j][k]) + - tmp2*bp->array[var][i][j][k+1])/ - (beta + alpha[var-2*mat] + - bp->array[var-mat][i][j][k] + - bp->array[var+mat][i][j][k] + - bp->array[var][i][j][k-1] + - bp->array[var][i][j][k] + - bp->array[var][i][j][k+1]); - else - bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j][k-1] + - (tmp2-tmp1)*(bp->array[var][i][j][k] + - bp->array[1][i][j][k]) + - tmp2*bp->array[var][i][j][k+1])/ - (beta + alpha[var-2*mat] + - bp->array[var-mat][i][j][k] + - bp->array[var+mat][i][j][k] + - bp->array[var][i][j][k-1] + - bp->array[var][i][j][k] + - bp->array[var][i][j][k+1]); - } -} - total_fp_adds += (double) 12*num_active*num_cells; - total_fp_muls += (double) 3*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } else { -#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - if ((tmp1 = fabs(bp->array[var-2*mat][i][j][k] - - bp->array[var-2*mat][i][j][k-1])) > - (tmp2 = fabs(bp->array[var-2*mat][i][j][k] - - bp->array[var-2*mat][i][j][k+1]))) - bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j][k-1] + - (tmp1-tmp2)*(bp->array[var][i][j][k] + - bp->array[1][i][j][k]) + - tmp2*bp->array[var][i][j][k+1])/ - (beta + alpha[var-3*mat] + - bp->array[var-mat][i][j][k] + - bp->array[var-2*mat][i][j][k] + - bp->array[var][i][j][k-1] + - bp->array[var][i][j][k] + - bp->array[var][i][j][k+1]); - else - bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j][k-1] + - (tmp2-tmp1)*(bp->array[var][i][j][k] + - bp->array[1][i][j][k]) + - tmp2*bp->array[var][i][j][k+1])/ - (beta + alpha[var-3*mat] + - bp->array[var-mat][i][j][k] + - bp->array[var-2*mat][i][j][k] + - bp->array[var][i][j][k-1] + - bp->array[var][i][j][k] + - bp->array[var][i][j][k+1]); - } -} - total_fp_adds += (double) 12*num_active*num_cells; - total_fp_muls += (double) 3*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } -} - -void stencil_7(int var) -{ - int in, i, j, k, v; - double work[x_block_size+2][y_block_size+2][z_block_size+2]; - block *bp; - - if (var < mat) { -#pragma omp parallel default(shared) private(i, j, k, bp, v) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - work[i][j][k] = (bp->array[var ][i-1][j ][k ]* - bp->array[var+ mat][i-1][j ][k ] + - bp->array[var ][i ][j-1][k ]* - bp->array[var+2*mat][i ][j-1][k ] + - bp->array[var ][i ][j ][k-1]* - bp->array[var+3*mat][i ][j ][k-1] + - bp->array[var ][i ][j ][k ]* - bp->array[var ][i ][j ][k ] + - bp->array[var ][i ][j ][k+1]* - bp->array[var+3*mat][i ][j ][k+1] + - bp->array[var ][i ][j+1][k ]* - bp->array[var+2*mat][i ][j+1][k ] + - bp->array[var ][i+1][j ][k ]* - bp->array[var+ mat][i+1][j ][k ])/ - 7.0*(beta + bp->array[var][i][j][k]); - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - bp->array[var][i][j][k] = work[i][j][k]; - } -} - total_fp_adds += (double) 7*num_active*num_cells; - total_fp_muls += (double) 8*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } else if (var < 2*mat) { -#pragma omp parallel default(shared) private(i, j, k, bp, v) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - work[i][j][k] = (bp->array[var ][i-1][j ][k ]* - bp->array[var+ mat][i-1][j ][k ] + - bp->array[var ][i ][j-1][k ]* - bp->array[var+2*mat][i ][j-1][k ] + - bp->array[var ][i ][j ][k-1]* - bp->array[var- mat][i ][j ][k-1] + - bp->array[var ][i ][j ][k ]* - bp->array[var ][i ][j ][k ] + - bp->array[var ][i ][j ][k+1]* - bp->array[var- mat][i ][j ][k+1] + - bp->array[var ][i ][j+1][k ]* - bp->array[var+2*mat][i ][j+1][k ] + - bp->array[var ][i+1][j ][k ]* - bp->array[var+ mat][i+1][j ][k ])/ - 7.0*(beta + bp->array[var][i][j][k]); - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - bp->array[var][i][j][k] = work[i][j][k]; - } -} - total_fp_adds += (double) 7*num_active*num_cells; - total_fp_muls += (double) 8*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } else if (var < 3*mat) { -#pragma omp parallel default(shared) private(i, j, k, bp, v) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - work[i][j][k] = (bp->array[var ][i-1][j ][k ]* - bp->array[var+ mat][i-1][j ][k ] + - bp->array[var ][i ][j-1][k ]* - bp->array[var-2*mat][i ][j-1][k ] + - bp->array[var ][i ][j ][k-1]* - bp->array[var- mat][i ][j ][k-1] + - bp->array[var ][i ][j ][k ]* - bp->array[var ][i ][j ][k ] + - bp->array[var ][i ][j ][k+1]* - bp->array[var- mat][i ][j ][k+1] + - bp->array[var ][i ][j+1][k ]* - bp->array[var-2*mat][i ][j+1][k ] + - bp->array[var ][i+1][j ][k ]* - bp->array[var+ mat][i+1][j ][k ])/ - 7.0*(beta + bp->array[var][i][j][k]); - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - bp->array[var][i][j][k] = work[i][j][k]; - } -} - total_fp_adds += (double) 7*num_active*num_cells; - total_fp_muls += (double) 8*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } else { -#pragma omp parallel default(shared) private(i, j, k, bp, v) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - work[i][j][k] = (bp->array[var ][i-1][j ][k ]* - bp->array[var-3*mat][i-1][j ][k ] + - bp->array[var ][i ][j-1][k ]* - bp->array[var-2*mat][i ][j-1][k ] + - bp->array[var ][i ][j ][k-1]* - bp->array[var- mat][i ][j ][k-1] + - bp->array[var ][i ][j ][k ]* - bp->array[var ][i ][j ][k ] + - bp->array[var ][i ][j ][k+1]* - bp->array[var- mat][i ][j ][k+1] + - bp->array[var ][i ][j+1][k ]* - bp->array[var-2*mat][i ][j+1][k ] + - bp->array[var ][i+1][j ][k ]* - bp->array[var-3*mat][i+1][j ][k ])/ - 7.0*(beta + bp->array[var][i][j][k]); - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - bp->array[var][i][j][k] = work[i][j][k]; - } -} - total_fp_adds += (double) 7*num_active*num_cells; - total_fp_muls += (double) 8*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } -} - -void stencil_27(int var) -{ - int in, i, j, k, v; - double work[x_block_size+2][y_block_size+2][z_block_size+2]; - block *bp; - - if (var < mat) { -#pragma omp parallel default(shared) private(i, j, k, bp, v) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; -#pragma omp for - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - work[i][j][k] = (bp->array[var+3*mat][i-1][j-1][k-1] + - bp->array[var+2*mat][i-1][j-1][k ] + - bp->array[var+3*mat][i-1][j-1][k+1] + - bp->array[var+2*mat][i-1][j ][k-1] + - bp->array[var+ mat][i-1][j ][k ] + - bp->array[var+2*mat][i-1][j ][k+1] + - bp->array[var+3*mat][i-1][j+1][k-1] + - bp->array[var+2*mat][i-1][j+1][k ] + - bp->array[var+3*mat][i-1][j+1][k+1] + - bp->array[var+2*mat][i ][j-1][k-1] + - bp->array[var+ mat][i ][j-1][k ] + - bp->array[var+2*mat][i ][j-1][k+1] + - bp->array[var+ mat][i ][j ][k-1] + - bp->array[var ][i ][j ][k ] + - bp->array[var+ mat][i ][j ][k+1] + - bp->array[var+2*mat][i ][j+1][k-1] + - bp->array[var+ mat][i ][j+1][k ] + - bp->array[var+2*mat][i ][j+1][k+1] + - bp->array[var+3*mat][i+1][j-1][k-1] + - bp->array[var+2*mat][i+1][j-1][k ] + - bp->array[var+3*mat][i+1][j-1][k+1] + - bp->array[var+2*mat][i+1][j ][k-1] + - bp->array[var+ mat][i+1][j ][k ] + - bp->array[var+2*mat][i+1][j ][k+1] + - bp->array[var+3*mat][i+1][j+1][k-1] + - bp->array[var+2*mat][i+1][j+1][k ] + - bp->array[var+3*mat][i+1][j+1][k+1])/ - (beta+27.0); - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - bp->array[var][i][j][k] = work[i][j][k]; - } -} - total_fp_adds += (double) 27*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } else if (var < 2*mat) { -#pragma omp parallel default(shared) private(i, j, k, bp, v) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - work[i][j][k] = (bp->array[var- mat][i-1][j-1][k-1] + - bp->array[var+2*mat][i-1][j-1][k ] + - bp->array[var- mat][i-1][j-1][k+1] + - bp->array[var+2*mat][i-1][j ][k-1] + - bp->array[var+ mat][i-1][j ][k ] + - bp->array[var+2*mat][i-1][j ][k+1] + - bp->array[var- mat][i-1][j+1][k-1] + - bp->array[var+2*mat][i-1][j+1][k ] + - bp->array[var- mat][i-1][j+1][k+1] + - bp->array[var+2*mat][i ][j-1][k-1] + - bp->array[var+ mat][i ][j-1][k ] + - bp->array[var+2*mat][i ][j-1][k+1] + - bp->array[var+ mat][i ][j ][k-1] + - bp->array[var ][i ][j ][k ] + - bp->array[var+ mat][i ][j ][k+1] + - bp->array[var+2*mat][i ][j+1][k-1] + - bp->array[var+ mat][i ][j+1][k ] + - bp->array[var+2*mat][i ][j+1][k+1] + - bp->array[var- mat][i+1][j-1][k-1] + - bp->array[var+2*mat][i+1][j-1][k ] + - bp->array[var- mat][i+1][j-1][k+1] + - bp->array[var+2*mat][i+1][j ][k-1] + - bp->array[var+ mat][i+1][j ][k ] + - bp->array[var+2*mat][i+1][j ][k+1] + - bp->array[var- mat][i+1][j+1][k-1] + - bp->array[var+2*mat][i+1][j+1][k ] + - bp->array[var- mat][i+1][j+1][k+1])/ - (beta+27.0); - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - bp->array[var][i][j][k] = work[i][j][k]; - } -} - - total_fp_adds += (double) 27*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } else if (var < 3*mat) { -#pragma omp parallel default(shared) private(i, j, k, bp, v) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - work[i][j][k] = (bp->array[var- mat][i-1][j-1][k-1] + - bp->array[var-2*mat][i-1][j-1][k ] + - bp->array[var- mat][i-1][j-1][k+1] + - bp->array[var-2*mat][i-1][j ][k-1] + - bp->array[var+ mat][i-1][j ][k ] + - bp->array[var-2*mat][i-1][j ][k+1] + - bp->array[var- mat][i-1][j+1][k-1] + - bp->array[var-2*mat][i-1][j+1][k ] + - bp->array[var- mat][i-1][j+1][k+1] + - bp->array[var-2*mat][i ][j-1][k-1] + - bp->array[var+ mat][i ][j-1][k ] + - bp->array[var-2*mat][i ][j-1][k+1] + - bp->array[var+ mat][i ][j ][k-1] + - bp->array[var ][i ][j ][k ] + - bp->array[var+ mat][i ][j ][k+1] + - bp->array[var-2*mat][i ][j+1][k-1] + - bp->array[var+ mat][i ][j+1][k ] + - bp->array[var-2*mat][i ][j+1][k+1] + - bp->array[var- mat][i+1][j-1][k-1] + - bp->array[var-2*mat][i+1][j-1][k ] + - bp->array[var- mat][i+1][j-1][k+1] + - bp->array[var-2*mat][i+1][j ][k-1] + - bp->array[var+ mat][i+1][j ][k ] + - bp->array[var-2*mat][i+1][j ][k+1] + - bp->array[var- mat][i+1][j+1][k-1] + - bp->array[var-2*mat][i+1][j+1][k ] + - bp->array[var- mat][i+1][j+1][k+1])/ - (beta+27.0); - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - bp->array[var][i][j][k] = work[i][j][k]; - } -} - total_fp_adds += (double) 27*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } else { -#pragma omp parallel default(shared) private(i, j, k, bp, v) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - work[i][j][k] = (bp->array[var- mat][i-1][j-1][k-1] + - bp->array[var-2*mat][i-1][j-1][k ] + - bp->array[var- mat][i-1][j-1][k+1] + - bp->array[var-2*mat][i-1][j ][k-1] + - bp->array[var-3*mat][i-1][j ][k ] + - bp->array[var-2*mat][i-1][j ][k+1] + - bp->array[var- mat][i-1][j+1][k-1] + - bp->array[var-2*mat][i-1][j+1][k ] + - bp->array[var- mat][i-1][j+1][k+1] + - bp->array[var-2*mat][i ][j-1][k-1] + - bp->array[var-3*mat][i ][j-1][k ] + - bp->array[var-2*mat][i ][j-1][k+1] + - bp->array[var-3*mat][i ][j ][k-1] + - bp->array[var ][i ][j ][k ] + - bp->array[var-3*mat][i ][j ][k+1] + - bp->array[var-2*mat][i ][j+1][k-1] + - bp->array[var-3*mat][i ][j+1][k ] + - bp->array[var-2*mat][i ][j+1][k+1] + - bp->array[var- mat][i+1][j-1][k-1] + - bp->array[var-2*mat][i+1][j-1][k ] + - bp->array[var- mat][i+1][j-1][k+1] + - bp->array[var-2*mat][i+1][j ][k-1] + - bp->array[var-3*mat][i+1][j ][k ] + - bp->array[var-2*mat][i+1][j ][k+1] + - bp->array[var- mat][i+1][j+1][k-1] + - bp->array[var-2*mat][i+1][j+1][k ] + - bp->array[var- mat][i+1][j+1][k+1])/ - (beta+27.0); - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) - bp->array[var][i][j][k] = work[i][j][k]; - } -} - - total_fp_adds += (double) 27*num_active*num_cells; - total_fp_divs += (double) num_active*num_cells; - } -} - -void stencil_check(int var) -{ - int in, i, j, k, v; - double work[x_block_size+2][y_block_size+2][z_block_size+2]; - block *bp; - -#pragma omp parallel default(shared) private(i, j, k, bp, v) -{ - for (in = 0; in < sorted_index[num_refine+1]; in++) { - bp = &blocks[sorted_list[in].n]; - for (i = 1; i <= x_block_size; i++) - for (j = 1; j <= y_block_size; j++) - for (k = 1; k <= z_block_size; k++) { - bp->array[var][i][j][k] = fabs(bp->array[var][i][j][k]); - if (bp->array[var][i][j][k] >= 1.0) { - bp->array[var][i][j][k] /= (beta + alpha[0] + - bp->array[var][i][j][k]); - total_fp_divs += (double) 1; - total_fp_adds += (double) 2; - } - else if (bp->array[var][i][j][k] < 0.1) { - bp->array[var][i][j][k] *= 10.0 - beta; - total_fp_muls += (double) 1; - total_fp_adds += (double) 1; - } - } - } -} -} +//void stencil_0(int var) +//{ +// int in, i, j, k, v; +// block *bp; +// +// if (var == 1) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// for (v = mat; v < 2*mat; v++) +// bp->array[var][i][j][k] += bp->array[v][i][j][k]* +// bp->array[0][i][j][k]; +// } +//} +// +// total_fp_adds += (double) mat*num_active*num_cells; +// total_fp_muls += (double) mat*num_active*num_cells; +// } else if (var < mat) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// bp->array[var][i][j][k] += bp->array[var][i][j][k]* +// (bp->array[0][i][j][k] + +// bp->array[1][i][j][k] - +// beta*bp->array[var][i][j][k]); +// } +//} +// +// total_fp_adds += (double) 3*num_active*num_cells; +// total_fp_muls += (double) 2*num_active*num_cells; +// } else if (var < 2*mat) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// bp->array[var][i][j][k] = bp->array[var][i][j][k]* +// (bp->array[0][i][j][k] + +// bp->array[var][i][j][k] + +// beta*bp->array[var+mat][i][j][k] + +// (1.0-beta)*bp->array[var+2*mat][i][j][k])/ +// bp->array[1][i][j][k]; +// } +//} +// +// total_fp_adds += (double) 3*num_active*num_cells; +// total_fp_muls += (double) 3*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } else if (var < 3*mat) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// bp->array[var][i][j][k] += bp->array[var-mat][i][j][k]* +// (beta*bp->array[0][i][j][k] + +// alpha[var-2*mat]*bp->array[var][i][j][k] + +// (1.0-beta)*bp->array[var+mat][i][j][k])/ +// bp->array[1][i][j][k]; +// } +//} +// +// total_fp_adds += (double) 4*num_active*num_cells; +// total_fp_muls += (double) 3*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } else { +//#pragma omp parallel default(shared) private(i, j, k, bp, v) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// bp->array[var][i][j][k] += bp->array[var-2*mat][i][j][k]* +// (beta*bp->array[0][i][j][k] + +// alpha[var-3*mat]*bp->array[var][i][j][k] + +// (1.0-alpha[var-3*mat])*bp->array[var-mat][i][j][k] + +// (1.0-beta)*bp->array[var-2*mat][i][j][k])/ +// (bp->array[1][i][j][k]* +// bp->array[1][i][j][k]); +// } +//} +// total_fp_adds += (double) 6*num_active*num_cells; +// total_fp_muls += (double) 6*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } +//} +// +//void stencil_x(int var) +//{ +// int in, i, j, k, v; +// double tmp1, tmp2; +// block *bp; +// +// if (var == 1) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) { +// for (v = 2; v < mat+2; v++) +// bp->array[1][i][j][k] += bp->array[v][i][j][k]* +// bp->array[0][i][j][k]; +// bp->array[1][i][j][k] /= (beta + bp->array[1][i][j][k]); +// } +// } +//} +// total_fp_adds += (double) (mat+1)*num_active*num_cells; +// total_fp_muls += (double) mat*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } else if (var < mat) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// bp->array[var][i][j][k] += bp->array[var][i][j][k]* +// (bp->array[0][i][j][k] + +// bp->array[1][i][j][k] - +// beta*bp->array[var][i][j][k])/ +// (alpha[var] + bp->array[1][i][j][k]); +// } +//} +// total_fp_adds += (double) 4*num_active*num_cells; +// total_fp_muls += (double) 2*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } else if (var < 2*mat) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// if ((tmp1 = fabs(bp->array[var][i][j][k] - +// bp->array[var][i-1][j][k])) > +// (tmp2 = fabs(bp->array[var][i][j][k] - +// bp->array[var][i+1][j][k]))) +// bp->array[var][i][j][k] = (tmp1*bp->array[var][i-1][j][k] + +// (tmp1-tmp2)*(bp->array[var][i][j][k] + +// bp->array[1][i][j][k]) + +// tmp2*bp->array[var][i+1][j][k])/ +// (beta + alpha[var-mat] + +// bp->array[var][i-1][j][k] + +// bp->array[var][i][j][k] + +// bp->array[var][i+1][j][k] + +// bp->array[0][i][j][k] + +// bp->array[1][i][j][k]); +// else +// bp->array[var][i][j][k] = (tmp1*bp->array[var][i-1][j][k] + +// (tmp2-tmp1)*(bp->array[var][i][j][k] + +// bp->array[1][i][j][k]) + +// tmp2*bp->array[var][i+1][j][k])/ +// (beta + alpha[var-mat] + +// bp->array[var][i-1][j][k] + +// bp->array[var][i][j][k] + +// bp->array[var][i+1][j][k] + +// bp->array[0][i][j][k] + +// bp->array[1][i][j][k]); +// } +//} +// total_fp_adds += (double) 12*num_active*num_cells; +// total_fp_muls += (double) 3*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } else if (var < 3*mat) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// if ((tmp1 = fabs(bp->array[var-mat][i][j][k] - +// bp->array[var-mat][i-1][j][k])) > +// (tmp2 = fabs(bp->array[var-mat][i][j][k] - +// bp->array[var-mat][i+1][j][k]))) +// bp->array[var][i][j][k] = (tmp1*bp->array[var][i-1][j][k] + +// (tmp1-tmp2)*(bp->array[var][i][j][k] + +// bp->array[1][i][j][k]) + +// tmp2*bp->array[var][i+1][j][k])/ +// (beta + alpha[var-2*mat] + +// bp->array[var-mat][i][j][k] + +// bp->array[var+mat][i][j][k] + +// bp->array[var][i-1][j][k] + +// bp->array[var][i][j][k] + +// bp->array[var][i+1][j][k]); +// else +// bp->array[var][i][j][k] = (tmp1*bp->array[var][i-1][j][k] + +// (tmp2-tmp1)*(bp->array[var][i][j][k] + +// bp->array[1][i][j][k]) + +// tmp2*bp->array[var][i+1][j][k])/ +// (beta + alpha[var-2*mat] + +// bp->array[var-mat][i][j][k] + +// bp->array[var+mat][i][j][k] + +// bp->array[var][i-1][j][k] + +// bp->array[var][i][j][k] + +// bp->array[var][i+1][j][k]); +// } +//} +// total_fp_adds += (double) 12*num_active*num_cells; +// total_fp_muls += (double) 3*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } else { +//#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// if ((tmp1 = fabs(bp->array[var-2*mat][i][j][k] - +// bp->array[var-2*mat][i-1][j][k])) > +// (tmp2 = fabs(bp->array[var-2*mat][i][j][k] - +// bp->array[var-2*mat][i+1][j][k]))) +// bp->array[var][i][j][k] = (tmp1*bp->array[var][i-1][j][k] + +// (tmp1-tmp2)*(bp->array[var][i][j][k] + +// bp->array[1][i][j][k]) + +// tmp2*bp->array[var][i+1][j][k])/ +// (beta + alpha[var-3*mat] + +// bp->array[var-mat][i][j][k] + +// bp->array[var-2*mat][i][j][k] + +// bp->array[var][i-1][j][k] + +// bp->array[var][i][j][k] + +// bp->array[var][i+1][j][k]); +// else +// bp->array[var][i][j][k] = (tmp1*bp->array[var][i-1][j][k] + +// (tmp2-tmp1)*(bp->array[var][i][j][k] + +// bp->array[1][i][j][k]) + +// tmp2*bp->array[var][i+1][j][k])/ +// (beta + alpha[var-3*mat] + +// bp->array[var-mat][i][j][k] + +// bp->array[var-2*mat][i][j][k] + +// bp->array[var][i-1][j][k] + +// bp->array[var][i][j][k] + +// bp->array[var][i+1][j][k]); +// } +//} +// total_fp_adds += (double) 12*num_active*num_cells; +// total_fp_muls += (double) 3*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } +//} +// +//void stencil_y(int var) +//{ +// int in, i, j, k, v; +// double tmp1, tmp2; +// block *bp; +// +// if (var == 1) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +//#pragma omp for +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) { +// for (v = 2; v < mat+2; v++) +// bp->array[1][i][j][k] += bp->array[v][i][j][k]* +// bp->array[0][i][j][k]; +// bp->array[1][i][j][k] /= (beta + bp->array[1][i][j][k]); +// } +// } +//} +// total_fp_adds += (double) (mat+1)*num_active*num_cells; +// total_fp_muls += (double) mat*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } else if (var < mat) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// bp->array[var][i][j][k] += bp->array[var][i][j][k]* +// (bp->array[0][i][j][k] + +// bp->array[1][i][j][k] - +// beta*bp->array[var][i][j][k])/ +// (alpha[var] + bp->array[1][i][j][k]); +// } +//} +// total_fp_adds += (double) 4*num_active*num_cells; +// total_fp_muls += (double) 2*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } else if (var < 2*mat) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// if ((tmp1 = fabs(bp->array[var][i][j][k] - +// bp->array[var][i][j-1][k])) > +// (tmp2 = fabs(bp->array[var][i][j][k] - +// bp->array[var][i][j+1][k]))) +// bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j-1][k] + +// (tmp1-tmp2)*(bp->array[var][i][j][k] + +// bp->array[1][i][j][k]) + +// tmp2*bp->array[var][i][j+1][k])/ +// (beta + alpha[var-mat] + +// bp->array[var][i][j-1][k] + +// bp->array[var][i][j][k] + +// bp->array[var][i][j+1][k] + +// bp->array[0][i][j][k] + +// bp->array[1][i][j][k]); +// else +// bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j-1][k] + +// (tmp2-tmp1)*(bp->array[var][i][j][k] + +// bp->array[1][i][j][k]) + +// tmp2*bp->array[var][i][j+1][k])/ +// (beta + alpha[var-mat] + +// bp->array[var][i][j-1][k] + +// bp->array[var][i][j][k] + +// bp->array[var][i][j+1][k] + +// bp->array[0][i][j][k] + +// bp->array[1][i][j][k]); +// } +//} +// total_fp_adds += (double) 12*num_active*num_cells; +// total_fp_muls += (double) 3*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } else if (var < 3*mat) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// if ((tmp1 = fabs(bp->array[var-mat][i][j][k] - +// bp->array[var-mat][i][j-1][k])) > +// (tmp2 = fabs(bp->array[var-mat][i][j][k] - +// bp->array[var-mat][i][j+1][k]))) +// bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j-1][k] + +// (tmp1-tmp2)*(bp->array[var][i][j][k] + +// bp->array[1][i][j][k]) + +// tmp2*bp->array[var][i][j+1][k])/ +// (beta + alpha[var-2*mat] + +// bp->array[var-mat][i][j][k] + +// bp->array[var+mat][i][j][k] + +// bp->array[var][i][j-1][k] + +// bp->array[var][i][j][k] + +// bp->array[var][i][j+1][k]); +// else +// bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j-1][k] + +// (tmp2-tmp1)*(bp->array[var][i][j][k] + +// bp->array[1][i][j][k]) + +// tmp2*bp->array[var][i][j+1][k])/ +// (beta + alpha[var-2*mat] + +// bp->array[var-mat][i][j][k] + +// bp->array[var+mat][i][j][k] + +// bp->array[var][i][j-1][k] + +// bp->array[var][i][j][k] + +// bp->array[var][i][j+1][k]); +// } +//} +// total_fp_adds += (double) 12*num_active*num_cells; +// total_fp_muls += (double) 3*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } else { +//#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// if ((tmp1 = fabs(bp->array[var-2*mat][i][j][k] - +// bp->array[var-2*mat][i][j-1][k])) > +// (tmp2 = fabs(bp->array[var-2*mat][i][j][k] - +// bp->array[var-2*mat][i][j+1][k]))) +// bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j-1][k] + +// (tmp1-tmp2)*(bp->array[var][i][j][k] + +// bp->array[1][i][j][k]) + +// tmp2*bp->array[var][i][j+1][k])/ +// (beta + alpha[var-3*mat] + +// bp->array[var-mat][i][j][k] + +// bp->array[var-2*mat][i][j][k] + +// bp->array[var][i][j-1][k] + +// bp->array[var][i][j][k] + +// bp->array[var][i][j+1][k]); +// else +// bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j-1][k] + +// (tmp2-tmp1)*(bp->array[var][i][j][k] + +// bp->array[1][i][j][k]) + +// tmp2*bp->array[var][i][j+1][k])/ +// (beta + alpha[var-3*mat] + +// bp->array[var-mat][i][j][k] + +// bp->array[var-2*mat][i][j][k] + +// bp->array[var][i][j-1][k] + +// bp->array[var][i][j][k] + +// bp->array[var][i][j+1][k]); +// } +//} +// total_fp_adds += (double) 12*num_active*num_cells; +// total_fp_muls += (double) 3*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } +//} +// +//void stencil_z(int var) +//{ +// int in, i, j, k, v; +// double tmp1, tmp2; +// block *bp; +// +// +// if (var == 1) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) { +// for (v = 2; v < mat+2; v++) +// bp->array[1][i][j][k] += bp->array[v][i][j][k]* +// bp->array[0][i][j][k]; +// bp->array[1][i][j][k] /= (beta + bp->array[1][i][j][k]); +// } +// } +//} +// +// total_fp_adds += (double) (mat+1)*num_active*num_cells; +// total_fp_muls += (double) mat*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } else if (var < mat) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// bp->array[var][i][j][k] += bp->array[var][i][j][k]* +// (bp->array[0][i][j][k] + +// bp->array[1][i][j][k] - +// beta*bp->array[var][i][j][k])/ +// (alpha[var] + bp->array[1][i][j][k]); +// } +//} +// total_fp_adds += (double) 4*num_active*num_cells; +// total_fp_muls += (double) 2*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } else if (var < 2*mat) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// if ((tmp1 = fabs(bp->array[var][i][j][k] - +// bp->array[var][i][j][k-1])) > +// (tmp2 = fabs(bp->array[var][i][j][k] - +// bp->array[var][i][j][k+1]))) +// bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j][k-1] + +// (tmp1-tmp2)*(bp->array[var][i][j][k] + +// bp->array[1][i][j][k]) + +// tmp2*bp->array[var][i][j][k+1])/ +// (beta + alpha[var-mat] + +// bp->array[var][i][j][k-1] + +// bp->array[var][i][j][k] + +// bp->array[var][i][j][k+1] + +// bp->array[0][i][j][k] + +// bp->array[1][i][j][k]); +// else +// bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j][k-1] + +// (tmp2-tmp1)*(bp->array[var][i][j][k] + +// bp->array[1][i][j][k]) + +// tmp2*bp->array[var][i][j][k+1])/ +// (beta + alpha[var-mat] + +// bp->array[var][i][j][k-1] + +// bp->array[var][i][j][k] + +// bp->array[var][i][j][k+1] + +// bp->array[0][i][j][k] + +// bp->array[1][i][j][k]); +// } +//} +// total_fp_adds += (double) 12*num_active*num_cells; +// total_fp_muls += (double) 3*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } else if (var < 3*mat) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// if ((tmp1 = fabs(bp->array[var-mat][i][j][k] - +// bp->array[var-mat][i][j][k-1])) > +// (tmp2 = fabs(bp->array[var-mat][i][j][k] - +// bp->array[var-mat][i][j][k+1]))) +// bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j][k-1] + +// (tmp1-tmp2)*(bp->array[var][i][j][k] + +// bp->array[1][i][j][k]) + +// tmp2*bp->array[var][i][j][k+1])/ +// (beta + alpha[var-2*mat] + +// bp->array[var-mat][i][j][k] + +// bp->array[var+mat][i][j][k] + +// bp->array[var][i][j][k-1] + +// bp->array[var][i][j][k] + +// bp->array[var][i][j][k+1]); +// else +// bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j][k-1] + +// (tmp2-tmp1)*(bp->array[var][i][j][k] + +// bp->array[1][i][j][k]) + +// tmp2*bp->array[var][i][j][k+1])/ +// (beta + alpha[var-2*mat] + +// bp->array[var-mat][i][j][k] + +// bp->array[var+mat][i][j][k] + +// bp->array[var][i][j][k-1] + +// bp->array[var][i][j][k] + +// bp->array[var][i][j][k+1]); +// } +//} +// total_fp_adds += (double) 12*num_active*num_cells; +// total_fp_muls += (double) 3*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } else { +//#pragma omp parallel default(shared) private(i, j, k, bp, v, tmp1, tmp2) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// if ((tmp1 = fabs(bp->array[var-2*mat][i][j][k] - +// bp->array[var-2*mat][i][j][k-1])) > +// (tmp2 = fabs(bp->array[var-2*mat][i][j][k] - +// bp->array[var-2*mat][i][j][k+1]))) +// bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j][k-1] + +// (tmp1-tmp2)*(bp->array[var][i][j][k] + +// bp->array[1][i][j][k]) + +// tmp2*bp->array[var][i][j][k+1])/ +// (beta + alpha[var-3*mat] + +// bp->array[var-mat][i][j][k] + +// bp->array[var-2*mat][i][j][k] + +// bp->array[var][i][j][k-1] + +// bp->array[var][i][j][k] + +// bp->array[var][i][j][k+1]); +// else +// bp->array[var][i][j][k] = (tmp1*bp->array[var][i][j][k-1] + +// (tmp2-tmp1)*(bp->array[var][i][j][k] + +// bp->array[1][i][j][k]) + +// tmp2*bp->array[var][i][j][k+1])/ +// (beta + alpha[var-3*mat] + +// bp->array[var-mat][i][j][k] + +// bp->array[var-2*mat][i][j][k] + +// bp->array[var][i][j][k-1] + +// bp->array[var][i][j][k] + +// bp->array[var][i][j][k+1]); +// } +//} +// total_fp_adds += (double) 12*num_active*num_cells; +// total_fp_muls += (double) 3*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } +//} +// +//void stencil_7(int var) +//{ +// int in, i, j, k, v; +// double work[x_block_size+2][y_block_size+2][z_block_size+2]; +// block *bp; +// +// if (var < mat) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// work[i][j][k] = (bp->array[var ][i-1][j ][k ]* +// bp->array[var+ mat][i-1][j ][k ] + +// bp->array[var ][i ][j-1][k ]* +// bp->array[var+2*mat][i ][j-1][k ] + +// bp->array[var ][i ][j ][k-1]* +// bp->array[var+3*mat][i ][j ][k-1] + +// bp->array[var ][i ][j ][k ]* +// bp->array[var ][i ][j ][k ] + +// bp->array[var ][i ][j ][k+1]* +// bp->array[var+3*mat][i ][j ][k+1] + +// bp->array[var ][i ][j+1][k ]* +// bp->array[var+2*mat][i ][j+1][k ] + +// bp->array[var ][i+1][j ][k ]* +// bp->array[var+ mat][i+1][j ][k ])/ +// 7.0*(beta + bp->array[var][i][j][k]); +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// bp->array[var][i][j][k] = work[i][j][k]; +// } +//} +// total_fp_adds += (double) 7*num_active*num_cells; +// total_fp_muls += (double) 8*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } else if (var < 2*mat) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// work[i][j][k] = (bp->array[var ][i-1][j ][k ]* +// bp->array[var+ mat][i-1][j ][k ] + +// bp->array[var ][i ][j-1][k ]* +// bp->array[var+2*mat][i ][j-1][k ] + +// bp->array[var ][i ][j ][k-1]* +// bp->array[var- mat][i ][j ][k-1] + +// bp->array[var ][i ][j ][k ]* +// bp->array[var ][i ][j ][k ] + +// bp->array[var ][i ][j ][k+1]* +// bp->array[var- mat][i ][j ][k+1] + +// bp->array[var ][i ][j+1][k ]* +// bp->array[var+2*mat][i ][j+1][k ] + +// bp->array[var ][i+1][j ][k ]* +// bp->array[var+ mat][i+1][j ][k ])/ +// 7.0*(beta + bp->array[var][i][j][k]); +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// bp->array[var][i][j][k] = work[i][j][k]; +// } +//} +// total_fp_adds += (double) 7*num_active*num_cells; +// total_fp_muls += (double) 8*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } else if (var < 3*mat) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// work[i][j][k] = (bp->array[var ][i-1][j ][k ]* +// bp->array[var+ mat][i-1][j ][k ] + +// bp->array[var ][i ][j-1][k ]* +// bp->array[var-2*mat][i ][j-1][k ] + +// bp->array[var ][i ][j ][k-1]* +// bp->array[var- mat][i ][j ][k-1] + +// bp->array[var ][i ][j ][k ]* +// bp->array[var ][i ][j ][k ] + +// bp->array[var ][i ][j ][k+1]* +// bp->array[var- mat][i ][j ][k+1] + +// bp->array[var ][i ][j+1][k ]* +// bp->array[var-2*mat][i ][j+1][k ] + +// bp->array[var ][i+1][j ][k ]* +// bp->array[var+ mat][i+1][j ][k ])/ +// 7.0*(beta + bp->array[var][i][j][k]); +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// bp->array[var][i][j][k] = work[i][j][k]; +// } +//} +// total_fp_adds += (double) 7*num_active*num_cells; +// total_fp_muls += (double) 8*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } else { +//#pragma omp parallel default(shared) private(i, j, k, bp, v) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// work[i][j][k] = (bp->array[var ][i-1][j ][k ]* +// bp->array[var-3*mat][i-1][j ][k ] + +// bp->array[var ][i ][j-1][k ]* +// bp->array[var-2*mat][i ][j-1][k ] + +// bp->array[var ][i ][j ][k-1]* +// bp->array[var- mat][i ][j ][k-1] + +// bp->array[var ][i ][j ][k ]* +// bp->array[var ][i ][j ][k ] + +// bp->array[var ][i ][j ][k+1]* +// bp->array[var- mat][i ][j ][k+1] + +// bp->array[var ][i ][j+1][k ]* +// bp->array[var-2*mat][i ][j+1][k ] + +// bp->array[var ][i+1][j ][k ]* +// bp->array[var-3*mat][i+1][j ][k ])/ +// 7.0*(beta + bp->array[var][i][j][k]); +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// bp->array[var][i][j][k] = work[i][j][k]; +// } +//} +// total_fp_adds += (double) 7*num_active*num_cells; +// total_fp_muls += (double) 8*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } +//} +// +//void stencil_27(int var) +//{ +// int in, i, j, k, v; +// double work[x_block_size+2][y_block_size+2][z_block_size+2]; +// block *bp; +// +// if (var < mat) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +//#pragma omp for +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// work[i][j][k] = (bp->array[var+3*mat][i-1][j-1][k-1] + +// bp->array[var+2*mat][i-1][j-1][k ] + +// bp->array[var+3*mat][i-1][j-1][k+1] + +// bp->array[var+2*mat][i-1][j ][k-1] + +// bp->array[var+ mat][i-1][j ][k ] + +// bp->array[var+2*mat][i-1][j ][k+1] + +// bp->array[var+3*mat][i-1][j+1][k-1] + +// bp->array[var+2*mat][i-1][j+1][k ] + +// bp->array[var+3*mat][i-1][j+1][k+1] + +// bp->array[var+2*mat][i ][j-1][k-1] + +// bp->array[var+ mat][i ][j-1][k ] + +// bp->array[var+2*mat][i ][j-1][k+1] + +// bp->array[var+ mat][i ][j ][k-1] + +// bp->array[var ][i ][j ][k ] + +// bp->array[var+ mat][i ][j ][k+1] + +// bp->array[var+2*mat][i ][j+1][k-1] + +// bp->array[var+ mat][i ][j+1][k ] + +// bp->array[var+2*mat][i ][j+1][k+1] + +// bp->array[var+3*mat][i+1][j-1][k-1] + +// bp->array[var+2*mat][i+1][j-1][k ] + +// bp->array[var+3*mat][i+1][j-1][k+1] + +// bp->array[var+2*mat][i+1][j ][k-1] + +// bp->array[var+ mat][i+1][j ][k ] + +// bp->array[var+2*mat][i+1][j ][k+1] + +// bp->array[var+3*mat][i+1][j+1][k-1] + +// bp->array[var+2*mat][i+1][j+1][k ] + +// bp->array[var+3*mat][i+1][j+1][k+1])/ +// (beta+27.0); +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// bp->array[var][i][j][k] = work[i][j][k]; +// } +//} +// total_fp_adds += (double) 27*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } else if (var < 2*mat) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// work[i][j][k] = (bp->array[var- mat][i-1][j-1][k-1] + +// bp->array[var+2*mat][i-1][j-1][k ] + +// bp->array[var- mat][i-1][j-1][k+1] + +// bp->array[var+2*mat][i-1][j ][k-1] + +// bp->array[var+ mat][i-1][j ][k ] + +// bp->array[var+2*mat][i-1][j ][k+1] + +// bp->array[var- mat][i-1][j+1][k-1] + +// bp->array[var+2*mat][i-1][j+1][k ] + +// bp->array[var- mat][i-1][j+1][k+1] + +// bp->array[var+2*mat][i ][j-1][k-1] + +// bp->array[var+ mat][i ][j-1][k ] + +// bp->array[var+2*mat][i ][j-1][k+1] + +// bp->array[var+ mat][i ][j ][k-1] + +// bp->array[var ][i ][j ][k ] + +// bp->array[var+ mat][i ][j ][k+1] + +// bp->array[var+2*mat][i ][j+1][k-1] + +// bp->array[var+ mat][i ][j+1][k ] + +// bp->array[var+2*mat][i ][j+1][k+1] + +// bp->array[var- mat][i+1][j-1][k-1] + +// bp->array[var+2*mat][i+1][j-1][k ] + +// bp->array[var- mat][i+1][j-1][k+1] + +// bp->array[var+2*mat][i+1][j ][k-1] + +// bp->array[var+ mat][i+1][j ][k ] + +// bp->array[var+2*mat][i+1][j ][k+1] + +// bp->array[var- mat][i+1][j+1][k-1] + +// bp->array[var+2*mat][i+1][j+1][k ] + +// bp->array[var- mat][i+1][j+1][k+1])/ +// (beta+27.0); +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// bp->array[var][i][j][k] = work[i][j][k]; +// } +//} +// +// total_fp_adds += (double) 27*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } else if (var < 3*mat) { +//#pragma omp parallel default(shared) private(i, j, k, bp, v) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// work[i][j][k] = (bp->array[var- mat][i-1][j-1][k-1] + +// bp->array[var-2*mat][i-1][j-1][k ] + +// bp->array[var- mat][i-1][j-1][k+1] + +// bp->array[var-2*mat][i-1][j ][k-1] + +// bp->array[var+ mat][i-1][j ][k ] + +// bp->array[var-2*mat][i-1][j ][k+1] + +// bp->array[var- mat][i-1][j+1][k-1] + +// bp->array[var-2*mat][i-1][j+1][k ] + +// bp->array[var- mat][i-1][j+1][k+1] + +// bp->array[var-2*mat][i ][j-1][k-1] + +// bp->array[var+ mat][i ][j-1][k ] + +// bp->array[var-2*mat][i ][j-1][k+1] + +// bp->array[var+ mat][i ][j ][k-1] + +// bp->array[var ][i ][j ][k ] + +// bp->array[var+ mat][i ][j ][k+1] + +// bp->array[var-2*mat][i ][j+1][k-1] + +// bp->array[var+ mat][i ][j+1][k ] + +// bp->array[var-2*mat][i ][j+1][k+1] + +// bp->array[var- mat][i+1][j-1][k-1] + +// bp->array[var-2*mat][i+1][j-1][k ] + +// bp->array[var- mat][i+1][j-1][k+1] + +// bp->array[var-2*mat][i+1][j ][k-1] + +// bp->array[var+ mat][i+1][j ][k ] + +// bp->array[var-2*mat][i+1][j ][k+1] + +// bp->array[var- mat][i+1][j+1][k-1] + +// bp->array[var-2*mat][i+1][j+1][k ] + +// bp->array[var- mat][i+1][j+1][k+1])/ +// (beta+27.0); +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// bp->array[var][i][j][k] = work[i][j][k]; +// } +//} +// total_fp_adds += (double) 27*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } else { +//#pragma omp parallel default(shared) private(i, j, k, bp, v) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// work[i][j][k] = (bp->array[var- mat][i-1][j-1][k-1] + +// bp->array[var-2*mat][i-1][j-1][k ] + +// bp->array[var- mat][i-1][j-1][k+1] + +// bp->array[var-2*mat][i-1][j ][k-1] + +// bp->array[var-3*mat][i-1][j ][k ] + +// bp->array[var-2*mat][i-1][j ][k+1] + +// bp->array[var- mat][i-1][j+1][k-1] + +// bp->array[var-2*mat][i-1][j+1][k ] + +// bp->array[var- mat][i-1][j+1][k+1] + +// bp->array[var-2*mat][i ][j-1][k-1] + +// bp->array[var-3*mat][i ][j-1][k ] + +// bp->array[var-2*mat][i ][j-1][k+1] + +// bp->array[var-3*mat][i ][j ][k-1] + +// bp->array[var ][i ][j ][k ] + +// bp->array[var-3*mat][i ][j ][k+1] + +// bp->array[var-2*mat][i ][j+1][k-1] + +// bp->array[var-3*mat][i ][j+1][k ] + +// bp->array[var-2*mat][i ][j+1][k+1] + +// bp->array[var- mat][i+1][j-1][k-1] + +// bp->array[var-2*mat][i+1][j-1][k ] + +// bp->array[var- mat][i+1][j-1][k+1] + +// bp->array[var-2*mat][i+1][j ][k-1] + +// bp->array[var-3*mat][i+1][j ][k ] + +// bp->array[var-2*mat][i+1][j ][k+1] + +// bp->array[var- mat][i+1][j+1][k-1] + +// bp->array[var-2*mat][i+1][j+1][k ] + +// bp->array[var- mat][i+1][j+1][k+1])/ +// (beta+27.0); +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) +// bp->array[var][i][j][k] = work[i][j][k]; +// } +//} +// +// total_fp_adds += (double) 27*num_active*num_cells; +// total_fp_divs += (double) num_active*num_cells; +// } +//} +// +//void stencil_check(int var) +//{ +// int in, i, j, k, v; +// double work[x_block_size+2][y_block_size+2][z_block_size+2]; +// block *bp; +// +//#pragma omp parallel default(shared) private(i, j, k, bp, v) +//{ +// for (in = 0; in < sorted_index[num_refine+1]; in++) { +// bp = &blocks[sorted_list[in].n]; +// for (i = 1; i <= x_block_size; i++) +// for (j = 1; j <= y_block_size; j++) +// for (k = 1; k <= z_block_size; k++) { +// bp->array[var][i][j][k] = fabs(bp->array[var][i][j][k]); +// if (bp->array[var][i][j][k] >= 1.0) { +// bp->array[var][i][j][k] /= (beta + alpha[0] + +// bp->array[var][i][j][k]); +// total_fp_divs += (double) 1; +// total_fp_adds += (double) 2; +// } +// else if (bp->array[var][i][j][k] < 0.1) { +// bp->array[var][i][j][k] *= 10.0 - beta; +// total_fp_muls += (double) 1; +// total_fp_adds += (double) 1; +// } +// } +// } +//} +//}