Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 1319

Last change on this file since 1319 was 1319, checked in by raasch, 10 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 26.1 KB

Rev	Line
[1216]	1	SUBROUTINE resort_for_xy( f_in, f_inv )
[1]	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[1310]	17	! Copyright 1997-2014 Leibniz Universitaet Hannover
[1036]	18	!--------------------------------------------------------------------------------!
	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[198]	22	!
[1319]	23	!
[198]	24	! Former revisions:
	25	! -----------------
	26	! $Id: transpose.f90 1319 2014-03-17 15:08:44Z raasch $
	27	!
[1319]	28	! 1318 2014-03-17 13:35:16Z raasch
	29	! cpu_log_nowait parameter added to cpu measurements of the transpositions
	30	! required for solving the Poisson equation (poisfft),
	31	! module interfaces removed
	32	!
[1258]	33	! 1257 2013-11-08 15:18:40Z raasch
	34	! openacc loop and loop vector clauses removed
	35	!
[1217]	36	! 1216 2013-08-26 09:31:42Z raasch
	37	! re-sorting of the transposed / to be transposed arrays moved to separate
	38	! routines resort_for_...
	39	!
[1112]	40	! 1111 2013-03-08 23:54:10Z raasch
	41	! openACC directives added,
	42	! resorting data from/to work changed, work got 4 dimensions instead of 1
	43	!
[1107]	44	! 1106 2013-03-04 05:31:38Z raasch
	45	! preprocessor lines rearranged so that routines can also be used in serial
	46	! (non-parallel) mode
	47	!
[1093]	48	! 1092 2013-02-02 11:24:22Z raasch
	49	! unused variables removed
	50	!
[1037]	51	! 1036 2012-10-22 13:43:42Z raasch
	52	! code put under GPL (PALM 3.9)
	53	!
[1004]	54	! 1003 2012-09-14 14:35:53Z raasch
	55	! indices nxa, nya, etc. replaced by nx, ny, etc.
	56	!
[684]	57	! 683 2011-02-09 14:25:15Z raasch
	58	! openMP parallelization of transpositions for 2d-domain-decomposition
	59	!
[623]	60	! 622 2010-12-10 08:08:13Z raasch
	61	! optional barriers included in order to speed up collective operations
	62	!
[198]	63	! 164 2008-05-15 08:46:15Z raasch
[164]	64	! f_inv changed from subroutine argument to automatic array in order to do
	65	! re-ordering from f_in to f_inv in one step, one array work is needed instead
	66	! of work1 and work2
[1]	67	!
[198]	68	! February 2007
[3]	69	! RCS Log replace by Id keyword, revision history cleaned up
	70	!
[1]	71	! Revision 1.2 2004/04/30 13:12:17 raasch
	72	! Switched from mpi_alltoallv to the simpler mpi_alltoall,
	73	! all former transpose-routine files collected in this file, enlarged
	74	! transposition arrays introduced
	75	!
	76	! Revision 1.1 2004/04/30 13:08:16 raasch
	77	! Initial revision (collection of former routines transpose_xy, transpose_xz,
	78	! transpose_yx, transpose_yz, transpose_zx, transpose_zy)
	79	!
	80	! Revision 1.1 1997/07/24 11:25:18 raasch
	81	! Initial revision
	82	!
[1216]	83	!------------------------------------------------------------------------------!
	84	! Description:
	85	! ------------
	86	! Resorting data for the transposition from x to y. The transposition itself
	87	! is carried out in transpose_xy
	88	!------------------------------------------------------------------------------!
	89
	90	USE indices
	91	USE transpose_indices
	92
	93	IMPLICIT NONE
	94
	95	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
	96	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
	97
	98
	99	INTEGER :: i, j, k
	100
[1]	101	!
[1216]	102	!-- Rearrange indices of input array in order to make data to be send
	103	!-- by MPI contiguous
	104	!$OMP PARALLEL PRIVATE ( i, j, k )
	105	!$OMP DO
	106	!$acc kernels present( f_in, f_inv )
	107	DO i = 0, nx
	108	DO k = nzb_x, nzt_x
	109	DO j = nys_x, nyn_x
	110	f_inv(j,k,i) = f_in(i,j,k)
	111	ENDDO
	112	ENDDO
	113	ENDDO
	114	!$acc end kernels
	115	!$OMP END PARALLEL
	116
	117	END SUBROUTINE resort_for_xy
	118
	119
	120	SUBROUTINE transpose_xy( f_inv, f_out )
	121
	122	!------------------------------------------------------------------------------!
[1]	123	! Description:
	124	! ------------
	125	! Transposition of input array (f_in) from x to y. For the input array, all
	126	! elements along x reside on the same PE, while after transposition, all
	127	! elements along y reside on the same PE.
	128	!------------------------------------------------------------------------------!
	129
	130	USE cpulog
	131	USE indices
	132	USE pegrid
	133	USE transpose_indices
	134
	135	IMPLICIT NONE
	136
[1111]	137	INTEGER :: i, j, k, l, ys
[1]	138
[1216]	139	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
[1]	140
[1111]	141	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
	142
	143
[1106]	144	IF ( numprocs /= 1 ) THEN
	145
	146	#if defined( __parallel )
[1]	147	!
[1106]	148	!-- Transpose array
[1318]	149	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	150	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	151	!$acc update host( f_inv )
	152	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	153	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	154	comm1dy, ierr )
	155	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	156
	157	!
[1106]	158	!-- Reorder transposed array
[1111]	159	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	160	!$OMP DO
[1216]	161	!$acc data copyin( work )
[1106]	162	DO l = 0, pdims(2) - 1
	163	ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]	164	!$acc kernels present( f_out, work )
[1106]	165	DO i = nxl_y, nxr_y
	166	DO k = nzb_y, nzt_y
	167	DO j = ys, ys + nyn_x - nys_x
[1111]	168	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	169	ENDDO
[1]	170	ENDDO
	171	ENDDO
[1111]	172	!$acc end kernels
[1]	173	ENDDO
[1216]	174	!$acc end data
[683]	175	!$OMP END PARALLEL
[1]	176	#endif
	177
[1106]	178	ELSE
	179
	180	!
	181	!-- Reorder transposed array
	182	!$OMP PARALLEL PRIVATE ( i, j, k )
	183	!$OMP DO
[1216]	184	!$acc kernels present( f_inv, f_out )
[1106]	185	DO k = nzb_y, nzt_y
	186	DO i = nxl_y, nxr_y
	187	DO j = 0, ny
	188	f_out(j,i,k) = f_inv(j,k,i)
	189	ENDDO
	190	ENDDO
	191	ENDDO
[1111]	192	!$acc end kernels
[1106]	193	!$OMP END PARALLEL
	194
	195	ENDIF
	196
[1]	197	END SUBROUTINE transpose_xy
	198
	199
[1216]	200	SUBROUTINE resort_for_xz( f_inv, f_out )
[1]	201
	202	!------------------------------------------------------------------------------!
	203	! Description:
	204	! ------------
[1216]	205	! Resorting data after the transposition from x to z. The transposition itself
	206	! is carried out in transpose_xz
	207	!------------------------------------------------------------------------------!
	208
	209	USE indices
	210	USE transpose_indices
	211
	212	IMPLICIT NONE
	213
	214	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
	215	REAL :: f_out(1:nz,nys:nyn,nxl:nxr)
	216
	217
	218	INTEGER :: i, j, k
	219
	220	!
	221	!-- Rearrange indices of input array in order to make data to be send
	222	!-- by MPI contiguous.
	223	!-- In case of parallel fft/transposition, scattered store is faster in
	224	!-- backward direction!!!
	225	!$OMP PARALLEL PRIVATE ( i, j, k )
	226	!$OMP DO
	227	!$acc kernels present( f_inv, f_out )
	228	DO k = 1, nz
	229	DO i = nxl, nxr
	230	DO j = nys, nyn
	231	f_out(k,j,i) = f_inv(j,i,k)
	232	ENDDO
	233	ENDDO
	234	ENDDO
	235	!$acc end kernels
	236	!$OMP END PARALLEL
	237
	238	END SUBROUTINE resort_for_xz
	239
	240
	241	SUBROUTINE transpose_xz( f_in, f_inv )
	242
	243	!------------------------------------------------------------------------------!
	244	! Description:
	245	! ------------
[1]	246	! Transposition of input array (f_in) from x to z. For the input array, all
	247	! elements along x reside on the same PE, while after transposition, all
	248	! elements along z reside on the same PE.
	249	!------------------------------------------------------------------------------!
	250
	251	USE cpulog
	252	USE indices
	253	USE pegrid
	254	USE transpose_indices
	255
	256	IMPLICIT NONE
	257
[1111]	258	INTEGER :: i, j, k, l, xs
[1]	259
[1216]	260	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz)
[1]	261
[1111]	262	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
[1]	263
[1111]	264
[1]	265	!
	266	!-- If the PE grid is one-dimensional along y, the array has only to be
	267	!-- reordered locally and therefore no transposition has to be done.
	268	IF ( pdims(1) /= 1 ) THEN
[1106]	269
	270	#if defined( __parallel )
[1]	271	!
	272	!-- Reorder input array for transposition
[1111]	273	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	274	!$OMP DO
[1216]	275	!$acc data copyout( work )
[1]	276	DO l = 0, pdims(1) - 1
	277	xs = 0 + l * nnx
[1111]	278	!$acc kernels present( f_in, work )
[1003]	279	DO k = nzb_x, nzt_x
[164]	280	DO i = xs, xs + nnx - 1
[1003]	281	DO j = nys_x, nyn_x
[1111]	282	work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]	283	ENDDO
	284	ENDDO
	285	ENDDO
[1111]	286	!$acc end kernels
[1]	287	ENDDO
[1216]	288	!$acc end data
[683]	289	!$OMP END PARALLEL
[1]	290
	291	!
	292	!-- Transpose array
[1318]	293	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[622]	294	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	295	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	296	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	297	comm1dx, ierr )
[1111]	298	!$acc update device( f_inv )
[1]	299	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	300	#endif
	301
[1]	302	ELSE
[1106]	303
[1]	304	!
	305	!-- Reorder the array in a way that the z index is in first position
[683]	306	!$OMP PARALLEL PRIVATE ( i, j, k )
	307	!$OMP DO
[1216]	308	!$acc kernels present( f_in, f_inv )
[1003]	309	DO i = nxl, nxr
	310	DO j = nys, nyn
	311	DO k = 1, nz
[164]	312	f_inv(j,i,k) = f_in(i,j,k)
[1]	313	ENDDO
	314	ENDDO
	315	ENDDO
[1111]	316	!$acc end kernels
[683]	317	!$OMP END PARALLEL
[1]	318
[164]	319	ENDIF
	320
[1]	321	END SUBROUTINE transpose_xz
	322
	323
[1216]	324	SUBROUTINE resort_for_yx( f_inv, f_out )
[1]	325
	326	!------------------------------------------------------------------------------!
	327	! Description:
	328	! ------------
[1216]	329	! Resorting data after the transposition from y to x. The transposition itself
	330	! is carried out in transpose_yx
	331	!------------------------------------------------------------------------------!
	332
	333	USE indices
	334	USE transpose_indices
	335
	336	IMPLICIT NONE
	337
	338	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
	339	REAL :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
	340
	341
	342	INTEGER :: i, j, k
	343
	344	!
	345	!-- Rearrange indices of input array in order to make data to be send
	346	!-- by MPI contiguous
	347	!$OMP PARALLEL PRIVATE ( i, j, k )
	348	!$OMP DO
	349	!$acc kernels present( f_inv, f_out )
	350	DO i = 0, nx
	351	DO k = nzb_x, nzt_x
	352	DO j = nys_x, nyn_x
	353	f_out(i,j,k) = f_inv(j,k,i)
	354	ENDDO
	355	ENDDO
	356	ENDDO
	357	!$acc end kernels
	358	!$OMP END PARALLEL
	359
	360	END SUBROUTINE resort_for_yx
	361
	362
	363	SUBROUTINE transpose_yx( f_in, f_inv )
	364
	365	!------------------------------------------------------------------------------!
	366	! Description:
	367	! ------------
[1]	368	! Transposition of input array (f_in) from y to x. For the input array, all
	369	! elements along y reside on the same PE, while after transposition, all
	370	! elements along x reside on the same PE.
	371	!------------------------------------------------------------------------------!
	372
	373	USE cpulog
	374	USE indices
	375	USE pegrid
	376	USE transpose_indices
	377
	378	IMPLICIT NONE
	379
[1111]	380	INTEGER :: i, j, k, l, ys
[1]	381
[1216]	382	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
[1]	383
[1111]	384	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
	385
	386
[1106]	387	IF ( numprocs /= 1 ) THEN
	388
[1]	389	#if defined( __parallel )
	390	!
[1106]	391	!-- Reorder input array for transposition
[1111]	392	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	393	!$OMP DO
[1216]	394	!$acc data copyout( work )
[1106]	395	DO l = 0, pdims(2) - 1
	396	ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]	397	!$acc kernels present( f_in, work )
[1106]	398	DO i = nxl_y, nxr_y
	399	DO k = nzb_y, nzt_y
	400	DO j = ys, ys + nyn_x - nys_x
[1111]	401	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	402	ENDDO
	403	ENDDO
	404	ENDDO
[1111]	405	!$acc end kernels
[1106]	406	ENDDO
[1216]	407	!$acc end data
[1106]	408	!$OMP END PARALLEL
	409
	410	!
	411	!-- Transpose array
[1318]	412	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	413	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	414	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	415	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	416	comm1dy, ierr )
[1111]	417	!$acc update device( f_inv )
[1106]	418	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	419	#endif
	420
	421	ELSE
	422
	423	!
	424	!-- Reorder array f_in the same way as ALLTOALL did it
	425	!$OMP PARALLEL PRIVATE ( i, j, k )
	426	!$OMP DO
[1216]	427	!$acc kernels present( f_in, f_inv )
[1003]	428	DO i = nxl_y, nxr_y
	429	DO k = nzb_y, nzt_y
[1106]	430	DO j = 0, ny
	431	f_inv(j,k,i) = f_in(j,i,k)
[1]	432	ENDDO
	433	ENDDO
	434	ENDDO
[1111]	435	!$acc end kernels
[683]	436	!$OMP END PARALLEL
[1]	437
[1106]	438	ENDIF
[1]	439
	440	END SUBROUTINE transpose_yx
	441
	442
[1216]	443	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	444
	445	!------------------------------------------------------------------------------!
	446	! Description:
	447	! ------------
	448	! Transposition of input array (f_in) from y to x. For the input array, all
	449	! elements along y reside on the same PE, while after transposition, all
	450	! elements along x reside on the same PE.
	451	! This is a direct transposition for arrays with indices in regular order
	452	! (k,j,i) (cf. transpose_yx).
	453	!------------------------------------------------------------------------------!
	454
	455	USE cpulog
	456	USE indices
	457	USE pegrid
	458	USE transpose_indices
	459
	460	IMPLICIT NONE
	461
[1092]	462	INTEGER :: i, j, k, l, m, xs
[1]	463
[1003]	464	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
	465	f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x), &
[164]	466	work(nnxnnynnz)
[1]	467
	468	#if defined( __parallel )
	469
	470	!
	471	!-- Rearrange indices of input array in order to make data to be send
	472	!-- by MPI contiguous
[1003]	473	DO k = 1, nz
	474	DO j = nys, nyn
	475	DO i = nxl, nxr
[164]	476	f_inv(i,k,j) = f_in(k,j,i)
[1]	477	ENDDO
	478	ENDDO
	479	ENDDO
	480
	481	!
	482	!-- Transpose array
	483	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	484	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	485	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	486	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	487	comm1dx, ierr )
	488	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	489
	490	!
	491	!-- Reorder transposed array
	492	m = 0
	493	DO l = 0, pdims(1) - 1
	494	xs = 0 + l * nnx
[1003]	495	DO j = nys_x, nyn_x
	496	DO k = 1, nz
[1]	497	DO i = xs, xs + nnx - 1
	498	m = m + 1
[164]	499	f_out(i,j,k) = work(m)
[1]	500	ENDDO
	501	ENDDO
	502	ENDDO
	503	ENDDO
	504
	505	#endif
	506
	507	END SUBROUTINE transpose_yxd
	508
	509
[1216]	510	SUBROUTINE resort_for_yz( f_in, f_inv )
[1]	511
	512	!------------------------------------------------------------------------------!
	513	! Description:
	514	! ------------
[1216]	515	! Resorting data for the transposition from y to z. The transposition itself
	516	! is carried out in transpose_yz
	517	!------------------------------------------------------------------------------!
	518
	519	USE indices
	520	USE transpose_indices
	521
	522	IMPLICIT NONE
	523
	524	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
	525	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
	526
	527
	528	INTEGER :: i, j, k
	529
	530	!
	531	!-- Rearrange indices of input array in order to make data to be send
	532	!-- by MPI contiguous
	533	!$OMP PARALLEL PRIVATE ( i, j, k )
	534	!$OMP DO
	535	!$acc kernels present( f_in, f_inv )
	536	DO j = 0, ny
	537	DO k = nzb_y, nzt_y
	538	DO i = nxl_y, nxr_y
	539	f_inv(i,k,j) = f_in(j,i,k)
	540	ENDDO
	541	ENDDO
	542	ENDDO
	543	!$acc end kernels
	544	!$OMP END PARALLEL
	545
	546	END SUBROUTINE resort_for_yz
	547
	548
	549	SUBROUTINE transpose_yz( f_inv, f_out )
	550
	551	!------------------------------------------------------------------------------!
	552	! Description:
	553	! ------------
[1]	554	! Transposition of input array (f_in) from y to z. For the input array, all
	555	! elements along y reside on the same PE, while after transposition, all
	556	! elements along z reside on the same PE.
	557	!------------------------------------------------------------------------------!
	558
	559	USE cpulog
	560	USE indices
	561	USE pegrid
	562	USE transpose_indices
	563
	564	IMPLICIT NONE
	565
[1111]	566	INTEGER :: i, j, k, l, zs
[1]	567
[1216]	568	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]	569
[1111]	570	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
	571
	572
[1]	573	!
	574	!-- If the PE grid is one-dimensional along y, only local reordering
	575	!-- of the data is necessary and no transposition has to be done.
	576	IF ( pdims(1) == 1 ) THEN
[1106]	577
[683]	578	!$OMP PARALLEL PRIVATE ( i, j, k )
	579	!$OMP DO
[1216]	580	!$acc kernels present( f_inv, f_out )
[1003]	581	DO j = 0, ny
	582	DO k = nzb_y, nzt_y
	583	DO i = nxl_y, nxr_y
[164]	584	f_out(i,j,k) = f_inv(i,k,j)
[1]	585	ENDDO
	586	ENDDO
	587	ENDDO
[1111]	588	!$acc end kernels
[683]	589	!$OMP END PARALLEL
[1]	590
[1106]	591	ELSE
	592
	593	#if defined( __parallel )
[1]	594	!
[1106]	595	!-- Transpose array
[1318]	596	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	597	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	598	!$acc update host( f_inv )
	599	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	600	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	601	comm1dx, ierr )
	602	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	603
	604	!
[1106]	605	!-- Reorder transposed array
[1111]	606	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	607	!$OMP DO
[1216]	608	!$acc data copyin( work )
[1106]	609	DO l = 0, pdims(1) - 1
	610	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1216]	611	!$acc kernels present( f_out )
[1106]	612	DO j = nys_z, nyn_z
	613	DO k = zs, zs + nzt_y - nzb_y
	614	DO i = nxl_z, nxr_z
[1111]	615	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	616	ENDDO
[1]	617	ENDDO
	618	ENDDO
[1111]	619	!$acc end kernels
[1]	620	ENDDO
[1216]	621	!$acc end data
[683]	622	!$OMP END PARALLEL
[1]	623	#endif
	624
[1106]	625	ENDIF
	626
[1]	627	END SUBROUTINE transpose_yz
	628
	629
[1216]	630	SUBROUTINE resort_for_zx( f_in, f_inv )
[1]	631
	632	!------------------------------------------------------------------------------!
	633	! Description:
	634	! ------------
[1216]	635	! Resorting data for the transposition from z to x. The transposition itself
	636	! is carried out in transpose_zx
	637	!------------------------------------------------------------------------------!
	638
	639	USE indices
	640	USE transpose_indices
	641
	642	IMPLICIT NONE
	643
	644	REAL :: f_in(1:nz,nys:nyn,nxl:nxr)
	645	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
	646
	647
	648	INTEGER :: i, j, k
	649
	650	!
	651	!-- Rearrange indices of input array in order to make data to be send
	652	!-- by MPI contiguous
	653	!$OMP PARALLEL PRIVATE ( i, j, k )
	654	!$OMP DO
	655	!$acc kernels present( f_in, f_inv )
	656	DO k = 1,nz
	657	DO i = nxl, nxr
	658	DO j = nys, nyn
	659	f_inv(j,i,k) = f_in(k,j,i)
	660	ENDDO
	661	ENDDO
	662	ENDDO
	663	!$acc end kernels
	664	!$OMP END PARALLEL
	665
	666	END SUBROUTINE resort_for_zx
	667
	668
	669	SUBROUTINE transpose_zx( f_inv, f_out )
	670
	671	!------------------------------------------------------------------------------!
	672	! Description:
	673	! ------------
[1]	674	! Transposition of input array (f_in) from z to x. For the input array, all
	675	! elements along z reside on the same PE, while after transposition, all
	676	! elements along x reside on the same PE.
	677	!------------------------------------------------------------------------------!
	678
	679	USE cpulog
	680	USE indices
	681	USE pegrid
	682	USE transpose_indices
	683
	684	IMPLICIT NONE
	685
[1111]	686	INTEGER :: i, j, k, l, xs
[1]	687
[1216]	688	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
[1]	689
[1111]	690	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
	691
[1]	692
	693	!
	694	!-- If the PE grid is one-dimensional along y, only local reordering
	695	!-- of the data is necessary and no transposition has to be done.
	696	IF ( pdims(1) == 1 ) THEN
[1106]	697
[683]	698	!$OMP PARALLEL PRIVATE ( i, j, k )
	699	!$OMP DO
[1216]	700	!$acc kernels present( f_inv, f_out )
[1003]	701	DO k = 1, nz
	702	DO i = nxl, nxr
	703	DO j = nys, nyn
[164]	704	f_out(i,j,k) = f_inv(j,i,k)
[1]	705	ENDDO
	706	ENDDO
	707	ENDDO
[1111]	708	!$acc end kernels
[683]	709	!$OMP END PARALLEL
[1]	710
[1106]	711	ELSE
	712
	713	#if defined( __parallel )
[1]	714	!
[1106]	715	!-- Transpose array
[1318]	716	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	717	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	718	!$acc update host( f_inv )
	719	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	720	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	721	comm1dx, ierr )
	722	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	723
	724	!
[1106]	725	!-- Reorder transposed array
[1111]	726	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	727	!$OMP DO
[1216]	728	!$acc data copyin( work )
[1106]	729	DO l = 0, pdims(1) - 1
	730	xs = 0 + l * nnx
[1216]	731	!$acc kernels present( f_out )
[1106]	732	DO k = nzb_x, nzt_x
	733	DO i = xs, xs + nnx - 1
	734	DO j = nys_x, nyn_x
[1111]	735	f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]	736	ENDDO
[1]	737	ENDDO
	738	ENDDO
[1111]	739	!$acc end kernels
[1]	740	ENDDO
[1216]	741	!$acc end data
[683]	742	!$OMP END PARALLEL
[1]	743	#endif
	744
[1106]	745	ENDIF
	746
[1]	747	END SUBROUTINE transpose_zx
	748
	749
[1216]	750	SUBROUTINE resort_for_zy( f_inv, f_out )
[1]	751
	752	!------------------------------------------------------------------------------!
	753	! Description:
	754	! ------------
[1216]	755	! Resorting data after the transposition from z to y. The transposition itself
	756	! is carried out in transpose_zy
	757	!------------------------------------------------------------------------------!
	758
	759	USE indices
	760	USE transpose_indices
	761
	762	IMPLICIT NONE
	763
	764	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
	765	REAL :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
	766
	767
	768	INTEGER :: i, j, k
	769
	770	!
	771	!-- Rearrange indices of input array in order to make data to be send
	772	!-- by MPI contiguous
	773	!$OMP PARALLEL PRIVATE ( i, j, k )
	774	!$OMP DO
	775	!$acc kernels present( f_inv, f_out )
	776	DO k = nzb_y, nzt_y
	777	DO j = 0, ny
	778	DO i = nxl_y, nxr_y
	779	f_out(j,i,k) = f_inv(i,k,j)
	780	ENDDO
	781	ENDDO
	782	ENDDO
	783	!$acc end kernels
	784	!$OMP END PARALLEL
	785
	786	END SUBROUTINE resort_for_zy
	787
	788
	789	SUBROUTINE transpose_zy( f_in, f_inv )
	790
	791	!------------------------------------------------------------------------------!
	792	! Description:
	793	! ------------
[1]	794	! Transposition of input array (f_in) from z to y. For the input array, all
	795	! elements along z reside on the same PE, while after transposition, all
	796	! elements along y reside on the same PE.
	797	!------------------------------------------------------------------------------!
	798
	799	USE cpulog
	800	USE indices
	801	USE pegrid
	802	USE transpose_indices
	803
	804	IMPLICIT NONE
	805
[1111]	806	INTEGER :: i, j, k, l, zs
[1]	807
[1216]	808	REAL :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
[1]	809
[1111]	810	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
	811
	812
[1]	813	!
	814	!-- If the PE grid is one-dimensional along y, the array has only to be
	815	!-- reordered locally and therefore no transposition has to be done.
	816	IF ( pdims(1) /= 1 ) THEN
[1106]	817
	818	#if defined( __parallel )
[1]	819	!
	820	!-- Reorder input array for transposition
[1111]	821	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	822	!$OMP DO
[1216]	823	!$acc data copyout( work )
[1]	824	DO l = 0, pdims(1) - 1
[1003]	825	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1111]	826	!$acc kernels present( f_in, work )
[1003]	827	DO j = nys_z, nyn_z
	828	DO k = zs, zs + nzt_y - nzb_y
	829	DO i = nxl_z, nxr_z
[1111]	830	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	831	ENDDO
	832	ENDDO
	833	ENDDO
[1111]	834	!$acc end kernels
[1]	835	ENDDO
[1216]	836	!$acc end data
[683]	837	!$OMP END PARALLEL
[1]	838
	839	!
	840	!-- Transpose array
[1318]	841	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[622]	842	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	843	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	844	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	845	comm1dx, ierr )
[1111]	846	!$acc update device( f_inv )
[1]	847	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	848	#endif
[1]	849
	850	ELSE
	851	!
[1106]	852	!-- Reorder the array in the same way like ALLTOALL did it
[683]	853	!$OMP PARALLEL PRIVATE ( i, j, k )
	854	!$OMP DO
[1216]	855	!$acc kernels present( f_in, f_inv )
[1003]	856	DO k = nzb_y, nzt_y
	857	DO j = 0, ny
	858	DO i = nxl_y, nxr_y
[164]	859	f_inv(i,k,j) = f_in(i,j,k)
	860	ENDDO
	861	ENDDO
	862	ENDDO
[1111]	863	!$acc end kernels
[683]	864	!$OMP END PARALLEL
[1106]	865
	866	ENDIF
	867
[1]	868	END SUBROUTINE transpose_zy
	869
	870
[1216]	871	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	872
	873	!------------------------------------------------------------------------------!
	874	! Description:
	875	! ------------
	876	! Transposition of input array (f_in) from z to y. For the input array, all
	877	! elements along z reside on the same PE, while after transposition, all
	878	! elements along y reside on the same PE.
	879	! This is a direct transposition for arrays with indices in regular order
	880	! (k,j,i) (cf. transpose_zy).
	881	!------------------------------------------------------------------------------!
	882
	883	USE cpulog
	884	USE indices
	885	USE pegrid
	886	USE transpose_indices
	887
	888	IMPLICIT NONE
	889
	890	INTEGER :: i, j, k, l, m, ys
	891
[1003]	892	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
	893	f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd), &
[164]	894	work(nnxnnynnz)
[1]	895
	896	#if defined( __parallel )
	897
	898	!
	899	!-- Rearrange indices of input array in order to make data to be send
	900	!-- by MPI contiguous
[1003]	901	DO i = nxl, nxr
	902	DO j = nys, nyn
	903	DO k = 1, nz
[164]	904	f_inv(j,i,k) = f_in(k,j,i)
[1]	905	ENDDO
	906	ENDDO
	907	ENDDO
	908
	909	!
	910	!-- Move data to different array, because memory location of work1 is
	911	!-- needed further below (work1 = work2).
	912	!-- If the PE grid is one-dimensional along x, only local reordering
	913	!-- of the data is necessary and no transposition has to be done.
	914	IF ( pdims(2) == 1 ) THEN
[1003]	915	DO k = 1, nz
	916	DO i = nxl, nxr
	917	DO j = nys, nyn
[164]	918	f_out(j,i,k) = f_inv(j,i,k)
[1]	919	ENDDO
	920	ENDDO
	921	ENDDO
	922	RETURN
	923	ENDIF
	924
	925	!
	926	!-- Transpose array
	927	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	928	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	929	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	930	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	931	comm1dy, ierr )
	932	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	933
	934	!
	935	!-- Reorder transposed array
	936	m = 0
	937	DO l = 0, pdims(2) - 1
	938	ys = 0 + l * nny
[1003]	939	DO k = nzb_yd, nzt_yd
	940	DO i = nxl_yd, nxr_yd
[1]	941	DO j = ys, ys + nny - 1
	942	m = m + 1
[164]	943	f_out(j,i,k) = work(m)
[1]	944	ENDDO
	945	ENDDO
	946	ENDDO
	947	ENDDO
	948
	949	#endif
	950
	951	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |