Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 1252

Last change on this file since 1252 was 1217, checked in by raasch, 11 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 26.8 KB

Rev	Line
[1216]	1	SUBROUTINE resort_for_xy( f_in, f_inv )
[1]	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
	17	! Copyright 1997-2012 Leibniz University Hannover
	18	!--------------------------------------------------------------------------------!
	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[198]	22	!
[1217]	23	!
[198]	24	! Former revisions:
	25	! -----------------
	26	! $Id: transpose.f90 1217 2013-08-26 11:30:05Z heinze $
	27	!
[1217]	28	! 1216 2013-08-26 09:31:42Z raasch
	29	! re-sorting of the transposed / to be transposed arrays moved to separate
	30	! routines resort_for_...
	31	!
[1112]	32	! 1111 2013-03-08 23:54:10Z raasch
	33	! openACC directives added,
	34	! resorting data from/to work changed, work got 4 dimensions instead of 1
	35	!
[1107]	36	! 1106 2013-03-04 05:31:38Z raasch
	37	! preprocessor lines rearranged so that routines can also be used in serial
	38	! (non-parallel) mode
	39	!
[1093]	40	! 1092 2013-02-02 11:24:22Z raasch
	41	! unused variables removed
	42	!
[1037]	43	! 1036 2012-10-22 13:43:42Z raasch
	44	! code put under GPL (PALM 3.9)
	45	!
[1004]	46	! 1003 2012-09-14 14:35:53Z raasch
	47	! indices nxa, nya, etc. replaced by nx, ny, etc.
	48	!
[684]	49	! 683 2011-02-09 14:25:15Z raasch
	50	! openMP parallelization of transpositions for 2d-domain-decomposition
	51	!
[623]	52	! 622 2010-12-10 08:08:13Z raasch
	53	! optional barriers included in order to speed up collective operations
	54	!
[198]	55	! 164 2008-05-15 08:46:15Z raasch
[164]	56	! f_inv changed from subroutine argument to automatic array in order to do
	57	! re-ordering from f_in to f_inv in one step, one array work is needed instead
	58	! of work1 and work2
[1]	59	!
[198]	60	! February 2007
[3]	61	! RCS Log replace by Id keyword, revision history cleaned up
	62	!
[1]	63	! Revision 1.2 2004/04/30 13:12:17 raasch
	64	! Switched from mpi_alltoallv to the simpler mpi_alltoall,
	65	! all former transpose-routine files collected in this file, enlarged
	66	! transposition arrays introduced
	67	!
	68	! Revision 1.1 2004/04/30 13:08:16 raasch
	69	! Initial revision (collection of former routines transpose_xy, transpose_xz,
	70	! transpose_yx, transpose_yz, transpose_zx, transpose_zy)
	71	!
	72	! Revision 1.1 1997/07/24 11:25:18 raasch
	73	! Initial revision
	74	!
[1216]	75	!------------------------------------------------------------------------------!
	76	! Description:
	77	! ------------
	78	! Resorting data for the transposition from x to y. The transposition itself
	79	! is carried out in transpose_xy
	80	!------------------------------------------------------------------------------!
	81
	82	USE indices
	83	USE transpose_indices
	84
	85	IMPLICIT NONE
	86
	87	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
	88	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
	89
	90
	91	INTEGER :: i, j, k
	92
[1]	93	!
[1216]	94	!-- Rearrange indices of input array in order to make data to be send
	95	!-- by MPI contiguous
	96	!$OMP PARALLEL PRIVATE ( i, j, k )
	97	!$OMP DO
	98	!$acc kernels present( f_in, f_inv )
	99	!$acc loop
	100	DO i = 0, nx
	101	DO k = nzb_x, nzt_x
	102	!$acc loop vector( 32 )
	103	DO j = nys_x, nyn_x
	104	f_inv(j,k,i) = f_in(i,j,k)
	105	ENDDO
	106	ENDDO
	107	ENDDO
	108	!$acc end kernels
	109	!$OMP END PARALLEL
	110
	111	END SUBROUTINE resort_for_xy
	112
	113
	114	SUBROUTINE transpose_xy( f_inv, f_out )
	115
	116	!------------------------------------------------------------------------------!
[1]	117	! Description:
	118	! ------------
	119	! Transposition of input array (f_in) from x to y. For the input array, all
	120	! elements along x reside on the same PE, while after transposition, all
	121	! elements along y reside on the same PE.
	122	!------------------------------------------------------------------------------!
	123
	124	USE cpulog
	125	USE indices
	126	USE interfaces
	127	USE pegrid
	128	USE transpose_indices
	129
	130	IMPLICIT NONE
	131
[1111]	132	INTEGER :: i, j, k, l, ys
[1]	133
[1216]	134	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
[1]	135
[1111]	136	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
	137
	138
[1106]	139	IF ( numprocs /= 1 ) THEN
	140
	141	#if defined( __parallel )
[1]	142	!
[1106]	143	!-- Transpose array
	144	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	145	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	146	!$acc update host( f_inv )
	147	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	148	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	149	comm1dy, ierr )
	150	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	151
	152	!
[1106]	153	!-- Reorder transposed array
[1111]	154	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	155	!$OMP DO
[1216]	156	!$acc data copyin( work )
[1106]	157	DO l = 0, pdims(2) - 1
	158	ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]	159	!$acc kernels present( f_out, work )
	160	!$acc loop
[1106]	161	DO i = nxl_y, nxr_y
	162	DO k = nzb_y, nzt_y
[1111]	163	!$acc loop vector( 32 )
[1106]	164	DO j = ys, ys + nyn_x - nys_x
[1111]	165	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	166	ENDDO
[1]	167	ENDDO
	168	ENDDO
[1111]	169	!$acc end kernels
[1]	170	ENDDO
[1216]	171	!$acc end data
[683]	172	!$OMP END PARALLEL
[1]	173	#endif
	174
[1106]	175	ELSE
	176
	177	!
	178	!-- Reorder transposed array
	179	!$OMP PARALLEL PRIVATE ( i, j, k )
	180	!$OMP DO
[1216]	181	!$acc kernels present( f_inv, f_out )
[1111]	182	!$acc loop
[1106]	183	DO k = nzb_y, nzt_y
	184	DO i = nxl_y, nxr_y
[1111]	185	!$acc loop vector( 32 )
[1106]	186	DO j = 0, ny
	187	f_out(j,i,k) = f_inv(j,k,i)
	188	ENDDO
	189	ENDDO
	190	ENDDO
[1111]	191	!$acc end kernels
[1106]	192	!$OMP END PARALLEL
	193
	194	ENDIF
	195
[1]	196	END SUBROUTINE transpose_xy
	197
	198
[1216]	199	SUBROUTINE resort_for_xz( f_inv, f_out )
[1]	200
	201	!------------------------------------------------------------------------------!
	202	! Description:
	203	! ------------
[1216]	204	! Resorting data after the transposition from x to z. The transposition itself
	205	! is carried out in transpose_xz
	206	!------------------------------------------------------------------------------!
	207
	208	USE indices
	209	USE transpose_indices
	210
	211	IMPLICIT NONE
	212
	213	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
	214	REAL :: f_out(1:nz,nys:nyn,nxl:nxr)
	215
	216
	217	INTEGER :: i, j, k
	218
	219	!
	220	!-- Rearrange indices of input array in order to make data to be send
	221	!-- by MPI contiguous.
	222	!-- In case of parallel fft/transposition, scattered store is faster in
	223	!-- backward direction!!!
	224	!$OMP PARALLEL PRIVATE ( i, j, k )
	225	!$OMP DO
	226	!$acc kernels present( f_inv, f_out )
	227	!$acc loop
	228	DO k = 1, nz
	229	DO i = nxl, nxr
	230	!$acc loop vector( 32 )
	231	DO j = nys, nyn
	232	f_out(k,j,i) = f_inv(j,i,k)
	233	ENDDO
	234	ENDDO
	235	ENDDO
	236	!$acc end kernels
	237	!$OMP END PARALLEL
	238
	239	END SUBROUTINE resort_for_xz
	240
	241
	242	SUBROUTINE transpose_xz( f_in, f_inv )
	243
	244	!------------------------------------------------------------------------------!
	245	! Description:
	246	! ------------
[1]	247	! Transposition of input array (f_in) from x to z. For the input array, all
	248	! elements along x reside on the same PE, while after transposition, all
	249	! elements along z reside on the same PE.
	250	!------------------------------------------------------------------------------!
	251
	252	USE cpulog
	253	USE indices
	254	USE interfaces
	255	USE pegrid
	256	USE transpose_indices
	257
	258	IMPLICIT NONE
	259
[1111]	260	INTEGER :: i, j, k, l, xs
[1]	261
[1216]	262	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz)
[1]	263
[1111]	264	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
[1]	265
[1111]	266
[1]	267	!
	268	!-- If the PE grid is one-dimensional along y, the array has only to be
	269	!-- reordered locally and therefore no transposition has to be done.
	270	IF ( pdims(1) /= 1 ) THEN
[1106]	271
	272	#if defined( __parallel )
[1]	273	!
	274	!-- Reorder input array for transposition
[1111]	275	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	276	!$OMP DO
[1216]	277	!$acc data copyout( work )
[1]	278	DO l = 0, pdims(1) - 1
	279	xs = 0 + l * nnx
[1111]	280	!$acc kernels present( f_in, work )
	281	!$acc loop
[1003]	282	DO k = nzb_x, nzt_x
[164]	283	DO i = xs, xs + nnx - 1
[1111]	284	!$acc loop vector( 32 )
[1003]	285	DO j = nys_x, nyn_x
[1111]	286	work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]	287	ENDDO
	288	ENDDO
	289	ENDDO
[1111]	290	!$acc end kernels
[1]	291	ENDDO
[1216]	292	!$acc end data
[683]	293	!$OMP END PARALLEL
[1]	294
	295	!
	296	!-- Transpose array
	297	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	298	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	299	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	300	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	301	comm1dx, ierr )
[1111]	302	!$acc update device( f_inv )
[1]	303	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	304	#endif
	305
[1]	306	ELSE
[1106]	307
[1]	308	!
	309	!-- Reorder the array in a way that the z index is in first position
[683]	310	!$OMP PARALLEL PRIVATE ( i, j, k )
	311	!$OMP DO
[1216]	312	!$acc kernels present( f_in, f_inv )
[1111]	313	!$acc loop
[1003]	314	DO i = nxl, nxr
	315	DO j = nys, nyn
[1111]	316	!$acc loop vector( 32 )
[1003]	317	DO k = 1, nz
[164]	318	f_inv(j,i,k) = f_in(i,j,k)
[1]	319	ENDDO
	320	ENDDO
	321	ENDDO
[1111]	322	!$acc end kernels
[683]	323	!$OMP END PARALLEL
[1]	324
[164]	325	ENDIF
	326
[1]	327	END SUBROUTINE transpose_xz
	328
	329
[1216]	330	SUBROUTINE resort_for_yx( f_inv, f_out )
[1]	331
	332	!------------------------------------------------------------------------------!
	333	! Description:
	334	! ------------
[1216]	335	! Resorting data after the transposition from y to x. The transposition itself
	336	! is carried out in transpose_yx
	337	!------------------------------------------------------------------------------!
	338
	339	USE indices
	340	USE transpose_indices
	341
	342	IMPLICIT NONE
	343
	344	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
	345	REAL :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
	346
	347
	348	INTEGER :: i, j, k
	349
	350	!
	351	!-- Rearrange indices of input array in order to make data to be send
	352	!-- by MPI contiguous
	353	!$OMP PARALLEL PRIVATE ( i, j, k )
	354	!$OMP DO
	355	!$acc kernels present( f_inv, f_out )
	356	!$acc loop
	357	DO i = 0, nx
	358	DO k = nzb_x, nzt_x
	359	!$acc loop vector( 32 )
	360	DO j = nys_x, nyn_x
	361	f_out(i,j,k) = f_inv(j,k,i)
	362	ENDDO
	363	ENDDO
	364	ENDDO
	365	!$acc end kernels
	366	!$OMP END PARALLEL
	367
	368	END SUBROUTINE resort_for_yx
	369
	370
	371	SUBROUTINE transpose_yx( f_in, f_inv )
	372
	373	!------------------------------------------------------------------------------!
	374	! Description:
	375	! ------------
[1]	376	! Transposition of input array (f_in) from y to x. For the input array, all
	377	! elements along y reside on the same PE, while after transposition, all
	378	! elements along x reside on the same PE.
	379	!------------------------------------------------------------------------------!
	380
	381	USE cpulog
	382	USE indices
	383	USE interfaces
	384	USE pegrid
	385	USE transpose_indices
	386
	387	IMPLICIT NONE
	388
[1111]	389	INTEGER :: i, j, k, l, ys
[1]	390
[1216]	391	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
[1]	392
[1111]	393	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
	394
	395
[1106]	396	IF ( numprocs /= 1 ) THEN
	397
[1]	398	#if defined( __parallel )
	399	!
[1106]	400	!-- Reorder input array for transposition
[1111]	401	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	402	!$OMP DO
[1216]	403	!$acc data copyout( work )
[1106]	404	DO l = 0, pdims(2) - 1
	405	ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]	406	!$acc kernels present( f_in, work )
	407	!$acc loop
[1106]	408	DO i = nxl_y, nxr_y
	409	DO k = nzb_y, nzt_y
[1111]	410	!$acc loop vector( 32 )
[1106]	411	DO j = ys, ys + nyn_x - nys_x
[1111]	412	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	413	ENDDO
	414	ENDDO
	415	ENDDO
[1111]	416	!$acc end kernels
[1106]	417	ENDDO
[1216]	418	!$acc end data
[1106]	419	!$OMP END PARALLEL
	420
	421	!
	422	!-- Transpose array
	423	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	424	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	425	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	426	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	427	comm1dy, ierr )
[1111]	428	!$acc update device( f_inv )
[1106]	429	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	430	#endif
	431
	432	ELSE
	433
	434	!
	435	!-- Reorder array f_in the same way as ALLTOALL did it
	436	!$OMP PARALLEL PRIVATE ( i, j, k )
	437	!$OMP DO
[1216]	438	!$acc kernels present( f_in, f_inv )
[1111]	439	!$acc loop
[1003]	440	DO i = nxl_y, nxr_y
	441	DO k = nzb_y, nzt_y
[1111]	442	!$acc loop vector( 32 )
[1106]	443	DO j = 0, ny
	444	f_inv(j,k,i) = f_in(j,i,k)
[1]	445	ENDDO
	446	ENDDO
	447	ENDDO
[1111]	448	!$acc end kernels
[683]	449	!$OMP END PARALLEL
[1]	450
[1106]	451	ENDIF
[1]	452
	453	END SUBROUTINE transpose_yx
	454
	455
[1216]	456	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	457
	458	!------------------------------------------------------------------------------!
	459	! Description:
	460	! ------------
	461	! Transposition of input array (f_in) from y to x. For the input array, all
	462	! elements along y reside on the same PE, while after transposition, all
	463	! elements along x reside on the same PE.
	464	! This is a direct transposition for arrays with indices in regular order
	465	! (k,j,i) (cf. transpose_yx).
	466	!------------------------------------------------------------------------------!
	467
	468	USE cpulog
	469	USE indices
	470	USE interfaces
	471	USE pegrid
	472	USE transpose_indices
	473
	474	IMPLICIT NONE
	475
[1092]	476	INTEGER :: i, j, k, l, m, xs
[1]	477
[1003]	478	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
	479	f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x), &
[164]	480	work(nnxnnynnz)
[1]	481
	482	#if defined( __parallel )
	483
	484	!
	485	!-- Rearrange indices of input array in order to make data to be send
	486	!-- by MPI contiguous
[1003]	487	DO k = 1, nz
	488	DO j = nys, nyn
	489	DO i = nxl, nxr
[164]	490	f_inv(i,k,j) = f_in(k,j,i)
[1]	491	ENDDO
	492	ENDDO
	493	ENDDO
	494
	495	!
	496	!-- Transpose array
	497	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	498	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	499	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	500	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	501	comm1dx, ierr )
	502	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	503
	504	!
	505	!-- Reorder transposed array
	506	m = 0
	507	DO l = 0, pdims(1) - 1
	508	xs = 0 + l * nnx
[1003]	509	DO j = nys_x, nyn_x
	510	DO k = 1, nz
[1]	511	DO i = xs, xs + nnx - 1
	512	m = m + 1
[164]	513	f_out(i,j,k) = work(m)
[1]	514	ENDDO
	515	ENDDO
	516	ENDDO
	517	ENDDO
	518
	519	#endif
	520
	521	END SUBROUTINE transpose_yxd
	522
	523
[1216]	524	SUBROUTINE resort_for_yz( f_in, f_inv )
[1]	525
	526	!------------------------------------------------------------------------------!
	527	! Description:
	528	! ------------
[1216]	529	! Resorting data for the transposition from y to z. The transposition itself
	530	! is carried out in transpose_yz
	531	!------------------------------------------------------------------------------!
	532
	533	USE indices
	534	USE transpose_indices
	535
	536	IMPLICIT NONE
	537
	538	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
	539	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
	540
	541
	542	INTEGER :: i, j, k
	543
	544	!
	545	!-- Rearrange indices of input array in order to make data to be send
	546	!-- by MPI contiguous
	547	!$OMP PARALLEL PRIVATE ( i, j, k )
	548	!$OMP DO
	549	!$acc kernels present( f_in, f_inv )
	550	!$acc loop
	551	DO j = 0, ny
	552	DO k = nzb_y, nzt_y
	553	!$acc loop vector( 32 )
	554	DO i = nxl_y, nxr_y
	555	f_inv(i,k,j) = f_in(j,i,k)
	556	ENDDO
	557	ENDDO
	558	ENDDO
	559	!$acc end kernels
	560	!$OMP END PARALLEL
	561
	562	END SUBROUTINE resort_for_yz
	563
	564
	565	SUBROUTINE transpose_yz( f_inv, f_out )
	566
	567	!------------------------------------------------------------------------------!
	568	! Description:
	569	! ------------
[1]	570	! Transposition of input array (f_in) from y to z. For the input array, all
	571	! elements along y reside on the same PE, while after transposition, all
	572	! elements along z reside on the same PE.
	573	!------------------------------------------------------------------------------!
	574
	575	USE cpulog
	576	USE indices
	577	USE interfaces
	578	USE pegrid
	579	USE transpose_indices
	580
	581	IMPLICIT NONE
	582
[1111]	583	INTEGER :: i, j, k, l, zs
[1]	584
[1216]	585	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]	586
[1111]	587	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
	588
	589
[1]	590	!
	591	!-- If the PE grid is one-dimensional along y, only local reordering
	592	!-- of the data is necessary and no transposition has to be done.
	593	IF ( pdims(1) == 1 ) THEN
[1106]	594
[683]	595	!$OMP PARALLEL PRIVATE ( i, j, k )
	596	!$OMP DO
[1216]	597	!$acc kernels present( f_inv, f_out )
[1111]	598	!$acc loop
[1003]	599	DO j = 0, ny
	600	DO k = nzb_y, nzt_y
[1111]	601	!$acc loop vector( 32 )
[1003]	602	DO i = nxl_y, nxr_y
[164]	603	f_out(i,j,k) = f_inv(i,k,j)
[1]	604	ENDDO
	605	ENDDO
	606	ENDDO
[1111]	607	!$acc end kernels
[683]	608	!$OMP END PARALLEL
[1]	609
[1106]	610	ELSE
	611
	612	#if defined( __parallel )
[1]	613	!
[1106]	614	!-- Transpose array
	615	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	616	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	617	!$acc update host( f_inv )
	618	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	619	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	620	comm1dx, ierr )
	621	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	622
	623	!
[1106]	624	!-- Reorder transposed array
[1111]	625	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	626	!$OMP DO
[1216]	627	!$acc data copyin( work )
[1106]	628	DO l = 0, pdims(1) - 1
	629	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1216]	630	!$acc kernels present( f_out )
[1111]	631	!$acc loop
[1106]	632	DO j = nys_z, nyn_z
	633	DO k = zs, zs + nzt_y - nzb_y
[1111]	634	!$acc loop vector( 32 )
[1106]	635	DO i = nxl_z, nxr_z
[1111]	636	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	637	ENDDO
[1]	638	ENDDO
	639	ENDDO
[1111]	640	!$acc end kernels
[1]	641	ENDDO
[1216]	642	!$acc end data
[683]	643	!$OMP END PARALLEL
[1]	644	#endif
	645
[1106]	646	ENDIF
	647
[1]	648	END SUBROUTINE transpose_yz
	649
	650
[1216]	651	SUBROUTINE resort_for_zx( f_in, f_inv )
[1]	652
	653	!------------------------------------------------------------------------------!
	654	! Description:
	655	! ------------
[1216]	656	! Resorting data for the transposition from z to x. The transposition itself
	657	! is carried out in transpose_zx
	658	!------------------------------------------------------------------------------!
	659
	660	USE indices
	661	USE transpose_indices
	662
	663	IMPLICIT NONE
	664
	665	REAL :: f_in(1:nz,nys:nyn,nxl:nxr)
	666	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
	667
	668
	669	INTEGER :: i, j, k
	670
	671	!
	672	!-- Rearrange indices of input array in order to make data to be send
	673	!-- by MPI contiguous
	674	!$OMP PARALLEL PRIVATE ( i, j, k )
	675	!$OMP DO
	676	!$acc kernels present( f_in, f_inv )
	677	!$acc loop
	678	DO k = 1,nz
	679	DO i = nxl, nxr
	680	!$acc loop vector( 32 )
	681	DO j = nys, nyn
	682	f_inv(j,i,k) = f_in(k,j,i)
	683	ENDDO
	684	ENDDO
	685	ENDDO
	686	!$acc end kernels
	687	!$OMP END PARALLEL
	688
	689	END SUBROUTINE resort_for_zx
	690
	691
	692	SUBROUTINE transpose_zx( f_inv, f_out )
	693
	694	!------------------------------------------------------------------------------!
	695	! Description:
	696	! ------------
[1]	697	! Transposition of input array (f_in) from z to x. For the input array, all
	698	! elements along z reside on the same PE, while after transposition, all
	699	! elements along x reside on the same PE.
	700	!------------------------------------------------------------------------------!
	701
	702	USE cpulog
	703	USE indices
	704	USE interfaces
	705	USE pegrid
	706	USE transpose_indices
	707
	708	IMPLICIT NONE
	709
[1111]	710	INTEGER :: i, j, k, l, xs
[1]	711
[1216]	712	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
[1]	713
[1111]	714	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
	715
[1]	716
	717	!
	718	!-- If the PE grid is one-dimensional along y, only local reordering
	719	!-- of the data is necessary and no transposition has to be done.
	720	IF ( pdims(1) == 1 ) THEN
[1106]	721
[683]	722	!$OMP PARALLEL PRIVATE ( i, j, k )
	723	!$OMP DO
[1216]	724	!$acc kernels present( f_inv, f_out )
[1106]	725	!$acc loop
[1003]	726	DO k = 1, nz
	727	DO i = nxl, nxr
[1106]	728	!$acc loop vector( 32 )
[1003]	729	DO j = nys, nyn
[164]	730	f_out(i,j,k) = f_inv(j,i,k)
[1]	731	ENDDO
	732	ENDDO
	733	ENDDO
[1111]	734	!$acc end kernels
[683]	735	!$OMP END PARALLEL
[1]	736
[1106]	737	ELSE
	738
	739	#if defined( __parallel )
[1]	740	!
[1106]	741	!-- Transpose array
	742	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	743	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	744	!$acc update host( f_inv )
	745	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	746	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	747	comm1dx, ierr )
	748	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	749
	750	!
[1106]	751	!-- Reorder transposed array
[1111]	752	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	753	!$OMP DO
[1216]	754	!$acc data copyin( work )
[1106]	755	DO l = 0, pdims(1) - 1
	756	xs = 0 + l * nnx
[1216]	757	!$acc kernels present( f_out )
[1111]	758	!$acc loop
[1106]	759	DO k = nzb_x, nzt_x
	760	DO i = xs, xs + nnx - 1
[1111]	761	!$acc loop vector( 32 )
[1106]	762	DO j = nys_x, nyn_x
[1111]	763	f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]	764	ENDDO
[1]	765	ENDDO
	766	ENDDO
[1111]	767	!$acc end kernels
[1]	768	ENDDO
[1216]	769	!$acc end data
[683]	770	!$OMP END PARALLEL
[1]	771	#endif
	772
[1106]	773	ENDIF
	774
[1]	775	END SUBROUTINE transpose_zx
	776
	777
[1216]	778	SUBROUTINE resort_for_zy( f_inv, f_out )
[1]	779
	780	!------------------------------------------------------------------------------!
	781	! Description:
	782	! ------------
[1216]	783	! Resorting data after the transposition from z to y. The transposition itself
	784	! is carried out in transpose_zy
	785	!------------------------------------------------------------------------------!
	786
	787	USE indices
	788	USE transpose_indices
	789
	790	IMPLICIT NONE
	791
	792	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
	793	REAL :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
	794
	795
	796	INTEGER :: i, j, k
	797
	798	!
	799	!-- Rearrange indices of input array in order to make data to be send
	800	!-- by MPI contiguous
	801	!$OMP PARALLEL PRIVATE ( i, j, k )
	802	!$OMP DO
	803	!$acc kernels present( f_inv, f_out )
	804	!$acc loop
	805	DO k = nzb_y, nzt_y
	806	DO j = 0, ny
	807	!$acc loop vector( 32 )
	808	DO i = nxl_y, nxr_y
	809	f_out(j,i,k) = f_inv(i,k,j)
	810	ENDDO
	811	ENDDO
	812	ENDDO
	813	!$acc end kernels
	814	!$OMP END PARALLEL
	815
	816	END SUBROUTINE resort_for_zy
	817
	818
	819	SUBROUTINE transpose_zy( f_in, f_inv )
	820
	821	!------------------------------------------------------------------------------!
	822	! Description:
	823	! ------------
[1]	824	! Transposition of input array (f_in) from z to y. For the input array, all
	825	! elements along z reside on the same PE, while after transposition, all
	826	! elements along y reside on the same PE.
	827	!------------------------------------------------------------------------------!
	828
	829	USE cpulog
	830	USE indices
	831	USE interfaces
	832	USE pegrid
	833	USE transpose_indices
	834
	835	IMPLICIT NONE
	836
[1111]	837	INTEGER :: i, j, k, l, zs
[1]	838
[1216]	839	REAL :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
[1]	840
[1111]	841	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
	842
	843
[1]	844	!
	845	!-- If the PE grid is one-dimensional along y, the array has only to be
	846	!-- reordered locally and therefore no transposition has to be done.
	847	IF ( pdims(1) /= 1 ) THEN
[1106]	848
	849	#if defined( __parallel )
[1]	850	!
	851	!-- Reorder input array for transposition
[1111]	852	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	853	!$OMP DO
[1216]	854	!$acc data copyout( work )
[1]	855	DO l = 0, pdims(1) - 1
[1003]	856	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1111]	857	!$acc kernels present( f_in, work )
	858	!$acc loop
[1003]	859	DO j = nys_z, nyn_z
	860	DO k = zs, zs + nzt_y - nzb_y
[1111]	861	!$acc loop vector( 32 )
[1003]	862	DO i = nxl_z, nxr_z
[1111]	863	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	864	ENDDO
	865	ENDDO
	866	ENDDO
[1111]	867	!$acc end kernels
[1]	868	ENDDO
[1216]	869	!$acc end data
[683]	870	!$OMP END PARALLEL
[1]	871
	872	!
	873	!-- Transpose array
	874	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	875	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	876	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	877	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	878	comm1dx, ierr )
[1111]	879	!$acc update device( f_inv )
[1]	880	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	881	#endif
[1]	882
	883	ELSE
	884	!
[1106]	885	!-- Reorder the array in the same way like ALLTOALL did it
[683]	886	!$OMP PARALLEL PRIVATE ( i, j, k )
	887	!$OMP DO
[1216]	888	!$acc kernels present( f_in, f_inv )
[1111]	889	!$acc loop
[1003]	890	DO k = nzb_y, nzt_y
	891	DO j = 0, ny
[1111]	892	!$acc loop vector( 32 )
[1003]	893	DO i = nxl_y, nxr_y
[164]	894	f_inv(i,k,j) = f_in(i,j,k)
	895	ENDDO
	896	ENDDO
	897	ENDDO
[1111]	898	!$acc end kernels
[683]	899	!$OMP END PARALLEL
[1106]	900
	901	ENDIF
	902
[1]	903	END SUBROUTINE transpose_zy
	904
	905
[1216]	906	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	907
	908	!------------------------------------------------------------------------------!
	909	! Description:
	910	! ------------
	911	! Transposition of input array (f_in) from z to y. For the input array, all
	912	! elements along z reside on the same PE, while after transposition, all
	913	! elements along y reside on the same PE.
	914	! This is a direct transposition for arrays with indices in regular order
	915	! (k,j,i) (cf. transpose_zy).
	916	!------------------------------------------------------------------------------!
	917
	918	USE cpulog
	919	USE indices
	920	USE interfaces
	921	USE pegrid
	922	USE transpose_indices
	923
	924	IMPLICIT NONE
	925
	926	INTEGER :: i, j, k, l, m, ys
	927
[1003]	928	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
	929	f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd), &
[164]	930	work(nnxnnynnz)
[1]	931
	932	#if defined( __parallel )
	933
	934	!
	935	!-- Rearrange indices of input array in order to make data to be send
	936	!-- by MPI contiguous
[1003]	937	DO i = nxl, nxr
	938	DO j = nys, nyn
	939	DO k = 1, nz
[164]	940	f_inv(j,i,k) = f_in(k,j,i)
[1]	941	ENDDO
	942	ENDDO
	943	ENDDO
	944
	945	!
	946	!-- Move data to different array, because memory location of work1 is
	947	!-- needed further below (work1 = work2).
	948	!-- If the PE grid is one-dimensional along x, only local reordering
	949	!-- of the data is necessary and no transposition has to be done.
	950	IF ( pdims(2) == 1 ) THEN
[1003]	951	DO k = 1, nz
	952	DO i = nxl, nxr
	953	DO j = nys, nyn
[164]	954	f_out(j,i,k) = f_inv(j,i,k)
[1]	955	ENDDO
	956	ENDDO
	957	ENDDO
	958	RETURN
	959	ENDIF
	960
	961	!
	962	!-- Transpose array
	963	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	964	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	965	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	966	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	967	comm1dy, ierr )
	968	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	969
	970	!
	971	!-- Reorder transposed array
	972	m = 0
	973	DO l = 0, pdims(2) - 1
	974	ys = 0 + l * nny
[1003]	975	DO k = nzb_yd, nzt_yd
	976	DO i = nxl_yd, nxr_yd
[1]	977	DO j = ys, ys + nny - 1
	978	m = m + 1
[164]	979	f_out(j,i,k) = work(m)
[1]	980	ENDDO
	981	ENDDO
	982	ENDDO
	983	ENDDO
	984
	985	#endif
	986
	987	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |