Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 1216

Last change on this file since 1216 was 1216, checked in by raasch, 11 years ago
overlapping execution of fft and transpositions (MPI_ALLTOALL), but real overlapping is not activated so far, fftw implemented for 1D-decomposition resorting of arrays moved to separate routines resort_for_... bugfix in mbuild concerning Makefile_check
Property svn:keywords set to `Id`
File size: 26.8 KB

Rev	Line
[1216]	1	SUBROUTINE resort_for_xy( f_in, f_inv )
[1]	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
	17	! Copyright 1997-2012 Leibniz University Hannover
	18	!--------------------------------------------------------------------------------!
	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1216]	22	! re-sorting of the transposed / to be transposed arrays moved to separate
	23	! routines resort_for_...
[198]	24	!
	25	! Former revisions:
	26	! -----------------
	27	! $Id: transpose.f90 1216 2013-08-26 09:31:42Z raasch $
	28	!
[1112]	29	! 1111 2013-03-08 23:54:10Z raasch
	30	! openACC directives added,
	31	! resorting data from/to work changed, work got 4 dimensions instead of 1
	32	!
[1107]	33	! 1106 2013-03-04 05:31:38Z raasch
	34	! preprocessor lines rearranged so that routines can also be used in serial
	35	! (non-parallel) mode
	36	!
[1093]	37	! 1092 2013-02-02 11:24:22Z raasch
	38	! unused variables removed
	39	!
[1037]	40	! 1036 2012-10-22 13:43:42Z raasch
	41	! code put under GPL (PALM 3.9)
	42	!
[1004]	43	! 1003 2012-09-14 14:35:53Z raasch
	44	! indices nxa, nya, etc. replaced by nx, ny, etc.
	45	!
[684]	46	! 683 2011-02-09 14:25:15Z raasch
	47	! openMP parallelization of transpositions for 2d-domain-decomposition
	48	!
[623]	49	! 622 2010-12-10 08:08:13Z raasch
	50	! optional barriers included in order to speed up collective operations
	51	!
[198]	52	! 164 2008-05-15 08:46:15Z raasch
[164]	53	! f_inv changed from subroutine argument to automatic array in order to do
	54	! re-ordering from f_in to f_inv in one step, one array work is needed instead
	55	! of work1 and work2
[1]	56	!
[198]	57	! February 2007
[3]	58	! RCS Log replace by Id keyword, revision history cleaned up
	59	!
[1]	60	! Revision 1.2 2004/04/30 13:12:17 raasch
	61	! Switched from mpi_alltoallv to the simpler mpi_alltoall,
	62	! all former transpose-routine files collected in this file, enlarged
	63	! transposition arrays introduced
	64	!
	65	! Revision 1.1 2004/04/30 13:08:16 raasch
	66	! Initial revision (collection of former routines transpose_xy, transpose_xz,
	67	! transpose_yx, transpose_yz, transpose_zx, transpose_zy)
	68	!
	69	! Revision 1.1 1997/07/24 11:25:18 raasch
	70	! Initial revision
	71	!
[1216]	72	!------------------------------------------------------------------------------!
	73	! Description:
	74	! ------------
	75	! Resorting data for the transposition from x to y. The transposition itself
	76	! is carried out in transpose_xy
	77	!------------------------------------------------------------------------------!
	78
	79	USE indices
	80	USE transpose_indices
	81
	82	IMPLICIT NONE
	83
	84	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
	85	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
	86
	87
	88	INTEGER :: i, j, k
	89
[1]	90	!
[1216]	91	!-- Rearrange indices of input array in order to make data to be send
	92	!-- by MPI contiguous
	93	!$OMP PARALLEL PRIVATE ( i, j, k )
	94	!$OMP DO
	95	!$acc kernels present( f_in, f_inv )
	96	!$acc loop
	97	DO i = 0, nx
	98	DO k = nzb_x, nzt_x
	99	!$acc loop vector( 32 )
	100	DO j = nys_x, nyn_x
	101	f_inv(j,k,i) = f_in(i,j,k)
	102	ENDDO
	103	ENDDO
	104	ENDDO
	105	!$acc end kernels
	106	!$OMP END PARALLEL
	107
	108	END SUBROUTINE resort_for_xy
	109
	110
	111	SUBROUTINE transpose_xy( f_inv, f_out )
	112
	113	!------------------------------------------------------------------------------!
[1]	114	! Description:
	115	! ------------
	116	! Transposition of input array (f_in) from x to y. For the input array, all
	117	! elements along x reside on the same PE, while after transposition, all
	118	! elements along y reside on the same PE.
	119	!------------------------------------------------------------------------------!
	120
	121	USE cpulog
	122	USE indices
	123	USE interfaces
	124	USE pegrid
	125	USE transpose_indices
	126
	127	IMPLICIT NONE
	128
[1111]	129	INTEGER :: i, j, k, l, ys
[1]	130
[1216]	131	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
[1]	132
[1111]	133	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
	134
	135
[1106]	136	IF ( numprocs /= 1 ) THEN
	137
	138	#if defined( __parallel )
[1]	139	!
[1106]	140	!-- Transpose array
	141	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	142	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	143	!$acc update host( f_inv )
	144	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	145	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	146	comm1dy, ierr )
	147	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	148
	149	!
[1106]	150	!-- Reorder transposed array
[1111]	151	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	152	!$OMP DO
[1216]	153	!$acc data copyin( work )
[1106]	154	DO l = 0, pdims(2) - 1
	155	ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]	156	!$acc kernels present( f_out, work )
	157	!$acc loop
[1106]	158	DO i = nxl_y, nxr_y
	159	DO k = nzb_y, nzt_y
[1111]	160	!$acc loop vector( 32 )
[1106]	161	DO j = ys, ys + nyn_x - nys_x
[1111]	162	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	163	ENDDO
[1]	164	ENDDO
	165	ENDDO
[1111]	166	!$acc end kernels
[1]	167	ENDDO
[1216]	168	!$acc end data
[683]	169	!$OMP END PARALLEL
[1]	170	#endif
	171
[1106]	172	ELSE
	173
	174	!
	175	!-- Reorder transposed array
	176	!$OMP PARALLEL PRIVATE ( i, j, k )
	177	!$OMP DO
[1216]	178	!$acc kernels present( f_inv, f_out )
[1111]	179	!$acc loop
[1106]	180	DO k = nzb_y, nzt_y
	181	DO i = nxl_y, nxr_y
[1111]	182	!$acc loop vector( 32 )
[1106]	183	DO j = 0, ny
	184	f_out(j,i,k) = f_inv(j,k,i)
	185	ENDDO
	186	ENDDO
	187	ENDDO
[1111]	188	!$acc end kernels
[1106]	189	!$OMP END PARALLEL
	190
	191	ENDIF
	192
[1]	193	END SUBROUTINE transpose_xy
	194
	195
[1216]	196	SUBROUTINE resort_for_xz( f_inv, f_out )
[1]	197
	198	!------------------------------------------------------------------------------!
	199	! Description:
	200	! ------------
[1216]	201	! Resorting data after the transposition from x to z. The transposition itself
	202	! is carried out in transpose_xz
	203	!------------------------------------------------------------------------------!
	204
	205	USE indices
	206	USE transpose_indices
	207
	208	IMPLICIT NONE
	209
	210	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
	211	REAL :: f_out(1:nz,nys:nyn,nxl:nxr)
	212
	213
	214	INTEGER :: i, j, k
	215
	216	!
	217	!-- Rearrange indices of input array in order to make data to be send
	218	!-- by MPI contiguous.
	219	!-- In case of parallel fft/transposition, scattered store is faster in
	220	!-- backward direction!!!
	221	!$OMP PARALLEL PRIVATE ( i, j, k )
	222	!$OMP DO
	223	!$acc kernels present( f_inv, f_out )
	224	!$acc loop
	225	DO k = 1, nz
	226	DO i = nxl, nxr
	227	!$acc loop vector( 32 )
	228	DO j = nys, nyn
	229	f_out(k,j,i) = f_inv(j,i,k)
	230	ENDDO
	231	ENDDO
	232	ENDDO
	233	!$acc end kernels
	234	!$OMP END PARALLEL
	235
	236	END SUBROUTINE resort_for_xz
	237
	238
	239	SUBROUTINE transpose_xz( f_in, f_inv )
	240
	241	!------------------------------------------------------------------------------!
	242	! Description:
	243	! ------------
[1]	244	! Transposition of input array (f_in) from x to z. For the input array, all
	245	! elements along x reside on the same PE, while after transposition, all
	246	! elements along z reside on the same PE.
	247	!------------------------------------------------------------------------------!
	248
	249	USE cpulog
	250	USE indices
	251	USE interfaces
	252	USE pegrid
	253	USE transpose_indices
	254
	255	IMPLICIT NONE
	256
[1111]	257	INTEGER :: i, j, k, l, xs
[1]	258
[1216]	259	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz)
[1]	260
[1111]	261	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
[1]	262
[1111]	263
[1]	264	!
	265	!-- If the PE grid is one-dimensional along y, the array has only to be
	266	!-- reordered locally and therefore no transposition has to be done.
	267	IF ( pdims(1) /= 1 ) THEN
[1106]	268
	269	#if defined( __parallel )
[1]	270	!
	271	!-- Reorder input array for transposition
[1111]	272	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	273	!$OMP DO
[1216]	274	!$acc data copyout( work )
[1]	275	DO l = 0, pdims(1) - 1
	276	xs = 0 + l * nnx
[1111]	277	!$acc kernels present( f_in, work )
	278	!$acc loop
[1003]	279	DO k = nzb_x, nzt_x
[164]	280	DO i = xs, xs + nnx - 1
[1111]	281	!$acc loop vector( 32 )
[1003]	282	DO j = nys_x, nyn_x
[1111]	283	work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]	284	ENDDO
	285	ENDDO
	286	ENDDO
[1111]	287	!$acc end kernels
[1]	288	ENDDO
[1216]	289	!$acc end data
[683]	290	!$OMP END PARALLEL
[1]	291
	292	!
	293	!-- Transpose array
	294	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	295	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	296	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	297	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	298	comm1dx, ierr )
[1111]	299	!$acc update device( f_inv )
[1]	300	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	301	#endif
	302
[1]	303	ELSE
[1106]	304
[1]	305	!
	306	!-- Reorder the array in a way that the z index is in first position
[683]	307	!$OMP PARALLEL PRIVATE ( i, j, k )
	308	!$OMP DO
[1216]	309	!$acc kernels present( f_in, f_inv )
[1111]	310	!$acc loop
[1003]	311	DO i = nxl, nxr
	312	DO j = nys, nyn
[1111]	313	!$acc loop vector( 32 )
[1003]	314	DO k = 1, nz
[164]	315	f_inv(j,i,k) = f_in(i,j,k)
[1]	316	ENDDO
	317	ENDDO
	318	ENDDO
[1111]	319	!$acc end kernels
[683]	320	!$OMP END PARALLEL
[1]	321
[164]	322	ENDIF
	323
[1]	324	END SUBROUTINE transpose_xz
	325
	326
[1216]	327	SUBROUTINE resort_for_yx( f_inv, f_out )
[1]	328
	329	!------------------------------------------------------------------------------!
	330	! Description:
	331	! ------------
[1216]	332	! Resorting data after the transposition from y to x. The transposition itself
	333	! is carried out in transpose_yx
	334	!------------------------------------------------------------------------------!
	335
	336	USE indices
	337	USE transpose_indices
	338
	339	IMPLICIT NONE
	340
	341	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
	342	REAL :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
	343
	344
	345	INTEGER :: i, j, k
	346
	347	!
	348	!-- Rearrange indices of input array in order to make data to be send
	349	!-- by MPI contiguous
	350	!$OMP PARALLEL PRIVATE ( i, j, k )
	351	!$OMP DO
	352	!$acc kernels present( f_inv, f_out )
	353	!$acc loop
	354	DO i = 0, nx
	355	DO k = nzb_x, nzt_x
	356	!$acc loop vector( 32 )
	357	DO j = nys_x, nyn_x
	358	f_out(i,j,k) = f_inv(j,k,i)
	359	ENDDO
	360	ENDDO
	361	ENDDO
	362	!$acc end kernels
	363	!$OMP END PARALLEL
	364
	365	END SUBROUTINE resort_for_yx
	366
	367
	368	SUBROUTINE transpose_yx( f_in, f_inv )
	369
	370	!------------------------------------------------------------------------------!
	371	! Description:
	372	! ------------
[1]	373	! Transposition of input array (f_in) from y to x. For the input array, all
	374	! elements along y reside on the same PE, while after transposition, all
	375	! elements along x reside on the same PE.
	376	!------------------------------------------------------------------------------!
	377
	378	USE cpulog
	379	USE indices
	380	USE interfaces
	381	USE pegrid
	382	USE transpose_indices
	383
	384	IMPLICIT NONE
	385
[1111]	386	INTEGER :: i, j, k, l, ys
[1]	387
[1216]	388	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
[1]	389
[1111]	390	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
	391
	392
[1106]	393	IF ( numprocs /= 1 ) THEN
	394
[1]	395	#if defined( __parallel )
	396	!
[1106]	397	!-- Reorder input array for transposition
[1111]	398	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	399	!$OMP DO
[1216]	400	!$acc data copyout( work )
[1106]	401	DO l = 0, pdims(2) - 1
	402	ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]	403	!$acc kernels present( f_in, work )
	404	!$acc loop
[1106]	405	DO i = nxl_y, nxr_y
	406	DO k = nzb_y, nzt_y
[1111]	407	!$acc loop vector( 32 )
[1106]	408	DO j = ys, ys + nyn_x - nys_x
[1111]	409	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	410	ENDDO
	411	ENDDO
	412	ENDDO
[1111]	413	!$acc end kernels
[1106]	414	ENDDO
[1216]	415	!$acc end data
[1106]	416	!$OMP END PARALLEL
	417
	418	!
	419	!-- Transpose array
	420	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	421	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	422	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	423	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	424	comm1dy, ierr )
[1111]	425	!$acc update device( f_inv )
[1106]	426	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	427	#endif
	428
	429	ELSE
	430
	431	!
	432	!-- Reorder array f_in the same way as ALLTOALL did it
	433	!$OMP PARALLEL PRIVATE ( i, j, k )
	434	!$OMP DO
[1216]	435	!$acc kernels present( f_in, f_inv )
[1111]	436	!$acc loop
[1003]	437	DO i = nxl_y, nxr_y
	438	DO k = nzb_y, nzt_y
[1111]	439	!$acc loop vector( 32 )
[1106]	440	DO j = 0, ny
	441	f_inv(j,k,i) = f_in(j,i,k)
[1]	442	ENDDO
	443	ENDDO
	444	ENDDO
[1111]	445	!$acc end kernels
[683]	446	!$OMP END PARALLEL
[1]	447
[1106]	448	ENDIF
[1]	449
	450	END SUBROUTINE transpose_yx
	451
	452
[1216]	453	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	454
	455	!------------------------------------------------------------------------------!
	456	! Description:
	457	! ------------
	458	! Transposition of input array (f_in) from y to x. For the input array, all
	459	! elements along y reside on the same PE, while after transposition, all
	460	! elements along x reside on the same PE.
	461	! This is a direct transposition for arrays with indices in regular order
	462	! (k,j,i) (cf. transpose_yx).
	463	!------------------------------------------------------------------------------!
	464
	465	USE cpulog
	466	USE indices
	467	USE interfaces
	468	USE pegrid
	469	USE transpose_indices
	470
	471	IMPLICIT NONE
	472
[1092]	473	INTEGER :: i, j, k, l, m, xs
[1]	474
[1003]	475	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
	476	f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x), &
[164]	477	work(nnxnnynnz)
[1]	478
	479	#if defined( __parallel )
	480
	481	!
	482	!-- Rearrange indices of input array in order to make data to be send
	483	!-- by MPI contiguous
[1003]	484	DO k = 1, nz
	485	DO j = nys, nyn
	486	DO i = nxl, nxr
[164]	487	f_inv(i,k,j) = f_in(k,j,i)
[1]	488	ENDDO
	489	ENDDO
	490	ENDDO
	491
	492	!
	493	!-- Transpose array
	494	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	495	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	496	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	497	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	498	comm1dx, ierr )
	499	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	500
	501	!
	502	!-- Reorder transposed array
	503	m = 0
	504	DO l = 0, pdims(1) - 1
	505	xs = 0 + l * nnx
[1003]	506	DO j = nys_x, nyn_x
	507	DO k = 1, nz
[1]	508	DO i = xs, xs + nnx - 1
	509	m = m + 1
[164]	510	f_out(i,j,k) = work(m)
[1]	511	ENDDO
	512	ENDDO
	513	ENDDO
	514	ENDDO
	515
	516	#endif
	517
	518	END SUBROUTINE transpose_yxd
	519
	520
[1216]	521	SUBROUTINE resort_for_yz( f_in, f_inv )
[1]	522
	523	!------------------------------------------------------------------------------!
	524	! Description:
	525	! ------------
[1216]	526	! Resorting data for the transposition from y to z. The transposition itself
	527	! is carried out in transpose_yz
	528	!------------------------------------------------------------------------------!
	529
	530	USE indices
	531	USE transpose_indices
	532
	533	IMPLICIT NONE
	534
	535	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
	536	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
	537
	538
	539	INTEGER :: i, j, k
	540
	541	!
	542	!-- Rearrange indices of input array in order to make data to be send
	543	!-- by MPI contiguous
	544	!$OMP PARALLEL PRIVATE ( i, j, k )
	545	!$OMP DO
	546	!$acc kernels present( f_in, f_inv )
	547	!$acc loop
	548	DO j = 0, ny
	549	DO k = nzb_y, nzt_y
	550	!$acc loop vector( 32 )
	551	DO i = nxl_y, nxr_y
	552	f_inv(i,k,j) = f_in(j,i,k)
	553	ENDDO
	554	ENDDO
	555	ENDDO
	556	!$acc end kernels
	557	!$OMP END PARALLEL
	558
	559	END SUBROUTINE resort_for_yz
	560
	561
	562	SUBROUTINE transpose_yz( f_inv, f_out )
	563
	564	!------------------------------------------------------------------------------!
	565	! Description:
	566	! ------------
[1]	567	! Transposition of input array (f_in) from y to z. For the input array, all
	568	! elements along y reside on the same PE, while after transposition, all
	569	! elements along z reside on the same PE.
	570	!------------------------------------------------------------------------------!
	571
	572	USE cpulog
	573	USE indices
	574	USE interfaces
	575	USE pegrid
	576	USE transpose_indices
	577
	578	IMPLICIT NONE
	579
[1111]	580	INTEGER :: i, j, k, l, zs
[1]	581
[1216]	582	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]	583
[1111]	584	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
	585
	586
[1]	587	!
	588	!-- If the PE grid is one-dimensional along y, only local reordering
	589	!-- of the data is necessary and no transposition has to be done.
	590	IF ( pdims(1) == 1 ) THEN
[1106]	591
[683]	592	!$OMP PARALLEL PRIVATE ( i, j, k )
	593	!$OMP DO
[1216]	594	!$acc kernels present( f_inv, f_out )
[1111]	595	!$acc loop
[1003]	596	DO j = 0, ny
	597	DO k = nzb_y, nzt_y
[1111]	598	!$acc loop vector( 32 )
[1003]	599	DO i = nxl_y, nxr_y
[164]	600	f_out(i,j,k) = f_inv(i,k,j)
[1]	601	ENDDO
	602	ENDDO
	603	ENDDO
[1111]	604	!$acc end kernels
[683]	605	!$OMP END PARALLEL
[1]	606
[1106]	607	ELSE
	608
	609	#if defined( __parallel )
[1]	610	!
[1106]	611	!-- Transpose array
	612	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	613	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	614	!$acc update host( f_inv )
	615	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	616	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	617	comm1dx, ierr )
	618	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	619
	620	!
[1106]	621	!-- Reorder transposed array
[1111]	622	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	623	!$OMP DO
[1216]	624	!$acc data copyin( work )
[1106]	625	DO l = 0, pdims(1) - 1
	626	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1216]	627	!$acc kernels present( f_out )
[1111]	628	!$acc loop
[1106]	629	DO j = nys_z, nyn_z
	630	DO k = zs, zs + nzt_y - nzb_y
[1111]	631	!$acc loop vector( 32 )
[1106]	632	DO i = nxl_z, nxr_z
[1111]	633	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	634	ENDDO
[1]	635	ENDDO
	636	ENDDO
[1111]	637	!$acc end kernels
[1]	638	ENDDO
[1216]	639	!$acc end data
[683]	640	!$OMP END PARALLEL
[1]	641	#endif
	642
[1106]	643	ENDIF
	644
[1]	645	END SUBROUTINE transpose_yz
	646
	647
[1216]	648	SUBROUTINE resort_for_zx( f_in, f_inv )
[1]	649
	650	!------------------------------------------------------------------------------!
	651	! Description:
	652	! ------------
[1216]	653	! Resorting data for the transposition from z to x. The transposition itself
	654	! is carried out in transpose_zx
	655	!------------------------------------------------------------------------------!
	656
	657	USE indices
	658	USE transpose_indices
	659
	660	IMPLICIT NONE
	661
	662	REAL :: f_in(1:nz,nys:nyn,nxl:nxr)
	663	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
	664
	665
	666	INTEGER :: i, j, k
	667
	668	!
	669	!-- Rearrange indices of input array in order to make data to be send
	670	!-- by MPI contiguous
	671	!$OMP PARALLEL PRIVATE ( i, j, k )
	672	!$OMP DO
	673	!$acc kernels present( f_in, f_inv )
	674	!$acc loop
	675	DO k = 1,nz
	676	DO i = nxl, nxr
	677	!$acc loop vector( 32 )
	678	DO j = nys, nyn
	679	f_inv(j,i,k) = f_in(k,j,i)
	680	ENDDO
	681	ENDDO
	682	ENDDO
	683	!$acc end kernels
	684	!$OMP END PARALLEL
	685
	686	END SUBROUTINE resort_for_zx
	687
	688
	689	SUBROUTINE transpose_zx( f_inv, f_out )
	690
	691	!------------------------------------------------------------------------------!
	692	! Description:
	693	! ------------
[1]	694	! Transposition of input array (f_in) from z to x. For the input array, all
	695	! elements along z reside on the same PE, while after transposition, all
	696	! elements along x reside on the same PE.
	697	!------------------------------------------------------------------------------!
	698
	699	USE cpulog
	700	USE indices
	701	USE interfaces
	702	USE pegrid
	703	USE transpose_indices
	704
	705	IMPLICIT NONE
	706
[1111]	707	INTEGER :: i, j, k, l, xs
[1]	708
[1216]	709	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
[1]	710
[1111]	711	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
	712
[1]	713
	714	!
	715	!-- If the PE grid is one-dimensional along y, only local reordering
	716	!-- of the data is necessary and no transposition has to be done.
	717	IF ( pdims(1) == 1 ) THEN
[1106]	718
[683]	719	!$OMP PARALLEL PRIVATE ( i, j, k )
	720	!$OMP DO
[1216]	721	!$acc kernels present( f_inv, f_out )
[1106]	722	!$acc loop
[1003]	723	DO k = 1, nz
	724	DO i = nxl, nxr
[1106]	725	!$acc loop vector( 32 )
[1003]	726	DO j = nys, nyn
[164]	727	f_out(i,j,k) = f_inv(j,i,k)
[1]	728	ENDDO
	729	ENDDO
	730	ENDDO
[1111]	731	!$acc end kernels
[683]	732	!$OMP END PARALLEL
[1]	733
[1106]	734	ELSE
	735
	736	#if defined( __parallel )
[1]	737	!
[1106]	738	!-- Transpose array
	739	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	740	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	741	!$acc update host( f_inv )
	742	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	743	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	744	comm1dx, ierr )
	745	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	746
	747	!
[1106]	748	!-- Reorder transposed array
[1111]	749	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	750	!$OMP DO
[1216]	751	!$acc data copyin( work )
[1106]	752	DO l = 0, pdims(1) - 1
	753	xs = 0 + l * nnx
[1216]	754	!$acc kernels present( f_out )
[1111]	755	!$acc loop
[1106]	756	DO k = nzb_x, nzt_x
	757	DO i = xs, xs + nnx - 1
[1111]	758	!$acc loop vector( 32 )
[1106]	759	DO j = nys_x, nyn_x
[1111]	760	f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]	761	ENDDO
[1]	762	ENDDO
	763	ENDDO
[1111]	764	!$acc end kernels
[1]	765	ENDDO
[1216]	766	!$acc end data
[683]	767	!$OMP END PARALLEL
[1]	768	#endif
	769
[1106]	770	ENDIF
	771
[1]	772	END SUBROUTINE transpose_zx
	773
	774
[1216]	775	SUBROUTINE resort_for_zy( f_inv, f_out )
[1]	776
	777	!------------------------------------------------------------------------------!
	778	! Description:
	779	! ------------
[1216]	780	! Resorting data after the transposition from z to y. The transposition itself
	781	! is carried out in transpose_zy
	782	!------------------------------------------------------------------------------!
	783
	784	USE indices
	785	USE transpose_indices
	786
	787	IMPLICIT NONE
	788
	789	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
	790	REAL :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
	791
	792
	793	INTEGER :: i, j, k
	794
	795	!
	796	!-- Rearrange indices of input array in order to make data to be send
	797	!-- by MPI contiguous
	798	!$OMP PARALLEL PRIVATE ( i, j, k )
	799	!$OMP DO
	800	!$acc kernels present( f_inv, f_out )
	801	!$acc loop
	802	DO k = nzb_y, nzt_y
	803	DO j = 0, ny
	804	!$acc loop vector( 32 )
	805	DO i = nxl_y, nxr_y
	806	f_out(j,i,k) = f_inv(i,k,j)
	807	ENDDO
	808	ENDDO
	809	ENDDO
	810	!$acc end kernels
	811	!$OMP END PARALLEL
	812
	813	END SUBROUTINE resort_for_zy
	814
	815
	816	SUBROUTINE transpose_zy( f_in, f_inv )
	817
	818	!------------------------------------------------------------------------------!
	819	! Description:
	820	! ------------
[1]	821	! Transposition of input array (f_in) from z to y. For the input array, all
	822	! elements along z reside on the same PE, while after transposition, all
	823	! elements along y reside on the same PE.
	824	!------------------------------------------------------------------------------!
	825
	826	USE cpulog
	827	USE indices
	828	USE interfaces
	829	USE pegrid
	830	USE transpose_indices
	831
	832	IMPLICIT NONE
	833
[1111]	834	INTEGER :: i, j, k, l, zs
[1]	835
[1216]	836	REAL :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
[1]	837
[1111]	838	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
	839
	840
[1]	841	!
	842	!-- If the PE grid is one-dimensional along y, the array has only to be
	843	!-- reordered locally and therefore no transposition has to be done.
	844	IF ( pdims(1) /= 1 ) THEN
[1106]	845
	846	#if defined( __parallel )
[1]	847	!
	848	!-- Reorder input array for transposition
[1111]	849	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	850	!$OMP DO
[1216]	851	!$acc data copyout( work )
[1]	852	DO l = 0, pdims(1) - 1
[1003]	853	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1111]	854	!$acc kernels present( f_in, work )
	855	!$acc loop
[1003]	856	DO j = nys_z, nyn_z
	857	DO k = zs, zs + nzt_y - nzb_y
[1111]	858	!$acc loop vector( 32 )
[1003]	859	DO i = nxl_z, nxr_z
[1111]	860	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	861	ENDDO
	862	ENDDO
	863	ENDDO
[1111]	864	!$acc end kernels
[1]	865	ENDDO
[1216]	866	!$acc end data
[683]	867	!$OMP END PARALLEL
[1]	868
	869	!
	870	!-- Transpose array
	871	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	872	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	873	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	874	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	875	comm1dx, ierr )
[1111]	876	!$acc update device( f_inv )
[1]	877	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	878	#endif
[1]	879
	880	ELSE
	881	!
[1106]	882	!-- Reorder the array in the same way like ALLTOALL did it
[683]	883	!$OMP PARALLEL PRIVATE ( i, j, k )
	884	!$OMP DO
[1216]	885	!$acc kernels present( f_in, f_inv )
[1111]	886	!$acc loop
[1003]	887	DO k = nzb_y, nzt_y
	888	DO j = 0, ny
[1111]	889	!$acc loop vector( 32 )
[1003]	890	DO i = nxl_y, nxr_y
[164]	891	f_inv(i,k,j) = f_in(i,j,k)
	892	ENDDO
	893	ENDDO
	894	ENDDO
[1111]	895	!$acc end kernels
[683]	896	!$OMP END PARALLEL
[1106]	897
	898	ENDIF
	899
[1]	900	END SUBROUTINE transpose_zy
	901
	902
[1216]	903	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	904
	905	!------------------------------------------------------------------------------!
	906	! Description:
	907	! ------------
	908	! Transposition of input array (f_in) from z to y. For the input array, all
	909	! elements along z reside on the same PE, while after transposition, all
	910	! elements along y reside on the same PE.
	911	! This is a direct transposition for arrays with indices in regular order
	912	! (k,j,i) (cf. transpose_zy).
	913	!------------------------------------------------------------------------------!
	914
	915	USE cpulog
	916	USE indices
	917	USE interfaces
	918	USE pegrid
	919	USE transpose_indices
	920
	921	IMPLICIT NONE
	922
	923	INTEGER :: i, j, k, l, m, ys
	924
[1003]	925	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
	926	f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd), &
[164]	927	work(nnxnnynnz)
[1]	928
	929	#if defined( __parallel )
	930
	931	!
	932	!-- Rearrange indices of input array in order to make data to be send
	933	!-- by MPI contiguous
[1003]	934	DO i = nxl, nxr
	935	DO j = nys, nyn
	936	DO k = 1, nz
[164]	937	f_inv(j,i,k) = f_in(k,j,i)
[1]	938	ENDDO
	939	ENDDO
	940	ENDDO
	941
	942	!
	943	!-- Move data to different array, because memory location of work1 is
	944	!-- needed further below (work1 = work2).
	945	!-- If the PE grid is one-dimensional along x, only local reordering
	946	!-- of the data is necessary and no transposition has to be done.
	947	IF ( pdims(2) == 1 ) THEN
[1003]	948	DO k = 1, nz
	949	DO i = nxl, nxr
	950	DO j = nys, nyn
[164]	951	f_out(j,i,k) = f_inv(j,i,k)
[1]	952	ENDDO
	953	ENDDO
	954	ENDDO
	955	RETURN
	956	ENDIF
	957
	958	!
	959	!-- Transpose array
	960	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	961	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	962	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	963	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	964	comm1dy, ierr )
	965	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	966
	967	!
	968	!-- Reorder transposed array
	969	m = 0
	970	DO l = 0, pdims(2) - 1
	971	ys = 0 + l * nny
[1003]	972	DO k = nzb_yd, nzt_yd
	973	DO i = nxl_yd, nxr_yd
[1]	974	DO j = ys, ys + nny - 1
	975	m = m + 1
[164]	976	f_out(j,i,k) = work(m)
[1]	977	ENDDO
	978	ENDDO
	979	ENDDO
	980	ENDDO
	981
	982	#endif
	983
	984	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |