Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 1213

Last change on this file since 1213 was 1112, checked in by raasch, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 23.9 KB

Rev	Line
[164]	1	SUBROUTINE transpose_xy( f_in, work, f_out )
[1]	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
	17	! Copyright 1997-2012 Leibniz University Hannover
	18	!--------------------------------------------------------------------------------!
	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1112]	22	!
[198]	23	!
	24	! Former revisions:
	25	! -----------------
	26	! $Id: transpose.f90 1112 2013-03-09 00:34:37Z raasch $
	27	!
[1112]	28	! 1111 2013-03-08 23:54:10Z raasch
	29	! openACC directives added,
	30	! resorting data from/to work changed, work got 4 dimensions instead of 1
	31	!
[1107]	32	! 1106 2013-03-04 05:31:38Z raasch
	33	! preprocessor lines rearranged so that routines can also be used in serial
	34	! (non-parallel) mode
	35	!
[1093]	36	! 1092 2013-02-02 11:24:22Z raasch
	37	! unused variables removed
	38	!
[1037]	39	! 1036 2012-10-22 13:43:42Z raasch
	40	! code put under GPL (PALM 3.9)
	41	!
[1004]	42	! 1003 2012-09-14 14:35:53Z raasch
	43	! indices nxa, nya, etc. replaced by nx, ny, etc.
	44	!
[684]	45	! 683 2011-02-09 14:25:15Z raasch
	46	! openMP parallelization of transpositions for 2d-domain-decomposition
	47	!
[623]	48	! 622 2010-12-10 08:08:13Z raasch
	49	! optional barriers included in order to speed up collective operations
	50	!
[198]	51	! 164 2008-05-15 08:46:15Z raasch
[164]	52	! f_inv changed from subroutine argument to automatic array in order to do
	53	! re-ordering from f_in to f_inv in one step, one array work is needed instead
	54	! of work1 and work2
[1]	55	!
[198]	56	! February 2007
[3]	57	! RCS Log replace by Id keyword, revision history cleaned up
	58	!
[1]	59	! Revision 1.2 2004/04/30 13:12:17 raasch
	60	! Switched from mpi_alltoallv to the simpler mpi_alltoall,
	61	! all former transpose-routine files collected in this file, enlarged
	62	! transposition arrays introduced
	63	!
	64	! Revision 1.1 2004/04/30 13:08:16 raasch
	65	! Initial revision (collection of former routines transpose_xy, transpose_xz,
	66	! transpose_yx, transpose_yz, transpose_zx, transpose_zy)
	67	!
	68	! Revision 1.1 1997/07/24 11:25:18 raasch
	69	! Initial revision
	70	!
	71	!
	72	! Description:
	73	! ------------
	74	! Transposition of input array (f_in) from x to y. For the input array, all
	75	! elements along x reside on the same PE, while after transposition, all
	76	! elements along y reside on the same PE.
	77	!------------------------------------------------------------------------------!
	78
	79	USE cpulog
	80	USE indices
	81	USE interfaces
	82	USE pegrid
	83	USE transpose_indices
	84
	85	IMPLICIT NONE
	86
[1111]	87	INTEGER :: i, j, k, l, ys
[1]	88
[1111]	89	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
[1]	90
[1111]	91	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
	92
	93	!$acc declare create( f_inv )
	94	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
	95
	96
[1]	97	!
	98	!-- Rearrange indices of input array in order to make data to be send
	99	!-- by MPI contiguous
[683]	100	!$OMP PARALLEL PRIVATE ( i, j, k )
	101	!$OMP DO
[1111]	102	!$acc kernels present( f_in )
	103	!$acc loop
[1003]	104	DO i = 0, nx
	105	DO k = nzb_x, nzt_x
[1111]	106	!$acc loop vector( 32 )
[1003]	107	DO j = nys_x, nyn_x
[164]	108	f_inv(j,k,i) = f_in(i,j,k)
[1]	109	ENDDO
	110	ENDDO
	111	ENDDO
[1111]	112	!$acc end kernels
[683]	113	!$OMP END PARALLEL
[1]	114
[1106]	115	IF ( numprocs /= 1 ) THEN
	116
	117	#if defined( __parallel )
[1]	118	!
[1106]	119	!-- Transpose array
	120	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	121	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	122	!$acc update host( f_inv )
	123	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	124	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	125	comm1dy, ierr )
[1111]	126	!$acc update device( work )
[1106]	127	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	128
	129	!
[1106]	130	!-- Reorder transposed array
[1111]	131	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	132	!$OMP DO
[1106]	133	DO l = 0, pdims(2) - 1
	134	ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]	135	!$acc kernels present( f_out, work )
	136	!$acc loop
[1106]	137	DO i = nxl_y, nxr_y
	138	DO k = nzb_y, nzt_y
[1111]	139	!$acc loop vector( 32 )
[1106]	140	DO j = ys, ys + nyn_x - nys_x
[1111]	141	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	142	ENDDO
[1]	143	ENDDO
	144	ENDDO
[1111]	145	!$acc end kernels
[1]	146	ENDDO
[683]	147	!$OMP END PARALLEL
[1]	148	#endif
	149
[1106]	150	ELSE
	151
	152	!
	153	!-- Reorder transposed array
	154	!$OMP PARALLEL PRIVATE ( i, j, k )
	155	!$OMP DO
[1111]	156	!$acc kernels present( f_out )
	157	!$acc loop
[1106]	158	DO k = nzb_y, nzt_y
	159	DO i = nxl_y, nxr_y
[1111]	160	!$acc loop vector( 32 )
[1106]	161	DO j = 0, ny
	162	f_out(j,i,k) = f_inv(j,k,i)
	163	ENDDO
	164	ENDDO
	165	ENDDO
[1111]	166	!$acc end kernels
[1106]	167	!$OMP END PARALLEL
	168
	169	ENDIF
	170
[1]	171	END SUBROUTINE transpose_xy
	172
	173
[164]	174	SUBROUTINE transpose_xz( f_in, work, f_out )
[1]	175
	176	!------------------------------------------------------------------------------!
	177	! Description:
	178	! ------------
	179	! Transposition of input array (f_in) from x to z. For the input array, all
	180	! elements along x reside on the same PE, while after transposition, all
	181	! elements along z reside on the same PE.
	182	!------------------------------------------------------------------------------!
	183
	184	USE cpulog
	185	USE indices
	186	USE interfaces
	187	USE pegrid
	188	USE transpose_indices
	189
	190	IMPLICIT NONE
	191
[1111]	192	INTEGER :: i, j, k, l, xs
[1]	193
[1111]	194	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_out(1:nz,nys:nyn,nxl:nxr)
[1]	195
[1111]	196	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
[1]	197
[1111]	198	!$acc declare create( f_inv )
	199	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
	200
	201
[1]	202	!
	203	!-- If the PE grid is one-dimensional along y, the array has only to be
	204	!-- reordered locally and therefore no transposition has to be done.
	205	IF ( pdims(1) /= 1 ) THEN
[1106]	206
	207	#if defined( __parallel )
[1]	208	!
	209	!-- Reorder input array for transposition
[1111]	210	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	211	!$OMP DO
[1]	212	DO l = 0, pdims(1) - 1
	213	xs = 0 + l * nnx
[1111]	214	!$acc kernels present( f_in, work )
	215	!$acc loop
[1003]	216	DO k = nzb_x, nzt_x
[164]	217	DO i = xs, xs + nnx - 1
[1111]	218	!$acc loop vector( 32 )
[1003]	219	DO j = nys_x, nyn_x
[1111]	220	work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]	221	ENDDO
	222	ENDDO
	223	ENDDO
[1111]	224	!$acc end kernels
[1]	225	ENDDO
[683]	226	!$OMP END PARALLEL
[1]	227
	228	!
	229	!-- Transpose array
	230	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	231	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	232	!$acc update host( work )
	233	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	234	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	235	comm1dx, ierr )
[1111]	236	!$acc update device( f_inv )
[1]	237	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	238
	239	!
	240	!-- Reorder transposed array in a way that the z index is in first position
[683]	241	!$OMP PARALLEL PRIVATE ( i, j, k )
	242	!$OMP DO
[1111]	243	!$acc kernels present( f_out )
	244	!$acc loop
[1003]	245	DO k = 1, nz
	246	DO i = nxl, nxr
[1111]	247	!$acc loop vector( 32 )
[1003]	248	DO j = nys, nyn
[164]	249	f_out(k,j,i) = f_inv(j,i,k)
[1]	250	ENDDO
	251	ENDDO
	252	ENDDO
[1111]	253	!$acc end kernels
[683]	254	!$OMP END PARALLEL
[1106]	255	#endif
	256
[1]	257	ELSE
[1106]	258
[1]	259	!
	260	!-- Reorder the array in a way that the z index is in first position
[683]	261	!$OMP PARALLEL PRIVATE ( i, j, k )
	262	!$OMP DO
[1111]	263	!$acc kernels present( f_in )
	264	!$acc loop
[1003]	265	DO i = nxl, nxr
	266	DO j = nys, nyn
[1111]	267	!$acc loop vector( 32 )
[1003]	268	DO k = 1, nz
[164]	269	f_inv(j,i,k) = f_in(i,j,k)
[1]	270	ENDDO
	271	ENDDO
	272	ENDDO
[1111]	273	!$acc end kernels
[683]	274	!$OMP END PARALLEL
[1]	275
[683]	276	!$OMP PARALLEL PRIVATE ( i, j, k )
	277	!$OMP DO
[1111]	278	!$acc kernels present( f_out )
	279	!$acc loop
[1003]	280	DO k = 1, nz
	281	DO i = nxl, nxr
[1111]	282	!$acc loop vector( 32 )
[1003]	283	DO j = nys, nyn
[164]	284	f_out(k,j,i) = f_inv(j,i,k)
	285	ENDDO
[1]	286	ENDDO
	287	ENDDO
[1111]	288	!$acc end kernels
[683]	289	!$OMP END PARALLEL
[1]	290
[164]	291	ENDIF
	292
[1]	293	END SUBROUTINE transpose_xz
	294
	295
[164]	296	SUBROUTINE transpose_yx( f_in, work, f_out )
[1]	297
	298	!------------------------------------------------------------------------------!
	299	! Description:
	300	! ------------
	301	! Transposition of input array (f_in) from y to x. For the input array, all
	302	! elements along y reside on the same PE, while after transposition, all
	303	! elements along x reside on the same PE.
	304	!------------------------------------------------------------------------------!
	305
	306	USE cpulog
	307	USE indices
	308	USE interfaces
	309	USE pegrid
	310	USE transpose_indices
	311
	312	IMPLICIT NONE
	313
[1111]	314	INTEGER :: i, j, k, l, ys
[1]	315
[1111]	316	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
[1]	317
[1111]	318	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
	319
	320	!$acc declare create( f_inv )
	321	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
	322
	323
[1106]	324	IF ( numprocs /= 1 ) THEN
	325
[1]	326	#if defined( __parallel )
	327	!
[1106]	328	!-- Reorder input array for transposition
[1111]	329	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	330	!$OMP DO
[1106]	331	DO l = 0, pdims(2) - 1
	332	ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]	333	!$acc kernels present( f_in, work )
	334	!$acc loop
[1106]	335	DO i = nxl_y, nxr_y
	336	DO k = nzb_y, nzt_y
[1111]	337	!$acc loop vector( 32 )
[1106]	338	DO j = ys, ys + nyn_x - nys_x
[1111]	339	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	340	ENDDO
	341	ENDDO
	342	ENDDO
[1111]	343	!$acc end kernels
[1106]	344	ENDDO
	345	!$OMP END PARALLEL
	346
	347	!
	348	!-- Transpose array
	349	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	350	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	351	!$acc update host( work )
	352	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	353	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	354	comm1dy, ierr )
[1111]	355	!$acc update device( f_inv )
[1106]	356	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	357	#endif
	358
	359	ELSE
	360
	361	!
	362	!-- Reorder array f_in the same way as ALLTOALL did it
	363	!$OMP PARALLEL PRIVATE ( i, j, k )
	364	!$OMP DO
[1111]	365	!$acc kernels present( f_in )
	366	!$acc loop
[1003]	367	DO i = nxl_y, nxr_y
	368	DO k = nzb_y, nzt_y
[1111]	369	!$acc loop vector( 32 )
[1106]	370	DO j = 0, ny
	371	f_inv(j,k,i) = f_in(j,i,k)
[1]	372	ENDDO
	373	ENDDO
	374	ENDDO
[1111]	375	!$acc end kernels
[683]	376	!$OMP END PARALLEL
[1]	377
[1106]	378	ENDIF
[1]	379
	380	!
	381	!-- Reorder transposed array in a way that the x index is in first position
[683]	382	!$OMP PARALLEL PRIVATE ( i, j, k )
	383	!$OMP DO
[1111]	384	!$acc kernels present( f_out )
	385	!$acc loop
[1003]	386	DO i = 0, nx
	387	DO k = nzb_x, nzt_x
[1111]	388	!$acc loop vector( 32 )
[1003]	389	DO j = nys_x, nyn_x
[164]	390	f_out(i,j,k) = f_inv(j,k,i)
[1]	391	ENDDO
	392	ENDDO
	393	ENDDO
[1111]	394	!$acc end kernels
[683]	395	!$OMP END PARALLEL
[1]	396
	397	END SUBROUTINE transpose_yx
	398
	399
[164]	400	SUBROUTINE transpose_yxd( f_in, work, f_out )
[1]	401
	402	!------------------------------------------------------------------------------!
	403	! Description:
	404	! ------------
	405	! Transposition of input array (f_in) from y to x. For the input array, all
	406	! elements along y reside on the same PE, while after transposition, all
	407	! elements along x reside on the same PE.
	408	! This is a direct transposition for arrays with indices in regular order
	409	! (k,j,i) (cf. transpose_yx).
	410	!------------------------------------------------------------------------------!
	411
	412	USE cpulog
	413	USE indices
	414	USE interfaces
	415	USE pegrid
	416	USE transpose_indices
	417
	418	IMPLICIT NONE
	419
[1092]	420	INTEGER :: i, j, k, l, m, xs
[1]	421
[1003]	422	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
	423	f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x), &
[164]	424	work(nnxnnynnz)
[1]	425
	426	#if defined( __parallel )
	427
	428	!
	429	!-- Rearrange indices of input array in order to make data to be send
	430	!-- by MPI contiguous
[1003]	431	DO k = 1, nz
	432	DO j = nys, nyn
	433	DO i = nxl, nxr
[164]	434	f_inv(i,k,j) = f_in(k,j,i)
[1]	435	ENDDO
	436	ENDDO
	437	ENDDO
	438
	439	!
	440	!-- Transpose array
	441	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	442	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	443	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	444	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	445	comm1dx, ierr )
	446	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	447
	448	!
	449	!-- Reorder transposed array
	450	m = 0
	451	DO l = 0, pdims(1) - 1
	452	xs = 0 + l * nnx
[1003]	453	DO j = nys_x, nyn_x
	454	DO k = 1, nz
[1]	455	DO i = xs, xs + nnx - 1
	456	m = m + 1
[164]	457	f_out(i,j,k) = work(m)
[1]	458	ENDDO
	459	ENDDO
	460	ENDDO
	461	ENDDO
	462
	463	#endif
	464
	465	END SUBROUTINE transpose_yxd
	466
	467
[164]	468	SUBROUTINE transpose_yz( f_in, work, f_out )
[1]	469
	470	!------------------------------------------------------------------------------!
	471	! Description:
	472	! ------------
	473	! Transposition of input array (f_in) from y to z. For the input array, all
	474	! elements along y reside on the same PE, while after transposition, all
	475	! elements along z reside on the same PE.
	476	!------------------------------------------------------------------------------!
	477
	478	USE cpulog
	479	USE indices
	480	USE interfaces
	481	USE pegrid
	482	USE transpose_indices
	483
	484	IMPLICIT NONE
	485
[1111]	486	INTEGER :: i, j, k, l, zs
[1]	487
[1111]	488	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]	489
[1111]	490	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
	491
	492	!$acc declare create( f_inv )
	493	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
	494
	495
[1]	496	!
	497	!-- Rearrange indices of input array in order to make data to be send
	498	!-- by MPI contiguous
[683]	499	!$OMP PARALLEL PRIVATE ( i, j, k )
	500	!$OMP DO
[1111]	501	!$acc kernels present( f_in )
	502	!$acc loop
[1003]	503	DO j = 0, ny
	504	DO k = nzb_y, nzt_y
[1111]	505	!$acc loop vector( 32 )
[1003]	506	DO i = nxl_y, nxr_y
[164]	507	f_inv(i,k,j) = f_in(j,i,k)
[1]	508	ENDDO
	509	ENDDO
	510	ENDDO
[1111]	511	!$acc end kernels
[683]	512	!$OMP END PARALLEL
[1]	513
	514	!
	515	!-- Move data to different array, because memory location of work1 is
	516	!-- needed further below (work1 = work2).
	517	!-- If the PE grid is one-dimensional along y, only local reordering
	518	!-- of the data is necessary and no transposition has to be done.
	519	IF ( pdims(1) == 1 ) THEN
[1106]	520
[683]	521	!$OMP PARALLEL PRIVATE ( i, j, k )
	522	!$OMP DO
[1111]	523	!$acc kernels present( f_out )
	524	!$acc loop
[1003]	525	DO j = 0, ny
	526	DO k = nzb_y, nzt_y
[1111]	527	!$acc loop vector( 32 )
[1003]	528	DO i = nxl_y, nxr_y
[164]	529	f_out(i,j,k) = f_inv(i,k,j)
[1]	530	ENDDO
	531	ENDDO
	532	ENDDO
[1111]	533	!$acc end kernels
[683]	534	!$OMP END PARALLEL
[1]	535
[1106]	536	ELSE
	537
	538	#if defined( __parallel )
[1]	539	!
[1106]	540	!-- Transpose array
	541	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	542	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	543	!$acc update host( f_inv )
	544	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	545	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	546	comm1dx, ierr )
[1111]	547	!$acc update device( work )
[1106]	548	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	549
	550	!
[1106]	551	!-- Reorder transposed array
[1111]	552	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	553	!$OMP DO
[1106]	554	DO l = 0, pdims(1) - 1
	555	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1111]	556	!$acc kernels present( f_out, work )
	557	!$acc loop
[1106]	558	DO j = nys_z, nyn_z
	559	DO k = zs, zs + nzt_y - nzb_y
[1111]	560	!$acc loop vector( 32 )
[1106]	561	DO i = nxl_z, nxr_z
[1111]	562	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	563	ENDDO
[1]	564	ENDDO
	565	ENDDO
[1111]	566	!$acc end kernels
[1]	567	ENDDO
[683]	568	!$OMP END PARALLEL
[1]	569	#endif
	570
[1106]	571	ENDIF
	572
[1]	573	END SUBROUTINE transpose_yz
	574
	575
[164]	576	SUBROUTINE transpose_zx( f_in, work, f_out )
[1]	577
	578	!------------------------------------------------------------------------------!
	579	! Description:
	580	! ------------
	581	! Transposition of input array (f_in) from z to x. For the input array, all
	582	! elements along z reside on the same PE, while after transposition, all
	583	! elements along x reside on the same PE.
	584	!------------------------------------------------------------------------------!
	585
	586	USE cpulog
	587	USE indices
	588	USE interfaces
	589	USE pegrid
	590	USE transpose_indices
	591
	592	IMPLICIT NONE
	593
[1111]	594	INTEGER :: i, j, k, l, xs
[1]	595
[1111]	596	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
[1]	597
[1111]	598	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
	599
	600	!$acc declare create( f_inv )
[1106]	601	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
[1]	602
[1106]	603
[1]	604	!
	605	!-- Rearrange indices of input array in order to make data to be send
	606	!-- by MPI contiguous
[683]	607	!$OMP PARALLEL PRIVATE ( i, j, k )
	608	!$OMP DO
[1106]	609	!$acc kernels present( f_in )
	610	!$acc loop
[1003]	611	DO k = 1,nz
	612	DO i = nxl, nxr
[1106]	613	!$acc loop vector( 32 )
[1003]	614	DO j = nys, nyn
[164]	615	f_inv(j,i,k) = f_in(k,j,i)
[1]	616	ENDDO
	617	ENDDO
	618	ENDDO
[1111]	619	!$acc end kernels
[683]	620	!$OMP END PARALLEL
[1]	621
	622	!
	623	!-- Move data to different array, because memory location of work1 is
	624	!-- needed further below (work1 = work2).
	625	!-- If the PE grid is one-dimensional along y, only local reordering
	626	!-- of the data is necessary and no transposition has to be done.
	627	IF ( pdims(1) == 1 ) THEN
[1106]	628
[683]	629	!$OMP PARALLEL PRIVATE ( i, j, k )
	630	!$OMP DO
[1106]	631	!$acc kernels present( f_out )
	632	!$acc loop
[1003]	633	DO k = 1, nz
	634	DO i = nxl, nxr
[1106]	635	!$acc loop vector( 32 )
[1003]	636	DO j = nys, nyn
[164]	637	f_out(i,j,k) = f_inv(j,i,k)
[1]	638	ENDDO
	639	ENDDO
	640	ENDDO
[1111]	641	!$acc end kernels
[683]	642	!$OMP END PARALLEL
[1]	643
[1106]	644	ELSE
	645
	646	#if defined( __parallel )
[1]	647	!
[1106]	648	!-- Transpose array
	649	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	650	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	651	!$acc update host( f_inv )
	652	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	653	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	654	comm1dx, ierr )
[1111]	655	!$acc update device( work )
[1106]	656	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	657
	658	!
[1106]	659	!-- Reorder transposed array
[1111]	660	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	661	!$OMP DO
[1106]	662	DO l = 0, pdims(1) - 1
	663	xs = 0 + l * nnx
[1111]	664	!$acc kernels present( f_out, work )
	665	!$acc loop
[1106]	666	DO k = nzb_x, nzt_x
	667	DO i = xs, xs + nnx - 1
[1111]	668	!$acc loop vector( 32 )
[1106]	669	DO j = nys_x, nyn_x
[1111]	670	f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]	671	ENDDO
[1]	672	ENDDO
	673	ENDDO
[1111]	674	!$acc end kernels
[1]	675	ENDDO
[683]	676	!$OMP END PARALLEL
[1]	677	#endif
	678
[1106]	679	ENDIF
	680
[1]	681	END SUBROUTINE transpose_zx
	682
	683
[164]	684	SUBROUTINE transpose_zy( f_in, work, f_out )
[1]	685
	686	!------------------------------------------------------------------------------!
	687	! Description:
	688	! ------------
	689	! Transposition of input array (f_in) from z to y. For the input array, all
	690	! elements along z reside on the same PE, while after transposition, all
	691	! elements along y reside on the same PE.
	692	!------------------------------------------------------------------------------!
	693
	694	USE cpulog
	695	USE indices
	696	USE interfaces
	697	USE pegrid
	698	USE transpose_indices
	699
	700	IMPLICIT NONE
	701
[1111]	702	INTEGER :: i, j, k, l, zs
[1]	703
[1111]	704	REAL :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
[1]	705
[1111]	706	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
	707
	708	!$acc declare create( f_inv )
	709	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
	710
	711
[1]	712	!
	713	!-- If the PE grid is one-dimensional along y, the array has only to be
	714	!-- reordered locally and therefore no transposition has to be done.
	715	IF ( pdims(1) /= 1 ) THEN
[1106]	716
	717	#if defined( __parallel )
[1]	718	!
	719	!-- Reorder input array for transposition
[1111]	720	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	721	!$OMP DO
[1]	722	DO l = 0, pdims(1) - 1
[1003]	723	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1111]	724	!$acc kernels present( f_in, work )
	725	!$acc loop
[1003]	726	DO j = nys_z, nyn_z
	727	DO k = zs, zs + nzt_y - nzb_y
[1111]	728	!$acc loop vector( 32 )
[1003]	729	DO i = nxl_z, nxr_z
[1111]	730	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	731	ENDDO
	732	ENDDO
	733	ENDDO
[1111]	734	!$acc end kernels
[1]	735	ENDDO
[683]	736	!$OMP END PARALLEL
[1]	737
	738	!
	739	!-- Transpose array
	740	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	741	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	742	!$acc update host( work )
	743	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	744	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	745	comm1dx, ierr )
[1111]	746	!$acc update device( f_inv )
[1]	747	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	748	#endif
[1]	749
	750	ELSE
	751	!
[1106]	752	!-- Reorder the array in the same way like ALLTOALL did it
[683]	753	!$OMP PARALLEL PRIVATE ( i, j, k )
	754	!$OMP DO
[1111]	755	!$acc kernels present( f_in )
	756	!$acc loop
[1003]	757	DO k = nzb_y, nzt_y
	758	DO j = 0, ny
[1111]	759	!$acc loop vector( 32 )
[1003]	760	DO i = nxl_y, nxr_y
[164]	761	f_inv(i,k,j) = f_in(i,j,k)
	762	ENDDO
	763	ENDDO
	764	ENDDO
[1111]	765	!$acc end kernels
[683]	766	!$OMP END PARALLEL
[1106]	767
	768	ENDIF
	769
[164]	770	!
[1106]	771	!-- Reorder transposed array in a way that the y index is in first position
[683]	772	!$OMP PARALLEL PRIVATE ( i, j, k )
	773	!$OMP DO
[1111]	774	!$acc kernels present( f_out )
	775	!$acc loop
[1106]	776	DO k = nzb_y, nzt_y
	777	DO i = nxl_y, nxr_y
[1111]	778	!$acc loop vector( 32 )
[1106]	779	DO j = 0, ny
	780	f_out(j,i,k) = f_inv(i,k,j)
[1]	781	ENDDO
	782	ENDDO
[1106]	783	ENDDO
[1111]	784	!$acc end kernels
[683]	785	!$OMP END PARALLEL
[164]	786
[1]	787	END SUBROUTINE transpose_zy
	788
	789
[164]	790	SUBROUTINE transpose_zyd( f_in, work, f_out )
[1]	791
	792	!------------------------------------------------------------------------------!
	793	! Description:
	794	! ------------
	795	! Transposition of input array (f_in) from z to y. For the input array, all
	796	! elements along z reside on the same PE, while after transposition, all
	797	! elements along y reside on the same PE.
	798	! This is a direct transposition for arrays with indices in regular order
	799	! (k,j,i) (cf. transpose_zy).
	800	!------------------------------------------------------------------------------!
	801
	802	USE cpulog
	803	USE indices
	804	USE interfaces
	805	USE pegrid
	806	USE transpose_indices
	807
	808	IMPLICIT NONE
	809
	810	INTEGER :: i, j, k, l, m, ys
	811
[1003]	812	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
	813	f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd), &
[164]	814	work(nnxnnynnz)
[1]	815
	816	#if defined( __parallel )
	817
	818	!
	819	!-- Rearrange indices of input array in order to make data to be send
	820	!-- by MPI contiguous
[1003]	821	DO i = nxl, nxr
	822	DO j = nys, nyn
	823	DO k = 1, nz
[164]	824	f_inv(j,i,k) = f_in(k,j,i)
[1]	825	ENDDO
	826	ENDDO
	827	ENDDO
	828
	829	!
	830	!-- Move data to different array, because memory location of work1 is
	831	!-- needed further below (work1 = work2).
	832	!-- If the PE grid is one-dimensional along x, only local reordering
	833	!-- of the data is necessary and no transposition has to be done.
	834	IF ( pdims(2) == 1 ) THEN
[1003]	835	DO k = 1, nz
	836	DO i = nxl, nxr
	837	DO j = nys, nyn
[164]	838	f_out(j,i,k) = f_inv(j,i,k)
[1]	839	ENDDO
	840	ENDDO
	841	ENDDO
	842	RETURN
	843	ENDIF
	844
	845	!
	846	!-- Transpose array
	847	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	848	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	849	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	850	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	851	comm1dy, ierr )
	852	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	853
	854	!
	855	!-- Reorder transposed array
	856	m = 0
	857	DO l = 0, pdims(2) - 1
	858	ys = 0 + l * nny
[1003]	859	DO k = nzb_yd, nzt_yd
	860	DO i = nxl_yd, nxr_yd
[1]	861	DO j = ys, ys + nny - 1
	862	m = m + 1
[164]	863	f_out(j,i,k) = work(m)
[1]	864	ENDDO
	865	ENDDO
	866	ENDDO
	867	ENDDO
	868
	869	#endif
	870
	871	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |