Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 1280

Last change on this file since 1280 was 1258, checked in by raasch, 11 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 25.9 KB

Rev	Line
[1216]	1	SUBROUTINE resort_for_xy( f_in, f_inv )
[1]	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
	17	! Copyright 1997-2012 Leibniz University Hannover
	18	!--------------------------------------------------------------------------------!
	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[198]	22	!
[1258]	23	!
[198]	24	! Former revisions:
	25	! -----------------
	26	! $Id: transpose.f90 1258 2013-11-08 16:09:09Z raasch $
	27	!
[1258]	28	! 1257 2013-11-08 15:18:40Z raasch
	29	! openacc loop and loop vector clauses removed
	30	!
[1217]	31	! 1216 2013-08-26 09:31:42Z raasch
	32	! re-sorting of the transposed / to be transposed arrays moved to separate
	33	! routines resort_for_...
	34	!
[1112]	35	! 1111 2013-03-08 23:54:10Z raasch
	36	! openACC directives added,
	37	! resorting data from/to work changed, work got 4 dimensions instead of 1
	38	!
[1107]	39	! 1106 2013-03-04 05:31:38Z raasch
	40	! preprocessor lines rearranged so that routines can also be used in serial
	41	! (non-parallel) mode
	42	!
[1093]	43	! 1092 2013-02-02 11:24:22Z raasch
	44	! unused variables removed
	45	!
[1037]	46	! 1036 2012-10-22 13:43:42Z raasch
	47	! code put under GPL (PALM 3.9)
	48	!
[1004]	49	! 1003 2012-09-14 14:35:53Z raasch
	50	! indices nxa, nya, etc. replaced by nx, ny, etc.
	51	!
[684]	52	! 683 2011-02-09 14:25:15Z raasch
	53	! openMP parallelization of transpositions for 2d-domain-decomposition
	54	!
[623]	55	! 622 2010-12-10 08:08:13Z raasch
	56	! optional barriers included in order to speed up collective operations
	57	!
[198]	58	! 164 2008-05-15 08:46:15Z raasch
[164]	59	! f_inv changed from subroutine argument to automatic array in order to do
	60	! re-ordering from f_in to f_inv in one step, one array work is needed instead
	61	! of work1 and work2
[1]	62	!
[198]	63	! February 2007
[3]	64	! RCS Log replace by Id keyword, revision history cleaned up
	65	!
[1]	66	! Revision 1.2 2004/04/30 13:12:17 raasch
	67	! Switched from mpi_alltoallv to the simpler mpi_alltoall,
	68	! all former transpose-routine files collected in this file, enlarged
	69	! transposition arrays introduced
	70	!
	71	! Revision 1.1 2004/04/30 13:08:16 raasch
	72	! Initial revision (collection of former routines transpose_xy, transpose_xz,
	73	! transpose_yx, transpose_yz, transpose_zx, transpose_zy)
	74	!
	75	! Revision 1.1 1997/07/24 11:25:18 raasch
	76	! Initial revision
	77	!
[1216]	78	!------------------------------------------------------------------------------!
	79	! Description:
	80	! ------------
	81	! Resorting data for the transposition from x to y. The transposition itself
	82	! is carried out in transpose_xy
	83	!------------------------------------------------------------------------------!
	84
	85	USE indices
	86	USE transpose_indices
	87
	88	IMPLICIT NONE
	89
	90	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
	91	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
	92
	93
	94	INTEGER :: i, j, k
	95
[1]	96	!
[1216]	97	!-- Rearrange indices of input array in order to make data to be send
	98	!-- by MPI contiguous
	99	!$OMP PARALLEL PRIVATE ( i, j, k )
	100	!$OMP DO
	101	!$acc kernels present( f_in, f_inv )
	102	DO i = 0, nx
	103	DO k = nzb_x, nzt_x
	104	DO j = nys_x, nyn_x
	105	f_inv(j,k,i) = f_in(i,j,k)
	106	ENDDO
	107	ENDDO
	108	ENDDO
	109	!$acc end kernels
	110	!$OMP END PARALLEL
	111
	112	END SUBROUTINE resort_for_xy
	113
	114
	115	SUBROUTINE transpose_xy( f_inv, f_out )
	116
	117	!------------------------------------------------------------------------------!
[1]	118	! Description:
	119	! ------------
	120	! Transposition of input array (f_in) from x to y. For the input array, all
	121	! elements along x reside on the same PE, while after transposition, all
	122	! elements along y reside on the same PE.
	123	!------------------------------------------------------------------------------!
	124
	125	USE cpulog
	126	USE indices
	127	USE interfaces
	128	USE pegrid
	129	USE transpose_indices
	130
	131	IMPLICIT NONE
	132
[1111]	133	INTEGER :: i, j, k, l, ys
[1]	134
[1216]	135	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
[1]	136
[1111]	137	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
	138
	139
[1106]	140	IF ( numprocs /= 1 ) THEN
	141
	142	#if defined( __parallel )
[1]	143	!
[1106]	144	!-- Transpose array
	145	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	146	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	147	!$acc update host( f_inv )
	148	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	149	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	150	comm1dy, ierr )
	151	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	152
	153	!
[1106]	154	!-- Reorder transposed array
[1111]	155	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	156	!$OMP DO
[1216]	157	!$acc data copyin( work )
[1106]	158	DO l = 0, pdims(2) - 1
	159	ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]	160	!$acc kernels present( f_out, work )
[1106]	161	DO i = nxl_y, nxr_y
	162	DO k = nzb_y, nzt_y
	163	DO j = ys, ys + nyn_x - nys_x
[1111]	164	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	165	ENDDO
[1]	166	ENDDO
	167	ENDDO
[1111]	168	!$acc end kernels
[1]	169	ENDDO
[1216]	170	!$acc end data
[683]	171	!$OMP END PARALLEL
[1]	172	#endif
	173
[1106]	174	ELSE
	175
	176	!
	177	!-- Reorder transposed array
	178	!$OMP PARALLEL PRIVATE ( i, j, k )
	179	!$OMP DO
[1216]	180	!$acc kernels present( f_inv, f_out )
[1106]	181	DO k = nzb_y, nzt_y
	182	DO i = nxl_y, nxr_y
	183	DO j = 0, ny
	184	f_out(j,i,k) = f_inv(j,k,i)
	185	ENDDO
	186	ENDDO
	187	ENDDO
[1111]	188	!$acc end kernels
[1106]	189	!$OMP END PARALLEL
	190
	191	ENDIF
	192
[1]	193	END SUBROUTINE transpose_xy
	194
	195
[1216]	196	SUBROUTINE resort_for_xz( f_inv, f_out )
[1]	197
	198	!------------------------------------------------------------------------------!
	199	! Description:
	200	! ------------
[1216]	201	! Resorting data after the transposition from x to z. The transposition itself
	202	! is carried out in transpose_xz
	203	!------------------------------------------------------------------------------!
	204
	205	USE indices
	206	USE transpose_indices
	207
	208	IMPLICIT NONE
	209
	210	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
	211	REAL :: f_out(1:nz,nys:nyn,nxl:nxr)
	212
	213
	214	INTEGER :: i, j, k
	215
	216	!
	217	!-- Rearrange indices of input array in order to make data to be send
	218	!-- by MPI contiguous.
	219	!-- In case of parallel fft/transposition, scattered store is faster in
	220	!-- backward direction!!!
	221	!$OMP PARALLEL PRIVATE ( i, j, k )
	222	!$OMP DO
	223	!$acc kernels present( f_inv, f_out )
	224	DO k = 1, nz
	225	DO i = nxl, nxr
	226	DO j = nys, nyn
	227	f_out(k,j,i) = f_inv(j,i,k)
	228	ENDDO
	229	ENDDO
	230	ENDDO
	231	!$acc end kernels
	232	!$OMP END PARALLEL
	233
	234	END SUBROUTINE resort_for_xz
	235
	236
	237	SUBROUTINE transpose_xz( f_in, f_inv )
	238
	239	!------------------------------------------------------------------------------!
	240	! Description:
	241	! ------------
[1]	242	! Transposition of input array (f_in) from x to z. For the input array, all
	243	! elements along x reside on the same PE, while after transposition, all
	244	! elements along z reside on the same PE.
	245	!------------------------------------------------------------------------------!
	246
	247	USE cpulog
	248	USE indices
	249	USE interfaces
	250	USE pegrid
	251	USE transpose_indices
	252
	253	IMPLICIT NONE
	254
[1111]	255	INTEGER :: i, j, k, l, xs
[1]	256
[1216]	257	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz)
[1]	258
[1111]	259	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
[1]	260
[1111]	261
[1]	262	!
	263	!-- If the PE grid is one-dimensional along y, the array has only to be
	264	!-- reordered locally and therefore no transposition has to be done.
	265	IF ( pdims(1) /= 1 ) THEN
[1106]	266
	267	#if defined( __parallel )
[1]	268	!
	269	!-- Reorder input array for transposition
[1111]	270	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	271	!$OMP DO
[1216]	272	!$acc data copyout( work )
[1]	273	DO l = 0, pdims(1) - 1
	274	xs = 0 + l * nnx
[1111]	275	!$acc kernels present( f_in, work )
[1003]	276	DO k = nzb_x, nzt_x
[164]	277	DO i = xs, xs + nnx - 1
[1003]	278	DO j = nys_x, nyn_x
[1111]	279	work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]	280	ENDDO
	281	ENDDO
	282	ENDDO
[1111]	283	!$acc end kernels
[1]	284	ENDDO
[1216]	285	!$acc end data
[683]	286	!$OMP END PARALLEL
[1]	287
	288	!
	289	!-- Transpose array
	290	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	291	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	292	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	293	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	294	comm1dx, ierr )
[1111]	295	!$acc update device( f_inv )
[1]	296	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	297	#endif
	298
[1]	299	ELSE
[1106]	300
[1]	301	!
	302	!-- Reorder the array in a way that the z index is in first position
[683]	303	!$OMP PARALLEL PRIVATE ( i, j, k )
	304	!$OMP DO
[1216]	305	!$acc kernels present( f_in, f_inv )
[1003]	306	DO i = nxl, nxr
	307	DO j = nys, nyn
	308	DO k = 1, nz
[164]	309	f_inv(j,i,k) = f_in(i,j,k)
[1]	310	ENDDO
	311	ENDDO
	312	ENDDO
[1111]	313	!$acc end kernels
[683]	314	!$OMP END PARALLEL
[1]	315
[164]	316	ENDIF
	317
[1]	318	END SUBROUTINE transpose_xz
	319
	320
[1216]	321	SUBROUTINE resort_for_yx( f_inv, f_out )
[1]	322
	323	!------------------------------------------------------------------------------!
	324	! Description:
	325	! ------------
[1216]	326	! Resorting data after the transposition from y to x. The transposition itself
	327	! is carried out in transpose_yx
	328	!------------------------------------------------------------------------------!
	329
	330	USE indices
	331	USE transpose_indices
	332
	333	IMPLICIT NONE
	334
	335	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
	336	REAL :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
	337
	338
	339	INTEGER :: i, j, k
	340
	341	!
	342	!-- Rearrange indices of input array in order to make data to be send
	343	!-- by MPI contiguous
	344	!$OMP PARALLEL PRIVATE ( i, j, k )
	345	!$OMP DO
	346	!$acc kernels present( f_inv, f_out )
	347	DO i = 0, nx
	348	DO k = nzb_x, nzt_x
	349	DO j = nys_x, nyn_x
	350	f_out(i,j,k) = f_inv(j,k,i)
	351	ENDDO
	352	ENDDO
	353	ENDDO
	354	!$acc end kernels
	355	!$OMP END PARALLEL
	356
	357	END SUBROUTINE resort_for_yx
	358
	359
	360	SUBROUTINE transpose_yx( f_in, f_inv )
	361
	362	!------------------------------------------------------------------------------!
	363	! Description:
	364	! ------------
[1]	365	! Transposition of input array (f_in) from y to x. For the input array, all
	366	! elements along y reside on the same PE, while after transposition, all
	367	! elements along x reside on the same PE.
	368	!------------------------------------------------------------------------------!
	369
	370	USE cpulog
	371	USE indices
	372	USE interfaces
	373	USE pegrid
	374	USE transpose_indices
	375
	376	IMPLICIT NONE
	377
[1111]	378	INTEGER :: i, j, k, l, ys
[1]	379
[1216]	380	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
[1]	381
[1111]	382	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
	383
	384
[1106]	385	IF ( numprocs /= 1 ) THEN
	386
[1]	387	#if defined( __parallel )
	388	!
[1106]	389	!-- Reorder input array for transposition
[1111]	390	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	391	!$OMP DO
[1216]	392	!$acc data copyout( work )
[1106]	393	DO l = 0, pdims(2) - 1
	394	ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]	395	!$acc kernels present( f_in, work )
[1106]	396	DO i = nxl_y, nxr_y
	397	DO k = nzb_y, nzt_y
	398	DO j = ys, ys + nyn_x - nys_x
[1111]	399	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	400	ENDDO
	401	ENDDO
	402	ENDDO
[1111]	403	!$acc end kernels
[1106]	404	ENDDO
[1216]	405	!$acc end data
[1106]	406	!$OMP END PARALLEL
	407
	408	!
	409	!-- Transpose array
	410	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	411	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	412	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	413	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	414	comm1dy, ierr )
[1111]	415	!$acc update device( f_inv )
[1106]	416	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	417	#endif
	418
	419	ELSE
	420
	421	!
	422	!-- Reorder array f_in the same way as ALLTOALL did it
	423	!$OMP PARALLEL PRIVATE ( i, j, k )
	424	!$OMP DO
[1216]	425	!$acc kernels present( f_in, f_inv )
[1003]	426	DO i = nxl_y, nxr_y
	427	DO k = nzb_y, nzt_y
[1106]	428	DO j = 0, ny
	429	f_inv(j,k,i) = f_in(j,i,k)
[1]	430	ENDDO
	431	ENDDO
	432	ENDDO
[1111]	433	!$acc end kernels
[683]	434	!$OMP END PARALLEL
[1]	435
[1106]	436	ENDIF
[1]	437
	438	END SUBROUTINE transpose_yx
	439
	440
[1216]	441	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	442
	443	!------------------------------------------------------------------------------!
	444	! Description:
	445	! ------------
	446	! Transposition of input array (f_in) from y to x. For the input array, all
	447	! elements along y reside on the same PE, while after transposition, all
	448	! elements along x reside on the same PE.
	449	! This is a direct transposition for arrays with indices in regular order
	450	! (k,j,i) (cf. transpose_yx).
	451	!------------------------------------------------------------------------------!
	452
	453	USE cpulog
	454	USE indices
	455	USE interfaces
	456	USE pegrid
	457	USE transpose_indices
	458
	459	IMPLICIT NONE
	460
[1092]	461	INTEGER :: i, j, k, l, m, xs
[1]	462
[1003]	463	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
	464	f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x), &
[164]	465	work(nnxnnynnz)
[1]	466
	467	#if defined( __parallel )
	468
	469	!
	470	!-- Rearrange indices of input array in order to make data to be send
	471	!-- by MPI contiguous
[1003]	472	DO k = 1, nz
	473	DO j = nys, nyn
	474	DO i = nxl, nxr
[164]	475	f_inv(i,k,j) = f_in(k,j,i)
[1]	476	ENDDO
	477	ENDDO
	478	ENDDO
	479
	480	!
	481	!-- Transpose array
	482	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	483	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	484	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	485	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	486	comm1dx, ierr )
	487	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	488
	489	!
	490	!-- Reorder transposed array
	491	m = 0
	492	DO l = 0, pdims(1) - 1
	493	xs = 0 + l * nnx
[1003]	494	DO j = nys_x, nyn_x
	495	DO k = 1, nz
[1]	496	DO i = xs, xs + nnx - 1
	497	m = m + 1
[164]	498	f_out(i,j,k) = work(m)
[1]	499	ENDDO
	500	ENDDO
	501	ENDDO
	502	ENDDO
	503
	504	#endif
	505
	506	END SUBROUTINE transpose_yxd
	507
	508
[1216]	509	SUBROUTINE resort_for_yz( f_in, f_inv )
[1]	510
	511	!------------------------------------------------------------------------------!
	512	! Description:
	513	! ------------
[1216]	514	! Resorting data for the transposition from y to z. The transposition itself
	515	! is carried out in transpose_yz
	516	!------------------------------------------------------------------------------!
	517
	518	USE indices
	519	USE transpose_indices
	520
	521	IMPLICIT NONE
	522
	523	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
	524	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
	525
	526
	527	INTEGER :: i, j, k
	528
	529	!
	530	!-- Rearrange indices of input array in order to make data to be send
	531	!-- by MPI contiguous
	532	!$OMP PARALLEL PRIVATE ( i, j, k )
	533	!$OMP DO
	534	!$acc kernels present( f_in, f_inv )
	535	DO j = 0, ny
	536	DO k = nzb_y, nzt_y
	537	DO i = nxl_y, nxr_y
	538	f_inv(i,k,j) = f_in(j,i,k)
	539	ENDDO
	540	ENDDO
	541	ENDDO
	542	!$acc end kernels
	543	!$OMP END PARALLEL
	544
	545	END SUBROUTINE resort_for_yz
	546
	547
	548	SUBROUTINE transpose_yz( f_inv, f_out )
	549
	550	!------------------------------------------------------------------------------!
	551	! Description:
	552	! ------------
[1]	553	! Transposition of input array (f_in) from y to z. For the input array, all
	554	! elements along y reside on the same PE, while after transposition, all
	555	! elements along z reside on the same PE.
	556	!------------------------------------------------------------------------------!
	557
	558	USE cpulog
	559	USE indices
	560	USE interfaces
	561	USE pegrid
	562	USE transpose_indices
	563
	564	IMPLICIT NONE
	565
[1111]	566	INTEGER :: i, j, k, l, zs
[1]	567
[1216]	568	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
[1]	569
[1111]	570	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
	571
	572
[1]	573	!
	574	!-- If the PE grid is one-dimensional along y, only local reordering
	575	!-- of the data is necessary and no transposition has to be done.
	576	IF ( pdims(1) == 1 ) THEN
[1106]	577
[683]	578	!$OMP PARALLEL PRIVATE ( i, j, k )
	579	!$OMP DO
[1216]	580	!$acc kernels present( f_inv, f_out )
[1003]	581	DO j = 0, ny
	582	DO k = nzb_y, nzt_y
	583	DO i = nxl_y, nxr_y
[164]	584	f_out(i,j,k) = f_inv(i,k,j)
[1]	585	ENDDO
	586	ENDDO
	587	ENDDO
[1111]	588	!$acc end kernels
[683]	589	!$OMP END PARALLEL
[1]	590
[1106]	591	ELSE
	592
	593	#if defined( __parallel )
[1]	594	!
[1106]	595	!-- Transpose array
	596	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	597	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	598	!$acc update host( f_inv )
	599	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	600	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	601	comm1dx, ierr )
	602	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	603
	604	!
[1106]	605	!-- Reorder transposed array
[1111]	606	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	607	!$OMP DO
[1216]	608	!$acc data copyin( work )
[1106]	609	DO l = 0, pdims(1) - 1
	610	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1216]	611	!$acc kernels present( f_out )
[1106]	612	DO j = nys_z, nyn_z
	613	DO k = zs, zs + nzt_y - nzb_y
	614	DO i = nxl_z, nxr_z
[1111]	615	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	616	ENDDO
[1]	617	ENDDO
	618	ENDDO
[1111]	619	!$acc end kernels
[1]	620	ENDDO
[1216]	621	!$acc end data
[683]	622	!$OMP END PARALLEL
[1]	623	#endif
	624
[1106]	625	ENDIF
	626
[1]	627	END SUBROUTINE transpose_yz
	628
	629
[1216]	630	SUBROUTINE resort_for_zx( f_in, f_inv )
[1]	631
	632	!------------------------------------------------------------------------------!
	633	! Description:
	634	! ------------
[1216]	635	! Resorting data for the transposition from z to x. The transposition itself
	636	! is carried out in transpose_zx
	637	!------------------------------------------------------------------------------!
	638
	639	USE indices
	640	USE transpose_indices
	641
	642	IMPLICIT NONE
	643
	644	REAL :: f_in(1:nz,nys:nyn,nxl:nxr)
	645	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
	646
	647
	648	INTEGER :: i, j, k
	649
	650	!
	651	!-- Rearrange indices of input array in order to make data to be send
	652	!-- by MPI contiguous
	653	!$OMP PARALLEL PRIVATE ( i, j, k )
	654	!$OMP DO
	655	!$acc kernels present( f_in, f_inv )
	656	DO k = 1,nz
	657	DO i = nxl, nxr
	658	DO j = nys, nyn
	659	f_inv(j,i,k) = f_in(k,j,i)
	660	ENDDO
	661	ENDDO
	662	ENDDO
	663	!$acc end kernels
	664	!$OMP END PARALLEL
	665
	666	END SUBROUTINE resort_for_zx
	667
	668
	669	SUBROUTINE transpose_zx( f_inv, f_out )
	670
	671	!------------------------------------------------------------------------------!
	672	! Description:
	673	! ------------
[1]	674	! Transposition of input array (f_in) from z to x. For the input array, all
	675	! elements along z reside on the same PE, while after transposition, all
	676	! elements along x reside on the same PE.
	677	!------------------------------------------------------------------------------!
	678
	679	USE cpulog
	680	USE indices
	681	USE interfaces
	682	USE pegrid
	683	USE transpose_indices
	684
	685	IMPLICIT NONE
	686
[1111]	687	INTEGER :: i, j, k, l, xs
[1]	688
[1216]	689	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
[1]	690
[1111]	691	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
	692
[1]	693
	694	!
	695	!-- If the PE grid is one-dimensional along y, only local reordering
	696	!-- of the data is necessary and no transposition has to be done.
	697	IF ( pdims(1) == 1 ) THEN
[1106]	698
[683]	699	!$OMP PARALLEL PRIVATE ( i, j, k )
	700	!$OMP DO
[1216]	701	!$acc kernels present( f_inv, f_out )
[1003]	702	DO k = 1, nz
	703	DO i = nxl, nxr
	704	DO j = nys, nyn
[164]	705	f_out(i,j,k) = f_inv(j,i,k)
[1]	706	ENDDO
	707	ENDDO
	708	ENDDO
[1111]	709	!$acc end kernels
[683]	710	!$OMP END PARALLEL
[1]	711
[1106]	712	ELSE
	713
	714	#if defined( __parallel )
[1]	715	!
[1106]	716	!-- Transpose array
	717	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	718	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	719	!$acc update host( f_inv )
	720	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	721	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	722	comm1dx, ierr )
	723	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	724
	725	!
[1106]	726	!-- Reorder transposed array
[1111]	727	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	728	!$OMP DO
[1216]	729	!$acc data copyin( work )
[1106]	730	DO l = 0, pdims(1) - 1
	731	xs = 0 + l * nnx
[1216]	732	!$acc kernels present( f_out )
[1106]	733	DO k = nzb_x, nzt_x
	734	DO i = xs, xs + nnx - 1
	735	DO j = nys_x, nyn_x
[1111]	736	f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]	737	ENDDO
[1]	738	ENDDO
	739	ENDDO
[1111]	740	!$acc end kernels
[1]	741	ENDDO
[1216]	742	!$acc end data
[683]	743	!$OMP END PARALLEL
[1]	744	#endif
	745
[1106]	746	ENDIF
	747
[1]	748	END SUBROUTINE transpose_zx
	749
	750
[1216]	751	SUBROUTINE resort_for_zy( f_inv, f_out )
[1]	752
	753	!------------------------------------------------------------------------------!
	754	! Description:
	755	! ------------
[1216]	756	! Resorting data after the transposition from z to y. The transposition itself
	757	! is carried out in transpose_zy
	758	!------------------------------------------------------------------------------!
	759
	760	USE indices
	761	USE transpose_indices
	762
	763	IMPLICIT NONE
	764
	765	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
	766	REAL :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
	767
	768
	769	INTEGER :: i, j, k
	770
	771	!
	772	!-- Rearrange indices of input array in order to make data to be send
	773	!-- by MPI contiguous
	774	!$OMP PARALLEL PRIVATE ( i, j, k )
	775	!$OMP DO
	776	!$acc kernels present( f_inv, f_out )
	777	DO k = nzb_y, nzt_y
	778	DO j = 0, ny
	779	DO i = nxl_y, nxr_y
	780	f_out(j,i,k) = f_inv(i,k,j)
	781	ENDDO
	782	ENDDO
	783	ENDDO
	784	!$acc end kernels
	785	!$OMP END PARALLEL
	786
	787	END SUBROUTINE resort_for_zy
	788
	789
	790	SUBROUTINE transpose_zy( f_in, f_inv )
	791
	792	!------------------------------------------------------------------------------!
	793	! Description:
	794	! ------------
[1]	795	! Transposition of input array (f_in) from z to y. For the input array, all
	796	! elements along z reside on the same PE, while after transposition, all
	797	! elements along y reside on the same PE.
	798	!------------------------------------------------------------------------------!
	799
	800	USE cpulog
	801	USE indices
	802	USE interfaces
	803	USE pegrid
	804	USE transpose_indices
	805
	806	IMPLICIT NONE
	807
[1111]	808	INTEGER :: i, j, k, l, zs
[1]	809
[1216]	810	REAL :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
[1]	811
[1111]	812	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
	813
	814
[1]	815	!
	816	!-- If the PE grid is one-dimensional along y, the array has only to be
	817	!-- reordered locally and therefore no transposition has to be done.
	818	IF ( pdims(1) /= 1 ) THEN
[1106]	819
	820	#if defined( __parallel )
[1]	821	!
	822	!-- Reorder input array for transposition
[1111]	823	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	824	!$OMP DO
[1216]	825	!$acc data copyout( work )
[1]	826	DO l = 0, pdims(1) - 1
[1003]	827	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1111]	828	!$acc kernels present( f_in, work )
[1003]	829	DO j = nys_z, nyn_z
	830	DO k = zs, zs + nzt_y - nzb_y
	831	DO i = nxl_z, nxr_z
[1111]	832	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	833	ENDDO
	834	ENDDO
	835	ENDDO
[1111]	836	!$acc end kernels
[1]	837	ENDDO
[1216]	838	!$acc end data
[683]	839	!$OMP END PARALLEL
[1]	840
	841	!
	842	!-- Transpose array
	843	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	844	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	845	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	846	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	847	comm1dx, ierr )
[1111]	848	!$acc update device( f_inv )
[1]	849	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	850	#endif
[1]	851
	852	ELSE
	853	!
[1106]	854	!-- Reorder the array in the same way like ALLTOALL did it
[683]	855	!$OMP PARALLEL PRIVATE ( i, j, k )
	856	!$OMP DO
[1216]	857	!$acc kernels present( f_in, f_inv )
[1003]	858	DO k = nzb_y, nzt_y
	859	DO j = 0, ny
	860	DO i = nxl_y, nxr_y
[164]	861	f_inv(i,k,j) = f_in(i,j,k)
	862	ENDDO
	863	ENDDO
	864	ENDDO
[1111]	865	!$acc end kernels
[683]	866	!$OMP END PARALLEL
[1106]	867
	868	ENDIF
	869
[1]	870	END SUBROUTINE transpose_zy
	871
	872
[1216]	873	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	874
	875	!------------------------------------------------------------------------------!
	876	! Description:
	877	! ------------
	878	! Transposition of input array (f_in) from z to y. For the input array, all
	879	! elements along z reside on the same PE, while after transposition, all
	880	! elements along y reside on the same PE.
	881	! This is a direct transposition for arrays with indices in regular order
	882	! (k,j,i) (cf. transpose_zy).
	883	!------------------------------------------------------------------------------!
	884
	885	USE cpulog
	886	USE indices
	887	USE interfaces
	888	USE pegrid
	889	USE transpose_indices
	890
	891	IMPLICIT NONE
	892
	893	INTEGER :: i, j, k, l, m, ys
	894
[1003]	895	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
	896	f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd), &
[164]	897	work(nnxnnynnz)
[1]	898
	899	#if defined( __parallel )
	900
	901	!
	902	!-- Rearrange indices of input array in order to make data to be send
	903	!-- by MPI contiguous
[1003]	904	DO i = nxl, nxr
	905	DO j = nys, nyn
	906	DO k = 1, nz
[164]	907	f_inv(j,i,k) = f_in(k,j,i)
[1]	908	ENDDO
	909	ENDDO
	910	ENDDO
	911
	912	!
	913	!-- Move data to different array, because memory location of work1 is
	914	!-- needed further below (work1 = work2).
	915	!-- If the PE grid is one-dimensional along x, only local reordering
	916	!-- of the data is necessary and no transposition has to be done.
	917	IF ( pdims(2) == 1 ) THEN
[1003]	918	DO k = 1, nz
	919	DO i = nxl, nxr
	920	DO j = nys, nyn
[164]	921	f_out(j,i,k) = f_inv(j,i,k)
[1]	922	ENDDO
	923	ENDDO
	924	ENDDO
	925	RETURN
	926	ENDIF
	927
	928	!
	929	!-- Transpose array
	930	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	931	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	932	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	933	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	934	comm1dy, ierr )
	935	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	936
	937	!
	938	!-- Reorder transposed array
	939	m = 0
	940	DO l = 0, pdims(2) - 1
	941	ys = 0 + l * nny
[1003]	942	DO k = nzb_yd, nzt_yd
	943	DO i = nxl_yd, nxr_yd
[1]	944	DO j = ys, ys + nny - 1
	945	m = m + 1
[164]	946	f_out(j,i,k) = work(m)
[1]	947	ENDDO
	948	ENDDO
	949	ENDDO
	950	ENDDO
	951
	952	#endif
	953
	954	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |