Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 4181

Last change on this file since 4181 was 4181, checked in by raasch, 5 years ago
bugfix: define directive added, which has been deleted by mistake in r4180
Property svn:keywords set to `Id`
File size: 32.2 KB

Rev	Line
[1682]	1	!> @file transpose.f90
[2000]	2	!------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[3655]	17	! Copyright 1997-2019 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1321]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: transpose.f90 4181 2019-08-22 06:37:36Z raasch $
[4181]	27	! bugfix: define directive added, which has been deleted by mistake in r4180
	28	!
	29	! 4180 2019-08-21 14:37:54Z scharf
[3832]	30	! loop reordering for performance optimization
[4171]	31	!
	32	! 3832 2019-03-28 13:16:58Z raasch
	33	! loop reordering for performance optimization
	34	!
[3832]	35	! 3694 2019-01-23 17:01:49Z knoop
[3634]	36	! OpenACC port for SPEC
[4171]	37	!
[1216]	38	!------------------------------------------------------------------------------!
	39	! Description:
	40	! ------------
[1682]	41	!> Resorting data for the transposition from x to y. The transposition itself
	42	!> is carried out in transpose_xy
[1216]	43	!------------------------------------------------------------------------------!
[4181]	44
	45	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
	46
[1682]	47	SUBROUTINE resort_for_xy( f_in, f_inv )
[1216]	48
[4171]	49
[1320]	50	USE indices, &
	51	ONLY: nx
[1216]	52
[1320]	53	USE kinds
	54
	55	USE transpose_indices, &
[3241]	56	ONLY: nyn_x, nys_x, nzb_x, nzt_x
[1320]	57
[1216]	58	IMPLICIT NONE
	59
[4171]	60	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	61	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1216]	62
	63
[4171]	64	INTEGER(iwp) :: i !<
	65	INTEGER(iwp) :: j !<
	66	INTEGER(iwp) :: k !<
[1]	67	!
[1216]	68	!-- Rearrange indices of input array in order to make data to be send
	69	!-- by MPI contiguous
	70	!$OMP PARALLEL PRIVATE ( i, j, k )
	71	!$OMP DO
[3690]	72	#if __acc_fft_device
[3634]	73	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	74	!$ACC PRESENT(f_inv, f_in)
[3690]	75	#endif
[3832]	76	DO k = nzb_x, nzt_x
[4171]	77	DO j = nys_x, nyn_x
	78	DO i = 0, nx
[1216]	79	f_inv(j,k,i) = f_in(i,j,k)
	80	ENDDO
	81	ENDDO
	82	ENDDO
	83	!$OMP END PARALLEL
	84
	85	END SUBROUTINE resort_for_xy
	86
	87
	88	!------------------------------------------------------------------------------!
[1]	89	! Description:
	90	! ------------
[1682]	91	!> Transposition of input array (f_in) from x to y. For the input array, all
	92	!> elements along x reside on the same PE, while after transposition, all
	93	!> elements along y reside on the same PE.
[1]	94	!------------------------------------------------------------------------------!
[1682]	95	SUBROUTINE transpose_xy( f_inv, f_out )
[1]	96
[1682]	97
[1320]	98	USE cpulog, &
	99	ONLY: cpu_log, cpu_log_nowait, log_point_s
	100
	101	USE indices, &
	102	ONLY: nx, ny
[4171]	103
[1320]	104	USE kinds
	105
[1]	106	USE pegrid
	107
[1320]	108	USE transpose_indices, &
	109	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	110
[1]	111	IMPLICIT NONE
	112
[4171]	113	INTEGER(iwp) :: i !<
	114	INTEGER(iwp) :: j !<
	115	INTEGER(iwp) :: k !<
	116	INTEGER(iwp) :: l !<
	117	INTEGER(iwp) :: ys !<
[1]	118
[4171]	119	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	120	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	121
	122	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	123	#if __acc_fft_device
[3634]	124	!$ACC DECLARE CREATE(work)
[3690]	125	#endif
[1111]	126
	127
[1106]	128	IF ( numprocs /= 1 ) THEN
	129
	130	#if defined( __parallel )
[1]	131	!
[1106]	132	!-- Transpose array
[1318]	133	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	134
	135	#if __acc_fft_device
[3657]	136	#ifndef __cuda_aware_mpi
[3634]	137	!$ACC UPDATE HOST(f_inv)
[3657]	138	#else
	139	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	140	#endif
[3690]	141	#endif
	142
[1106]	143	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	144	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	145	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	146	comm1dy, ierr )
[3690]	147
	148	#if __acc_fft_device
[3657]	149	#ifndef __cuda_aware_mpi
[3634]	150	!$ACC UPDATE DEVICE(work)
[3657]	151	#else
	152	!$ACC END HOST_DATA
	153	#endif
[3690]	154	#endif
	155
[1106]	156	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	157
	158	!
[1106]	159	!-- Reorder transposed array
[1111]	160	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	161	!$OMP DO
[1106]	162	DO l = 0, pdims(2) - 1
	163	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	164	#if __acc_fft_device
[3634]	165	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	166	!$ACC PRESENT(f_out, work)
[3690]	167	#endif
[1106]	168	DO i = nxl_y, nxr_y
	169	DO k = nzb_y, nzt_y
	170	DO j = ys, ys + nyn_x - nys_x
[1111]	171	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	172	ENDDO
[1]	173	ENDDO
	174	ENDDO
	175	ENDDO
[683]	176	!$OMP END PARALLEL
[1]	177	#endif
	178
[1106]	179	ELSE
	180
	181	!
	182	!-- Reorder transposed array
	183	!$OMP PARALLEL PRIVATE ( i, j, k )
	184	!$OMP DO
[3690]	185	#if __acc_fft_device
[3634]	186	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	187	!$ACC PRESENT(f_out, f_inv)
[3690]	188	#endif
[1106]	189	DO k = nzb_y, nzt_y
	190	DO i = nxl_y, nxr_y
	191	DO j = 0, ny
	192	f_out(j,i,k) = f_inv(j,k,i)
	193	ENDDO
	194	ENDDO
	195	ENDDO
	196	!$OMP END PARALLEL
	197
	198	ENDIF
	199
[1]	200	END SUBROUTINE transpose_xy
	201
	202
	203	!------------------------------------------------------------------------------!
	204	! Description:
	205	! ------------
[1682]	206	!> Resorting data after the transposition from x to z. The transposition itself
	207	!> is carried out in transpose_xz
[1216]	208	!------------------------------------------------------------------------------!
[1682]	209	SUBROUTINE resort_for_xz( f_inv, f_out )
[1216]	210
[1682]	211
[1320]	212	USE indices, &
	213	ONLY: nxl, nxr, nyn, nys, nz
[1216]	214
[1320]	215	USE kinds
	216
[1216]	217	IMPLICIT NONE
	218
[4171]	219	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	220	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
[1216]	221
[4171]	222	INTEGER(iwp) :: i !<
	223	INTEGER(iwp) :: j !<
	224	INTEGER(iwp) :: k !<
[1216]	225	!
	226	!-- Rearrange indices of input array in order to make data to be send
	227	!-- by MPI contiguous.
	228	!-- In case of parallel fft/transposition, scattered store is faster in
	229	!-- backward direction!!!
	230	!$OMP PARALLEL PRIVATE ( i, j, k )
	231	!$OMP DO
[3690]	232	#if __acc_fft_device
[3634]	233	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	234	!$ACC PRESENT(f_out, f_inv)
[3690]	235	#endif
[4171]	236	DO i = nxl, nxr
	237	DO j = nys, nyn
	238	DO k = 1, nz
[1216]	239	f_out(k,j,i) = f_inv(j,i,k)
	240	ENDDO
	241	ENDDO
	242	ENDDO
	243	!$OMP END PARALLEL
	244
	245	END SUBROUTINE resort_for_xz
	246
	247
	248	!------------------------------------------------------------------------------!
	249	! Description:
	250	! ------------
[1682]	251	!> Transposition of input array (f_in) from x to z. For the input array, all
	252	!> elements along x reside on the same PE, while after transposition, all
	253	!> elements along z reside on the same PE.
[1]	254	!------------------------------------------------------------------------------!
[1682]	255	SUBROUTINE transpose_xz( f_in, f_inv )
[1]	256
[1682]	257
[1320]	258	USE cpulog, &
	259	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	260
[1320]	261	USE indices, &
[3241]	262	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
[1320]	263
	264	USE kinds
	265
[1324]	266	USE pegrid
[1320]	267
	268	USE transpose_indices, &
	269	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	270
[1]	271	IMPLICIT NONE
	272
[4171]	273	INTEGER(iwp) :: i !<
	274	INTEGER(iwp) :: j !<
	275	INTEGER(iwp) :: k !<
	276	INTEGER(iwp) :: l !<
	277	INTEGER(iwp) :: xs !<
[1]	278
[4171]	279	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	280	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1]	281
[4171]	282	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	283	#if __acc_fft_device
[3634]	284	!$ACC DECLARE CREATE(work)
[3690]	285	#endif
[1111]	286
[1320]	287
[1]	288	!
	289	!-- If the PE grid is one-dimensional along y, the array has only to be
	290	!-- reordered locally and therefore no transposition has to be done.
	291	IF ( pdims(1) /= 1 ) THEN
[1106]	292
	293	#if defined( __parallel )
[1]	294	!
	295	!-- Reorder input array for transposition
[1111]	296	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	297	!$OMP DO
[1]	298	DO l = 0, pdims(1) - 1
	299	xs = 0 + l * nnx
[3690]	300	#if __acc_fft_device
[3634]	301	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	302	!$ACC PRESENT(work, f_in)
[3690]	303	#endif
[1003]	304	DO k = nzb_x, nzt_x
[164]	305	DO i = xs, xs + nnx - 1
[1003]	306	DO j = nys_x, nyn_x
[1111]	307	work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]	308	ENDDO
	309	ENDDO
	310	ENDDO
	311	ENDDO
[683]	312	!$OMP END PARALLEL
[1]	313
	314	!
	315	!-- Transpose array
[1318]	316	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	317
	318	#if __acc_fft_device
[3657]	319	#ifndef __cuda_aware_mpi
[3634]	320	!$ACC UPDATE HOST(work)
[3657]	321	#else
	322	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	323	#endif
[3690]	324	#endif
	325
[622]	326	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	327	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	328	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	329	comm1dx, ierr )
[3690]	330
	331	#if __acc_fft_device
[3657]	332	#ifndef __cuda_aware_mpi
[3634]	333	!$ACC UPDATE DEVICE(f_inv)
[3657]	334	#else
	335	!$ACC END HOST_DATA
	336	#endif
[3694]	337	#endif
	338
[1]	339	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	340	#endif
	341
[1]	342	ELSE
[1106]	343
[1]	344	!
	345	!-- Reorder the array in a way that the z index is in first position
[683]	346	!$OMP PARALLEL PRIVATE ( i, j, k )
	347	!$OMP DO
[3690]	348	#if __acc_fft_device
[3634]	349	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	350	!$ACC PRESENT(f_inv, f_in)
[3690]	351	#endif
[1003]	352	DO i = nxl, nxr
	353	DO j = nys, nyn
	354	DO k = 1, nz
[164]	355	f_inv(j,i,k) = f_in(i,j,k)
[1]	356	ENDDO
	357	ENDDO
	358	ENDDO
[683]	359	!$OMP END PARALLEL
[1]	360
[164]	361	ENDIF
	362
[1]	363	END SUBROUTINE transpose_xz
	364
	365
	366	!------------------------------------------------------------------------------!
	367	! Description:
	368	! ------------
[1682]	369	!> Resorting data after the transposition from y to x. The transposition itself
	370	!> is carried out in transpose_yx
[1216]	371	!------------------------------------------------------------------------------!
[1682]	372	SUBROUTINE resort_for_yx( f_inv, f_out )
[1216]	373
[1682]	374
[1320]	375	USE indices, &
	376	ONLY: nx
[1216]	377
[1320]	378	USE kinds
	379
	380	USE transpose_indices, &
	381	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	382
[1216]	383	IMPLICIT NONE
	384
[4171]	385	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	386	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1216]	387
	388
[4171]	389	INTEGER(iwp) :: i !<
	390	INTEGER(iwp) :: j !<
	391	INTEGER(iwp) :: k !<
[1216]	392	!
	393	!-- Rearrange indices of input array in order to make data to be send
	394	!-- by MPI contiguous
	395	!$OMP PARALLEL PRIVATE ( i, j, k )
	396	!$OMP DO
[3690]	397	#if __acc_fft_device
[3634]	398	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	399	!$ACC PRESENT(f_out, f_inv)
[3690]	400	#endif
[4171]	401	DO k = nzb_x, nzt_x
	402	DO j = nys_x, nyn_x
	403	DO i = 0, nx
[1216]	404	f_out(i,j,k) = f_inv(j,k,i)
	405	ENDDO
	406	ENDDO
	407	ENDDO
	408	!$OMP END PARALLEL
	409
	410	END SUBROUTINE resort_for_yx
	411
	412
	413	!------------------------------------------------------------------------------!
	414	! Description:
	415	! ------------
[1682]	416	!> Transposition of input array (f_in) from y to x. For the input array, all
	417	!> elements along y reside on the same PE, while after transposition, all
	418	!> elements along x reside on the same PE.
[1]	419	!------------------------------------------------------------------------------!
[1682]	420	SUBROUTINE transpose_yx( f_in, f_inv )
[1]	421
[1682]	422
[1320]	423	USE cpulog, &
	424	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	425
[1320]	426	USE indices, &
	427	ONLY: nx, ny
	428
	429	USE kinds
	430
[1324]	431	USE pegrid
[1320]	432
	433	USE transpose_indices, &
	434	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	435
[1]	436	IMPLICIT NONE
	437
[4171]	438	INTEGER(iwp) :: i !<
	439	INTEGER(iwp) :: j !<
	440	INTEGER(iwp) :: k !<
	441	INTEGER(iwp) :: l !<
	442	INTEGER(iwp) :: ys !<
[1]	443
[4171]	444	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	445	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1111]	446
[4171]	447	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	448	#if __acc_fft_device
[3634]	449	!$ACC DECLARE CREATE(work)
[3690]	450	#endif
[1111]	451
[1320]	452
[1106]	453	IF ( numprocs /= 1 ) THEN
	454
[1]	455	#if defined( __parallel )
	456	!
[1106]	457	!-- Reorder input array for transposition
[1111]	458	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	459	!$OMP DO
[1106]	460	DO l = 0, pdims(2) - 1
	461	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	462	#if __acc_fft_device
[3634]	463	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	464	!$ACC PRESENT(work, f_in)
[3690]	465	#endif
[1106]	466	DO i = nxl_y, nxr_y
	467	DO k = nzb_y, nzt_y
	468	DO j = ys, ys + nyn_x - nys_x
[1111]	469	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	470	ENDDO
	471	ENDDO
	472	ENDDO
	473	ENDDO
	474	!$OMP END PARALLEL
	475
	476	!
	477	!-- Transpose array
[1318]	478	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	479
	480	#if __acc_fft_device
[3657]	481	#ifndef __cuda_aware_mpi
[3634]	482	!$ACC UPDATE HOST(work)
[3657]	483	#else
	484	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	485	#endif
[3690]	486	#endif
	487
[1106]	488	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	489	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	490	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	491	comm1dy, ierr )
[3690]	492
	493	#if __acc_fft_device
[3657]	494	#ifndef __cuda_aware_mpi
[3634]	495	!$ACC UPDATE DEVICE(f_inv)
[3657]	496	#else
	497	!$ACC END HOST_DATA
	498	#endif
[3690]	499	#endif
	500
[1106]	501	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	502	#endif
	503
	504	ELSE
	505
	506	!
	507	!-- Reorder array f_in the same way as ALLTOALL did it
	508	!$OMP PARALLEL PRIVATE ( i, j, k )
	509	!$OMP DO
[3690]	510	#if __acc_fft_device
[3634]	511	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	512	!$ACC PRESENT(f_inv, f_in)
[3690]	513	#endif
[1003]	514	DO i = nxl_y, nxr_y
	515	DO k = nzb_y, nzt_y
[1106]	516	DO j = 0, ny
	517	f_inv(j,k,i) = f_in(j,i,k)
[1]	518	ENDDO
	519	ENDDO
	520	ENDDO
[683]	521	!$OMP END PARALLEL
[1]	522
[1106]	523	ENDIF
[1]	524
	525	END SUBROUTINE transpose_yx
	526
	527
	528	!------------------------------------------------------------------------------!
	529	! Description:
	530	! ------------
[1682]	531	!> Transposition of input array (f_in) from y to x. For the input array, all
	532	!> elements along y reside on the same PE, while after transposition, all
	533	!> elements along x reside on the same PE.
	534	!> This is a direct transposition for arrays with indices in regular order
	535	!> (k,j,i) (cf. transpose_yx).
[1]	536	!------------------------------------------------------------------------------!
[1682]	537	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	538
[1682]	539
[1320]	540	USE cpulog, &
[3241]	541	ONLY: cpu_log, log_point_s
[1]	542
[1320]	543	USE indices, &
	544	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
	545
	546	USE kinds
	547
[1324]	548	USE pegrid
[1320]	549
	550	USE transpose_indices, &
	551	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	552
[1]	553	IMPLICIT NONE
	554
[4171]	555	INTEGER(iwp) :: i !<
	556	INTEGER(iwp) :: j !<
	557	INTEGER(iwp) :: k !<
	558	INTEGER(iwp) :: l !<
	559	INTEGER(iwp) :: m !<
	560	INTEGER(iwp) :: xs !<
[1]	561
[4171]	562	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	563	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
	564	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	565	REAL(wp) :: work(nnxnnynnz) !<
[1]	566	#if defined( __parallel )
	567
	568	!
	569	!-- Rearrange indices of input array in order to make data to be send
	570	!-- by MPI contiguous
[1003]	571	DO k = 1, nz
	572	DO j = nys, nyn
	573	DO i = nxl, nxr
[164]	574	f_inv(i,k,j) = f_in(k,j,i)
[1]	575	ENDDO
	576	ENDDO
	577	ENDDO
	578
	579	!
	580	!-- Transpose array
	581	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	582	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	583	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	584	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	585	comm1dx, ierr )
	586	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	587
	588	!
	589	!-- Reorder transposed array
	590	m = 0
	591	DO l = 0, pdims(1) - 1
	592	xs = 0 + l * nnx
[1003]	593	DO j = nys_x, nyn_x
	594	DO k = 1, nz
[1]	595	DO i = xs, xs + nnx - 1
	596	m = m + 1
[164]	597	f_out(i,j,k) = work(m)
[1]	598	ENDDO
	599	ENDDO
	600	ENDDO
	601	ENDDO
	602
	603	#endif
	604
	605	END SUBROUTINE transpose_yxd
	606
	607
	608	!------------------------------------------------------------------------------!
	609	! Description:
	610	! ------------
[1682]	611	!> Resorting data for the transposition from y to z. The transposition itself
	612	!> is carried out in transpose_yz
[1216]	613	!------------------------------------------------------------------------------!
[1682]	614	SUBROUTINE resort_for_yz( f_in, f_inv )
[1216]	615
[1682]	616
[1320]	617	USE indices, &
	618	ONLY: ny
[1216]	619
[1320]	620	USE kinds
	621
	622	USE transpose_indices, &
	623	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	624
[1216]	625	IMPLICIT NONE
	626
[4171]	627	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	628	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1216]	629
[4171]	630	INTEGER(iwp) :: i !<
	631	INTEGER(iwp) :: j !<
	632	INTEGER(iwp) :: k !<
[1216]	633
	634	!
	635	!-- Rearrange indices of input array in order to make data to be send
	636	!-- by MPI contiguous
	637	!$OMP PARALLEL PRIVATE ( i, j, k )
	638	!$OMP DO
[3690]	639	#if __acc_fft_device
[3634]	640	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	641	!$ACC PRESENT(f_inv, f_in)
[3690]	642	#endif
[4171]	643	DO k = nzb_y, nzt_y
	644	DO i = nxl_y, nxr_y
	645	DO j = 0, ny
[1216]	646	f_inv(i,k,j) = f_in(j,i,k)
	647	ENDDO
	648	ENDDO
	649	ENDDO
	650	!$OMP END PARALLEL
	651
	652	END SUBROUTINE resort_for_yz
	653
	654
	655	!------------------------------------------------------------------------------!
	656	! Description:
	657	! ------------
[1682]	658	!> Transposition of input array (f_in) from y to z. For the input array, all
	659	!> elements along y reside on the same PE, while after transposition, all
	660	!> elements along z reside on the same PE.
[1]	661	!------------------------------------------------------------------------------!
[1682]	662	SUBROUTINE transpose_yz( f_inv, f_out )
[1]	663
[1682]	664
[1320]	665	USE cpulog, &
	666	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	667
[1320]	668	USE indices, &
	669	ONLY: ny, nz
	670
	671	USE kinds
	672
[1324]	673	USE pegrid
[1320]	674
	675	USE transpose_indices, &
	676	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	677
[1]	678	IMPLICIT NONE
	679
[4171]	680	INTEGER(iwp) :: i !<
	681	INTEGER(iwp) :: j !<
	682	INTEGER(iwp) :: k !<
	683	INTEGER(iwp) :: l !<
	684	INTEGER(iwp) :: zs !<
[1]	685
[4171]	686	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	687	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
[1111]	688
[4171]	689	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	690	#if __acc_fft_device
[3634]	691	!$ACC DECLARE CREATE(work)
[3690]	692	#endif
[1111]	693
[1320]	694
[1]	695	!
	696	!-- If the PE grid is one-dimensional along y, only local reordering
	697	!-- of the data is necessary and no transposition has to be done.
	698	IF ( pdims(1) == 1 ) THEN
[1106]	699
[683]	700	!$OMP PARALLEL PRIVATE ( i, j, k )
	701	!$OMP DO
[3690]	702	#if __acc_fft_device
[3634]	703	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	704	!$ACC PRESENT(f_out, f_inv)
[3690]	705	#endif
[1003]	706	DO j = 0, ny
	707	DO k = nzb_y, nzt_y
	708	DO i = nxl_y, nxr_y
[164]	709	f_out(i,j,k) = f_inv(i,k,j)
[1]	710	ENDDO
	711	ENDDO
	712	ENDDO
[683]	713	!$OMP END PARALLEL
[1]	714
[1106]	715	ELSE
	716
	717	#if defined( __parallel )
[1]	718	!
[1106]	719	!-- Transpose array
[1318]	720	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	721
	722	#if __acc_fft_device
[3657]	723	#ifndef __cuda_aware_mpi
[3634]	724	!$ACC UPDATE HOST(f_inv)
[3657]	725	#else
	726	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	727	#endif
[3690]	728	#endif
	729
[1106]	730	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	731	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	732	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	733	comm1dx, ierr )
[3690]	734
	735	#if __acc_fft_device
[3657]	736	#ifndef __cuda_aware_mpi
[3634]	737	!$ACC UPDATE DEVICE(work)
[3657]	738	#else
	739	!$ACC END HOST_DATA
	740	#endif
[3690]	741	#endif
	742
[1106]	743	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	744
	745	!
[1106]	746	!-- Reorder transposed array
[1111]	747	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	748	!$OMP DO
[1106]	749	DO l = 0, pdims(1) - 1
	750	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	751	#if __acc_fft_device
[3634]	752	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	753	!$ACC PRESENT(f_out, work)
[3690]	754	#endif
[1106]	755	DO j = nys_z, nyn_z
	756	DO k = zs, zs + nzt_y - nzb_y
	757	DO i = nxl_z, nxr_z
[1111]	758	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	759	ENDDO
[1]	760	ENDDO
	761	ENDDO
	762	ENDDO
[683]	763	!$OMP END PARALLEL
[1]	764	#endif
	765
[1106]	766	ENDIF
	767
[1]	768	END SUBROUTINE transpose_yz
	769
	770
	771	!------------------------------------------------------------------------------!
	772	! Description:
	773	! ------------
[1682]	774	!> Resorting data for the transposition from z to x. The transposition itself
	775	!> is carried out in transpose_zx
[1216]	776	!------------------------------------------------------------------------------!
[1682]	777	SUBROUTINE resort_for_zx( f_in, f_inv )
[1216]	778
[1682]	779
[1320]	780	USE indices, &
	781	ONLY: nxl, nxr, nyn, nys, nz
[1216]	782
[1320]	783	USE kinds
	784
[1216]	785	IMPLICIT NONE
	786
[4171]	787	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	788	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1216]	789
[4171]	790	INTEGER(iwp) :: i !<
	791	INTEGER(iwp) :: j !<
	792	INTEGER(iwp) :: k !<
[1216]	793
	794	!
	795	!-- Rearrange indices of input array in order to make data to be send
	796	!-- by MPI contiguous
	797	!$OMP PARALLEL PRIVATE ( i, j, k )
	798	!$OMP DO
[3690]	799	#if __acc_fft_device
[3634]	800	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	801	!$ACC PRESENT(f_in, f_inv)
[3690]	802	#endif
[3832]	803	DO i = nxl, nxr
[4171]	804	DO j = nys, nyn
	805	DO k = 1,nz
[1216]	806	f_inv(j,i,k) = f_in(k,j,i)
	807	ENDDO
	808	ENDDO
	809	ENDDO
	810	!$OMP END PARALLEL
	811
	812	END SUBROUTINE resort_for_zx
	813
	814
	815	!------------------------------------------------------------------------------!
	816	! Description:
	817	! ------------
[1682]	818	!> Transposition of input array (f_in) from z to x. For the input array, all
	819	!> elements along z reside on the same PE, while after transposition, all
	820	!> elements along x reside on the same PE.
[1]	821	!------------------------------------------------------------------------------!
[1682]	822	SUBROUTINE transpose_zx( f_inv, f_out )
[1]	823
[1682]	824
[1320]	825	USE cpulog, &
	826	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	827
[1320]	828	USE indices, &
	829	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
	830
	831	USE kinds
	832
[1324]	833	USE pegrid
[1320]	834
	835	USE transpose_indices, &
	836	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	837
[1]	838	IMPLICIT NONE
	839
[4171]	840	INTEGER(iwp) :: i !<
	841	INTEGER(iwp) :: j !<
	842	INTEGER(iwp) :: k !<
	843	INTEGER(iwp) :: l !<
	844	INTEGER(iwp) :: xs !<
[1]	845
[4171]	846	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	847	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1111]	848
[4171]	849	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	850	#if __acc_fft_device
[3634]	851	!$ACC DECLARE CREATE(work)
[3690]	852	#endif
[1]	853
[1320]	854
[1]	855	!
	856	!-- If the PE grid is one-dimensional along y, only local reordering
	857	!-- of the data is necessary and no transposition has to be done.
	858	IF ( pdims(1) == 1 ) THEN
[1106]	859
[683]	860	!$OMP PARALLEL PRIVATE ( i, j, k )
	861	!$OMP DO
[3690]	862	#if __acc_fft_device
[3634]	863	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	864	!$ACC PRESENT(f_out, f_inv)
[3690]	865	#endif
[1003]	866	DO k = 1, nz
	867	DO i = nxl, nxr
	868	DO j = nys, nyn
[164]	869	f_out(i,j,k) = f_inv(j,i,k)
[1]	870	ENDDO
	871	ENDDO
	872	ENDDO
[683]	873	!$OMP END PARALLEL
[1]	874
[1106]	875	ELSE
	876
	877	#if defined( __parallel )
[1]	878	!
[1106]	879	!-- Transpose array
[1318]	880	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	881
	882	#if __acc_fft_device
[3657]	883	#ifndef __cuda_aware_mpi
[3634]	884	!$ACC UPDATE HOST(f_inv)
[3657]	885	#else
	886	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	887	#endif
[3690]	888	#endif
	889
[1106]	890	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	891	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	892	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	893	comm1dx, ierr )
[3690]	894
	895	#if __acc_fft_device
[3657]	896	#ifndef __cuda_aware_mpi
[3634]	897	!$ACC UPDATE DEVICE(work)
[3657]	898	#else
	899	!$ACC END HOST_DATA
	900	#endif
[3690]	901	#endif
	902
[1106]	903	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	904
	905	!
[1106]	906	!-- Reorder transposed array
[1111]	907	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	908	!$OMP DO
[1106]	909	DO l = 0, pdims(1) - 1
	910	xs = 0 + l * nnx
[3690]	911	#if __acc_fft_device
[3634]	912	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	913	!$ACC PRESENT(f_out, work)
[3690]	914	#endif
[1106]	915	DO k = nzb_x, nzt_x
	916	DO i = xs, xs + nnx - 1
	917	DO j = nys_x, nyn_x
[1111]	918	f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]	919	ENDDO
[1]	920	ENDDO
	921	ENDDO
	922	ENDDO
[683]	923	!$OMP END PARALLEL
[1]	924	#endif
	925
[1106]	926	ENDIF
	927
[1]	928	END SUBROUTINE transpose_zx
	929
	930
	931	!------------------------------------------------------------------------------!
	932	! Description:
	933	! ------------
[1682]	934	!> Resorting data after the transposition from z to y. The transposition itself
	935	!> is carried out in transpose_zy
[1216]	936	!------------------------------------------------------------------------------!
[1682]	937	SUBROUTINE resort_for_zy( f_inv, f_out )
[1216]	938
[1682]	939
[1320]	940	USE indices, &
	941	ONLY: ny
[1216]	942
[1320]	943	USE kinds
	944
	945	USE transpose_indices, &
	946	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	947
[1216]	948	IMPLICIT NONE
	949
[4171]	950	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	951	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1216]	952
	953
[4171]	954	INTEGER(iwp) :: i !<
	955	INTEGER(iwp) :: j !<
	956	INTEGER(iwp) :: k !<
[1216]	957
	958	!
	959	!-- Rearrange indices of input array in order to make data to be send
	960	!-- by MPI contiguous
	961	!$OMP PARALLEL PRIVATE ( i, j, k )
	962	!$OMP DO
[3690]	963	#if __acc_fft_device
[3634]	964	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	965	!$ACC PRESENT(f_out, f_inv)
[3690]	966	#endif
[4171]	967	DO k = nzb_y, nzt_y
	968	DO i = nxl_y, nxr_y
	969	DO j = 0, ny
[1216]	970	f_out(j,i,k) = f_inv(i,k,j)
	971	ENDDO
	972	ENDDO
	973	ENDDO
	974	!$OMP END PARALLEL
	975
	976	END SUBROUTINE resort_for_zy
	977
	978
	979	!------------------------------------------------------------------------------!
[3241]	980	! Description:cpu_log_nowait
[1216]	981	! ------------
[1682]	982	!> Transposition of input array (f_in) from z to y. For the input array, all
	983	!> elements along z reside on the same PE, while after transposition, all
	984	!> elements along y reside on the same PE.
[1]	985	!------------------------------------------------------------------------------!
[1682]	986	SUBROUTINE transpose_zy( f_in, f_inv )
[1]	987
[1682]	988
[1320]	989	USE cpulog, &
	990	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	991
[1320]	992	USE indices, &
	993	ONLY: ny, nz
	994
	995	USE kinds
	996
[1324]	997	USE pegrid
[1320]	998
	999	USE transpose_indices, &
	1000	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	1001
[1]	1002	IMPLICIT NONE
	1003
[4171]	1004	INTEGER(iwp) :: i !<
	1005	INTEGER(iwp) :: j !<
	1006	INTEGER(iwp) :: k !<
	1007	INTEGER(iwp) :: l !<
	1008	INTEGER(iwp) :: zs !<
[1]	1009
[4171]	1010	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
	1011	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1111]	1012
[1682]	1013	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	1014	#if __acc_fft_device
[3634]	1015	!$ACC DECLARE CREATE(work)
[3690]	1016	#endif
[1111]	1017
[1]	1018	!
	1019	!-- If the PE grid is one-dimensional along y, the array has only to be
	1020	!-- reordered locally and therefore no transposition has to be done.
	1021	IF ( pdims(1) /= 1 ) THEN
[1106]	1022
	1023	#if defined( __parallel )
[1]	1024	!
	1025	!-- Reorder input array for transposition
[1111]	1026	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	1027	!$OMP DO
[1]	1028	DO l = 0, pdims(1) - 1
[1003]	1029	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	1030	#if __acc_fft_device
[3634]	1031	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1032	!$ACC PRESENT(work, f_in)
[3690]	1033	#endif
[1003]	1034	DO j = nys_z, nyn_z
	1035	DO k = zs, zs + nzt_y - nzb_y
	1036	DO i = nxl_z, nxr_z
[1111]	1037	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	1038	ENDDO
	1039	ENDDO
	1040	ENDDO
	1041	ENDDO
[683]	1042	!$OMP END PARALLEL
[1]	1043
	1044	!
	1045	!-- Transpose array
[1318]	1046	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	1047
	1048	#if __acc_fft_device
[3657]	1049	#ifndef __cuda_aware_mpi
[3634]	1050	!$ACC UPDATE HOST(work)
[3657]	1051	#else
	1052	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	1053	#endif
[3690]	1054	#endif
	1055
[622]	1056	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	1057	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	1058	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	1059	comm1dx, ierr )
[3690]	1060
	1061	#if __acc_fft_device
[3657]	1062	#ifndef __cuda_aware_mpi
[3634]	1063	!$ACC UPDATE DEVICE(f_inv)
[3657]	1064	#else
	1065	!$ACC END HOST_DATA
	1066	#endif
[3690]	1067	#endif
	1068
[1]	1069	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	1070	#endif
[1]	1071
	1072	ELSE
	1073	!
[1106]	1074	!-- Reorder the array in the same way like ALLTOALL did it
[683]	1075	!$OMP PARALLEL PRIVATE ( i, j, k )
	1076	!$OMP DO
[3690]	1077	#if __acc_fft_device
[3634]	1078	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1079	!$ACC PRESENT(f_inv, f_in)
[3690]	1080	#endif
[1003]	1081	DO k = nzb_y, nzt_y
	1082	DO j = 0, ny
	1083	DO i = nxl_y, nxr_y
[164]	1084	f_inv(i,k,j) = f_in(i,j,k)
	1085	ENDDO
	1086	ENDDO
	1087	ENDDO
[683]	1088	!$OMP END PARALLEL
[1106]	1089
	1090	ENDIF
	1091
[1]	1092	END SUBROUTINE transpose_zy
	1093
	1094
	1095	!------------------------------------------------------------------------------!
	1096	! Description:
	1097	! ------------
[1682]	1098	!> Transposition of input array (f_in) from z to y. For the input array, all
	1099	!> elements along z reside on the same PE, while after transposition, all
	1100	!> elements along y reside on the same PE.
	1101	!> This is a direct transposition for arrays with indices in regular order
	1102	!> (k,j,i) (cf. transpose_zy).
[1]	1103	!------------------------------------------------------------------------------!
[1682]	1104	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	1105
[1682]	1106
[1320]	1107	USE cpulog, &
[3241]	1108	ONLY: cpu_log, log_point_s
[1]	1109
[1320]	1110	USE indices, &
	1111	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
	1112
	1113	USE kinds
	1114
[1324]	1115	USE pegrid
[1320]	1116
	1117	USE transpose_indices, &
[3241]	1118	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
[1320]	1119
[1]	1120	IMPLICIT NONE
	1121
[4171]	1122	INTEGER(iwp) :: i !<
	1123	INTEGER(iwp) :: j !<
	1124	INTEGER(iwp) :: k !<
	1125	INTEGER(iwp) :: l !<
	1126	INTEGER(iwp) :: m !<
	1127	INTEGER(iwp) :: ys !<
[1]	1128
[4171]	1129	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	1130	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	1131	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
	1132	REAL(wp) :: work(nnxnnynnz) !<
[1320]	1133
[1]	1134	#if defined( __parallel )
	1135
	1136	!
	1137	!-- Rearrange indices of input array in order to make data to be send
	1138	!-- by MPI contiguous
[1003]	1139	DO i = nxl, nxr
	1140	DO j = nys, nyn
	1141	DO k = 1, nz
[164]	1142	f_inv(j,i,k) = f_in(k,j,i)
[1]	1143	ENDDO
	1144	ENDDO
	1145	ENDDO
	1146
	1147	!
	1148	!-- Move data to different array, because memory location of work1 is
	1149	!-- needed further below (work1 = work2).
	1150	!-- If the PE grid is one-dimensional along x, only local reordering
	1151	!-- of the data is necessary and no transposition has to be done.
	1152	IF ( pdims(2) == 1 ) THEN
[1003]	1153	DO k = 1, nz
	1154	DO i = nxl, nxr
	1155	DO j = nys, nyn
[164]	1156	f_out(j,i,k) = f_inv(j,i,k)
[1]	1157	ENDDO
	1158	ENDDO
	1159	ENDDO
	1160	RETURN
	1161	ENDIF
	1162
	1163	!
	1164	!-- Transpose array
	1165	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1166	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1167	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	1168	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	1169	comm1dy, ierr )
	1170	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1171
	1172	!
	1173	!-- Reorder transposed array
	1174	m = 0
	1175	DO l = 0, pdims(2) - 1
	1176	ys = 0 + l * nny
[1003]	1177	DO k = nzb_yd, nzt_yd
	1178	DO i = nxl_yd, nxr_yd
[1]	1179	DO j = ys, ys + nny - 1
	1180	m = m + 1
[164]	1181	f_out(j,i,k) = work(m)
[1]	1182	ENDDO
	1183	ENDDO
	1184	ENDDO
	1185	ENDDO
	1186
	1187	#endif
	1188
	1189	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |