Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 4237

Last change on this file since 4237 was 4236, checked in by knoop, 5 years ago
Added missing OpenMP directives within "transpose" and "tridia_solver"
Property svn:keywords set to `Id`
File size: 32.4 KB

Rev	Line
[1682]	1	!> @file transpose.f90
[2000]	2	!------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[3655]	17	! Copyright 1997-2019 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1321]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: transpose.f90 4236 2019-09-25 11:26:18Z knoop $
[4236]	27	! Added missing OpenMP directives
	28	!
	29	! 4182 2019-08-22 15:20:23Z scharf
[4182]	30	! Corrected "Former revisions" section
[4181]	31	!
[4182]	32	! 4171 2019-08-19 17:44:09Z gronemeier
[3832]	33	! loop reordering for performance optimization
[4171]	34	!
	35	! 3832 2019-03-28 13:16:58Z raasch
	36	! loop reordering for performance optimization
	37	!
[3832]	38	! 3694 2019-01-23 17:01:49Z knoop
[3634]	39	! OpenACC port for SPEC
[4171]	40	!
[4182]	41	! Revision 1.1 1997/07/24 11:25:18 raasch
	42	! Initial revision
	43	!
	44	!
[1216]	45	! Description:
	46	! ------------
[1682]	47	!> Resorting data for the transposition from x to y. The transposition itself
	48	!> is carried out in transpose_xy
[1216]	49	!------------------------------------------------------------------------------!
[4181]	50
	51	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
	52
[1682]	53	SUBROUTINE resort_for_xy( f_in, f_inv )
[1216]	54
[4171]	55
[1320]	56	USE indices, &
	57	ONLY: nx
[1216]	58
[1320]	59	USE kinds
	60
	61	USE transpose_indices, &
[3241]	62	ONLY: nyn_x, nys_x, nzb_x, nzt_x
[1320]	63
[1216]	64	IMPLICIT NONE
	65
[4171]	66	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	67	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1216]	68
	69
[4171]	70	INTEGER(iwp) :: i !<
	71	INTEGER(iwp) :: j !<
	72	INTEGER(iwp) :: k !<
[1]	73	!
[1216]	74	!-- Rearrange indices of input array in order to make data to be send
	75	!-- by MPI contiguous
	76	!$OMP PARALLEL PRIVATE ( i, j, k )
	77	!$OMP DO
[3690]	78	#if __acc_fft_device
[3634]	79	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	80	!$ACC PRESENT(f_inv, f_in)
[3690]	81	#endif
[3832]	82	DO k = nzb_x, nzt_x
[4171]	83	DO j = nys_x, nyn_x
	84	DO i = 0, nx
[1216]	85	f_inv(j,k,i) = f_in(i,j,k)
	86	ENDDO
	87	ENDDO
	88	ENDDO
	89	!$OMP END PARALLEL
	90
	91	END SUBROUTINE resort_for_xy
	92
	93
	94	!------------------------------------------------------------------------------!
[1]	95	! Description:
	96	! ------------
[1682]	97	!> Transposition of input array (f_in) from x to y. For the input array, all
	98	!> elements along x reside on the same PE, while after transposition, all
	99	!> elements along y reside on the same PE.
[1]	100	!------------------------------------------------------------------------------!
[1682]	101	SUBROUTINE transpose_xy( f_inv, f_out )
[1]	102
[1682]	103
[1320]	104	USE cpulog, &
	105	ONLY: cpu_log, cpu_log_nowait, log_point_s
	106
	107	USE indices, &
	108	ONLY: nx, ny
[4171]	109
[1320]	110	USE kinds
	111
[1]	112	USE pegrid
	113
[1320]	114	USE transpose_indices, &
	115	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	116
[1]	117	IMPLICIT NONE
	118
[4171]	119	INTEGER(iwp) :: i !<
	120	INTEGER(iwp) :: j !<
	121	INTEGER(iwp) :: k !<
	122	INTEGER(iwp) :: l !<
	123	INTEGER(iwp) :: ys !<
[1]	124
[4171]	125	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	126	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	127
	128	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	129	#if __acc_fft_device
[3634]	130	!$ACC DECLARE CREATE(work)
[3690]	131	#endif
[1111]	132
	133
[1106]	134	IF ( numprocs /= 1 ) THEN
	135
	136	#if defined( __parallel )
[1]	137	!
[1106]	138	!-- Transpose array
[1318]	139	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	140
	141	#if __acc_fft_device
[3657]	142	#ifndef __cuda_aware_mpi
[3634]	143	!$ACC UPDATE HOST(f_inv)
[3657]	144	#else
	145	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	146	#endif
[3690]	147	#endif
	148
[1106]	149	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	150	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	151	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	152	comm1dy, ierr )
[3690]	153
	154	#if __acc_fft_device
[3657]	155	#ifndef __cuda_aware_mpi
[3634]	156	!$ACC UPDATE DEVICE(work)
[3657]	157	#else
	158	!$ACC END HOST_DATA
	159	#endif
[3690]	160	#endif
	161
[1106]	162	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	163
	164	!
[1106]	165	!-- Reorder transposed array
[1111]	166	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[1106]	167	DO l = 0, pdims(2) - 1
	168	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	169	#if __acc_fft_device
[3634]	170	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	171	!$ACC PRESENT(f_out, work)
[3690]	172	#endif
[4236]	173	!$OMP DO
[1106]	174	DO i = nxl_y, nxr_y
	175	DO k = nzb_y, nzt_y
	176	DO j = ys, ys + nyn_x - nys_x
[1111]	177	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	178	ENDDO
[1]	179	ENDDO
	180	ENDDO
[4236]	181	!$OMP END DO NOWAIT
[1]	182	ENDDO
[683]	183	!$OMP END PARALLEL
[1]	184	#endif
	185
[1106]	186	ELSE
	187
	188	!
	189	!-- Reorder transposed array
	190	!$OMP PARALLEL PRIVATE ( i, j, k )
	191	!$OMP DO
[3690]	192	#if __acc_fft_device
[3634]	193	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	194	!$ACC PRESENT(f_out, f_inv)
[3690]	195	#endif
[1106]	196	DO k = nzb_y, nzt_y
	197	DO i = nxl_y, nxr_y
	198	DO j = 0, ny
	199	f_out(j,i,k) = f_inv(j,k,i)
	200	ENDDO
	201	ENDDO
	202	ENDDO
	203	!$OMP END PARALLEL
	204
	205	ENDIF
	206
[1]	207	END SUBROUTINE transpose_xy
	208
	209
	210	!------------------------------------------------------------------------------!
	211	! Description:
	212	! ------------
[1682]	213	!> Resorting data after the transposition from x to z. The transposition itself
	214	!> is carried out in transpose_xz
[1216]	215	!------------------------------------------------------------------------------!
[1682]	216	SUBROUTINE resort_for_xz( f_inv, f_out )
[1216]	217
[1682]	218
[1320]	219	USE indices, &
	220	ONLY: nxl, nxr, nyn, nys, nz
[1216]	221
[1320]	222	USE kinds
	223
[1216]	224	IMPLICIT NONE
	225
[4171]	226	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	227	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
[1216]	228
[4171]	229	INTEGER(iwp) :: i !<
	230	INTEGER(iwp) :: j !<
	231	INTEGER(iwp) :: k !<
[1216]	232	!
	233	!-- Rearrange indices of input array in order to make data to be send
	234	!-- by MPI contiguous.
	235	!-- In case of parallel fft/transposition, scattered store is faster in
	236	!-- backward direction!!!
	237	!$OMP PARALLEL PRIVATE ( i, j, k )
	238	!$OMP DO
[3690]	239	#if __acc_fft_device
[3634]	240	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	241	!$ACC PRESENT(f_out, f_inv)
[3690]	242	#endif
[4171]	243	DO i = nxl, nxr
	244	DO j = nys, nyn
	245	DO k = 1, nz
[1216]	246	f_out(k,j,i) = f_inv(j,i,k)
	247	ENDDO
	248	ENDDO
	249	ENDDO
	250	!$OMP END PARALLEL
	251
	252	END SUBROUTINE resort_for_xz
	253
	254
	255	!------------------------------------------------------------------------------!
	256	! Description:
	257	! ------------
[1682]	258	!> Transposition of input array (f_in) from x to z. For the input array, all
	259	!> elements along x reside on the same PE, while after transposition, all
	260	!> elements along z reside on the same PE.
[1]	261	!------------------------------------------------------------------------------!
[1682]	262	SUBROUTINE transpose_xz( f_in, f_inv )
[1]	263
[1682]	264
[1320]	265	USE cpulog, &
	266	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	267
[1320]	268	USE indices, &
[3241]	269	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
[1320]	270
	271	USE kinds
	272
[1324]	273	USE pegrid
[1320]	274
	275	USE transpose_indices, &
	276	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	277
[1]	278	IMPLICIT NONE
	279
[4171]	280	INTEGER(iwp) :: i !<
	281	INTEGER(iwp) :: j !<
	282	INTEGER(iwp) :: k !<
	283	INTEGER(iwp) :: l !<
	284	INTEGER(iwp) :: xs !<
[1]	285
[4171]	286	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	287	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1]	288
[4171]	289	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	290	#if __acc_fft_device
[3634]	291	!$ACC DECLARE CREATE(work)
[3690]	292	#endif
[1111]	293
[1320]	294
[1]	295	!
	296	!-- If the PE grid is one-dimensional along y, the array has only to be
	297	!-- reordered locally and therefore no transposition has to be done.
	298	IF ( pdims(1) /= 1 ) THEN
[1106]	299
	300	#if defined( __parallel )
[1]	301	!
	302	!-- Reorder input array for transposition
[1111]	303	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[1]	304	DO l = 0, pdims(1) - 1
	305	xs = 0 + l * nnx
[3690]	306	#if __acc_fft_device
[3634]	307	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	308	!$ACC PRESENT(work, f_in)
[3690]	309	#endif
[4236]	310	!$OMP DO
[1003]	311	DO k = nzb_x, nzt_x
[164]	312	DO i = xs, xs + nnx - 1
[1003]	313	DO j = nys_x, nyn_x
[1111]	314	work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]	315	ENDDO
	316	ENDDO
	317	ENDDO
[4236]	318	!$OMP END DO NOWAIT
[1]	319	ENDDO
[683]	320	!$OMP END PARALLEL
[1]	321
	322	!
	323	!-- Transpose array
[1318]	324	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	325
	326	#if __acc_fft_device
[3657]	327	#ifndef __cuda_aware_mpi
[3634]	328	!$ACC UPDATE HOST(work)
[3657]	329	#else
	330	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	331	#endif
[3690]	332	#endif
	333
[622]	334	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	335	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	336	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	337	comm1dx, ierr )
[3690]	338
	339	#if __acc_fft_device
[3657]	340	#ifndef __cuda_aware_mpi
[3634]	341	!$ACC UPDATE DEVICE(f_inv)
[3657]	342	#else
	343	!$ACC END HOST_DATA
	344	#endif
[3694]	345	#endif
	346
[1]	347	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	348	#endif
	349
[1]	350	ELSE
[1106]	351
[1]	352	!
	353	!-- Reorder the array in a way that the z index is in first position
[683]	354	!$OMP PARALLEL PRIVATE ( i, j, k )
	355	!$OMP DO
[3690]	356	#if __acc_fft_device
[3634]	357	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	358	!$ACC PRESENT(f_inv, f_in)
[3690]	359	#endif
[1003]	360	DO i = nxl, nxr
	361	DO j = nys, nyn
	362	DO k = 1, nz
[164]	363	f_inv(j,i,k) = f_in(i,j,k)
[1]	364	ENDDO
	365	ENDDO
	366	ENDDO
[683]	367	!$OMP END PARALLEL
[1]	368
[164]	369	ENDIF
	370
[1]	371	END SUBROUTINE transpose_xz
	372
	373
	374	!------------------------------------------------------------------------------!
	375	! Description:
	376	! ------------
[1682]	377	!> Resorting data after the transposition from y to x. The transposition itself
	378	!> is carried out in transpose_yx
[1216]	379	!------------------------------------------------------------------------------!
[1682]	380	SUBROUTINE resort_for_yx( f_inv, f_out )
[1216]	381
[1682]	382
[1320]	383	USE indices, &
	384	ONLY: nx
[1216]	385
[1320]	386	USE kinds
	387
	388	USE transpose_indices, &
	389	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	390
[1216]	391	IMPLICIT NONE
	392
[4171]	393	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	394	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1216]	395
	396
[4171]	397	INTEGER(iwp) :: i !<
	398	INTEGER(iwp) :: j !<
	399	INTEGER(iwp) :: k !<
[1216]	400	!
	401	!-- Rearrange indices of input array in order to make data to be send
	402	!-- by MPI contiguous
	403	!$OMP PARALLEL PRIVATE ( i, j, k )
	404	!$OMP DO
[3690]	405	#if __acc_fft_device
[3634]	406	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	407	!$ACC PRESENT(f_out, f_inv)
[3690]	408	#endif
[4171]	409	DO k = nzb_x, nzt_x
	410	DO j = nys_x, nyn_x
	411	DO i = 0, nx
[1216]	412	f_out(i,j,k) = f_inv(j,k,i)
	413	ENDDO
	414	ENDDO
	415	ENDDO
	416	!$OMP END PARALLEL
	417
	418	END SUBROUTINE resort_for_yx
	419
	420
	421	!------------------------------------------------------------------------------!
	422	! Description:
	423	! ------------
[1682]	424	!> Transposition of input array (f_in) from y to x. For the input array, all
	425	!> elements along y reside on the same PE, while after transposition, all
	426	!> elements along x reside on the same PE.
[1]	427	!------------------------------------------------------------------------------!
[1682]	428	SUBROUTINE transpose_yx( f_in, f_inv )
[1]	429
[1682]	430
[1320]	431	USE cpulog, &
	432	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	433
[1320]	434	USE indices, &
	435	ONLY: nx, ny
	436
	437	USE kinds
	438
[1324]	439	USE pegrid
[1320]	440
	441	USE transpose_indices, &
	442	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	443
[1]	444	IMPLICIT NONE
	445
[4171]	446	INTEGER(iwp) :: i !<
	447	INTEGER(iwp) :: j !<
	448	INTEGER(iwp) :: k !<
	449	INTEGER(iwp) :: l !<
	450	INTEGER(iwp) :: ys !<
[1]	451
[4171]	452	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	453	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1111]	454
[4171]	455	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	456	#if __acc_fft_device
[3634]	457	!$ACC DECLARE CREATE(work)
[3690]	458	#endif
[1111]	459
[1320]	460
[1106]	461	IF ( numprocs /= 1 ) THEN
	462
[1]	463	#if defined( __parallel )
	464	!
[1106]	465	!-- Reorder input array for transposition
[1111]	466	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[1106]	467	DO l = 0, pdims(2) - 1
	468	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	469	#if __acc_fft_device
[3634]	470	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	471	!$ACC PRESENT(work, f_in)
[3690]	472	#endif
[4236]	473	!$OMP DO
[1106]	474	DO i = nxl_y, nxr_y
	475	DO k = nzb_y, nzt_y
	476	DO j = ys, ys + nyn_x - nys_x
[1111]	477	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	478	ENDDO
	479	ENDDO
	480	ENDDO
[4236]	481	!$OMP END DO NOWAIT
[1106]	482	ENDDO
	483	!$OMP END PARALLEL
	484
	485	!
	486	!-- Transpose array
[1318]	487	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	488
	489	#if __acc_fft_device
[3657]	490	#ifndef __cuda_aware_mpi
[3634]	491	!$ACC UPDATE HOST(work)
[3657]	492	#else
	493	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	494	#endif
[3690]	495	#endif
	496
[1106]	497	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	498	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	499	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	500	comm1dy, ierr )
[3690]	501
	502	#if __acc_fft_device
[3657]	503	#ifndef __cuda_aware_mpi
[3634]	504	!$ACC UPDATE DEVICE(f_inv)
[3657]	505	#else
	506	!$ACC END HOST_DATA
	507	#endif
[3690]	508	#endif
	509
[1106]	510	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	511	#endif
	512
	513	ELSE
	514
	515	!
	516	!-- Reorder array f_in the same way as ALLTOALL did it
	517	!$OMP PARALLEL PRIVATE ( i, j, k )
	518	!$OMP DO
[3690]	519	#if __acc_fft_device
[3634]	520	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	521	!$ACC PRESENT(f_inv, f_in)
[3690]	522	#endif
[1003]	523	DO i = nxl_y, nxr_y
	524	DO k = nzb_y, nzt_y
[1106]	525	DO j = 0, ny
	526	f_inv(j,k,i) = f_in(j,i,k)
[1]	527	ENDDO
	528	ENDDO
	529	ENDDO
[683]	530	!$OMP END PARALLEL
[1]	531
[1106]	532	ENDIF
[1]	533
	534	END SUBROUTINE transpose_yx
	535
	536
	537	!------------------------------------------------------------------------------!
	538	! Description:
	539	! ------------
[1682]	540	!> Transposition of input array (f_in) from y to x. For the input array, all
	541	!> elements along y reside on the same PE, while after transposition, all
	542	!> elements along x reside on the same PE.
	543	!> This is a direct transposition for arrays with indices in regular order
	544	!> (k,j,i) (cf. transpose_yx).
[1]	545	!------------------------------------------------------------------------------!
[1682]	546	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	547
[1682]	548
[1320]	549	USE cpulog, &
[3241]	550	ONLY: cpu_log, log_point_s
[1]	551
[1320]	552	USE indices, &
	553	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
	554
	555	USE kinds
	556
[1324]	557	USE pegrid
[1320]	558
	559	USE transpose_indices, &
	560	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	561
[1]	562	IMPLICIT NONE
	563
[4171]	564	INTEGER(iwp) :: i !<
	565	INTEGER(iwp) :: j !<
	566	INTEGER(iwp) :: k !<
	567	INTEGER(iwp) :: l !<
	568	INTEGER(iwp) :: m !<
	569	INTEGER(iwp) :: xs !<
[1]	570
[4171]	571	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	572	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
	573	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	574	REAL(wp) :: work(nnxnnynnz) !<
[1]	575	#if defined( __parallel )
	576
	577	!
	578	!-- Rearrange indices of input array in order to make data to be send
	579	!-- by MPI contiguous
[1003]	580	DO k = 1, nz
	581	DO j = nys, nyn
	582	DO i = nxl, nxr
[164]	583	f_inv(i,k,j) = f_in(k,j,i)
[1]	584	ENDDO
	585	ENDDO
	586	ENDDO
	587
	588	!
	589	!-- Transpose array
	590	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	591	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	592	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	593	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	594	comm1dx, ierr )
	595	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	596
	597	!
	598	!-- Reorder transposed array
	599	m = 0
	600	DO l = 0, pdims(1) - 1
	601	xs = 0 + l * nnx
[1003]	602	DO j = nys_x, nyn_x
	603	DO k = 1, nz
[1]	604	DO i = xs, xs + nnx - 1
	605	m = m + 1
[164]	606	f_out(i,j,k) = work(m)
[1]	607	ENDDO
	608	ENDDO
	609	ENDDO
	610	ENDDO
	611
	612	#endif
	613
	614	END SUBROUTINE transpose_yxd
	615
	616
	617	!------------------------------------------------------------------------------!
	618	! Description:
	619	! ------------
[1682]	620	!> Resorting data for the transposition from y to z. The transposition itself
	621	!> is carried out in transpose_yz
[1216]	622	!------------------------------------------------------------------------------!
[1682]	623	SUBROUTINE resort_for_yz( f_in, f_inv )
[1216]	624
[1682]	625
[1320]	626	USE indices, &
	627	ONLY: ny
[1216]	628
[1320]	629	USE kinds
	630
	631	USE transpose_indices, &
	632	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	633
[1216]	634	IMPLICIT NONE
	635
[4171]	636	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	637	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1216]	638
[4171]	639	INTEGER(iwp) :: i !<
	640	INTEGER(iwp) :: j !<
	641	INTEGER(iwp) :: k !<
[1216]	642
	643	!
	644	!-- Rearrange indices of input array in order to make data to be send
	645	!-- by MPI contiguous
	646	!$OMP PARALLEL PRIVATE ( i, j, k )
	647	!$OMP DO
[3690]	648	#if __acc_fft_device
[3634]	649	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	650	!$ACC PRESENT(f_inv, f_in)
[3690]	651	#endif
[4171]	652	DO k = nzb_y, nzt_y
	653	DO i = nxl_y, nxr_y
	654	DO j = 0, ny
[1216]	655	f_inv(i,k,j) = f_in(j,i,k)
	656	ENDDO
	657	ENDDO
	658	ENDDO
	659	!$OMP END PARALLEL
	660
	661	END SUBROUTINE resort_for_yz
	662
	663
	664	!------------------------------------------------------------------------------!
	665	! Description:
	666	! ------------
[1682]	667	!> Transposition of input array (f_in) from y to z. For the input array, all
	668	!> elements along y reside on the same PE, while after transposition, all
	669	!> elements along z reside on the same PE.
[1]	670	!------------------------------------------------------------------------------!
[1682]	671	SUBROUTINE transpose_yz( f_inv, f_out )
[1]	672
[1682]	673
[1320]	674	USE cpulog, &
	675	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	676
[1320]	677	USE indices, &
	678	ONLY: ny, nz
	679
	680	USE kinds
	681
[1324]	682	USE pegrid
[1320]	683
	684	USE transpose_indices, &
	685	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	686
[1]	687	IMPLICIT NONE
	688
[4171]	689	INTEGER(iwp) :: i !<
	690	INTEGER(iwp) :: j !<
	691	INTEGER(iwp) :: k !<
	692	INTEGER(iwp) :: l !<
	693	INTEGER(iwp) :: zs !<
[1]	694
[4171]	695	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	696	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
[1111]	697
[4171]	698	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	699	#if __acc_fft_device
[3634]	700	!$ACC DECLARE CREATE(work)
[3690]	701	#endif
[1111]	702
[1320]	703
[1]	704	!
	705	!-- If the PE grid is one-dimensional along y, only local reordering
	706	!-- of the data is necessary and no transposition has to be done.
	707	IF ( pdims(1) == 1 ) THEN
[1106]	708
[683]	709	!$OMP PARALLEL PRIVATE ( i, j, k )
	710	!$OMP DO
[3690]	711	#if __acc_fft_device
[3634]	712	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	713	!$ACC PRESENT(f_out, f_inv)
[3690]	714	#endif
[1003]	715	DO j = 0, ny
	716	DO k = nzb_y, nzt_y
	717	DO i = nxl_y, nxr_y
[164]	718	f_out(i,j,k) = f_inv(i,k,j)
[1]	719	ENDDO
	720	ENDDO
	721	ENDDO
[683]	722	!$OMP END PARALLEL
[1]	723
[1106]	724	ELSE
	725
	726	#if defined( __parallel )
[1]	727	!
[1106]	728	!-- Transpose array
[1318]	729	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	730
	731	#if __acc_fft_device
[3657]	732	#ifndef __cuda_aware_mpi
[3634]	733	!$ACC UPDATE HOST(f_inv)
[3657]	734	#else
	735	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	736	#endif
[3690]	737	#endif
	738
[1106]	739	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	740	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	741	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	742	comm1dx, ierr )
[3690]	743
	744	#if __acc_fft_device
[3657]	745	#ifndef __cuda_aware_mpi
[3634]	746	!$ACC UPDATE DEVICE(work)
[3657]	747	#else
	748	!$ACC END HOST_DATA
	749	#endif
[3690]	750	#endif
	751
[1106]	752	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	753
	754	!
[1106]	755	!-- Reorder transposed array
[1111]	756	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[1106]	757	DO l = 0, pdims(1) - 1
	758	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	759	#if __acc_fft_device
[3634]	760	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	761	!$ACC PRESENT(f_out, work)
[3690]	762	#endif
[4236]	763	!$OMP DO
[1106]	764	DO j = nys_z, nyn_z
	765	DO k = zs, zs + nzt_y - nzb_y
	766	DO i = nxl_z, nxr_z
[1111]	767	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	768	ENDDO
[1]	769	ENDDO
	770	ENDDO
[4236]	771	!$OMP END DO NOWAIT
[1]	772	ENDDO
[683]	773	!$OMP END PARALLEL
[1]	774	#endif
	775
[1106]	776	ENDIF
	777
[1]	778	END SUBROUTINE transpose_yz
	779
	780
	781	!------------------------------------------------------------------------------!
	782	! Description:
	783	! ------------
[1682]	784	!> Resorting data for the transposition from z to x. The transposition itself
	785	!> is carried out in transpose_zx
[1216]	786	!------------------------------------------------------------------------------!
[1682]	787	SUBROUTINE resort_for_zx( f_in, f_inv )
[1216]	788
[1682]	789
[1320]	790	USE indices, &
	791	ONLY: nxl, nxr, nyn, nys, nz
[1216]	792
[1320]	793	USE kinds
	794
[1216]	795	IMPLICIT NONE
	796
[4171]	797	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	798	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1216]	799
[4171]	800	INTEGER(iwp) :: i !<
	801	INTEGER(iwp) :: j !<
	802	INTEGER(iwp) :: k !<
[1216]	803
	804	!
	805	!-- Rearrange indices of input array in order to make data to be send
	806	!-- by MPI contiguous
	807	!$OMP PARALLEL PRIVATE ( i, j, k )
	808	!$OMP DO
[3690]	809	#if __acc_fft_device
[3634]	810	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	811	!$ACC PRESENT(f_in, f_inv)
[3690]	812	#endif
[3832]	813	DO i = nxl, nxr
[4171]	814	DO j = nys, nyn
	815	DO k = 1,nz
[1216]	816	f_inv(j,i,k) = f_in(k,j,i)
	817	ENDDO
	818	ENDDO
	819	ENDDO
	820	!$OMP END PARALLEL
	821
	822	END SUBROUTINE resort_for_zx
	823
	824
	825	!------------------------------------------------------------------------------!
	826	! Description:
	827	! ------------
[1682]	828	!> Transposition of input array (f_in) from z to x. For the input array, all
	829	!> elements along z reside on the same PE, while after transposition, all
	830	!> elements along x reside on the same PE.
[1]	831	!------------------------------------------------------------------------------!
[1682]	832	SUBROUTINE transpose_zx( f_inv, f_out )
[1]	833
[1682]	834
[1320]	835	USE cpulog, &
	836	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	837
[1320]	838	USE indices, &
	839	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
	840
	841	USE kinds
	842
[1324]	843	USE pegrid
[1320]	844
	845	USE transpose_indices, &
	846	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	847
[1]	848	IMPLICIT NONE
	849
[4171]	850	INTEGER(iwp) :: i !<
	851	INTEGER(iwp) :: j !<
	852	INTEGER(iwp) :: k !<
	853	INTEGER(iwp) :: l !<
	854	INTEGER(iwp) :: xs !<
[1]	855
[4171]	856	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	857	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1111]	858
[4171]	859	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	860	#if __acc_fft_device
[3634]	861	!$ACC DECLARE CREATE(work)
[3690]	862	#endif
[1]	863
[1320]	864
[1]	865	!
	866	!-- If the PE grid is one-dimensional along y, only local reordering
	867	!-- of the data is necessary and no transposition has to be done.
	868	IF ( pdims(1) == 1 ) THEN
[1106]	869
[683]	870	!$OMP PARALLEL PRIVATE ( i, j, k )
	871	!$OMP DO
[3690]	872	#if __acc_fft_device
[3634]	873	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	874	!$ACC PRESENT(f_out, f_inv)
[3690]	875	#endif
[1003]	876	DO k = 1, nz
	877	DO i = nxl, nxr
	878	DO j = nys, nyn
[164]	879	f_out(i,j,k) = f_inv(j,i,k)
[1]	880	ENDDO
	881	ENDDO
	882	ENDDO
[683]	883	!$OMP END PARALLEL
[1]	884
[1106]	885	ELSE
	886
	887	#if defined( __parallel )
[1]	888	!
[1106]	889	!-- Transpose array
[1318]	890	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	891
	892	#if __acc_fft_device
[3657]	893	#ifndef __cuda_aware_mpi
[3634]	894	!$ACC UPDATE HOST(f_inv)
[3657]	895	#else
	896	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	897	#endif
[3690]	898	#endif
	899
[1106]	900	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	901	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	902	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	903	comm1dx, ierr )
[3690]	904
	905	#if __acc_fft_device
[3657]	906	#ifndef __cuda_aware_mpi
[3634]	907	!$ACC UPDATE DEVICE(work)
[3657]	908	#else
	909	!$ACC END HOST_DATA
	910	#endif
[3690]	911	#endif
	912
[1106]	913	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	914
	915	!
[1106]	916	!-- Reorder transposed array
[1111]	917	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[1106]	918	DO l = 0, pdims(1) - 1
	919	xs = 0 + l * nnx
[3690]	920	#if __acc_fft_device
[3634]	921	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	922	!$ACC PRESENT(f_out, work)
[3690]	923	#endif
[4236]	924	!$OMP DO
[1106]	925	DO k = nzb_x, nzt_x
	926	DO i = xs, xs + nnx - 1
	927	DO j = nys_x, nyn_x
[1111]	928	f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]	929	ENDDO
[1]	930	ENDDO
	931	ENDDO
[4236]	932	!$OMP END DO NOWAIT
[1]	933	ENDDO
[683]	934	!$OMP END PARALLEL
[1]	935	#endif
	936
[1106]	937	ENDIF
	938
[1]	939	END SUBROUTINE transpose_zx
	940
	941
	942	!------------------------------------------------------------------------------!
	943	! Description:
	944	! ------------
[1682]	945	!> Resorting data after the transposition from z to y. The transposition itself
	946	!> is carried out in transpose_zy
[1216]	947	!------------------------------------------------------------------------------!
[1682]	948	SUBROUTINE resort_for_zy( f_inv, f_out )
[1216]	949
[1682]	950
[1320]	951	USE indices, &
	952	ONLY: ny
[1216]	953
[1320]	954	USE kinds
	955
	956	USE transpose_indices, &
	957	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	958
[1216]	959	IMPLICIT NONE
	960
[4171]	961	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	962	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1216]	963
	964
[4171]	965	INTEGER(iwp) :: i !<
	966	INTEGER(iwp) :: j !<
	967	INTEGER(iwp) :: k !<
[1216]	968
	969	!
	970	!-- Rearrange indices of input array in order to make data to be send
	971	!-- by MPI contiguous
	972	!$OMP PARALLEL PRIVATE ( i, j, k )
	973	!$OMP DO
[3690]	974	#if __acc_fft_device
[3634]	975	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	976	!$ACC PRESENT(f_out, f_inv)
[3690]	977	#endif
[4171]	978	DO k = nzb_y, nzt_y
	979	DO i = nxl_y, nxr_y
	980	DO j = 0, ny
[1216]	981	f_out(j,i,k) = f_inv(i,k,j)
	982	ENDDO
	983	ENDDO
	984	ENDDO
	985	!$OMP END PARALLEL
	986
	987	END SUBROUTINE resort_for_zy
	988
	989
	990	!------------------------------------------------------------------------------!
[3241]	991	! Description:cpu_log_nowait
[1216]	992	! ------------
[1682]	993	!> Transposition of input array (f_in) from z to y. For the input array, all
	994	!> elements along z reside on the same PE, while after transposition, all
	995	!> elements along y reside on the same PE.
[1]	996	!------------------------------------------------------------------------------!
[1682]	997	SUBROUTINE transpose_zy( f_in, f_inv )
[1]	998
[1682]	999
[1320]	1000	USE cpulog, &
	1001	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	1002
[1320]	1003	USE indices, &
	1004	ONLY: ny, nz
	1005
	1006	USE kinds
	1007
[1324]	1008	USE pegrid
[1320]	1009
	1010	USE transpose_indices, &
	1011	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	1012
[1]	1013	IMPLICIT NONE
	1014
[4171]	1015	INTEGER(iwp) :: i !<
	1016	INTEGER(iwp) :: j !<
	1017	INTEGER(iwp) :: k !<
	1018	INTEGER(iwp) :: l !<
	1019	INTEGER(iwp) :: zs !<
[1]	1020
[4171]	1021	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
	1022	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1111]	1023
[1682]	1024	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	1025	#if __acc_fft_device
[3634]	1026	!$ACC DECLARE CREATE(work)
[3690]	1027	#endif
[1111]	1028
[1]	1029	!
	1030	!-- If the PE grid is one-dimensional along y, the array has only to be
	1031	!-- reordered locally and therefore no transposition has to be done.
	1032	IF ( pdims(1) /= 1 ) THEN
[1106]	1033
	1034	#if defined( __parallel )
[1]	1035	!
	1036	!-- Reorder input array for transposition
[1111]	1037	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[1]	1038	DO l = 0, pdims(1) - 1
[1003]	1039	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	1040	#if __acc_fft_device
[3634]	1041	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1042	!$ACC PRESENT(work, f_in)
[3690]	1043	#endif
[4236]	1044	!$OMP DO
[1003]	1045	DO j = nys_z, nyn_z
	1046	DO k = zs, zs + nzt_y - nzb_y
	1047	DO i = nxl_z, nxr_z
[1111]	1048	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	1049	ENDDO
	1050	ENDDO
	1051	ENDDO
[4236]	1052	!$OMP END DO NOWAIT
[1]	1053	ENDDO
[683]	1054	!$OMP END PARALLEL
[1]	1055
	1056	!
	1057	!-- Transpose array
[1318]	1058	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	1059
	1060	#if __acc_fft_device
[3657]	1061	#ifndef __cuda_aware_mpi
[3634]	1062	!$ACC UPDATE HOST(work)
[3657]	1063	#else
	1064	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	1065	#endif
[3690]	1066	#endif
	1067
[622]	1068	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	1069	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	1070	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	1071	comm1dx, ierr )
[3690]	1072
	1073	#if __acc_fft_device
[3657]	1074	#ifndef __cuda_aware_mpi
[3634]	1075	!$ACC UPDATE DEVICE(f_inv)
[3657]	1076	#else
	1077	!$ACC END HOST_DATA
	1078	#endif
[3690]	1079	#endif
	1080
[1]	1081	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	1082	#endif
[1]	1083
	1084	ELSE
	1085	!
[1106]	1086	!-- Reorder the array in the same way like ALLTOALL did it
[683]	1087	!$OMP PARALLEL PRIVATE ( i, j, k )
	1088	!$OMP DO
[3690]	1089	#if __acc_fft_device
[3634]	1090	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1091	!$ACC PRESENT(f_inv, f_in)
[3690]	1092	#endif
[1003]	1093	DO k = nzb_y, nzt_y
	1094	DO j = 0, ny
	1095	DO i = nxl_y, nxr_y
[164]	1096	f_inv(i,k,j) = f_in(i,j,k)
	1097	ENDDO
	1098	ENDDO
	1099	ENDDO
[683]	1100	!$OMP END PARALLEL
[1106]	1101
	1102	ENDIF
	1103
[1]	1104	END SUBROUTINE transpose_zy
	1105
	1106
	1107	!------------------------------------------------------------------------------!
	1108	! Description:
	1109	! ------------
[1682]	1110	!> Transposition of input array (f_in) from z to y. For the input array, all
	1111	!> elements along z reside on the same PE, while after transposition, all
	1112	!> elements along y reside on the same PE.
	1113	!> This is a direct transposition for arrays with indices in regular order
	1114	!> (k,j,i) (cf. transpose_zy).
[1]	1115	!------------------------------------------------------------------------------!
[1682]	1116	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	1117
[1682]	1118
[1320]	1119	USE cpulog, &
[3241]	1120	ONLY: cpu_log, log_point_s
[1]	1121
[1320]	1122	USE indices, &
	1123	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
	1124
	1125	USE kinds
	1126
[1324]	1127	USE pegrid
[1320]	1128
	1129	USE transpose_indices, &
[3241]	1130	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
[1320]	1131
[1]	1132	IMPLICIT NONE
	1133
[4171]	1134	INTEGER(iwp) :: i !<
	1135	INTEGER(iwp) :: j !<
	1136	INTEGER(iwp) :: k !<
	1137	INTEGER(iwp) :: l !<
	1138	INTEGER(iwp) :: m !<
	1139	INTEGER(iwp) :: ys !<
[1]	1140
[4171]	1141	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	1142	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	1143	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
	1144	REAL(wp) :: work(nnxnnynnz) !<
[1320]	1145
[1]	1146	#if defined( __parallel )
	1147
	1148	!
	1149	!-- Rearrange indices of input array in order to make data to be send
	1150	!-- by MPI contiguous
[1003]	1151	DO i = nxl, nxr
	1152	DO j = nys, nyn
	1153	DO k = 1, nz
[164]	1154	f_inv(j,i,k) = f_in(k,j,i)
[1]	1155	ENDDO
	1156	ENDDO
	1157	ENDDO
	1158
	1159	!
	1160	!-- Move data to different array, because memory location of work1 is
	1161	!-- needed further below (work1 = work2).
	1162	!-- If the PE grid is one-dimensional along x, only local reordering
	1163	!-- of the data is necessary and no transposition has to be done.
	1164	IF ( pdims(2) == 1 ) THEN
[1003]	1165	DO k = 1, nz
	1166	DO i = nxl, nxr
	1167	DO j = nys, nyn
[164]	1168	f_out(j,i,k) = f_inv(j,i,k)
[1]	1169	ENDDO
	1170	ENDDO
	1171	ENDDO
	1172	RETURN
	1173	ENDIF
	1174
	1175	!
	1176	!-- Transpose array
	1177	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1178	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1179	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	1180	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	1181	comm1dy, ierr )
	1182	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1183
	1184	!
	1185	!-- Reorder transposed array
	1186	m = 0
	1187	DO l = 0, pdims(2) - 1
	1188	ys = 0 + l * nny
[1003]	1189	DO k = nzb_yd, nzt_yd
	1190	DO i = nxl_yd, nxr_yd
[1]	1191	DO j = ys, ys + nny - 1
	1192	m = m + 1
[164]	1193	f_out(j,i,k) = work(m)
[1]	1194	ENDDO
	1195	ENDDO
	1196	ENDDO
	1197	ENDDO
	1198
	1199	#endif
	1200
	1201	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |