Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 4664

Last change on this file since 4664 was 4540, checked in by raasch, 4 years ago
files re-formatted to follow the PALM coding standard
Property svn:keywords set to `Id`
File size: 46.1 KB

Rev	Line
[1682]	1	!> @file transpose.f90
[4540]	2	!--------------------------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[4540]	5	! PALM is free software: you can redistribute it and/or modify it under the terms of the GNU General
	6	! Public License as published by the Free Software Foundation, either version 3 of the License, or
	7	! (at your option) any later version.
[1036]	8	!
[4540]	9	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
	10	! implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
	11	! Public License for more details.
[1036]	12	!
[4540]	13	! You should have received a copy of the GNU General Public License along with PALM. If not, see
	14	! <http://www.gnu.org/licenses/>.
[1036]	15	!
[4360]	16	! Copyright 1997-2020 Leibniz Universitaet Hannover
[4540]	17	!--------------------------------------------------------------------------------------------------!
[1036]	18	!
[4540]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1321]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: transpose.f90 4540 2020-05-18 15:23:29Z gronemeier $
[4540]	27	! File re-formatted to follow the PALM coding standard
	28	!
	29	!
	30	! 4429 2020-02-27 15:24:30Z raasch
	31	! Bugfix: cpp-directives added for serial mode
	32	!
[4429]	33	! 4415 2020-02-20 10:30:33Z raasch
[4540]	34	! Bugfix for misplaced preprocessor directive
	35	!
[4415]	36	! 4370 2020-01-10 14:00:44Z raasch
[4540]	37	! Vector array renamed
	38	!
[4370]	39	! 4366 2020-01-09 08:12:43Z raasch
[4540]	40	! Modifications for NEC vectorization
	41	!
[4366]	42	! 4360 2020-01-07 11:25:50Z suehring
[4236]	43	! Added missing OpenMP directives
[4540]	44	!
[4236]	45	! 4182 2019-08-22 15:20:23Z scharf
[4182]	46	! Corrected "Former revisions" section
[4540]	47	!
[4182]	48	! 4171 2019-08-19 17:44:09Z gronemeier
[4540]	49	! Loop reordering for performance optimization
[4171]	50	!
	51	! 3832 2019-03-28 13:16:58Z raasch
[4540]	52	! Loop reordering for performance optimization
[4171]	53	!
[3832]	54	! 3694 2019-01-23 17:01:49Z knoop
[3634]	55	! OpenACC port for SPEC
[4171]	56	!
[4182]	57	! Revision 1.1 1997/07/24 11:25:18 raasch
	58	! Initial revision
	59	!
	60	!
[1216]	61	! Description:
	62	! ------------
[4540]	63	!> Resorting data for the transposition from x to y. The transposition itself is carried out in
	64	!> transpose_xy.
	65	!--------------------------------------------------------------------------------------------------!
[4181]	66
	67	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
	68
[1682]	69	SUBROUTINE resort_for_xy( f_in, f_inv )
[1216]	70
[4171]	71
[4540]	72	USE indices, &
	73	ONLY: nx
[1216]	74
[4540]	75	USE kinds
[1320]	76
[4540]	77	USE transpose_indices, &
	78	ONLY: nyn_x, &
	79	nys_x, &
	80	nzb_x, &
	81	nzt_x
[1320]	82
[4540]	83	IMPLICIT NONE
[1216]	84
[4540]	85	INTEGER(iwp) :: i !<
	86	INTEGER(iwp) :: j !<
	87	INTEGER(iwp) :: k !<
[1216]	88
[4540]	89	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	90	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1216]	91
[1]	92	!
[4540]	93	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous
[1216]	94	!$OMP PARALLEL PRIVATE ( i, j, k )
	95	!$OMP DO
[3690]	96	#if __acc_fft_device
[4540]	97	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	98	!$ACC PRESENT(f_inv, f_in)
[3690]	99	#endif
[4540]	100	DO k = nzb_x, nzt_x
	101	DO j = nys_x, nyn_x
	102	DO i = 0, nx
	103	f_inv(j,k,i) = f_in(i,j,k)
	104	ENDDO
	105	ENDDO
	106	ENDDO
	107	!$OMP END PARALLEL
[1216]	108
	109	END SUBROUTINE resort_for_xy
	110
	111
[4540]	112	!--------------------------------------------------------------------------------------------------!
[1]	113	! Description:
	114	! ------------
[4540]	115	!> Transposition of input array (f_in) from x to y. For the input array, all elements along x reside
	116	!> on the same PE, while after transposition, all elements along y reside on the same PE.
	117	!--------------------------------------------------------------------------------------------------!
[1682]	118	SUBROUTINE transpose_xy( f_inv, f_out )
[1]	119
[1682]	120
[4429]	121	#if defined( __parallel )
[4540]	122	USE cpulog, &
	123	ONLY: cpu_log, &
	124	cpu_log_nowait, &
	125	log_point_s
[4429]	126	#endif
[1320]	127
[4540]	128	USE indices, &
	129	ONLY: nx, &
	130	ny
[4171]	131
[1320]	132	USE kinds
	133
[1]	134	USE pegrid
	135
[4540]	136	USE transpose_indices, &
	137	ONLY: nxl_y, &
	138	nxr_y, &
	139	nyn_x, &
	140	nys_x, &
	141	nzb_x, &
	142	nzb_y, &
	143	nzt_x, &
	144	nzt_y
[1320]	145
[1]	146	IMPLICIT NONE
	147
[4171]	148	INTEGER(iwp) :: i !<
	149	INTEGER(iwp) :: j !<
	150	INTEGER(iwp) :: k !<
[4429]	151
	152	#if defined( __parallel )
[4540]	153	INTEGER(iwp) :: l !<
	154	INTEGER(iwp) :: ys !<
[4429]	155	#endif
[1]	156
[4540]	157	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	158	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[4171]	159
[4429]	160	#if defined( __parallel )
[4540]	161	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	162	#if __acc_fft_device
[3634]	163	!$ACC DECLARE CREATE(work)
[3690]	164	#endif
[4429]	165	#endif
[1111]	166
	167
[1106]	168	IF ( numprocs /= 1 ) THEN
	169
	170	#if defined( __parallel )
[1]	171	!
[1106]	172	!-- Transpose array
[1318]	173	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	174
	175	#if __acc_fft_device
[3657]	176	#ifndef __cuda_aware_mpi
[3634]	177	!$ACC UPDATE HOST(f_inv)
[3657]	178	#else
	179	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	180	#endif
[3690]	181	#endif
	182
[1106]	183	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[4540]	184	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	185	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
[3690]	186
	187	#if __acc_fft_device
[3657]	188	#ifndef __cuda_aware_mpi
[3634]	189	!$ACC UPDATE DEVICE(work)
[3657]	190	#else
	191	!$ACC END HOST_DATA
	192	#endif
[3690]	193	#endif
	194
[1106]	195	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	196
	197	!
[1106]	198	!-- Reorder transposed array
[1111]	199	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[1106]	200	DO l = 0, pdims(2) - 1
	201	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	202	#if __acc_fft_device
[3634]	203	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	204	!$ACC PRESENT(f_out, work)
[3690]	205	#endif
[4236]	206	!$OMP DO
[1106]	207	DO i = nxl_y, nxr_y
	208	DO k = nzb_y, nzt_y
	209	DO j = ys, ys + nyn_x - nys_x
[1111]	210	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	211	ENDDO
[1]	212	ENDDO
	213	ENDDO
[4236]	214	!$OMP END DO NOWAIT
[1]	215	ENDDO
[683]	216	!$OMP END PARALLEL
[1]	217	#endif
	218
[1106]	219	ELSE
	220
	221	!
	222	!-- Reorder transposed array
	223	!$OMP PARALLEL PRIVATE ( i, j, k )
	224	!$OMP DO
[3690]	225	#if __acc_fft_device
[3634]	226	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	227	!$ACC PRESENT(f_out, f_inv)
[3690]	228	#endif
[1106]	229	DO k = nzb_y, nzt_y
	230	DO i = nxl_y, nxr_y
	231	DO j = 0, ny
	232	f_out(j,i,k) = f_inv(j,k,i)
	233	ENDDO
	234	ENDDO
	235	ENDDO
	236	!$OMP END PARALLEL
	237
	238	ENDIF
	239
[1]	240	END SUBROUTINE transpose_xy
	241
	242
[4540]	243	!--------------------------------------------------------------------------------------------------!
[1]	244	! Description:
	245	! ------------
[4540]	246	!> Resorting data after the transposition from x to z. The transposition itself is carried out in
	247	!> transpose_xz.
	248	!--------------------------------------------------------------------------------------------------!
[1682]	249	SUBROUTINE resort_for_xz( f_inv, f_out )
[1216]	250
[1682]	251
[4540]	252	USE indices, &
	253	ONLY: nxl, &
	254	nxr, &
	255	nyn, &
	256	nys, &
	257	nz
[1216]	258
[4540]	259	USE kinds
[1320]	260
[4540]	261	IMPLICIT NONE
[1216]	262
[4540]	263	INTEGER(iwp) :: i !<
	264	INTEGER(iwp) :: j !<
	265	INTEGER(iwp) :: k !<
[1216]	266
[4540]	267	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	268	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
	269
[1216]	270	!
[4540]	271	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
	272	!-- In case of parallel fft/transposition, scattered store is faster in backward direction!!!
	273	!$OMP PARALLEL PRIVATE ( i, j, k )
	274	!$OMP DO
[3690]	275	#if __acc_fft_device
[4540]	276	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	277	!$ACC PRESENT(f_out, f_inv)
[3690]	278	#endif
[4540]	279	DO i = nxl, nxr
	280	DO j = nys, nyn
	281	DO k = 1, nz
	282	f_out(k,j,i) = f_inv(j,i,k)
	283	ENDDO
	284	ENDDO
	285	ENDDO
	286	!$OMP END PARALLEL
[1216]	287
	288	END SUBROUTINE resort_for_xz
	289
	290
[4540]	291	!--------------------------------------------------------------------------------------------------!
[1216]	292	! Description:
	293	! ------------
[4540]	294	!> Transposition of input array (f_in) from x to z. For the input array, all elements along x reside
	295	!> on the same PE, while after transposition, all elements along z reside on the same PE.
	296	!--------------------------------------------------------------------------------------------------!
[1682]	297	SUBROUTINE transpose_xz( f_in, f_inv )
[1]	298
[4429]	299	#if defined( __parallel )
[4540]	300	USE cpulog, &
	301	ONLY: cpu_log, &
	302	cpu_log_nowait, &
	303	log_point_s
[1]	304
[4540]	305	USE fft_xy, &
	306	ONLY: f_vec_x, &
	307	temperton_fft_vec
[4429]	308	#endif
[4366]	309
[4540]	310	USE indices, &
	311	ONLY: nx, &
	312	nxl, &
	313	nxr, &
	314	nyn, &
	315	nys, &
	316	nz
	317
[4429]	318	#if defined( __parallel )
[4540]	319	USE indices, &
[4429]	320	ONLY: nnx
	321	#endif
[1320]	322
	323	USE kinds
	324
[1324]	325	USE pegrid
[1320]	326
[4540]	327	USE transpose_indices, &
	328	ONLY: nyn_x, &
	329	nys_x, &
	330	nzb_x, &
	331	nzt_x
[1320]	332
[1]	333	IMPLICIT NONE
	334
[4171]	335	INTEGER(iwp) :: i !<
	336	INTEGER(iwp) :: j !<
	337	INTEGER(iwp) :: k !<
[4540]	338
[4429]	339	#if defined( __parallel )
[4540]	340	INTEGER(iwp) :: l !<
	341	INTEGER(iwp) :: mm !<
	342	INTEGER(iwp) :: xs !<
[4429]	343	#endif
[1]	344
[4540]	345	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	346	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1]	347
[4429]	348	#if defined( __parallel )
[4540]	349	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	350	#if __acc_fft_device
[3634]	351	!$ACC DECLARE CREATE(work)
[3690]	352	#endif
[4429]	353	#endif
[1111]	354
[4366]	355	!
[4540]	356	!-- If the PE grid is one-dimensional along y, the array has only to be reordered locally and
	357	!-- therefore no transposition has to be done.
[1]	358	IF ( pdims(1) /= 1 ) THEN
[1106]	359
	360	#if defined( __parallel )
[1]	361	!
[4366]	362	!-- Reorder input array for transposition. Data from the vectorized Temperton-fft is stored in
[4370]	363	!-- different array format (f_vec_x).
[4366]	364	IF ( temperton_fft_vec ) THEN
	365
	366	DO l = 0, pdims(1) - 1
	367	xs = 0 + l * nnx
	368	DO k = nzb_x, nzt_x
	369	DO i = xs, xs + nnx - 1
	370	DO j = nys_x, nyn_x
	371	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
[4370]	372	work(j,i-xs+1,k,l) = f_vec_x(mm,i)
[4366]	373	ENDDO
	374	ENDDO
	375	ENDDO
	376	ENDDO
	377
	378	ELSE
	379
	380	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
	381	DO l = 0, pdims(1) - 1
	382	xs = 0 + l * nnx
[3690]	383	#if __acc_fft_device
[4366]	384	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	385	!$ACC PRESENT(work, f_in)
[3690]	386	#endif
[4366]	387	!$OMP DO
	388	DO k = nzb_x, nzt_x
	389	DO i = xs, xs + nnx - 1
	390	DO j = nys_x, nyn_x
	391	work(j,i-xs+1,k,l) = f_in(i,j,k)
	392	ENDDO
[1]	393	ENDDO
	394	ENDDO
[4366]	395	!$OMP END DO NOWAIT
[1]	396	ENDDO
[4366]	397	!$OMP END PARALLEL
[1]	398
[4366]	399	ENDIF
	400
[1]	401	!
	402	!-- Transpose array
[1318]	403	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	404
	405	#if __acc_fft_device
[3657]	406	#ifndef __cuda_aware_mpi
[3634]	407	!$ACC UPDATE HOST(work)
[3657]	408	#else
	409	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	410	#endif
[3690]	411	#endif
	412
[622]	413	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[4540]	414	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	415	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, comm1dx, ierr )
[3690]	416
	417	#if __acc_fft_device
[3657]	418	#ifndef __cuda_aware_mpi
[3634]	419	!$ACC UPDATE DEVICE(f_inv)
[3657]	420	#else
	421	!$ACC END HOST_DATA
	422	#endif
[3694]	423	#endif
	424
[1]	425	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	426	#endif
	427
[1]	428	ELSE
[1106]	429
[1]	430	!
	431	!-- Reorder the array in a way that the z index is in first position
[683]	432	!$OMP PARALLEL PRIVATE ( i, j, k )
	433	!$OMP DO
[3690]	434	#if __acc_fft_device
[3634]	435	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	436	!$ACC PRESENT(f_inv, f_in)
[3690]	437	#endif
[1003]	438	DO i = nxl, nxr
	439	DO j = nys, nyn
	440	DO k = 1, nz
[164]	441	f_inv(j,i,k) = f_in(i,j,k)
[1]	442	ENDDO
	443	ENDDO
	444	ENDDO
[683]	445	!$OMP END PARALLEL
[1]	446
[164]	447	ENDIF
	448
[1]	449	END SUBROUTINE transpose_xz
	450
	451
[4540]	452	!--------------------------------------------------------------------------------------------------!
[1]	453	! Description:
	454	! ------------
[4540]	455	!> Resorting data after the transposition from y to x. The transposition itself is carried out in
	456	!> transpose_yx.
	457	!--------------------------------------------------------------------------------------------------!
[1682]	458	SUBROUTINE resort_for_yx( f_inv, f_out )
[1216]	459
[1682]	460
[4540]	461	USE indices, &
	462	ONLY: nx
[1216]	463
[4540]	464	USE kinds
[1320]	465
[4540]	466	USE transpose_indices, &
	467	ONLY: nyn_x, &
	468	nys_x, &
	469	nzb_x, &
	470	nzt_x
[1320]	471
[4540]	472	IMPLICIT NONE
[1216]	473
[4540]	474	INTEGER(iwp) :: i !<
	475	INTEGER(iwp) :: j !<
	476	INTEGER(iwp) :: k !<
[1216]	477
[4540]	478	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	479	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1216]	480
	481	!
[4540]	482	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
	483	!$OMP PARALLEL PRIVATE ( i, j, k )
	484	!$OMP DO
[3690]	485	#if __acc_fft_device
[4540]	486	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	487	!$ACC PRESENT(f_out, f_inv)
[3690]	488	#endif
[4540]	489	DO k = nzb_x, nzt_x
	490	DO j = nys_x, nyn_x
	491	DO i = 0, nx
	492	f_out(i,j,k) = f_inv(j,k,i)
	493	ENDDO
	494	ENDDO
	495	ENDDO
	496	!$OMP END PARALLEL
[1216]	497
	498	END SUBROUTINE resort_for_yx
	499
	500
[4540]	501	!--------------------------------------------------------------------------------------------------!
[1216]	502	! Description:
	503	! ------------
[4540]	504	!> Transposition of input array (f_in) from y to x. For the input array, all elements along y
	505	!> reside on the same PE, while after transposition, all elements along x reside on the same PE.
	506	!--------------------------------------------------------------------------------------------------!
[1682]	507	SUBROUTINE transpose_yx( f_in, f_inv )
[1]	508
[1682]	509
[4429]	510	#if defined( __parallel )
[4540]	511	USE cpulog, &
	512	ONLY: cpu_log, &
	513	cpu_log_nowait, &
	514	log_point_s
[4429]	515	#endif
[1]	516
[4540]	517	USE indices, &
	518	ONLY: nx, &
	519	ny
[1320]	520
	521	USE kinds
	522
[1324]	523	USE pegrid
[1320]	524
[4540]	525	USE transpose_indices, &
	526	ONLY: nxl_y, &
	527	nxr_y, &
	528	nyn_x, &
	529	nys_x, &
	530	nzb_x, &
	531	nzb_y, &
	532	nzt_x, &
	533	nzt_y
[1320]	534
[1]	535	IMPLICIT NONE
	536
[4171]	537	INTEGER(iwp) :: i !<
	538	INTEGER(iwp) :: j !<
	539	INTEGER(iwp) :: k !<
[4540]	540
[4429]	541	#if defined( __parallel )
[4540]	542	INTEGER(iwp) :: l !<
	543	INTEGER(iwp) :: ys !<
[4429]	544	#endif
[1]	545
[4540]	546	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	547	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1111]	548
[4429]	549	#if defined( __parallel )
[4540]	550	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	551	#if __acc_fft_device
[3634]	552	!$ACC DECLARE CREATE(work)
[3690]	553	#endif
[4429]	554	#endif
[1111]	555
[1320]	556
[1106]	557	IF ( numprocs /= 1 ) THEN
	558
[1]	559	#if defined( __parallel )
	560	!
[1106]	561	!-- Reorder input array for transposition
[1111]	562	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[1106]	563	DO l = 0, pdims(2) - 1
	564	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	565	#if __acc_fft_device
[3634]	566	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	567	!$ACC PRESENT(work, f_in)
[3690]	568	#endif
[4236]	569	!$OMP DO
[1106]	570	DO i = nxl_y, nxr_y
	571	DO k = nzb_y, nzt_y
	572	DO j = ys, ys + nyn_x - nys_x
[1111]	573	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	574	ENDDO
	575	ENDDO
	576	ENDDO
[4236]	577	!$OMP END DO NOWAIT
[1106]	578	ENDDO
	579	!$OMP END PARALLEL
	580
	581	!
	582	!-- Transpose array
[1318]	583	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	584
	585	#if __acc_fft_device
[3657]	586	#ifndef __cuda_aware_mpi
[3634]	587	!$ACC UPDATE HOST(work)
[3657]	588	#else
	589	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	590	#endif
[3690]	591	#endif
	592
[1106]	593	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[4540]	594	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	595	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
[3690]	596
	597	#if __acc_fft_device
[3657]	598	#ifndef __cuda_aware_mpi
[3634]	599	!$ACC UPDATE DEVICE(f_inv)
[3657]	600	#else
	601	!$ACC END HOST_DATA
	602	#endif
[3690]	603	#endif
	604
[1106]	605	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	606	#endif
	607
	608	ELSE
	609
	610	!
[4540]	611	!-- Reorder array f_in the same way as ALLTOALL did it.
[1106]	612	!$OMP PARALLEL PRIVATE ( i, j, k )
	613	!$OMP DO
[3690]	614	#if __acc_fft_device
[3634]	615	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	616	!$ACC PRESENT(f_inv, f_in)
[3690]	617	#endif
[1003]	618	DO i = nxl_y, nxr_y
	619	DO k = nzb_y, nzt_y
[1106]	620	DO j = 0, ny
	621	f_inv(j,k,i) = f_in(j,i,k)
[1]	622	ENDDO
	623	ENDDO
	624	ENDDO
[683]	625	!$OMP END PARALLEL
[1]	626
[1106]	627	ENDIF
[1]	628
	629	END SUBROUTINE transpose_yx
	630
	631
[4540]	632	!--------------------------------------------------------------------------------------------------!
[1]	633	! Description:
	634	! ------------
[4540]	635	!> Transposition of input array (f_in) from y to x. For the input array, all elements along y reside
	636	!> on the same PE, while after transposition, all elements along x reside on the same PE. This is a
	637	!> direct transposition for arrays with indices in regular order (k,j,i) (cf. transpose_yx).
	638	!--------------------------------------------------------------------------------------------------!
[4429]	639	#if defined( __parallel )
[1682]	640	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	641
[1682]	642
[4540]	643	USE cpulog, &
	644	ONLY: cpu_log, &
	645	log_point_s
[1]	646
[4540]	647	USE indices, &
	648	ONLY: nnx, &
	649	nny, &
	650	nnz, &
	651	nx, &
	652	nxl, &
	653	nxr, &
	654	nyn, &
	655	nys, &
	656	nz
[1320]	657
	658	USE kinds
	659
[1324]	660	USE pegrid
[1320]	661
[4540]	662	USE transpose_indices, &
	663	ONLY: nyn_x, &
	664	nys_x, &
	665	nzb_x, &
	666	nzt_x
[1320]	667
[1]	668	IMPLICIT NONE
	669
[4540]	670	INTEGER(iwp) :: i !<
	671	INTEGER(iwp) :: j !<
	672	INTEGER(iwp) :: k !<
	673	INTEGER(iwp) :: l !<
	674	INTEGER(iwp) :: m !<
	675	INTEGER(iwp) :: xs !<
[1]	676
[4540]	677	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	678	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
	679	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	680	REAL(wp) :: work(nnxnnynnz) !<
[1]	681
	682	!
[4540]	683	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
[1003]	684	DO k = 1, nz
	685	DO j = nys, nyn
	686	DO i = nxl, nxr
[164]	687	f_inv(i,k,j) = f_in(k,j,i)
[1]	688	ENDDO
	689	ENDDO
	690	ENDDO
	691
	692	!
	693	!-- Transpose array
	694	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	695	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[4540]	696	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
	697	work(1), sendrecvcount_xy, MPI_REAL, comm1dx, ierr )
[1]	698	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	699
	700	!
	701	!-- Reorder transposed array
	702	m = 0
	703	DO l = 0, pdims(1) - 1
	704	xs = 0 + l * nnx
[1003]	705	DO j = nys_x, nyn_x
	706	DO k = 1, nz
[1]	707	DO i = xs, xs + nnx - 1
	708	m = m + 1
[164]	709	f_out(i,j,k) = work(m)
[1]	710	ENDDO
	711	ENDDO
	712	ENDDO
	713	ENDDO
	714
[4429]	715	END SUBROUTINE transpose_yxd
[1]	716	#endif
	717
	718
[4540]	719	!--------------------------------------------------------------------------------------------------!
[1]	720	! Description:
	721	! ------------
[4540]	722	!> Resorting data for the transposition from y to z. The transposition itself is carried out in
	723	!> transpose_yz.
	724	!--------------------------------------------------------------------------------------------------!
[1682]	725	SUBROUTINE resort_for_yz( f_in, f_inv )
[1216]	726
[1682]	727
[4540]	728	USE indices, &
	729	ONLY: ny
[1216]	730
[4540]	731	USE kinds
[1320]	732
[4540]	733	USE transpose_indices, &
	734	ONLY: nxl_y, &
	735	nxr_y, &
	736	nzb_y, &
	737	nzt_y
[1320]	738
[4540]	739	IMPLICIT NONE
[1216]	740
[4540]	741	INTEGER(iwp) :: i !<
	742	INTEGER(iwp) :: j !<
	743	INTEGER(iwp) :: k !<
[1216]	744
[4540]	745	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	746	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1216]	747
	748	!
[4540]	749	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
	750	!$OMP PARALLEL PRIVATE ( i, j, k )
	751	!$OMP DO
[3690]	752	#if __acc_fft_device
[4540]	753	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	754	!$ACC PRESENT(f_inv, f_in)
[3690]	755	#endif
[4540]	756	DO k = nzb_y, nzt_y
	757	DO i = nxl_y, nxr_y
	758	DO j = 0, ny
	759	f_inv(i,k,j) = f_in(j,i,k)
	760	ENDDO
	761	ENDDO
	762	ENDDO
	763	!$OMP END PARALLEL
[1216]	764
	765	END SUBROUTINE resort_for_yz
	766
	767
[4540]	768	!--------------------------------------------------------------------------------------------------!
[1216]	769	! Description:
	770	! ------------
[4540]	771	!> Transposition of input array (f_in) from y to z. For the input array, all elements along y reside
	772	!> on the same PE, while after transposition, all elements along z reside on the same PE.
	773	!--------------------------------------------------------------------------------------------------!
[1682]	774	SUBROUTINE transpose_yz( f_inv, f_out )
[1]	775
[1682]	776
[4429]	777	#if defined( __parallel )
[4540]	778	USE cpulog, &
	779	ONLY: cpu_log, &
	780	cpu_log_nowait, &
	781	log_point_s
[4429]	782	#endif
[1]	783
[4540]	784	USE indices, &
	785	ONLY: ny, &
	786	nz
[1320]	787
	788	USE kinds
	789
[1324]	790	USE pegrid
[1320]	791
[4540]	792	USE transpose_indices, &
	793	ONLY: nxl_y, &
	794	nxl_z, &
	795	nxr_y, &
	796	nxr_z, &
	797	nyn_z, &
	798	nys_z, &
	799	nzb_y, &
	800	nzt_y
[1320]	801
[1]	802	IMPLICIT NONE
	803
[4171]	804	INTEGER(iwp) :: i !<
	805	INTEGER(iwp) :: j !<
	806	INTEGER(iwp) :: k !<
[4540]	807
[4429]	808	#if defined( __parallel )
[4540]	809	INTEGER(iwp) :: l !<
	810	INTEGER(iwp) :: zs !<
[4429]	811	#endif
[1]	812
[4540]	813	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	814	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
[1111]	815
[4429]	816	#if defined( __parallel )
[4540]	817	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	818	#if __acc_fft_device
[3634]	819	!$ACC DECLARE CREATE(work)
[3690]	820	#endif
[4429]	821	#endif
[1111]	822
[1320]	823
[1]	824	!
[4540]	825	!-- If the PE grid is one-dimensional along y, only local reordering of the data is necessary and no
	826	!-- transposition has to be done.
[1]	827	IF ( pdims(1) == 1 ) THEN
[1106]	828
[683]	829	!$OMP PARALLEL PRIVATE ( i, j, k )
	830	!$OMP DO
[3690]	831	#if __acc_fft_device
[3634]	832	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	833	!$ACC PRESENT(f_out, f_inv)
[3690]	834	#endif
[1003]	835	DO j = 0, ny
	836	DO k = nzb_y, nzt_y
	837	DO i = nxl_y, nxr_y
[164]	838	f_out(i,j,k) = f_inv(i,k,j)
[1]	839	ENDDO
	840	ENDDO
	841	ENDDO
[683]	842	!$OMP END PARALLEL
[1]	843
[1106]	844	ELSE
	845
	846	#if defined( __parallel )
[1]	847	!
[1106]	848	!-- Transpose array
[1318]	849	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	850
	851	#if __acc_fft_device
[3657]	852	#ifndef __cuda_aware_mpi
[3634]	853	!$ACC UPDATE HOST(f_inv)
[3657]	854	#else
	855	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	856	#endif
[3690]	857	#endif
	858
[1106]	859	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[4540]	860	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	861	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, comm1dx, ierr )
[3690]	862
	863	#if __acc_fft_device
[3657]	864	#ifndef __cuda_aware_mpi
[3634]	865	!$ACC UPDATE DEVICE(work)
[3657]	866	#else
	867	!$ACC END HOST_DATA
	868	#endif
[3690]	869	#endif
	870
[1106]	871	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	872
	873	!
[1106]	874	!-- Reorder transposed array
[1111]	875	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[1106]	876	DO l = 0, pdims(1) - 1
	877	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	878	#if __acc_fft_device
[3634]	879	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	880	!$ACC PRESENT(f_out, work)
[3690]	881	#endif
[4236]	882	!$OMP DO
[1106]	883	DO j = nys_z, nyn_z
	884	DO k = zs, zs + nzt_y - nzb_y
	885	DO i = nxl_z, nxr_z
[1111]	886	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	887	ENDDO
[1]	888	ENDDO
	889	ENDDO
[4236]	890	!$OMP END DO NOWAIT
[1]	891	ENDDO
[683]	892	!$OMP END PARALLEL
[1]	893	#endif
	894
[1106]	895	ENDIF
	896
[1]	897	END SUBROUTINE transpose_yz
	898
	899
[4540]	900	!--------------------------------------------------------------------------------------------------!
[1]	901	! Description:
	902	! ------------
[4540]	903	!> Resorting data for the transposition from z to x. The transposition itself is carried out in
	904	!> transpose_zx.
	905	!--------------------------------------------------------------------------------------------------!
[1682]	906	SUBROUTINE resort_for_zx( f_in, f_inv )
[1216]	907
[1682]	908
[4540]	909	USE indices, &
	910	ONLY: nxl, &
	911	nxr, &
	912	nyn, &
	913	nys, &
	914	nz
[1216]	915
[4540]	916	USE kinds
[1320]	917
[4540]	918	IMPLICIT NONE
[1216]	919
[4540]	920	INTEGER(iwp) :: i !<
	921	INTEGER(iwp) :: j !<
	922	INTEGER(iwp) :: k !<
[1216]	923
[4540]	924	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	925	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1216]	926
	927	!
[4540]	928	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
	929	!$OMP PARALLEL PRIVATE ( i, j, k )
	930	!$OMP DO
[3690]	931	#if __acc_fft_device
[4540]	932	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	933	!$ACC PRESENT(f_in, f_inv)
[3690]	934	#endif
[4540]	935	DO i = nxl, nxr
	936	DO j = nys, nyn
	937	DO k = 1,nz
	938	f_inv(j,i,k) = f_in(k,j,i)
	939	ENDDO
	940	ENDDO
	941	ENDDO
	942	!$OMP END PARALLEL
[1216]	943
	944	END SUBROUTINE resort_for_zx
	945
	946
[4540]	947	!--------------------------------------------------------------------------------------------------!
[1216]	948	! Description:
	949	! ------------
[4540]	950	!> Transposition of input array (f_in) from z to x. For the input array, all elements along z reside
	951	!> on the same PE, while after transposition, all elements along x reside on the same PE.
	952	!--------------------------------------------------------------------------------------------------!
[1682]	953	SUBROUTINE transpose_zx( f_inv, f_out )
[1]	954
[1682]	955
[4429]	956	#if defined( __parallel )
[4540]	957	USE cpulog, &
	958	ONLY: cpu_log, &
	959	cpu_log_nowait, &
	960	log_point_s
[1]	961
[4540]	962	USE fft_xy, &
	963	ONLY: f_vec_x, &
	964	temperton_fft_vec
[4429]	965	#endif
[4366]	966
[4540]	967	USE indices, &
	968	ONLY: nx, &
	969	nxl, &
	970	nxr, &
	971	nyn, &
	972	nys, &
	973	nz
	974
[4429]	975	#if defined( __parallel )
[4540]	976	USE indices, &
[4429]	977	ONLY: nnx
	978	#endif
[1320]	979
	980	USE kinds
	981
[1324]	982	USE pegrid
[1320]	983
[4540]	984	USE transpose_indices, &
	985	ONLY: nyn_x, &
	986	nys_x, &
	987	nzb_x, &
	988	nzt_x
[1320]	989
[1]	990	IMPLICIT NONE
	991
[4171]	992	INTEGER(iwp) :: i !<
	993	INTEGER(iwp) :: j !<
	994	INTEGER(iwp) :: k !<
[4540]	995
[4429]	996	#if defined( __parallel )
[4540]	997	INTEGER(iwp) :: l !<
	998	INTEGER(iwp) :: mm !<
	999	INTEGER(iwp) :: xs !<
[4429]	1000	#endif
[1]	1001
[4540]	1002	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	1003	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1111]	1004
[4429]	1005	#if defined( __parallel )
[4540]	1006	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	1007	#if __acc_fft_device
[3634]	1008	!$ACC DECLARE CREATE(work)
[3690]	1009	#endif
[4429]	1010	#endif
[1]	1011
[1320]	1012
[1]	1013	!
[4540]	1014	!-- If the PE grid is one-dimensional along y, only local reordering of the data is necessary and no
	1015	!-- transposition has to be done.
[1]	1016	IF ( pdims(1) == 1 ) THEN
[1106]	1017
[683]	1018	!$OMP PARALLEL PRIVATE ( i, j, k )
	1019	!$OMP DO
[3690]	1020	#if __acc_fft_device
[3634]	1021	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1022	!$ACC PRESENT(f_out, f_inv)
[3690]	1023	#endif
[1003]	1024	DO k = 1, nz
	1025	DO i = nxl, nxr
	1026	DO j = nys, nyn
[164]	1027	f_out(i,j,k) = f_inv(j,i,k)
[1]	1028	ENDDO
	1029	ENDDO
	1030	ENDDO
[683]	1031	!$OMP END PARALLEL
[1]	1032
[1106]	1033	ELSE
	1034
	1035	#if defined( __parallel )
[1]	1036	!
[1106]	1037	!-- Transpose array
[1318]	1038	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	1039
	1040	#if __acc_fft_device
[3657]	1041	#ifndef __cuda_aware_mpi
[3634]	1042	!$ACC UPDATE HOST(f_inv)
[3657]	1043	#else
	1044	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	1045	#endif
[3690]	1046	#endif
	1047
[1106]	1048	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[4540]	1049	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	1050	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, comm1dx, ierr )
[3690]	1051
	1052	#if __acc_fft_device
[3657]	1053	#ifndef __cuda_aware_mpi
[3634]	1054	!$ACC UPDATE DEVICE(work)
[3657]	1055	#else
	1056	!$ACC END HOST_DATA
	1057	#endif
[3690]	1058	#endif
	1059
[1106]	1060	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	1061
	1062	!
[4366]	1063	!-- Reorder transposed array.
[4370]	1064	!-- Data for the vectorized Temperton-fft is stored in different array format (f_vec_x) which
	1065	!-- saves additional data copy in fft_x.
[4366]	1066	IF ( temperton_fft_vec ) THEN
	1067
	1068	DO l = 0, pdims(1) - 1
	1069	xs = 0 + l * nnx
	1070	DO k = nzb_x, nzt_x
	1071	DO i = xs, xs + nnx - 1
	1072	DO j = nys_x, nyn_x
	1073	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
[4370]	1074	f_vec_x(mm,i) = work(j,i-xs+1,k,l)
[4366]	1075	ENDDO
	1076	ENDDO
	1077	ENDDO
	1078	ENDDO
	1079
	1080	ELSE
	1081
	1082	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
	1083	DO l = 0, pdims(1) - 1
	1084	xs = 0 + l * nnx
[3690]	1085	#if __acc_fft_device
[4366]	1086	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1087	!$ACC PRESENT(f_out, work)
[3690]	1088	#endif
[4366]	1089	!$OMP DO
	1090	DO k = nzb_x, nzt_x
	1091	DO i = xs, xs + nnx - 1
	1092	DO j = nys_x, nyn_x
	1093	f_out(i,j,k) = work(j,i-xs+1,k,l)
	1094	ENDDO
[1106]	1095	ENDDO
[1]	1096	ENDDO
[4366]	1097	!$OMP END DO NOWAIT
[1]	1098	ENDDO
[4366]	1099	!$OMP END PARALLEL
[1]	1100
[4366]	1101	ENDIF
	1102
[4415]	1103	#endif
	1104
[1106]	1105	ENDIF
	1106
[1]	1107	END SUBROUTINE transpose_zx
	1108
	1109
[4540]	1110	!--------------------------------------------------------------------------------------------------!
[1]	1111	! Description:
	1112	! ------------
[4540]	1113	!> Resorting data after the transposition from z to y. The transposition itself is carried out in
	1114	!> transpose_zy.
	1115	!--------------------------------------------------------------------------------------------------!
[1682]	1116	SUBROUTINE resort_for_zy( f_inv, f_out )
[1216]	1117
[1682]	1118
[4540]	1119	USE indices, &
	1120	ONLY: ny
[1216]	1121
[4540]	1122	USE kinds
[1320]	1123
[4540]	1124	USE transpose_indices, &
	1125	ONLY: nxl_y, &
	1126	nxr_y, &
	1127	nzb_y, &
	1128	nzt_y
[1320]	1129
[4540]	1130	IMPLICIT NONE
[1216]	1131
[4540]	1132	INTEGER(iwp) :: i !<
	1133	INTEGER(iwp) :: j !<
	1134	INTEGER(iwp) :: k !<
[1216]	1135
[4540]	1136	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	1137	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1216]	1138
	1139	!
[4540]	1140	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
[1216]	1141	!$OMP PARALLEL PRIVATE ( i, j, k )
	1142	!$OMP DO
[3690]	1143	#if __acc_fft_device
[3634]	1144	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1145	!$ACC PRESENT(f_out, f_inv)
[3690]	1146	#endif
[4540]	1147	DO k = nzb_y, nzt_y
	1148	DO i = nxl_y, nxr_y
	1149	DO j = 0, ny
	1150	f_out(j,i,k) = f_inv(i,k,j)
	1151	ENDDO
	1152	ENDDO
	1153	ENDDO
	1154	!$OMP END PARALLEL
[1216]	1155
	1156	END SUBROUTINE resort_for_zy
	1157
	1158
[4540]	1159	!--------------------------------------------------------------------------------------------------!
[3241]	1160	! Description:cpu_log_nowait
[1216]	1161	! ------------
[4540]	1162	!> Transposition of input array (f_in) from z to y. For the input array, all elements along z reside
	1163	!> on the same PE, while after transposition, all elements along y reside on the same PE.
	1164	!--------------------------------------------------------------------------------------------------!
[1682]	1165	SUBROUTINE transpose_zy( f_in, f_inv )
[1]	1166
[1682]	1167
[4429]	1168	#if defined( __parallel )
[4540]	1169	USE cpulog, &
	1170	ONLY: cpu_log, &
	1171	cpu_log_nowait, &
	1172	log_point_s
[4429]	1173	#endif
[1]	1174
[4540]	1175	USE indices, &
	1176	ONLY: ny, &
	1177	nz
[1320]	1178
	1179	USE kinds
	1180
[1324]	1181	USE pegrid
[1320]	1182
[4540]	1183	USE transpose_indices, &
	1184	ONLY: nxl_y, &
	1185	nxl_z, &
	1186	nxr_y, &
	1187	nxr_z, &
	1188	nyn_z, &
	1189	nys_z, &
	1190	nzb_y, &
	1191	nzt_y
[1320]	1192
[1]	1193	IMPLICIT NONE
	1194
[4171]	1195	INTEGER(iwp) :: i !<
	1196	INTEGER(iwp) :: j !<
	1197	INTEGER(iwp) :: k !<
[4540]	1198
[4429]	1199	#if defined( __parallel )
[4540]	1200	INTEGER(iwp) :: l !<
	1201	INTEGER(iwp) :: zs !<
[4429]	1202	#endif
[1]	1203
[4540]	1204	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
	1205	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1111]	1206
[4429]	1207	#if defined( __parallel )
[4540]	1208	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	1209	#if __acc_fft_device
[3634]	1210	!$ACC DECLARE CREATE(work)
[3690]	1211	#endif
[4429]	1212	#endif
[1111]	1213
[1]	1214	!
[4540]	1215	!-- If the PE grid is one-dimensional along y, the array has only to be reordered locally and
	1216	!-- therefore no transposition has to be done.
[1]	1217	IF ( pdims(1) /= 1 ) THEN
[1106]	1218
	1219	#if defined( __parallel )
[1]	1220	!
	1221	!-- Reorder input array for transposition
[1111]	1222	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[1]	1223	DO l = 0, pdims(1) - 1
[1003]	1224	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	1225	#if __acc_fft_device
[3634]	1226	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1227	!$ACC PRESENT(work, f_in)
[3690]	1228	#endif
[4236]	1229	!$OMP DO
[1003]	1230	DO j = nys_z, nyn_z
	1231	DO k = zs, zs + nzt_y - nzb_y
	1232	DO i = nxl_z, nxr_z
[1111]	1233	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	1234	ENDDO
	1235	ENDDO
	1236	ENDDO
[4236]	1237	!$OMP END DO NOWAIT
[1]	1238	ENDDO
[683]	1239	!$OMP END PARALLEL
[1]	1240
	1241	!
	1242	!-- Transpose array
[1318]	1243	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	1244
	1245	#if __acc_fft_device
[3657]	1246	#ifndef __cuda_aware_mpi
[3634]	1247	!$ACC UPDATE HOST(work)
[3657]	1248	#else
	1249	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	1250	#endif
[3690]	1251	#endif
	1252
[622]	1253	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[4540]	1254	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	1255	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, comm1dx, ierr )
[3690]	1256
	1257	#if __acc_fft_device
[3657]	1258	#ifndef __cuda_aware_mpi
[3634]	1259	!$ACC UPDATE DEVICE(f_inv)
[3657]	1260	#else
	1261	!$ACC END HOST_DATA
	1262	#endif
[3690]	1263	#endif
	1264
[1]	1265	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	1266	#endif
[1]	1267
	1268	ELSE
	1269	!
[1106]	1270	!-- Reorder the array in the same way like ALLTOALL did it
[683]	1271	!$OMP PARALLEL PRIVATE ( i, j, k )
	1272	!$OMP DO
[3690]	1273	#if __acc_fft_device
[3634]	1274	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1275	!$ACC PRESENT(f_inv, f_in)
[3690]	1276	#endif
[1003]	1277	DO k = nzb_y, nzt_y
	1278	DO j = 0, ny
	1279	DO i = nxl_y, nxr_y
[164]	1280	f_inv(i,k,j) = f_in(i,j,k)
	1281	ENDDO
	1282	ENDDO
	1283	ENDDO
[683]	1284	!$OMP END PARALLEL
[1106]	1285
	1286	ENDIF
	1287
[1]	1288	END SUBROUTINE transpose_zy
	1289
	1290
[4540]	1291	!--------------------------------------------------------------------------------------------------!
[1]	1292	! Description:
	1293	! ------------
[4540]	1294	!> Transposition of input array (f_in) from z to y. For the input array, all elements along z reside
	1295	!> on the same PE, while after transposition, all elements along y reside on the same PE. This is a
	1296	!> direct transposition for arrays with indices in regular order (k,j,i) (cf. transpose_zy).
	1297	!--------------------------------------------------------------------------------------------------!
[4429]	1298	#if defined( __parallel )
[1682]	1299	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	1300
[1682]	1301
[4540]	1302	USE cpulog, &
	1303	ONLY: cpu_log, &
	1304	log_point_s
[1]	1305
[4540]	1306	USE indices, &
	1307	ONLY: nnx, &
	1308	nny, &
	1309	nnz, &
	1310	nxl, &
	1311	nxr, &
	1312	nyn, &
	1313	nys, &
	1314	ny, &
	1315	nz
[1320]	1316
	1317	USE kinds
	1318
[1324]	1319	USE pegrid
[1320]	1320
[4540]	1321	USE transpose_indices, &
	1322	ONLY: nxl_yd, &
	1323	nxr_yd, &
	1324	nzb_yd, &
	1325	nzt_yd
[1320]	1326
[1]	1327	IMPLICIT NONE
	1328
[4540]	1329	INTEGER(iwp) :: i !<
	1330	INTEGER(iwp) :: j !<
	1331	INTEGER(iwp) :: k !<
	1332	INTEGER(iwp) :: l !<
	1333	INTEGER(iwp) :: m !<
	1334	INTEGER(iwp) :: ys !<
[1]	1335
[4540]	1336	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	1337	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	1338	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
	1339	REAL(wp) :: work(nnxnnynnz) !<
[1320]	1340
[1]	1341	!
[4540]	1342	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
[1003]	1343	DO i = nxl, nxr
	1344	DO j = nys, nyn
	1345	DO k = 1, nz
[164]	1346	f_inv(j,i,k) = f_in(k,j,i)
[1]	1347	ENDDO
	1348	ENDDO
	1349	ENDDO
	1350
	1351	!
[4540]	1352	!-- Move data to different array, because memory location of work1 is needed further below
	1353	!-- (work1 = work2). If the PE grid is one-dimensional along x, only local reordering of the data is
	1354	!-- necessary and no transposition has to be done.
[1]	1355	IF ( pdims(2) == 1 ) THEN
[1003]	1356	DO k = 1, nz
	1357	DO i = nxl, nxr
	1358	DO j = nys, nyn
[164]	1359	f_out(j,i,k) = f_inv(j,i,k)
[1]	1360	ENDDO
	1361	ENDDO
	1362	ENDDO
	1363	RETURN
	1364	ENDIF
	1365
	1366	!
	1367	!-- Transpose array
	1368	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1369	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[4540]	1370	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
	1371	work(1), sendrecvcount_zyd, MPI_REAL, comm1dy, ierr )
[1]	1372	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1373
	1374	!
	1375	!-- Reorder transposed array
	1376	m = 0
	1377	DO l = 0, pdims(2) - 1
	1378	ys = 0 + l * nny
[1003]	1379	DO k = nzb_yd, nzt_yd
	1380	DO i = nxl_yd, nxr_yd
[1]	1381	DO j = ys, ys + nny - 1
	1382	m = m + 1
[164]	1383	f_out(j,i,k) = work(m)
[1]	1384	ENDDO
	1385	ENDDO
	1386	ENDDO
	1387	ENDDO
	1388
[4429]	1389	END SUBROUTINE transpose_zyd
[1]	1390	#endif

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |