Home

Context Navigation

← Previous Revision
Next Revision →
Normal
Revision Log

transpose.f90

Last change on this file was 4828, checked in by Giersch, 3 years ago
Copyright updated to year 2021, interface pmc_sort removed to accelarate the nesting code
Property svn:keywords set to `Id`
File size: 46.3 KB

Rev	Line
[1682]	1	!> @file transpose.f90
[4540]	2	!--------------------------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[4540]	5	! PALM is free software: you can redistribute it and/or modify it under the terms of the GNU General
	6	! Public License as published by the Free Software Foundation, either version 3 of the License, or
	7	! (at your option) any later version.
[1036]	8	!
[4540]	9	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
	10	! implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General
	11	! Public License for more details.
[1036]	12	!
[4540]	13	! You should have received a copy of the GNU General Public License along with PALM. If not, see
	14	! <http://www.gnu.org/licenses/>.
[1036]	15	!
[4828]	16	! Copyright 1997-2021 Leibniz Universitaet Hannover
[4540]	17	!--------------------------------------------------------------------------------------------------!
[1036]	18	!
[4540]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[4717]	22	!
	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: transpose.f90 4828 2021-01-05 11:21:41Z banzhafs $
[4717]	27	! Formatting of OpenMP directives (J. Resler)
	28	!
	29	! 4540 2020-05-18 15:23:29Z raasch
[4540]	30	! File re-formatted to follow the PALM coding standard
	31	!
	32	! 4429 2020-02-27 15:24:30Z raasch
	33	! Bugfix: cpp-directives added for serial mode
	34	!
[4429]	35	! 4415 2020-02-20 10:30:33Z raasch
[4540]	36	! Bugfix for misplaced preprocessor directive
	37	!
[4415]	38	! 4370 2020-01-10 14:00:44Z raasch
[4540]	39	! Vector array renamed
	40	!
[4370]	41	! 4366 2020-01-09 08:12:43Z raasch
[4540]	42	! Modifications for NEC vectorization
	43	!
[4366]	44	! 4360 2020-01-07 11:25:50Z suehring
[4236]	45	! Added missing OpenMP directives
[4540]	46	!
[4236]	47	! 4182 2019-08-22 15:20:23Z scharf
[4182]	48	! Corrected "Former revisions" section
[4540]	49	!
[4182]	50	! 4171 2019-08-19 17:44:09Z gronemeier
[4540]	51	! Loop reordering for performance optimization
[4171]	52	!
	53	! 3832 2019-03-28 13:16:58Z raasch
[4540]	54	! Loop reordering for performance optimization
[4171]	55	!
[3832]	56	! 3694 2019-01-23 17:01:49Z knoop
[3634]	57	! OpenACC port for SPEC
[4171]	58	!
[4182]	59	! Revision 1.1 1997/07/24 11:25:18 raasch
	60	! Initial revision
	61	!
	62	!
[1216]	63	! Description:
	64	! ------------
[4540]	65	!> Resorting data for the transposition from x to y. The transposition itself is carried out in
	66	!> transpose_xy.
	67	!--------------------------------------------------------------------------------------------------!
[4181]	68
	69	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
	70
[1682]	71	SUBROUTINE resort_for_xy( f_in, f_inv )
[1216]	72
[4171]	73
[4540]	74	USE indices, &
	75	ONLY: nx
[1216]	76
[4540]	77	USE kinds
[1320]	78
[4540]	79	USE transpose_indices, &
	80	ONLY: nyn_x, &
	81	nys_x, &
	82	nzb_x, &
	83	nzt_x
[1320]	84
[4540]	85	IMPLICIT NONE
[1216]	86
[4540]	87	INTEGER(iwp) :: i !<
	88	INTEGER(iwp) :: j !<
	89	INTEGER(iwp) :: k !<
[1216]	90
[4540]	91	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	92	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1216]	93
[1]	94	!
[4540]	95	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous
[1216]	96	!$OMP PARALLEL PRIVATE ( i, j, k )
	97	!$OMP DO
[3690]	98	#if __acc_fft_device
[4540]	99	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	100	!$ACC PRESENT(f_inv, f_in)
[3690]	101	#endif
[4540]	102	DO k = nzb_x, nzt_x
	103	DO j = nys_x, nyn_x
	104	DO i = 0, nx
	105	f_inv(j,k,i) = f_in(i,j,k)
	106	ENDDO
	107	ENDDO
	108	ENDDO
	109	!$OMP END PARALLEL
[1216]	110
	111	END SUBROUTINE resort_for_xy
	112
	113
[4540]	114	!--------------------------------------------------------------------------------------------------!
[1]	115	! Description:
	116	! ------------
[4540]	117	!> Transposition of input array (f_in) from x to y. For the input array, all elements along x reside
	118	!> on the same PE, while after transposition, all elements along y reside on the same PE.
	119	!--------------------------------------------------------------------------------------------------!
[1682]	120	SUBROUTINE transpose_xy( f_inv, f_out )
[1]	121
[1682]	122
[4429]	123	#if defined( __parallel )
[4540]	124	USE cpulog, &
	125	ONLY: cpu_log, &
	126	cpu_log_nowait, &
	127	log_point_s
[4429]	128	#endif
[1320]	129
[4540]	130	USE indices, &
	131	ONLY: nx, &
	132	ny
[4171]	133
[1320]	134	USE kinds
	135
[1]	136	USE pegrid
	137
[4540]	138	USE transpose_indices, &
	139	ONLY: nxl_y, &
	140	nxr_y, &
	141	nyn_x, &
	142	nys_x, &
	143	nzb_x, &
	144	nzb_y, &
	145	nzt_x, &
	146	nzt_y
[1320]	147
[1]	148	IMPLICIT NONE
	149
[4171]	150	INTEGER(iwp) :: i !<
	151	INTEGER(iwp) :: j !<
	152	INTEGER(iwp) :: k !<
[4429]	153
	154	#if defined( __parallel )
[4540]	155	INTEGER(iwp) :: l !<
	156	INTEGER(iwp) :: ys !<
[4429]	157	#endif
[1]	158
[4540]	159	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	160	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[4171]	161
[4429]	162	#if defined( __parallel )
[4540]	163	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	164	#if __acc_fft_device
[3634]	165	!$ACC DECLARE CREATE(work)
[3690]	166	#endif
[4429]	167	#endif
[1111]	168
	169
[1106]	170	IF ( numprocs /= 1 ) THEN
	171
	172	#if defined( __parallel )
[1]	173	!
[1106]	174	!-- Transpose array
[1318]	175	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	176
	177	#if __acc_fft_device
[3657]	178	#ifndef __cuda_aware_mpi
[3634]	179	!$ACC UPDATE HOST(f_inv)
[3657]	180	#else
	181	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	182	#endif
[3690]	183	#endif
	184
[1106]	185	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[4540]	186	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	187	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
[3690]	188
	189	#if __acc_fft_device
[3657]	190	#ifndef __cuda_aware_mpi
[3634]	191	!$ACC UPDATE DEVICE(work)
[3657]	192	#else
	193	!$ACC END HOST_DATA
	194	#endif
[3690]	195	#endif
	196
[1106]	197	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	198
	199	!
[1106]	200	!-- Reorder transposed array
[4717]	201	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[1106]	202	DO l = 0, pdims(2) - 1
	203	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	204	#if __acc_fft_device
[3634]	205	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	206	!$ACC PRESENT(f_out, work)
[3690]	207	#endif
[4236]	208	!$OMP DO
[1106]	209	DO i = nxl_y, nxr_y
	210	DO k = nzb_y, nzt_y
	211	DO j = ys, ys + nyn_x - nys_x
[1111]	212	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	213	ENDDO
[1]	214	ENDDO
	215	ENDDO
[4236]	216	!$OMP END DO NOWAIT
[1]	217	ENDDO
[4717]	218	!$OMP END PARALLEL
[1]	219	#endif
	220
[1106]	221	ELSE
	222
	223	!
	224	!-- Reorder transposed array
[4717]	225	!$OMP PARALLEL PRIVATE ( i, j, k )
	226	!$OMP DO
[3690]	227	#if __acc_fft_device
[3634]	228	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	229	!$ACC PRESENT(f_out, f_inv)
[3690]	230	#endif
[1106]	231	DO k = nzb_y, nzt_y
	232	DO i = nxl_y, nxr_y
	233	DO j = 0, ny
	234	f_out(j,i,k) = f_inv(j,k,i)
	235	ENDDO
	236	ENDDO
	237	ENDDO
[4717]	238	!$OMP END PARALLEL
[1106]	239
	240	ENDIF
	241
[1]	242	END SUBROUTINE transpose_xy
	243
	244
[4540]	245	!--------------------------------------------------------------------------------------------------!
[1]	246	! Description:
	247	! ------------
[4540]	248	!> Resorting data after the transposition from x to z. The transposition itself is carried out in
	249	!> transpose_xz.
	250	!--------------------------------------------------------------------------------------------------!
[1682]	251	SUBROUTINE resort_for_xz( f_inv, f_out )
[1216]	252
[1682]	253
[4540]	254	USE indices, &
	255	ONLY: nxl, &
	256	nxr, &
	257	nyn, &
	258	nys, &
	259	nz
[1216]	260
[4540]	261	USE kinds
[1320]	262
[4540]	263	IMPLICIT NONE
[1216]	264
[4540]	265	INTEGER(iwp) :: i !<
	266	INTEGER(iwp) :: j !<
	267	INTEGER(iwp) :: k !<
[1216]	268
[4540]	269	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	270	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
	271
[1216]	272	!
[4540]	273	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
	274	!-- In case of parallel fft/transposition, scattered store is faster in backward direction!!!
[4717]	275	!$OMP PARALLEL PRIVATE ( i, j, k )
	276	!$OMP DO
[3690]	277	#if __acc_fft_device
[4717]	278	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	279	!$ACC PRESENT(f_out, f_inv)
[3690]	280	#endif
[4540]	281	DO i = nxl, nxr
	282	DO j = nys, nyn
	283	DO k = 1, nz
	284	f_out(k,j,i) = f_inv(j,i,k)
	285	ENDDO
	286	ENDDO
	287	ENDDO
	288	!$OMP END PARALLEL
[1216]	289
	290	END SUBROUTINE resort_for_xz
	291
	292
[4540]	293	!--------------------------------------------------------------------------------------------------!
[1216]	294	! Description:
	295	! ------------
[4540]	296	!> Transposition of input array (f_in) from x to z. For the input array, all elements along x reside
	297	!> on the same PE, while after transposition, all elements along z reside on the same PE.
	298	!--------------------------------------------------------------------------------------------------!
[1682]	299	SUBROUTINE transpose_xz( f_in, f_inv )
[1]	300
[4429]	301	#if defined( __parallel )
[4540]	302	USE cpulog, &
	303	ONLY: cpu_log, &
	304	cpu_log_nowait, &
	305	log_point_s
[1]	306
[4540]	307	USE fft_xy, &
	308	ONLY: f_vec_x, &
	309	temperton_fft_vec
[4429]	310	#endif
[4366]	311
[4540]	312	USE indices, &
	313	ONLY: nx, &
	314	nxl, &
	315	nxr, &
	316	nyn, &
	317	nys, &
	318	nz
	319
[4429]	320	#if defined( __parallel )
[4540]	321	USE indices, &
[4429]	322	ONLY: nnx
	323	#endif
[1320]	324
	325	USE kinds
	326
[1324]	327	USE pegrid
[1320]	328
[4540]	329	USE transpose_indices, &
	330	ONLY: nyn_x, &
	331	nys_x, &
	332	nzb_x, &
	333	nzt_x
[1320]	334
[1]	335	IMPLICIT NONE
	336
[4171]	337	INTEGER(iwp) :: i !<
	338	INTEGER(iwp) :: j !<
	339	INTEGER(iwp) :: k !<
[4540]	340
[4429]	341	#if defined( __parallel )
[4540]	342	INTEGER(iwp) :: l !<
	343	INTEGER(iwp) :: mm !<
	344	INTEGER(iwp) :: xs !<
[4429]	345	#endif
[1]	346
[4540]	347	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	348	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1]	349
[4429]	350	#if defined( __parallel )
[4540]	351	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	352	#if __acc_fft_device
[3634]	353	!$ACC DECLARE CREATE(work)
[3690]	354	#endif
[4429]	355	#endif
[1111]	356
[4366]	357	!
[4540]	358	!-- If the PE grid is one-dimensional along y, the array has only to be reordered locally and
	359	!-- therefore no transposition has to be done.
[1]	360	IF ( pdims(1) /= 1 ) THEN
[1106]	361
	362	#if defined( __parallel )
[1]	363	!
[4366]	364	!-- Reorder input array for transposition. Data from the vectorized Temperton-fft is stored in
[4370]	365	!-- different array format (f_vec_x).
[4366]	366	IF ( temperton_fft_vec ) THEN
	367
	368	DO l = 0, pdims(1) - 1
	369	xs = 0 + l * nnx
	370	DO k = nzb_x, nzt_x
	371	DO i = xs, xs + nnx - 1
	372	DO j = nys_x, nyn_x
	373	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
[4370]	374	work(j,i-xs+1,k,l) = f_vec_x(mm,i)
[4366]	375	ENDDO
	376	ENDDO
	377	ENDDO
	378	ENDDO
	379
	380	ELSE
	381
	382	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
	383	DO l = 0, pdims(1) - 1
	384	xs = 0 + l * nnx
[3690]	385	#if __acc_fft_device
[4366]	386	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	387	!$ACC PRESENT(work, f_in)
[3690]	388	#endif
[4366]	389	!$OMP DO
	390	DO k = nzb_x, nzt_x
	391	DO i = xs, xs + nnx - 1
	392	DO j = nys_x, nyn_x
	393	work(j,i-xs+1,k,l) = f_in(i,j,k)
	394	ENDDO
[1]	395	ENDDO
	396	ENDDO
[4366]	397	!$OMP END DO NOWAIT
[1]	398	ENDDO
[4366]	399	!$OMP END PARALLEL
[1]	400
[4366]	401	ENDIF
	402
[1]	403	!
	404	!-- Transpose array
[1318]	405	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	406
	407	#if __acc_fft_device
[3657]	408	#ifndef __cuda_aware_mpi
[3634]	409	!$ACC UPDATE HOST(work)
[3657]	410	#else
	411	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	412	#endif
[3690]	413	#endif
	414
[622]	415	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[4540]	416	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	417	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, comm1dx, ierr )
[3690]	418
	419	#if __acc_fft_device
[3657]	420	#ifndef __cuda_aware_mpi
[3634]	421	!$ACC UPDATE DEVICE(f_inv)
[3657]	422	#else
	423	!$ACC END HOST_DATA
	424	#endif
[3694]	425	#endif
	426
[1]	427	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	428	#endif
	429
[1]	430	ELSE
[1106]	431
[1]	432	!
	433	!-- Reorder the array in a way that the z index is in first position
[4717]	434	!$OMP PARALLEL PRIVATE ( i, j, k )
	435	!$OMP DO
[3690]	436	#if __acc_fft_device
[3634]	437	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	438	!$ACC PRESENT(f_inv, f_in)
[3690]	439	#endif
[1003]	440	DO i = nxl, nxr
	441	DO j = nys, nyn
	442	DO k = 1, nz
[164]	443	f_inv(j,i,k) = f_in(i,j,k)
[1]	444	ENDDO
	445	ENDDO
	446	ENDDO
[4717]	447	!$OMP END PARALLEL
[1]	448
[164]	449	ENDIF
	450
[1]	451	END SUBROUTINE transpose_xz
	452
	453
[4540]	454	!--------------------------------------------------------------------------------------------------!
[1]	455	! Description:
	456	! ------------
[4540]	457	!> Resorting data after the transposition from y to x. The transposition itself is carried out in
	458	!> transpose_yx.
	459	!--------------------------------------------------------------------------------------------------!
[1682]	460	SUBROUTINE resort_for_yx( f_inv, f_out )
[1216]	461
[1682]	462
[4540]	463	USE indices, &
	464	ONLY: nx
[1216]	465
[4540]	466	USE kinds
[1320]	467
[4540]	468	USE transpose_indices, &
	469	ONLY: nyn_x, &
	470	nys_x, &
	471	nzb_x, &
	472	nzt_x
[1320]	473
[4540]	474	IMPLICIT NONE
[1216]	475
[4540]	476	INTEGER(iwp) :: i !<
	477	INTEGER(iwp) :: j !<
	478	INTEGER(iwp) :: k !<
[1216]	479
[4540]	480	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	481	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1216]	482
	483	!
[4540]	484	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
[4717]	485	!$OMP PARALLEL PRIVATE ( i, j, k )
	486	!$OMP DO
[3690]	487	#if __acc_fft_device
[4540]	488	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	489	!$ACC PRESENT(f_out, f_inv)
[3690]	490	#endif
[4540]	491	DO k = nzb_x, nzt_x
	492	DO j = nys_x, nyn_x
	493	DO i = 0, nx
	494	f_out(i,j,k) = f_inv(j,k,i)
	495	ENDDO
	496	ENDDO
	497	ENDDO
[4717]	498	!$OMP END PARALLEL
[1216]	499
	500	END SUBROUTINE resort_for_yx
	501
	502
[4540]	503	!--------------------------------------------------------------------------------------------------!
[1216]	504	! Description:
	505	! ------------
[4540]	506	!> Transposition of input array (f_in) from y to x. For the input array, all elements along y
	507	!> reside on the same PE, while after transposition, all elements along x reside on the same PE.
	508	!--------------------------------------------------------------------------------------------------!
[1682]	509	SUBROUTINE transpose_yx( f_in, f_inv )
[1]	510
[1682]	511
[4429]	512	#if defined( __parallel )
[4540]	513	USE cpulog, &
	514	ONLY: cpu_log, &
	515	cpu_log_nowait, &
	516	log_point_s
[4429]	517	#endif
[1]	518
[4540]	519	USE indices, &
	520	ONLY: nx, &
	521	ny
[1320]	522
	523	USE kinds
	524
[1324]	525	USE pegrid
[1320]	526
[4540]	527	USE transpose_indices, &
	528	ONLY: nxl_y, &
	529	nxr_y, &
	530	nyn_x, &
	531	nys_x, &
	532	nzb_x, &
	533	nzb_y, &
	534	nzt_x, &
	535	nzt_y
[1320]	536
[1]	537	IMPLICIT NONE
	538
[4171]	539	INTEGER(iwp) :: i !<
	540	INTEGER(iwp) :: j !<
	541	INTEGER(iwp) :: k !<
[4540]	542
[4429]	543	#if defined( __parallel )
[4540]	544	INTEGER(iwp) :: l !<
	545	INTEGER(iwp) :: ys !<
[4429]	546	#endif
[1]	547
[4540]	548	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	549	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1111]	550
[4429]	551	#if defined( __parallel )
[4540]	552	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	553	#if __acc_fft_device
[3634]	554	!$ACC DECLARE CREATE(work)
[3690]	555	#endif
[4429]	556	#endif
[1111]	557
[1320]	558
[1106]	559	IF ( numprocs /= 1 ) THEN
	560
[1]	561	#if defined( __parallel )
	562	!
[1106]	563	!-- Reorder input array for transposition
[4717]	564	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[1106]	565	DO l = 0, pdims(2) - 1
	566	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	567	#if __acc_fft_device
[3634]	568	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	569	!$ACC PRESENT(work, f_in)
[3690]	570	#endif
[4236]	571	!$OMP DO
[1106]	572	DO i = nxl_y, nxr_y
	573	DO k = nzb_y, nzt_y
	574	DO j = ys, ys + nyn_x - nys_x
[1111]	575	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	576	ENDDO
	577	ENDDO
	578	ENDDO
[4236]	579	!$OMP END DO NOWAIT
[1106]	580	ENDDO
[4717]	581	!$OMP END PARALLEL
[1106]	582
	583	!
	584	!-- Transpose array
[1318]	585	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	586
	587	#if __acc_fft_device
[3657]	588	#ifndef __cuda_aware_mpi
[3634]	589	!$ACC UPDATE HOST(work)
[3657]	590	#else
	591	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	592	#endif
[3690]	593	#endif
	594
[1106]	595	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[4540]	596	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	597	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, comm1dy, ierr )
[3690]	598
	599	#if __acc_fft_device
[3657]	600	#ifndef __cuda_aware_mpi
[3634]	601	!$ACC UPDATE DEVICE(f_inv)
[3657]	602	#else
	603	!$ACC END HOST_DATA
	604	#endif
[3690]	605	#endif
	606
[1106]	607	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	608	#endif
	609
	610	ELSE
	611
	612	!
[4540]	613	!-- Reorder array f_in the same way as ALLTOALL did it.
[4717]	614	!$OMP PARALLEL PRIVATE ( i, j, k )
	615	!$OMP DO
[3690]	616	#if __acc_fft_device
[3634]	617	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	618	!$ACC PRESENT(f_inv, f_in)
[3690]	619	#endif
[1003]	620	DO i = nxl_y, nxr_y
	621	DO k = nzb_y, nzt_y
[1106]	622	DO j = 0, ny
	623	f_inv(j,k,i) = f_in(j,i,k)
[1]	624	ENDDO
	625	ENDDO
	626	ENDDO
[4717]	627	!$OMP END PARALLEL
[1]	628
[1106]	629	ENDIF
[1]	630
	631	END SUBROUTINE transpose_yx
	632
	633
[4540]	634	!--------------------------------------------------------------------------------------------------!
[1]	635	! Description:
	636	! ------------
[4540]	637	!> Transposition of input array (f_in) from y to x. For the input array, all elements along y reside
	638	!> on the same PE, while after transposition, all elements along x reside on the same PE. This is a
	639	!> direct transposition for arrays with indices in regular order (k,j,i) (cf. transpose_yx).
	640	!--------------------------------------------------------------------------------------------------!
[4429]	641	#if defined( __parallel )
[1682]	642	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	643
[1682]	644
[4540]	645	USE cpulog, &
	646	ONLY: cpu_log, &
	647	log_point_s
[1]	648
[4540]	649	USE indices, &
	650	ONLY: nnx, &
	651	nny, &
	652	nnz, &
	653	nx, &
	654	nxl, &
	655	nxr, &
	656	nyn, &
	657	nys, &
	658	nz
[1320]	659
	660	USE kinds
	661
[1324]	662	USE pegrid
[1320]	663
[4540]	664	USE transpose_indices, &
	665	ONLY: nyn_x, &
	666	nys_x, &
	667	nzb_x, &
	668	nzt_x
[1320]	669
[1]	670	IMPLICIT NONE
	671
[4540]	672	INTEGER(iwp) :: i !<
	673	INTEGER(iwp) :: j !<
	674	INTEGER(iwp) :: k !<
	675	INTEGER(iwp) :: l !<
	676	INTEGER(iwp) :: m !<
	677	INTEGER(iwp) :: xs !<
[1]	678
[4540]	679	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	680	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
	681	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	682	REAL(wp) :: work(nnxnnynnz) !<
[1]	683
	684	!
[4540]	685	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
[1003]	686	DO k = 1, nz
	687	DO j = nys, nyn
	688	DO i = nxl, nxr
[164]	689	f_inv(i,k,j) = f_in(k,j,i)
[1]	690	ENDDO
	691	ENDDO
	692	ENDDO
	693
	694	!
	695	!-- Transpose array
	696	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	697	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[4540]	698	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
	699	work(1), sendrecvcount_xy, MPI_REAL, comm1dx, ierr )
[1]	700	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	701
	702	!
	703	!-- Reorder transposed array
	704	m = 0
	705	DO l = 0, pdims(1) - 1
	706	xs = 0 + l * nnx
[1003]	707	DO j = nys_x, nyn_x
	708	DO k = 1, nz
[1]	709	DO i = xs, xs + nnx - 1
	710	m = m + 1
[164]	711	f_out(i,j,k) = work(m)
[1]	712	ENDDO
	713	ENDDO
	714	ENDDO
	715	ENDDO
	716
[4429]	717	END SUBROUTINE transpose_yxd
[1]	718	#endif
	719
	720
[4540]	721	!--------------------------------------------------------------------------------------------------!
[1]	722	! Description:
	723	! ------------
[4540]	724	!> Resorting data for the transposition from y to z. The transposition itself is carried out in
	725	!> transpose_yz.
	726	!--------------------------------------------------------------------------------------------------!
[1682]	727	SUBROUTINE resort_for_yz( f_in, f_inv )
[1216]	728
[1682]	729
[4540]	730	USE indices, &
	731	ONLY: ny
[1216]	732
[4540]	733	USE kinds
[1320]	734
[4540]	735	USE transpose_indices, &
	736	ONLY: nxl_y, &
	737	nxr_y, &
	738	nzb_y, &
	739	nzt_y
[1320]	740
[4540]	741	IMPLICIT NONE
[1216]	742
[4540]	743	INTEGER(iwp) :: i !<
	744	INTEGER(iwp) :: j !<
	745	INTEGER(iwp) :: k !<
[1216]	746
[4540]	747	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	748	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1216]	749
	750	!
[4540]	751	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
[4717]	752	!$OMP PARALLEL PRIVATE ( i, j, k )
	753	!$OMP DO
[3690]	754	#if __acc_fft_device
[4540]	755	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	756	!$ACC PRESENT(f_inv, f_in)
[3690]	757	#endif
[4540]	758	DO k = nzb_y, nzt_y
	759	DO i = nxl_y, nxr_y
	760	DO j = 0, ny
	761	f_inv(i,k,j) = f_in(j,i,k)
	762	ENDDO
	763	ENDDO
	764	ENDDO
[4717]	765	!$OMP END PARALLEL
[1216]	766
	767	END SUBROUTINE resort_for_yz
	768
	769
[4540]	770	!--------------------------------------------------------------------------------------------------!
[1216]	771	! Description:
	772	! ------------
[4540]	773	!> Transposition of input array (f_in) from y to z. For the input array, all elements along y reside
	774	!> on the same PE, while after transposition, all elements along z reside on the same PE.
	775	!--------------------------------------------------------------------------------------------------!
[1682]	776	SUBROUTINE transpose_yz( f_inv, f_out )
[1]	777
[1682]	778
[4429]	779	#if defined( __parallel )
[4540]	780	USE cpulog, &
	781	ONLY: cpu_log, &
	782	cpu_log_nowait, &
	783	log_point_s
[4429]	784	#endif
[1]	785
[4540]	786	USE indices, &
	787	ONLY: ny, &
	788	nz
[1320]	789
	790	USE kinds
	791
[1324]	792	USE pegrid
[1320]	793
[4540]	794	USE transpose_indices, &
	795	ONLY: nxl_y, &
	796	nxl_z, &
	797	nxr_y, &
	798	nxr_z, &
	799	nyn_z, &
	800	nys_z, &
	801	nzb_y, &
	802	nzt_y
[1320]	803
[1]	804	IMPLICIT NONE
	805
[4171]	806	INTEGER(iwp) :: i !<
	807	INTEGER(iwp) :: j !<
	808	INTEGER(iwp) :: k !<
[4540]	809
[4429]	810	#if defined( __parallel )
[4540]	811	INTEGER(iwp) :: l !<
	812	INTEGER(iwp) :: zs !<
[4429]	813	#endif
[1]	814
[4540]	815	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	816	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
[1111]	817
[4429]	818	#if defined( __parallel )
[4540]	819	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	820	#if __acc_fft_device
[3634]	821	!$ACC DECLARE CREATE(work)
[3690]	822	#endif
[4429]	823	#endif
[1111]	824
[1320]	825
[1]	826	!
[4540]	827	!-- If the PE grid is one-dimensional along y, only local reordering of the data is necessary and no
	828	!-- transposition has to be done.
[1]	829	IF ( pdims(1) == 1 ) THEN
[1106]	830
[4717]	831	!$OMP PARALLEL PRIVATE ( i, j, k )
	832	!$OMP DO
[3690]	833	#if __acc_fft_device
[3634]	834	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	835	!$ACC PRESENT(f_out, f_inv)
[3690]	836	#endif
[1003]	837	DO j = 0, ny
	838	DO k = nzb_y, nzt_y
	839	DO i = nxl_y, nxr_y
[164]	840	f_out(i,j,k) = f_inv(i,k,j)
[1]	841	ENDDO
	842	ENDDO
	843	ENDDO
[4717]	844	!$OMP END PARALLEL
[1]	845
[1106]	846	ELSE
	847
	848	#if defined( __parallel )
[1]	849	!
[1106]	850	!-- Transpose array
[1318]	851	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	852
	853	#if __acc_fft_device
[3657]	854	#ifndef __cuda_aware_mpi
[3634]	855	!$ACC UPDATE HOST(f_inv)
[3657]	856	#else
	857	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	858	#endif
[3690]	859	#endif
	860
[1106]	861	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[4540]	862	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	863	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, comm1dx, ierr )
[3690]	864
	865	#if __acc_fft_device
[3657]	866	#ifndef __cuda_aware_mpi
[3634]	867	!$ACC UPDATE DEVICE(work)
[3657]	868	#else
	869	!$ACC END HOST_DATA
	870	#endif
[3690]	871	#endif
	872
[1106]	873	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	874
	875	!
[1106]	876	!-- Reorder transposed array
[4717]	877	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[1106]	878	DO l = 0, pdims(1) - 1
	879	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	880	#if __acc_fft_device
[3634]	881	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	882	!$ACC PRESENT(f_out, work)
[3690]	883	#endif
[4236]	884	!$OMP DO
[1106]	885	DO j = nys_z, nyn_z
	886	DO k = zs, zs + nzt_y - nzb_y
	887	DO i = nxl_z, nxr_z
[1111]	888	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	889	ENDDO
[1]	890	ENDDO
	891	ENDDO
[4236]	892	!$OMP END DO NOWAIT
[1]	893	ENDDO
[4717]	894	!$OMP END PARALLEL
[1]	895	#endif
	896
[1106]	897	ENDIF
	898
[1]	899	END SUBROUTINE transpose_yz
	900
	901
[4540]	902	!--------------------------------------------------------------------------------------------------!
[1]	903	! Description:
	904	! ------------
[4540]	905	!> Resorting data for the transposition from z to x. The transposition itself is carried out in
	906	!> transpose_zx.
	907	!--------------------------------------------------------------------------------------------------!
[1682]	908	SUBROUTINE resort_for_zx( f_in, f_inv )
[1216]	909
[1682]	910
[4540]	911	USE indices, &
	912	ONLY: nxl, &
	913	nxr, &
	914	nyn, &
	915	nys, &
	916	nz
[1216]	917
[4540]	918	USE kinds
[1320]	919
[4540]	920	IMPLICIT NONE
[1216]	921
[4540]	922	INTEGER(iwp) :: i !<
	923	INTEGER(iwp) :: j !<
	924	INTEGER(iwp) :: k !<
[1216]	925
[4540]	926	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	927	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1216]	928
	929	!
[4540]	930	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
[4717]	931	!$OMP PARALLEL PRIVATE ( i, j, k )
	932	!$OMP DO
[3690]	933	#if __acc_fft_device
[4717]	934	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	935	!$ACC PRESENT(f_in, f_inv)
[3690]	936	#endif
[4540]	937	DO i = nxl, nxr
	938	DO j = nys, nyn
	939	DO k = 1,nz
	940	f_inv(j,i,k) = f_in(k,j,i)
	941	ENDDO
	942	ENDDO
	943	ENDDO
[4717]	944	!$OMP END PARALLEL
[1216]	945
	946	END SUBROUTINE resort_for_zx
	947
	948
[4540]	949	!--------------------------------------------------------------------------------------------------!
[1216]	950	! Description:
	951	! ------------
[4540]	952	!> Transposition of input array (f_in) from z to x. For the input array, all elements along z reside
	953	!> on the same PE, while after transposition, all elements along x reside on the same PE.
	954	!--------------------------------------------------------------------------------------------------!
[1682]	955	SUBROUTINE transpose_zx( f_inv, f_out )
[1]	956
[1682]	957
[4429]	958	#if defined( __parallel )
[4540]	959	USE cpulog, &
	960	ONLY: cpu_log, &
	961	cpu_log_nowait, &
	962	log_point_s
[1]	963
[4540]	964	USE fft_xy, &
	965	ONLY: f_vec_x, &
	966	temperton_fft_vec
[4429]	967	#endif
[4366]	968
[4540]	969	USE indices, &
	970	ONLY: nx, &
	971	nxl, &
	972	nxr, &
	973	nyn, &
	974	nys, &
	975	nz
	976
[4429]	977	#if defined( __parallel )
[4540]	978	USE indices, &
[4429]	979	ONLY: nnx
	980	#endif
[1320]	981
	982	USE kinds
	983
[1324]	984	USE pegrid
[1320]	985
[4540]	986	USE transpose_indices, &
	987	ONLY: nyn_x, &
	988	nys_x, &
	989	nzb_x, &
	990	nzt_x
[1320]	991
[1]	992	IMPLICIT NONE
	993
[4171]	994	INTEGER(iwp) :: i !<
	995	INTEGER(iwp) :: j !<
	996	INTEGER(iwp) :: k !<
[4540]	997
[4429]	998	#if defined( __parallel )
[4540]	999	INTEGER(iwp) :: l !<
	1000	INTEGER(iwp) :: mm !<
	1001	INTEGER(iwp) :: xs !<
[4429]	1002	#endif
[1]	1003
[4540]	1004	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	1005	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1111]	1006
[4429]	1007	#if defined( __parallel )
[4540]	1008	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	1009	#if __acc_fft_device
[3634]	1010	!$ACC DECLARE CREATE(work)
[3690]	1011	#endif
[4429]	1012	#endif
[1]	1013
[1320]	1014
[1]	1015	!
[4540]	1016	!-- If the PE grid is one-dimensional along y, only local reordering of the data is necessary and no
	1017	!-- transposition has to be done.
[1]	1018	IF ( pdims(1) == 1 ) THEN
[1106]	1019
[4717]	1020	!$OMP PARALLEL PRIVATE ( i, j, k )
	1021	!$OMP DO
[3690]	1022	#if __acc_fft_device
[3634]	1023	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1024	!$ACC PRESENT(f_out, f_inv)
[3690]	1025	#endif
[1003]	1026	DO k = 1, nz
	1027	DO i = nxl, nxr
	1028	DO j = nys, nyn
[164]	1029	f_out(i,j,k) = f_inv(j,i,k)
[1]	1030	ENDDO
	1031	ENDDO
	1032	ENDDO
[4717]	1033	!$OMP END PARALLEL
[1]	1034
[1106]	1035	ELSE
	1036
	1037	#if defined( __parallel )
[1]	1038	!
[1106]	1039	!-- Transpose array
[1318]	1040	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	1041
	1042	#if __acc_fft_device
[3657]	1043	#ifndef __cuda_aware_mpi
[3634]	1044	!$ACC UPDATE HOST(f_inv)
[3657]	1045	#else
	1046	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	1047	#endif
[3690]	1048	#endif
	1049
[1106]	1050	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[4540]	1051	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	1052	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, comm1dx, ierr )
[3690]	1053
	1054	#if __acc_fft_device
[3657]	1055	#ifndef __cuda_aware_mpi
[3634]	1056	!$ACC UPDATE DEVICE(work)
[3657]	1057	#else
	1058	!$ACC END HOST_DATA
	1059	#endif
[3690]	1060	#endif
	1061
[1106]	1062	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	1063
	1064	!
[4366]	1065	!-- Reorder transposed array.
[4370]	1066	!-- Data for the vectorized Temperton-fft is stored in different array format (f_vec_x) which
	1067	!-- saves additional data copy in fft_x.
[4366]	1068	IF ( temperton_fft_vec ) THEN
	1069
	1070	DO l = 0, pdims(1) - 1
	1071	xs = 0 + l * nnx
	1072	DO k = nzb_x, nzt_x
	1073	DO i = xs, xs + nnx - 1
	1074	DO j = nys_x, nyn_x
	1075	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
[4370]	1076	f_vec_x(mm,i) = work(j,i-xs+1,k,l)
[4366]	1077	ENDDO
	1078	ENDDO
	1079	ENDDO
	1080	ENDDO
	1081
	1082	ELSE
	1083
[4717]	1084	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[4366]	1085	DO l = 0, pdims(1) - 1
	1086	xs = 0 + l * nnx
[3690]	1087	#if __acc_fft_device
[4366]	1088	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1089	!$ACC PRESENT(f_out, work)
[3690]	1090	#endif
[4366]	1091	!$OMP DO
	1092	DO k = nzb_x, nzt_x
	1093	DO i = xs, xs + nnx - 1
	1094	DO j = nys_x, nyn_x
	1095	f_out(i,j,k) = work(j,i-xs+1,k,l)
	1096	ENDDO
[1106]	1097	ENDDO
[1]	1098	ENDDO
[4366]	1099	!$OMP END DO NOWAIT
[1]	1100	ENDDO
[4717]	1101	!$OMP END PARALLEL
[1]	1102
[4366]	1103	ENDIF
	1104
[4415]	1105	#endif
	1106
[1106]	1107	ENDIF
	1108
[1]	1109	END SUBROUTINE transpose_zx
	1110
	1111
[4540]	1112	!--------------------------------------------------------------------------------------------------!
[1]	1113	! Description:
	1114	! ------------
[4540]	1115	!> Resorting data after the transposition from z to y. The transposition itself is carried out in
	1116	!> transpose_zy.
	1117	!--------------------------------------------------------------------------------------------------!
[1682]	1118	SUBROUTINE resort_for_zy( f_inv, f_out )
[1216]	1119
[1682]	1120
[4540]	1121	USE indices, &
	1122	ONLY: ny
[1216]	1123
[4540]	1124	USE kinds
[1320]	1125
[4540]	1126	USE transpose_indices, &
	1127	ONLY: nxl_y, &
	1128	nxr_y, &
	1129	nzb_y, &
	1130	nzt_y
[1320]	1131
[4540]	1132	IMPLICIT NONE
[1216]	1133
[4540]	1134	INTEGER(iwp) :: i !<
	1135	INTEGER(iwp) :: j !<
	1136	INTEGER(iwp) :: k !<
[1216]	1137
[4540]	1138	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	1139	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1216]	1140
	1141	!
[4540]	1142	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
[4717]	1143	!$OMP PARALLEL PRIVATE ( i, j, k )
	1144	!$OMP DO
[3690]	1145	#if __acc_fft_device
[3634]	1146	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1147	!$ACC PRESENT(f_out, f_inv)
[3690]	1148	#endif
[4540]	1149	DO k = nzb_y, nzt_y
	1150	DO i = nxl_y, nxr_y
	1151	DO j = 0, ny
	1152	f_out(j,i,k) = f_inv(i,k,j)
	1153	ENDDO
	1154	ENDDO
	1155	ENDDO
[4717]	1156	!$OMP END PARALLEL
[1216]	1157
	1158	END SUBROUTINE resort_for_zy
	1159
	1160
[4540]	1161	!--------------------------------------------------------------------------------------------------!
[3241]	1162	! Description:cpu_log_nowait
[1216]	1163	! ------------
[4540]	1164	!> Transposition of input array (f_in) from z to y. For the input array, all elements along z reside
	1165	!> on the same PE, while after transposition, all elements along y reside on the same PE.
	1166	!--------------------------------------------------------------------------------------------------!
[1682]	1167	SUBROUTINE transpose_zy( f_in, f_inv )
[1]	1168
[1682]	1169
[4429]	1170	#if defined( __parallel )
[4540]	1171	USE cpulog, &
	1172	ONLY: cpu_log, &
	1173	cpu_log_nowait, &
	1174	log_point_s
[4429]	1175	#endif
[1]	1176
[4540]	1177	USE indices, &
	1178	ONLY: ny, &
	1179	nz
[1320]	1180
	1181	USE kinds
	1182
[1324]	1183	USE pegrid
[1320]	1184
[4540]	1185	USE transpose_indices, &
	1186	ONLY: nxl_y, &
	1187	nxl_z, &
	1188	nxr_y, &
	1189	nxr_z, &
	1190	nyn_z, &
	1191	nys_z, &
	1192	nzb_y, &
	1193	nzt_y
[1320]	1194
[1]	1195	IMPLICIT NONE
	1196
[4171]	1197	INTEGER(iwp) :: i !<
	1198	INTEGER(iwp) :: j !<
	1199	INTEGER(iwp) :: k !<
[4540]	1200
[4429]	1201	#if defined( __parallel )
[4540]	1202	INTEGER(iwp) :: l !<
	1203	INTEGER(iwp) :: zs !<
[4429]	1204	#endif
[1]	1205
[4540]	1206	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
	1207	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1111]	1208
[4429]	1209	#if defined( __parallel )
[4540]	1210	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	1211	#if __acc_fft_device
[3634]	1212	!$ACC DECLARE CREATE(work)
[3690]	1213	#endif
[4429]	1214	#endif
[1111]	1215
[1]	1216	!
[4540]	1217	!-- If the PE grid is one-dimensional along y, the array has only to be reordered locally and
	1218	!-- therefore no transposition has to be done.
[1]	1219	IF ( pdims(1) /= 1 ) THEN
[1106]	1220
	1221	#if defined( __parallel )
[1]	1222	!
	1223	!-- Reorder input array for transposition
[4717]	1224	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[1]	1225	DO l = 0, pdims(1) - 1
[1003]	1226	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	1227	#if __acc_fft_device
[3634]	1228	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1229	!$ACC PRESENT(work, f_in)
[3690]	1230	#endif
[4236]	1231	!$OMP DO
[1003]	1232	DO j = nys_z, nyn_z
	1233	DO k = zs, zs + nzt_y - nzb_y
	1234	DO i = nxl_z, nxr_z
[1111]	1235	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	1236	ENDDO
	1237	ENDDO
	1238	ENDDO
[4236]	1239	!$OMP END DO NOWAIT
[1]	1240	ENDDO
[4717]	1241	!$OMP END PARALLEL
[1]	1242
	1243	!
	1244	!-- Transpose array
[1318]	1245	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	1246
	1247	#if __acc_fft_device
[3657]	1248	#ifndef __cuda_aware_mpi
[3634]	1249	!$ACC UPDATE HOST(work)
[3657]	1250	#else
	1251	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	1252	#endif
[3690]	1253	#endif
	1254
[622]	1255	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[4540]	1256	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	1257	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, comm1dx, ierr )
[3690]	1258
	1259	#if __acc_fft_device
[3657]	1260	#ifndef __cuda_aware_mpi
[3634]	1261	!$ACC UPDATE DEVICE(f_inv)
[3657]	1262	#else
	1263	!$ACC END HOST_DATA
	1264	#endif
[3690]	1265	#endif
	1266
[1]	1267	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	1268	#endif
[1]	1269
	1270	ELSE
	1271	!
[1106]	1272	!-- Reorder the array in the same way like ALLTOALL did it
[4717]	1273	!$OMP PARALLEL PRIVATE ( i, j, k )
	1274	!$OMP DO
[3690]	1275	#if __acc_fft_device
[3634]	1276	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1277	!$ACC PRESENT(f_inv, f_in)
[3690]	1278	#endif
[1003]	1279	DO k = nzb_y, nzt_y
	1280	DO j = 0, ny
	1281	DO i = nxl_y, nxr_y
[164]	1282	f_inv(i,k,j) = f_in(i,j,k)
	1283	ENDDO
	1284	ENDDO
	1285	ENDDO
[4717]	1286	!$OMP END PARALLEL
[1106]	1287
	1288	ENDIF
	1289
[1]	1290	END SUBROUTINE transpose_zy
	1291
	1292
[4540]	1293	!--------------------------------------------------------------------------------------------------!
[1]	1294	! Description:
	1295	! ------------
[4540]	1296	!> Transposition of input array (f_in) from z to y. For the input array, all elements along z reside
	1297	!> on the same PE, while after transposition, all elements along y reside on the same PE. This is a
	1298	!> direct transposition for arrays with indices in regular order (k,j,i) (cf. transpose_zy).
	1299	!--------------------------------------------------------------------------------------------------!
[4429]	1300	#if defined( __parallel )
[1682]	1301	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	1302
[1682]	1303
[4540]	1304	USE cpulog, &
	1305	ONLY: cpu_log, &
	1306	log_point_s
[1]	1307
[4540]	1308	USE indices, &
	1309	ONLY: nnx, &
	1310	nny, &
	1311	nnz, &
	1312	nxl, &
	1313	nxr, &
	1314	nyn, &
	1315	nys, &
	1316	ny, &
	1317	nz
[1320]	1318
	1319	USE kinds
	1320
[1324]	1321	USE pegrid
[1320]	1322
[4540]	1323	USE transpose_indices, &
	1324	ONLY: nxl_yd, &
	1325	nxr_yd, &
	1326	nzb_yd, &
	1327	nzt_yd
[1320]	1328
[1]	1329	IMPLICIT NONE
	1330
[4540]	1331	INTEGER(iwp) :: i !<
	1332	INTEGER(iwp) :: j !<
	1333	INTEGER(iwp) :: k !<
	1334	INTEGER(iwp) :: l !<
	1335	INTEGER(iwp) :: m !<
	1336	INTEGER(iwp) :: ys !<
[1]	1337
[4540]	1338	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	1339	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	1340	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
	1341	REAL(wp) :: work(nnxnnynnz) !<
[1320]	1342
[1]	1343	!
[4540]	1344	!-- Rearrange indices of input array in order to make data to be send by MPI contiguous.
[1003]	1345	DO i = nxl, nxr
	1346	DO j = nys, nyn
	1347	DO k = 1, nz
[164]	1348	f_inv(j,i,k) = f_in(k,j,i)
[1]	1349	ENDDO
	1350	ENDDO
	1351	ENDDO
	1352
	1353	!
[4540]	1354	!-- Move data to different array, because memory location of work1 is needed further below
	1355	!-- (work1 = work2). If the PE grid is one-dimensional along x, only local reordering of the data is
	1356	!-- necessary and no transposition has to be done.
[1]	1357	IF ( pdims(2) == 1 ) THEN
[1003]	1358	DO k = 1, nz
	1359	DO i = nxl, nxr
	1360	DO j = nys, nyn
[164]	1361	f_out(j,i,k) = f_inv(j,i,k)
[1]	1362	ENDDO
	1363	ENDDO
	1364	ENDDO
	1365	RETURN
	1366	ENDIF
	1367
	1368	!
	1369	!-- Transpose array
	1370	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1371	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[4540]	1372	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
	1373	work(1), sendrecvcount_zyd, MPI_REAL, comm1dy, ierr )
[1]	1374	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1375
	1376	!
	1377	!-- Reorder transposed array
	1378	m = 0
	1379	DO l = 0, pdims(2) - 1
	1380	ys = 0 + l * nny
[1003]	1381	DO k = nzb_yd, nzt_yd
	1382	DO i = nxl_yd, nxr_yd
[1]	1383	DO j = ys, ys + nny - 1
	1384	m = m + 1
[164]	1385	f_out(j,i,k) = work(m)
[1]	1386	ENDDO
	1387	ENDDO
	1388	ENDDO
	1389	ENDDO
	1390
[4429]	1391	END SUBROUTINE transpose_zyd
[1]	1392	#endif

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: palm/trunk/SOURCE/transpose.f90

Download in other formats: