Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 4406

Last change on this file since 4406 was 4370, checked in by raasch, 5 years ago
bugfixes for previous commit: unused variables removed, Temperton-fft usage on GPU, openacc porting of vector version of Obukhov length calculation, collective read switched off on NEC to avoid hanging; some vector directives added in prognostic equations to force vectorization on Intel19 compiler, configuration files for NEC Aurora added
Property svn:keywords set to `Id`
File size: 34.1 KB

Rev	Line
[1682]	1	!> @file transpose.f90
[2000]	2	!------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[4360]	17	! Copyright 1997-2020 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1321]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: transpose.f90 4370 2020-01-10 14:00:44Z knoop $
[4370]	27	! vector array renamed
	28	!
	29	! 4366 2020-01-09 08:12:43Z raasch
[4366]	30	! modifications for NEC vectorization
	31	!
	32	! 4360 2020-01-07 11:25:50Z suehring
[4236]	33	! Added missing OpenMP directives
	34	!
	35	! 4182 2019-08-22 15:20:23Z scharf
[4182]	36	! Corrected "Former revisions" section
[4181]	37	!
[4182]	38	! 4171 2019-08-19 17:44:09Z gronemeier
[3832]	39	! loop reordering for performance optimization
[4171]	40	!
	41	! 3832 2019-03-28 13:16:58Z raasch
	42	! loop reordering for performance optimization
	43	!
[3832]	44	! 3694 2019-01-23 17:01:49Z knoop
[3634]	45	! OpenACC port for SPEC
[4171]	46	!
[4182]	47	! Revision 1.1 1997/07/24 11:25:18 raasch
	48	! Initial revision
	49	!
	50	!
[1216]	51	! Description:
	52	! ------------
[1682]	53	!> Resorting data for the transposition from x to y. The transposition itself
	54	!> is carried out in transpose_xy
[1216]	55	!------------------------------------------------------------------------------!
[4181]	56
	57	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
	58
[1682]	59	SUBROUTINE resort_for_xy( f_in, f_inv )
[1216]	60
[4171]	61
[1320]	62	USE indices, &
	63	ONLY: nx
[1216]	64
[1320]	65	USE kinds
	66
	67	USE transpose_indices, &
[3241]	68	ONLY: nyn_x, nys_x, nzb_x, nzt_x
[1320]	69
[1216]	70	IMPLICIT NONE
	71
[4171]	72	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	73	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1216]	74
	75
[4171]	76	INTEGER(iwp) :: i !<
	77	INTEGER(iwp) :: j !<
	78	INTEGER(iwp) :: k !<
[1]	79	!
[1216]	80	!-- Rearrange indices of input array in order to make data to be send
	81	!-- by MPI contiguous
	82	!$OMP PARALLEL PRIVATE ( i, j, k )
	83	!$OMP DO
[3690]	84	#if __acc_fft_device
[3634]	85	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	86	!$ACC PRESENT(f_inv, f_in)
[3690]	87	#endif
[3832]	88	DO k = nzb_x, nzt_x
[4171]	89	DO j = nys_x, nyn_x
	90	DO i = 0, nx
[1216]	91	f_inv(j,k,i) = f_in(i,j,k)
	92	ENDDO
	93	ENDDO
	94	ENDDO
	95	!$OMP END PARALLEL
	96
	97	END SUBROUTINE resort_for_xy
	98
	99
	100	!------------------------------------------------------------------------------!
[1]	101	! Description:
	102	! ------------
[1682]	103	!> Transposition of input array (f_in) from x to y. For the input array, all
	104	!> elements along x reside on the same PE, while after transposition, all
	105	!> elements along y reside on the same PE.
[1]	106	!------------------------------------------------------------------------------!
[1682]	107	SUBROUTINE transpose_xy( f_inv, f_out )
[1]	108
[1682]	109
[1320]	110	USE cpulog, &
	111	ONLY: cpu_log, cpu_log_nowait, log_point_s
	112
	113	USE indices, &
	114	ONLY: nx, ny
[4171]	115
[1320]	116	USE kinds
	117
[1]	118	USE pegrid
	119
[1320]	120	USE transpose_indices, &
	121	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	122
[1]	123	IMPLICIT NONE
	124
[4171]	125	INTEGER(iwp) :: i !<
	126	INTEGER(iwp) :: j !<
	127	INTEGER(iwp) :: k !<
	128	INTEGER(iwp) :: l !<
	129	INTEGER(iwp) :: ys !<
[1]	130
[4171]	131	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	132	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	133
	134	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	135	#if __acc_fft_device
[3634]	136	!$ACC DECLARE CREATE(work)
[3690]	137	#endif
[1111]	138
	139
[1106]	140	IF ( numprocs /= 1 ) THEN
	141
	142	#if defined( __parallel )
[1]	143	!
[1106]	144	!-- Transpose array
[1318]	145	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	146
	147	#if __acc_fft_device
[3657]	148	#ifndef __cuda_aware_mpi
[3634]	149	!$ACC UPDATE HOST(f_inv)
[3657]	150	#else
	151	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	152	#endif
[3690]	153	#endif
	154
[1106]	155	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	156	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	157	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	158	comm1dy, ierr )
[3690]	159
	160	#if __acc_fft_device
[3657]	161	#ifndef __cuda_aware_mpi
[3634]	162	!$ACC UPDATE DEVICE(work)
[3657]	163	#else
	164	!$ACC END HOST_DATA
	165	#endif
[3690]	166	#endif
	167
[1106]	168	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	169
	170	!
[1106]	171	!-- Reorder transposed array
[1111]	172	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[1106]	173	DO l = 0, pdims(2) - 1
	174	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	175	#if __acc_fft_device
[3634]	176	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	177	!$ACC PRESENT(f_out, work)
[3690]	178	#endif
[4236]	179	!$OMP DO
[1106]	180	DO i = nxl_y, nxr_y
	181	DO k = nzb_y, nzt_y
	182	DO j = ys, ys + nyn_x - nys_x
[1111]	183	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	184	ENDDO
[1]	185	ENDDO
	186	ENDDO
[4236]	187	!$OMP END DO NOWAIT
[1]	188	ENDDO
[683]	189	!$OMP END PARALLEL
[1]	190	#endif
	191
[1106]	192	ELSE
	193
	194	!
	195	!-- Reorder transposed array
	196	!$OMP PARALLEL PRIVATE ( i, j, k )
	197	!$OMP DO
[3690]	198	#if __acc_fft_device
[3634]	199	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	200	!$ACC PRESENT(f_out, f_inv)
[3690]	201	#endif
[1106]	202	DO k = nzb_y, nzt_y
	203	DO i = nxl_y, nxr_y
	204	DO j = 0, ny
	205	f_out(j,i,k) = f_inv(j,k,i)
	206	ENDDO
	207	ENDDO
	208	ENDDO
	209	!$OMP END PARALLEL
	210
	211	ENDIF
	212
[1]	213	END SUBROUTINE transpose_xy
	214
	215
	216	!------------------------------------------------------------------------------!
	217	! Description:
	218	! ------------
[1682]	219	!> Resorting data after the transposition from x to z. The transposition itself
	220	!> is carried out in transpose_xz
[1216]	221	!------------------------------------------------------------------------------!
[1682]	222	SUBROUTINE resort_for_xz( f_inv, f_out )
[1216]	223
[1682]	224
[1320]	225	USE indices, &
	226	ONLY: nxl, nxr, nyn, nys, nz
[1216]	227
[1320]	228	USE kinds
	229
[1216]	230	IMPLICIT NONE
	231
[4171]	232	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	233	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
[1216]	234
[4171]	235	INTEGER(iwp) :: i !<
	236	INTEGER(iwp) :: j !<
	237	INTEGER(iwp) :: k !<
[1216]	238	!
	239	!-- Rearrange indices of input array in order to make data to be send
	240	!-- by MPI contiguous.
	241	!-- In case of parallel fft/transposition, scattered store is faster in
	242	!-- backward direction!!!
	243	!$OMP PARALLEL PRIVATE ( i, j, k )
	244	!$OMP DO
[3690]	245	#if __acc_fft_device
[3634]	246	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	247	!$ACC PRESENT(f_out, f_inv)
[3690]	248	#endif
[4171]	249	DO i = nxl, nxr
	250	DO j = nys, nyn
	251	DO k = 1, nz
[1216]	252	f_out(k,j,i) = f_inv(j,i,k)
	253	ENDDO
	254	ENDDO
	255	ENDDO
	256	!$OMP END PARALLEL
	257
	258	END SUBROUTINE resort_for_xz
	259
	260
	261	!------------------------------------------------------------------------------!
	262	! Description:
	263	! ------------
[1682]	264	!> Transposition of input array (f_in) from x to z. For the input array, all
	265	!> elements along x reside on the same PE, while after transposition, all
	266	!> elements along z reside on the same PE.
[1]	267	!------------------------------------------------------------------------------!
[1682]	268	SUBROUTINE transpose_xz( f_in, f_inv )
[1]	269
[1682]	270
[1320]	271	USE cpulog, &
	272	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	273
[4366]	274	USE fft_xy, &
[4370]	275	ONLY: f_vec_x, temperton_fft_vec
[4366]	276
[1320]	277	USE indices, &
[3241]	278	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
[1320]	279
	280	USE kinds
	281
[1324]	282	USE pegrid
[1320]	283
	284	USE transpose_indices, &
	285	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	286
[1]	287	IMPLICIT NONE
	288
[4171]	289	INTEGER(iwp) :: i !<
	290	INTEGER(iwp) :: j !<
	291	INTEGER(iwp) :: k !<
	292	INTEGER(iwp) :: l !<
[4366]	293	INTEGER(iwp) :: mm !<
[4171]	294	INTEGER(iwp) :: xs !<
[1]	295
[4171]	296	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	297	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1]	298
[4171]	299	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	300	#if __acc_fft_device
[3634]	301	!$ACC DECLARE CREATE(work)
[3690]	302	#endif
[1111]	303
[4366]	304	!
	305	!-- If the PE grid is one-dimensional along y, the array has only to be
	306	!-- reordered locally and therefore no transposition has to be done.
[1]	307	IF ( pdims(1) /= 1 ) THEN
[1106]	308
	309	#if defined( __parallel )
[1]	310	!
[4366]	311	!-- Reorder input array for transposition. Data from the vectorized Temperton-fft is stored in
[4370]	312	!-- different array format (f_vec_x).
[4366]	313	IF ( temperton_fft_vec ) THEN
	314
	315	DO l = 0, pdims(1) - 1
	316	xs = 0 + l * nnx
	317	DO k = nzb_x, nzt_x
	318	DO i = xs, xs + nnx - 1
	319	DO j = nys_x, nyn_x
	320	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
[4370]	321	work(j,i-xs+1,k,l) = f_vec_x(mm,i)
[4366]	322	ENDDO
	323	ENDDO
	324	ENDDO
	325	ENDDO
	326
	327	ELSE
	328
	329	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
	330	DO l = 0, pdims(1) - 1
	331	xs = 0 + l * nnx
[3690]	332	#if __acc_fft_device
[4366]	333	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	334	!$ACC PRESENT(work, f_in)
[3690]	335	#endif
[4366]	336	!$OMP DO
	337	DO k = nzb_x, nzt_x
	338	DO i = xs, xs + nnx - 1
	339	DO j = nys_x, nyn_x
	340	work(j,i-xs+1,k,l) = f_in(i,j,k)
	341	ENDDO
[1]	342	ENDDO
	343	ENDDO
[4366]	344	!$OMP END DO NOWAIT
[1]	345	ENDDO
[4366]	346	!$OMP END PARALLEL
[1]	347
[4366]	348	ENDIF
	349
[1]	350	!
	351	!-- Transpose array
[1318]	352	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	353
	354	#if __acc_fft_device
[3657]	355	#ifndef __cuda_aware_mpi
[3634]	356	!$ACC UPDATE HOST(work)
[3657]	357	#else
	358	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	359	#endif
[3690]	360	#endif
	361
[622]	362	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	363	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	364	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	365	comm1dx, ierr )
[3690]	366
	367	#if __acc_fft_device
[3657]	368	#ifndef __cuda_aware_mpi
[3634]	369	!$ACC UPDATE DEVICE(f_inv)
[3657]	370	#else
	371	!$ACC END HOST_DATA
	372	#endif
[3694]	373	#endif
	374
[1]	375	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	376	#endif
	377
[1]	378	ELSE
[1106]	379
[1]	380	!
	381	!-- Reorder the array in a way that the z index is in first position
[683]	382	!$OMP PARALLEL PRIVATE ( i, j, k )
	383	!$OMP DO
[3690]	384	#if __acc_fft_device
[3634]	385	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	386	!$ACC PRESENT(f_inv, f_in)
[3690]	387	#endif
[1003]	388	DO i = nxl, nxr
	389	DO j = nys, nyn
	390	DO k = 1, nz
[164]	391	f_inv(j,i,k) = f_in(i,j,k)
[1]	392	ENDDO
	393	ENDDO
	394	ENDDO
[683]	395	!$OMP END PARALLEL
[1]	396
[164]	397	ENDIF
	398
[1]	399	END SUBROUTINE transpose_xz
	400
	401
	402	!------------------------------------------------------------------------------!
	403	! Description:
	404	! ------------
[1682]	405	!> Resorting data after the transposition from y to x. The transposition itself
	406	!> is carried out in transpose_yx
[1216]	407	!------------------------------------------------------------------------------!
[1682]	408	SUBROUTINE resort_for_yx( f_inv, f_out )
[1216]	409
[1682]	410
[1320]	411	USE indices, &
	412	ONLY: nx
[1216]	413
[1320]	414	USE kinds
	415
	416	USE transpose_indices, &
	417	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	418
[1216]	419	IMPLICIT NONE
	420
[4171]	421	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	422	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1216]	423
	424
[4171]	425	INTEGER(iwp) :: i !<
	426	INTEGER(iwp) :: j !<
	427	INTEGER(iwp) :: k !<
[1216]	428	!
	429	!-- Rearrange indices of input array in order to make data to be send
	430	!-- by MPI contiguous
	431	!$OMP PARALLEL PRIVATE ( i, j, k )
	432	!$OMP DO
[3690]	433	#if __acc_fft_device
[3634]	434	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	435	!$ACC PRESENT(f_out, f_inv)
[3690]	436	#endif
[4171]	437	DO k = nzb_x, nzt_x
	438	DO j = nys_x, nyn_x
	439	DO i = 0, nx
[1216]	440	f_out(i,j,k) = f_inv(j,k,i)
	441	ENDDO
	442	ENDDO
	443	ENDDO
	444	!$OMP END PARALLEL
	445
	446	END SUBROUTINE resort_for_yx
	447
	448
	449	!------------------------------------------------------------------------------!
	450	! Description:
	451	! ------------
[1682]	452	!> Transposition of input array (f_in) from y to x. For the input array, all
	453	!> elements along y reside on the same PE, while after transposition, all
	454	!> elements along x reside on the same PE.
[1]	455	!------------------------------------------------------------------------------!
[1682]	456	SUBROUTINE transpose_yx( f_in, f_inv )
[1]	457
[1682]	458
[1320]	459	USE cpulog, &
	460	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	461
[1320]	462	USE indices, &
	463	ONLY: nx, ny
	464
	465	USE kinds
	466
[1324]	467	USE pegrid
[1320]	468
	469	USE transpose_indices, &
	470	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	471
[1]	472	IMPLICIT NONE
	473
[4171]	474	INTEGER(iwp) :: i !<
	475	INTEGER(iwp) :: j !<
	476	INTEGER(iwp) :: k !<
	477	INTEGER(iwp) :: l !<
	478	INTEGER(iwp) :: ys !<
[1]	479
[4171]	480	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	481	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1111]	482
[4171]	483	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	484	#if __acc_fft_device
[3634]	485	!$ACC DECLARE CREATE(work)
[3690]	486	#endif
[1111]	487
[1320]	488
[1106]	489	IF ( numprocs /= 1 ) THEN
	490
[1]	491	#if defined( __parallel )
	492	!
[1106]	493	!-- Reorder input array for transposition
[1111]	494	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[1106]	495	DO l = 0, pdims(2) - 1
	496	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	497	#if __acc_fft_device
[3634]	498	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	499	!$ACC PRESENT(work, f_in)
[3690]	500	#endif
[4236]	501	!$OMP DO
[1106]	502	DO i = nxl_y, nxr_y
	503	DO k = nzb_y, nzt_y
	504	DO j = ys, ys + nyn_x - nys_x
[1111]	505	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	506	ENDDO
	507	ENDDO
	508	ENDDO
[4236]	509	!$OMP END DO NOWAIT
[1106]	510	ENDDO
	511	!$OMP END PARALLEL
	512
	513	!
	514	!-- Transpose array
[1318]	515	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	516
	517	#if __acc_fft_device
[3657]	518	#ifndef __cuda_aware_mpi
[3634]	519	!$ACC UPDATE HOST(work)
[3657]	520	#else
	521	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	522	#endif
[3690]	523	#endif
	524
[1106]	525	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	526	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	527	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	528	comm1dy, ierr )
[3690]	529
	530	#if __acc_fft_device
[3657]	531	#ifndef __cuda_aware_mpi
[3634]	532	!$ACC UPDATE DEVICE(f_inv)
[3657]	533	#else
	534	!$ACC END HOST_DATA
	535	#endif
[3690]	536	#endif
	537
[1106]	538	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	539	#endif
	540
	541	ELSE
	542
	543	!
	544	!-- Reorder array f_in the same way as ALLTOALL did it
	545	!$OMP PARALLEL PRIVATE ( i, j, k )
	546	!$OMP DO
[3690]	547	#if __acc_fft_device
[3634]	548	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	549	!$ACC PRESENT(f_inv, f_in)
[3690]	550	#endif
[1003]	551	DO i = nxl_y, nxr_y
	552	DO k = nzb_y, nzt_y
[1106]	553	DO j = 0, ny
	554	f_inv(j,k,i) = f_in(j,i,k)
[1]	555	ENDDO
	556	ENDDO
	557	ENDDO
[683]	558	!$OMP END PARALLEL
[1]	559
[1106]	560	ENDIF
[1]	561
	562	END SUBROUTINE transpose_yx
	563
	564
	565	!------------------------------------------------------------------------------!
	566	! Description:
	567	! ------------
[1682]	568	!> Transposition of input array (f_in) from y to x. For the input array, all
	569	!> elements along y reside on the same PE, while after transposition, all
	570	!> elements along x reside on the same PE.
	571	!> This is a direct transposition for arrays with indices in regular order
	572	!> (k,j,i) (cf. transpose_yx).
[1]	573	!------------------------------------------------------------------------------!
[1682]	574	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	575
[1682]	576
[1320]	577	USE cpulog, &
[3241]	578	ONLY: cpu_log, log_point_s
[1]	579
[1320]	580	USE indices, &
	581	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
	582
	583	USE kinds
	584
[1324]	585	USE pegrid
[1320]	586
	587	USE transpose_indices, &
	588	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	589
[1]	590	IMPLICIT NONE
	591
[4171]	592	INTEGER(iwp) :: i !<
	593	INTEGER(iwp) :: j !<
	594	INTEGER(iwp) :: k !<
	595	INTEGER(iwp) :: l !<
	596	INTEGER(iwp) :: m !<
	597	INTEGER(iwp) :: xs !<
[1]	598
[4171]	599	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	600	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
	601	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	602	REAL(wp) :: work(nnxnnynnz) !<
[1]	603	#if defined( __parallel )
	604
	605	!
	606	!-- Rearrange indices of input array in order to make data to be send
	607	!-- by MPI contiguous
[1003]	608	DO k = 1, nz
	609	DO j = nys, nyn
	610	DO i = nxl, nxr
[164]	611	f_inv(i,k,j) = f_in(k,j,i)
[1]	612	ENDDO
	613	ENDDO
	614	ENDDO
	615
	616	!
	617	!-- Transpose array
	618	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	619	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	620	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	621	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	622	comm1dx, ierr )
	623	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	624
	625	!
	626	!-- Reorder transposed array
	627	m = 0
	628	DO l = 0, pdims(1) - 1
	629	xs = 0 + l * nnx
[1003]	630	DO j = nys_x, nyn_x
	631	DO k = 1, nz
[1]	632	DO i = xs, xs + nnx - 1
	633	m = m + 1
[164]	634	f_out(i,j,k) = work(m)
[1]	635	ENDDO
	636	ENDDO
	637	ENDDO
	638	ENDDO
	639
	640	#endif
	641
	642	END SUBROUTINE transpose_yxd
	643
	644
	645	!------------------------------------------------------------------------------!
	646	! Description:
	647	! ------------
[1682]	648	!> Resorting data for the transposition from y to z. The transposition itself
	649	!> is carried out in transpose_yz
[1216]	650	!------------------------------------------------------------------------------!
[1682]	651	SUBROUTINE resort_for_yz( f_in, f_inv )
[1216]	652
[1682]	653
[1320]	654	USE indices, &
	655	ONLY: ny
[1216]	656
[1320]	657	USE kinds
	658
	659	USE transpose_indices, &
	660	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	661
[1216]	662	IMPLICIT NONE
	663
[4171]	664	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	665	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1216]	666
[4171]	667	INTEGER(iwp) :: i !<
	668	INTEGER(iwp) :: j !<
	669	INTEGER(iwp) :: k !<
[1216]	670
	671	!
	672	!-- Rearrange indices of input array in order to make data to be send
	673	!-- by MPI contiguous
	674	!$OMP PARALLEL PRIVATE ( i, j, k )
	675	!$OMP DO
[3690]	676	#if __acc_fft_device
[3634]	677	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	678	!$ACC PRESENT(f_inv, f_in)
[3690]	679	#endif
[4171]	680	DO k = nzb_y, nzt_y
	681	DO i = nxl_y, nxr_y
	682	DO j = 0, ny
[1216]	683	f_inv(i,k,j) = f_in(j,i,k)
	684	ENDDO
	685	ENDDO
	686	ENDDO
	687	!$OMP END PARALLEL
	688
	689	END SUBROUTINE resort_for_yz
	690
	691
	692	!------------------------------------------------------------------------------!
	693	! Description:
	694	! ------------
[1682]	695	!> Transposition of input array (f_in) from y to z. For the input array, all
	696	!> elements along y reside on the same PE, while after transposition, all
	697	!> elements along z reside on the same PE.
[1]	698	!------------------------------------------------------------------------------!
[1682]	699	SUBROUTINE transpose_yz( f_inv, f_out )
[1]	700
[1682]	701
[1320]	702	USE cpulog, &
	703	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	704
[1320]	705	USE indices, &
	706	ONLY: ny, nz
	707
	708	USE kinds
	709
[1324]	710	USE pegrid
[1320]	711
	712	USE transpose_indices, &
	713	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	714
[1]	715	IMPLICIT NONE
	716
[4171]	717	INTEGER(iwp) :: i !<
	718	INTEGER(iwp) :: j !<
	719	INTEGER(iwp) :: k !<
	720	INTEGER(iwp) :: l !<
	721	INTEGER(iwp) :: zs !<
[1]	722
[4171]	723	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	724	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
[1111]	725
[4171]	726	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	727	#if __acc_fft_device
[3634]	728	!$ACC DECLARE CREATE(work)
[3690]	729	#endif
[1111]	730
[1320]	731
[1]	732	!
	733	!-- If the PE grid is one-dimensional along y, only local reordering
	734	!-- of the data is necessary and no transposition has to be done.
	735	IF ( pdims(1) == 1 ) THEN
[1106]	736
[683]	737	!$OMP PARALLEL PRIVATE ( i, j, k )
	738	!$OMP DO
[3690]	739	#if __acc_fft_device
[3634]	740	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	741	!$ACC PRESENT(f_out, f_inv)
[3690]	742	#endif
[1003]	743	DO j = 0, ny
	744	DO k = nzb_y, nzt_y
	745	DO i = nxl_y, nxr_y
[164]	746	f_out(i,j,k) = f_inv(i,k,j)
[1]	747	ENDDO
	748	ENDDO
	749	ENDDO
[683]	750	!$OMP END PARALLEL
[1]	751
[1106]	752	ELSE
	753
	754	#if defined( __parallel )
[1]	755	!
[1106]	756	!-- Transpose array
[1318]	757	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	758
	759	#if __acc_fft_device
[3657]	760	#ifndef __cuda_aware_mpi
[3634]	761	!$ACC UPDATE HOST(f_inv)
[3657]	762	#else
	763	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	764	#endif
[3690]	765	#endif
	766
[1106]	767	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	768	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	769	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	770	comm1dx, ierr )
[3690]	771
	772	#if __acc_fft_device
[3657]	773	#ifndef __cuda_aware_mpi
[3634]	774	!$ACC UPDATE DEVICE(work)
[3657]	775	#else
	776	!$ACC END HOST_DATA
	777	#endif
[3690]	778	#endif
	779
[1106]	780	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	781
	782	!
[1106]	783	!-- Reorder transposed array
[1111]	784	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[1106]	785	DO l = 0, pdims(1) - 1
	786	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	787	#if __acc_fft_device
[3634]	788	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	789	!$ACC PRESENT(f_out, work)
[3690]	790	#endif
[4236]	791	!$OMP DO
[1106]	792	DO j = nys_z, nyn_z
	793	DO k = zs, zs + nzt_y - nzb_y
	794	DO i = nxl_z, nxr_z
[1111]	795	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	796	ENDDO
[1]	797	ENDDO
	798	ENDDO
[4236]	799	!$OMP END DO NOWAIT
[1]	800	ENDDO
[683]	801	!$OMP END PARALLEL
[1]	802	#endif
	803
[1106]	804	ENDIF
	805
[1]	806	END SUBROUTINE transpose_yz
	807
	808
	809	!------------------------------------------------------------------------------!
	810	! Description:
	811	! ------------
[1682]	812	!> Resorting data for the transposition from z to x. The transposition itself
	813	!> is carried out in transpose_zx
[1216]	814	!------------------------------------------------------------------------------!
[1682]	815	SUBROUTINE resort_for_zx( f_in, f_inv )
[1216]	816
[1682]	817
[1320]	818	USE indices, &
	819	ONLY: nxl, nxr, nyn, nys, nz
[1216]	820
[1320]	821	USE kinds
	822
[1216]	823	IMPLICIT NONE
	824
[4171]	825	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	826	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1216]	827
[4171]	828	INTEGER(iwp) :: i !<
	829	INTEGER(iwp) :: j !<
	830	INTEGER(iwp) :: k !<
[1216]	831
	832	!
	833	!-- Rearrange indices of input array in order to make data to be send
	834	!-- by MPI contiguous
	835	!$OMP PARALLEL PRIVATE ( i, j, k )
	836	!$OMP DO
[3690]	837	#if __acc_fft_device
[3634]	838	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	839	!$ACC PRESENT(f_in, f_inv)
[3690]	840	#endif
[3832]	841	DO i = nxl, nxr
[4171]	842	DO j = nys, nyn
	843	DO k = 1,nz
[1216]	844	f_inv(j,i,k) = f_in(k,j,i)
	845	ENDDO
	846	ENDDO
	847	ENDDO
	848	!$OMP END PARALLEL
	849
	850	END SUBROUTINE resort_for_zx
	851
	852
	853	!------------------------------------------------------------------------------!
	854	! Description:
	855	! ------------
[1682]	856	!> Transposition of input array (f_in) from z to x. For the input array, all
	857	!> elements along z reside on the same PE, while after transposition, all
	858	!> elements along x reside on the same PE.
[1]	859	!------------------------------------------------------------------------------!
[1682]	860	SUBROUTINE transpose_zx( f_inv, f_out )
[1]	861
[1682]	862
[1320]	863	USE cpulog, &
	864	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	865
[4366]	866	USE fft_xy, &
[4370]	867	ONLY: f_vec_x, temperton_fft_vec
[4366]	868
[1320]	869	USE indices, &
	870	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
	871
	872	USE kinds
	873
[1324]	874	USE pegrid
[1320]	875
	876	USE transpose_indices, &
	877	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	878
[1]	879	IMPLICIT NONE
	880
[4171]	881	INTEGER(iwp) :: i !<
	882	INTEGER(iwp) :: j !<
	883	INTEGER(iwp) :: k !<
	884	INTEGER(iwp) :: l !<
[4366]	885	INTEGER(iwp) :: mm !<
[4171]	886	INTEGER(iwp) :: xs !<
[1]	887
[4171]	888	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	889	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1111]	890
[4171]	891	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	892	#if __acc_fft_device
[3634]	893	!$ACC DECLARE CREATE(work)
[3690]	894	#endif
[1]	895
[1320]	896
[1]	897	!
	898	!-- If the PE grid is one-dimensional along y, only local reordering
	899	!-- of the data is necessary and no transposition has to be done.
	900	IF ( pdims(1) == 1 ) THEN
[1106]	901
[683]	902	!$OMP PARALLEL PRIVATE ( i, j, k )
	903	!$OMP DO
[3690]	904	#if __acc_fft_device
[3634]	905	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	906	!$ACC PRESENT(f_out, f_inv)
[3690]	907	#endif
[1003]	908	DO k = 1, nz
	909	DO i = nxl, nxr
	910	DO j = nys, nyn
[164]	911	f_out(i,j,k) = f_inv(j,i,k)
[1]	912	ENDDO
	913	ENDDO
	914	ENDDO
[683]	915	!$OMP END PARALLEL
[1]	916
[1106]	917	ELSE
	918
	919	#if defined( __parallel )
[1]	920	!
[1106]	921	!-- Transpose array
[1318]	922	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	923
	924	#if __acc_fft_device
[3657]	925	#ifndef __cuda_aware_mpi
[3634]	926	!$ACC UPDATE HOST(f_inv)
[3657]	927	#else
	928	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	929	#endif
[3690]	930	#endif
	931
[1106]	932	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	933	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	934	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	935	comm1dx, ierr )
[3690]	936
	937	#if __acc_fft_device
[3657]	938	#ifndef __cuda_aware_mpi
[3634]	939	!$ACC UPDATE DEVICE(work)
[3657]	940	#else
	941	!$ACC END HOST_DATA
	942	#endif
[3690]	943	#endif
	944
[1106]	945	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	946
	947	!
[4366]	948	!-- Reorder transposed array.
[4370]	949	!-- Data for the vectorized Temperton-fft is stored in different array format (f_vec_x) which
	950	!-- saves additional data copy in fft_x.
[4366]	951	IF ( temperton_fft_vec ) THEN
	952
	953	DO l = 0, pdims(1) - 1
	954	xs = 0 + l * nnx
	955	DO k = nzb_x, nzt_x
	956	DO i = xs, xs + nnx - 1
	957	DO j = nys_x, nyn_x
	958	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
[4370]	959	f_vec_x(mm,i) = work(j,i-xs+1,k,l)
[4366]	960	ENDDO
	961	ENDDO
	962	ENDDO
	963	ENDDO
	964
	965	ELSE
	966
	967	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
	968	DO l = 0, pdims(1) - 1
	969	xs = 0 + l * nnx
[3690]	970	#if __acc_fft_device
[4366]	971	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	972	!$ACC PRESENT(f_out, work)
[3690]	973	#endif
[4366]	974	!$OMP DO
	975	DO k = nzb_x, nzt_x
	976	DO i = xs, xs + nnx - 1
	977	DO j = nys_x, nyn_x
	978	f_out(i,j,k) = work(j,i-xs+1,k,l)
	979	ENDDO
[1106]	980	ENDDO
[1]	981	ENDDO
[4366]	982	!$OMP END DO NOWAIT
[1]	983	ENDDO
[4366]	984	!$OMP END PARALLEL
[1]	985	#endif
	986
[4366]	987	ENDIF
	988
[1106]	989	ENDIF
	990
[1]	991	END SUBROUTINE transpose_zx
	992
	993
	994	!------------------------------------------------------------------------------!
	995	! Description:
	996	! ------------
[1682]	997	!> Resorting data after the transposition from z to y. The transposition itself
	998	!> is carried out in transpose_zy
[1216]	999	!------------------------------------------------------------------------------!
[1682]	1000	SUBROUTINE resort_for_zy( f_inv, f_out )
[1216]	1001
[1682]	1002
[1320]	1003	USE indices, &
	1004	ONLY: ny
[1216]	1005
[1320]	1006	USE kinds
	1007
	1008	USE transpose_indices, &
	1009	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	1010
[1216]	1011	IMPLICIT NONE
	1012
[4171]	1013	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	1014	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1216]	1015
	1016
[4171]	1017	INTEGER(iwp) :: i !<
	1018	INTEGER(iwp) :: j !<
	1019	INTEGER(iwp) :: k !<
[1216]	1020
	1021	!
	1022	!-- Rearrange indices of input array in order to make data to be send
	1023	!-- by MPI contiguous
	1024	!$OMP PARALLEL PRIVATE ( i, j, k )
	1025	!$OMP DO
[3690]	1026	#if __acc_fft_device
[3634]	1027	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1028	!$ACC PRESENT(f_out, f_inv)
[3690]	1029	#endif
[4171]	1030	DO k = nzb_y, nzt_y
	1031	DO i = nxl_y, nxr_y
	1032	DO j = 0, ny
[1216]	1033	f_out(j,i,k) = f_inv(i,k,j)
	1034	ENDDO
	1035	ENDDO
	1036	ENDDO
	1037	!$OMP END PARALLEL
	1038
	1039	END SUBROUTINE resort_for_zy
	1040
	1041
	1042	!------------------------------------------------------------------------------!
[3241]	1043	! Description:cpu_log_nowait
[1216]	1044	! ------------
[1682]	1045	!> Transposition of input array (f_in) from z to y. For the input array, all
	1046	!> elements along z reside on the same PE, while after transposition, all
	1047	!> elements along y reside on the same PE.
[1]	1048	!------------------------------------------------------------------------------!
[1682]	1049	SUBROUTINE transpose_zy( f_in, f_inv )
[1]	1050
[1682]	1051
[1320]	1052	USE cpulog, &
	1053	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	1054
[1320]	1055	USE indices, &
	1056	ONLY: ny, nz
	1057
	1058	USE kinds
	1059
[1324]	1060	USE pegrid
[1320]	1061
	1062	USE transpose_indices, &
	1063	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	1064
[1]	1065	IMPLICIT NONE
	1066
[4171]	1067	INTEGER(iwp) :: i !<
	1068	INTEGER(iwp) :: j !<
	1069	INTEGER(iwp) :: k !<
	1070	INTEGER(iwp) :: l !<
	1071	INTEGER(iwp) :: zs !<
[1]	1072
[4171]	1073	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
	1074	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1111]	1075
[1682]	1076	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	1077	#if __acc_fft_device
[3634]	1078	!$ACC DECLARE CREATE(work)
[3690]	1079	#endif
[1111]	1080
[1]	1081	!
	1082	!-- If the PE grid is one-dimensional along y, the array has only to be
	1083	!-- reordered locally and therefore no transposition has to be done.
	1084	IF ( pdims(1) /= 1 ) THEN
[1106]	1085
	1086	#if defined( __parallel )
[1]	1087	!
	1088	!-- Reorder input array for transposition
[1111]	1089	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[1]	1090	DO l = 0, pdims(1) - 1
[1003]	1091	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	1092	#if __acc_fft_device
[3634]	1093	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1094	!$ACC PRESENT(work, f_in)
[3690]	1095	#endif
[4236]	1096	!$OMP DO
[1003]	1097	DO j = nys_z, nyn_z
	1098	DO k = zs, zs + nzt_y - nzb_y
	1099	DO i = nxl_z, nxr_z
[1111]	1100	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	1101	ENDDO
	1102	ENDDO
	1103	ENDDO
[4236]	1104	!$OMP END DO NOWAIT
[1]	1105	ENDDO
[683]	1106	!$OMP END PARALLEL
[1]	1107
	1108	!
	1109	!-- Transpose array
[1318]	1110	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	1111
	1112	#if __acc_fft_device
[3657]	1113	#ifndef __cuda_aware_mpi
[3634]	1114	!$ACC UPDATE HOST(work)
[3657]	1115	#else
	1116	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	1117	#endif
[3690]	1118	#endif
	1119
[622]	1120	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	1121	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	1122	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	1123	comm1dx, ierr )
[3690]	1124
	1125	#if __acc_fft_device
[3657]	1126	#ifndef __cuda_aware_mpi
[3634]	1127	!$ACC UPDATE DEVICE(f_inv)
[3657]	1128	#else
	1129	!$ACC END HOST_DATA
	1130	#endif
[3690]	1131	#endif
	1132
[1]	1133	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	1134	#endif
[1]	1135
	1136	ELSE
	1137	!
[1106]	1138	!-- Reorder the array in the same way like ALLTOALL did it
[683]	1139	!$OMP PARALLEL PRIVATE ( i, j, k )
	1140	!$OMP DO
[3690]	1141	#if __acc_fft_device
[3634]	1142	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1143	!$ACC PRESENT(f_inv, f_in)
[3690]	1144	#endif
[1003]	1145	DO k = nzb_y, nzt_y
	1146	DO j = 0, ny
	1147	DO i = nxl_y, nxr_y
[164]	1148	f_inv(i,k,j) = f_in(i,j,k)
	1149	ENDDO
	1150	ENDDO
	1151	ENDDO
[683]	1152	!$OMP END PARALLEL
[1106]	1153
	1154	ENDIF
	1155
[1]	1156	END SUBROUTINE transpose_zy
	1157
	1158
	1159	!------------------------------------------------------------------------------!
	1160	! Description:
	1161	! ------------
[1682]	1162	!> Transposition of input array (f_in) from z to y. For the input array, all
	1163	!> elements along z reside on the same PE, while after transposition, all
	1164	!> elements along y reside on the same PE.
	1165	!> This is a direct transposition for arrays with indices in regular order
	1166	!> (k,j,i) (cf. transpose_zy).
[1]	1167	!------------------------------------------------------------------------------!
[1682]	1168	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	1169
[1682]	1170
[1320]	1171	USE cpulog, &
[3241]	1172	ONLY: cpu_log, log_point_s
[1]	1173
[1320]	1174	USE indices, &
	1175	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
	1176
	1177	USE kinds
	1178
[1324]	1179	USE pegrid
[1320]	1180
	1181	USE transpose_indices, &
[3241]	1182	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
[1320]	1183
[1]	1184	IMPLICIT NONE
	1185
[4171]	1186	INTEGER(iwp) :: i !<
	1187	INTEGER(iwp) :: j !<
	1188	INTEGER(iwp) :: k !<
	1189	INTEGER(iwp) :: l !<
	1190	INTEGER(iwp) :: m !<
	1191	INTEGER(iwp) :: ys !<
[1]	1192
[4171]	1193	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	1194	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	1195	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
	1196	REAL(wp) :: work(nnxnnynnz) !<
[1320]	1197
[1]	1198	#if defined( __parallel )
	1199
	1200	!
	1201	!-- Rearrange indices of input array in order to make data to be send
	1202	!-- by MPI contiguous
[1003]	1203	DO i = nxl, nxr
	1204	DO j = nys, nyn
	1205	DO k = 1, nz
[164]	1206	f_inv(j,i,k) = f_in(k,j,i)
[1]	1207	ENDDO
	1208	ENDDO
	1209	ENDDO
	1210
	1211	!
	1212	!-- Move data to different array, because memory location of work1 is
	1213	!-- needed further below (work1 = work2).
	1214	!-- If the PE grid is one-dimensional along x, only local reordering
	1215	!-- of the data is necessary and no transposition has to be done.
	1216	IF ( pdims(2) == 1 ) THEN
[1003]	1217	DO k = 1, nz
	1218	DO i = nxl, nxr
	1219	DO j = nys, nyn
[164]	1220	f_out(j,i,k) = f_inv(j,i,k)
[1]	1221	ENDDO
	1222	ENDDO
	1223	ENDDO
	1224	RETURN
	1225	ENDIF
	1226
	1227	!
	1228	!-- Transpose array
	1229	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1230	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1231	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	1232	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	1233	comm1dy, ierr )
	1234	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1235
	1236	!
	1237	!-- Reorder transposed array
	1238	m = 0
	1239	DO l = 0, pdims(2) - 1
	1240	ys = 0 + l * nny
[1003]	1241	DO k = nzb_yd, nzt_yd
	1242	DO i = nxl_yd, nxr_yd
[1]	1243	DO j = ys, ys + nny - 1
	1244	m = m + 1
[164]	1245	f_out(j,i,k) = work(m)
[1]	1246	ENDDO
	1247	ENDDO
	1248	ENDDO
	1249	ENDDO
	1250
	1251	#endif
	1252
	1253	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |