Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 4418

Last change on this file since 4418 was 4415, checked in by raasch, 4 years ago
bugfix for misplaced/missing preprocessor directives
Property svn:keywords set to `Id`
File size: 34.2 KB

Rev	Line
[1682]	1	!> @file transpose.f90
[2000]	2	!------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[4360]	17	! Copyright 1997-2020 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1321]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: transpose.f90 4415 2020-02-20 10:30:33Z raasch $
[4415]	27	! bugfix for misplaced preprocessor directive
	28	!
	29	! 4370 2020-01-10 14:00:44Z raasch
[4370]	30	! vector array renamed
	31	!
	32	! 4366 2020-01-09 08:12:43Z raasch
[4366]	33	! modifications for NEC vectorization
	34	!
	35	! 4360 2020-01-07 11:25:50Z suehring
[4236]	36	! Added missing OpenMP directives
	37	!
	38	! 4182 2019-08-22 15:20:23Z scharf
[4182]	39	! Corrected "Former revisions" section
[4181]	40	!
[4182]	41	! 4171 2019-08-19 17:44:09Z gronemeier
[3832]	42	! loop reordering for performance optimization
[4171]	43	!
	44	! 3832 2019-03-28 13:16:58Z raasch
	45	! loop reordering for performance optimization
	46	!
[3832]	47	! 3694 2019-01-23 17:01:49Z knoop
[3634]	48	! OpenACC port for SPEC
[4171]	49	!
[4182]	50	! Revision 1.1 1997/07/24 11:25:18 raasch
	51	! Initial revision
	52	!
	53	!
[1216]	54	! Description:
	55	! ------------
[1682]	56	!> Resorting data for the transposition from x to y. The transposition itself
	57	!> is carried out in transpose_xy
[1216]	58	!------------------------------------------------------------------------------!
[4181]	59
	60	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
	61
[1682]	62	SUBROUTINE resort_for_xy( f_in, f_inv )
[1216]	63
[4171]	64
[1320]	65	USE indices, &
	66	ONLY: nx
[1216]	67
[1320]	68	USE kinds
	69
	70	USE transpose_indices, &
[3241]	71	ONLY: nyn_x, nys_x, nzb_x, nzt_x
[1320]	72
[1216]	73	IMPLICIT NONE
	74
[4171]	75	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	76	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1216]	77
	78
[4171]	79	INTEGER(iwp) :: i !<
	80	INTEGER(iwp) :: j !<
	81	INTEGER(iwp) :: k !<
[1]	82	!
[1216]	83	!-- Rearrange indices of input array in order to make data to be send
	84	!-- by MPI contiguous
	85	!$OMP PARALLEL PRIVATE ( i, j, k )
	86	!$OMP DO
[3690]	87	#if __acc_fft_device
[3634]	88	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	89	!$ACC PRESENT(f_inv, f_in)
[3690]	90	#endif
[3832]	91	DO k = nzb_x, nzt_x
[4171]	92	DO j = nys_x, nyn_x
	93	DO i = 0, nx
[1216]	94	f_inv(j,k,i) = f_in(i,j,k)
	95	ENDDO
	96	ENDDO
	97	ENDDO
	98	!$OMP END PARALLEL
	99
	100	END SUBROUTINE resort_for_xy
	101
	102
	103	!------------------------------------------------------------------------------!
[1]	104	! Description:
	105	! ------------
[1682]	106	!> Transposition of input array (f_in) from x to y. For the input array, all
	107	!> elements along x reside on the same PE, while after transposition, all
	108	!> elements along y reside on the same PE.
[1]	109	!------------------------------------------------------------------------------!
[1682]	110	SUBROUTINE transpose_xy( f_inv, f_out )
[1]	111
[1682]	112
[1320]	113	USE cpulog, &
	114	ONLY: cpu_log, cpu_log_nowait, log_point_s
	115
	116	USE indices, &
	117	ONLY: nx, ny
[4171]	118
[1320]	119	USE kinds
	120
[1]	121	USE pegrid
	122
[1320]	123	USE transpose_indices, &
	124	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	125
[1]	126	IMPLICIT NONE
	127
[4171]	128	INTEGER(iwp) :: i !<
	129	INTEGER(iwp) :: j !<
	130	INTEGER(iwp) :: k !<
	131	INTEGER(iwp) :: l !<
	132	INTEGER(iwp) :: ys !<
[1]	133
[4171]	134	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	135	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	136
	137	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	138	#if __acc_fft_device
[3634]	139	!$ACC DECLARE CREATE(work)
[3690]	140	#endif
[1111]	141
	142
[1106]	143	IF ( numprocs /= 1 ) THEN
	144
	145	#if defined( __parallel )
[1]	146	!
[1106]	147	!-- Transpose array
[1318]	148	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	149
	150	#if __acc_fft_device
[3657]	151	#ifndef __cuda_aware_mpi
[3634]	152	!$ACC UPDATE HOST(f_inv)
[3657]	153	#else
	154	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	155	#endif
[3690]	156	#endif
	157
[1106]	158	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	159	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	160	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	161	comm1dy, ierr )
[3690]	162
	163	#if __acc_fft_device
[3657]	164	#ifndef __cuda_aware_mpi
[3634]	165	!$ACC UPDATE DEVICE(work)
[3657]	166	#else
	167	!$ACC END HOST_DATA
	168	#endif
[3690]	169	#endif
	170
[1106]	171	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	172
	173	!
[1106]	174	!-- Reorder transposed array
[1111]	175	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[1106]	176	DO l = 0, pdims(2) - 1
	177	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	178	#if __acc_fft_device
[3634]	179	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	180	!$ACC PRESENT(f_out, work)
[3690]	181	#endif
[4236]	182	!$OMP DO
[1106]	183	DO i = nxl_y, nxr_y
	184	DO k = nzb_y, nzt_y
	185	DO j = ys, ys + nyn_x - nys_x
[1111]	186	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	187	ENDDO
[1]	188	ENDDO
	189	ENDDO
[4236]	190	!$OMP END DO NOWAIT
[1]	191	ENDDO
[683]	192	!$OMP END PARALLEL
[1]	193	#endif
	194
[1106]	195	ELSE
	196
	197	!
	198	!-- Reorder transposed array
	199	!$OMP PARALLEL PRIVATE ( i, j, k )
	200	!$OMP DO
[3690]	201	#if __acc_fft_device
[3634]	202	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	203	!$ACC PRESENT(f_out, f_inv)
[3690]	204	#endif
[1106]	205	DO k = nzb_y, nzt_y
	206	DO i = nxl_y, nxr_y
	207	DO j = 0, ny
	208	f_out(j,i,k) = f_inv(j,k,i)
	209	ENDDO
	210	ENDDO
	211	ENDDO
	212	!$OMP END PARALLEL
	213
	214	ENDIF
	215
[1]	216	END SUBROUTINE transpose_xy
	217
	218
	219	!------------------------------------------------------------------------------!
	220	! Description:
	221	! ------------
[1682]	222	!> Resorting data after the transposition from x to z. The transposition itself
	223	!> is carried out in transpose_xz
[1216]	224	!------------------------------------------------------------------------------!
[1682]	225	SUBROUTINE resort_for_xz( f_inv, f_out )
[1216]	226
[1682]	227
[1320]	228	USE indices, &
	229	ONLY: nxl, nxr, nyn, nys, nz
[1216]	230
[1320]	231	USE kinds
	232
[1216]	233	IMPLICIT NONE
	234
[4171]	235	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	236	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
[1216]	237
[4171]	238	INTEGER(iwp) :: i !<
	239	INTEGER(iwp) :: j !<
	240	INTEGER(iwp) :: k !<
[1216]	241	!
	242	!-- Rearrange indices of input array in order to make data to be send
	243	!-- by MPI contiguous.
	244	!-- In case of parallel fft/transposition, scattered store is faster in
	245	!-- backward direction!!!
	246	!$OMP PARALLEL PRIVATE ( i, j, k )
	247	!$OMP DO
[3690]	248	#if __acc_fft_device
[3634]	249	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	250	!$ACC PRESENT(f_out, f_inv)
[3690]	251	#endif
[4171]	252	DO i = nxl, nxr
	253	DO j = nys, nyn
	254	DO k = 1, nz
[1216]	255	f_out(k,j,i) = f_inv(j,i,k)
	256	ENDDO
	257	ENDDO
	258	ENDDO
	259	!$OMP END PARALLEL
	260
	261	END SUBROUTINE resort_for_xz
	262
	263
	264	!------------------------------------------------------------------------------!
	265	! Description:
	266	! ------------
[1682]	267	!> Transposition of input array (f_in) from x to z. For the input array, all
	268	!> elements along x reside on the same PE, while after transposition, all
	269	!> elements along z reside on the same PE.
[1]	270	!------------------------------------------------------------------------------!
[1682]	271	SUBROUTINE transpose_xz( f_in, f_inv )
[1]	272
[1682]	273
[1320]	274	USE cpulog, &
	275	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	276
[4366]	277	USE fft_xy, &
[4370]	278	ONLY: f_vec_x, temperton_fft_vec
[4366]	279
[1320]	280	USE indices, &
[3241]	281	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
[1320]	282
	283	USE kinds
	284
[1324]	285	USE pegrid
[1320]	286
	287	USE transpose_indices, &
	288	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	289
[1]	290	IMPLICIT NONE
	291
[4171]	292	INTEGER(iwp) :: i !<
	293	INTEGER(iwp) :: j !<
	294	INTEGER(iwp) :: k !<
	295	INTEGER(iwp) :: l !<
[4366]	296	INTEGER(iwp) :: mm !<
[4171]	297	INTEGER(iwp) :: xs !<
[1]	298
[4171]	299	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	300	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1]	301
[4171]	302	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	303	#if __acc_fft_device
[3634]	304	!$ACC DECLARE CREATE(work)
[3690]	305	#endif
[1111]	306
[4366]	307	!
	308	!-- If the PE grid is one-dimensional along y, the array has only to be
	309	!-- reordered locally and therefore no transposition has to be done.
[1]	310	IF ( pdims(1) /= 1 ) THEN
[1106]	311
	312	#if defined( __parallel )
[1]	313	!
[4366]	314	!-- Reorder input array for transposition. Data from the vectorized Temperton-fft is stored in
[4370]	315	!-- different array format (f_vec_x).
[4366]	316	IF ( temperton_fft_vec ) THEN
	317
	318	DO l = 0, pdims(1) - 1
	319	xs = 0 + l * nnx
	320	DO k = nzb_x, nzt_x
	321	DO i = xs, xs + nnx - 1
	322	DO j = nys_x, nyn_x
	323	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
[4370]	324	work(j,i-xs+1,k,l) = f_vec_x(mm,i)
[4366]	325	ENDDO
	326	ENDDO
	327	ENDDO
	328	ENDDO
	329
	330	ELSE
	331
	332	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
	333	DO l = 0, pdims(1) - 1
	334	xs = 0 + l * nnx
[3690]	335	#if __acc_fft_device
[4366]	336	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	337	!$ACC PRESENT(work, f_in)
[3690]	338	#endif
[4366]	339	!$OMP DO
	340	DO k = nzb_x, nzt_x
	341	DO i = xs, xs + nnx - 1
	342	DO j = nys_x, nyn_x
	343	work(j,i-xs+1,k,l) = f_in(i,j,k)
	344	ENDDO
[1]	345	ENDDO
	346	ENDDO
[4366]	347	!$OMP END DO NOWAIT
[1]	348	ENDDO
[4366]	349	!$OMP END PARALLEL
[1]	350
[4366]	351	ENDIF
	352
[1]	353	!
	354	!-- Transpose array
[1318]	355	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	356
	357	#if __acc_fft_device
[3657]	358	#ifndef __cuda_aware_mpi
[3634]	359	!$ACC UPDATE HOST(work)
[3657]	360	#else
	361	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	362	#endif
[3690]	363	#endif
	364
[622]	365	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	366	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	367	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	368	comm1dx, ierr )
[3690]	369
	370	#if __acc_fft_device
[3657]	371	#ifndef __cuda_aware_mpi
[3634]	372	!$ACC UPDATE DEVICE(f_inv)
[3657]	373	#else
	374	!$ACC END HOST_DATA
	375	#endif
[3694]	376	#endif
	377
[1]	378	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	379	#endif
	380
[1]	381	ELSE
[1106]	382
[1]	383	!
	384	!-- Reorder the array in a way that the z index is in first position
[683]	385	!$OMP PARALLEL PRIVATE ( i, j, k )
	386	!$OMP DO
[3690]	387	#if __acc_fft_device
[3634]	388	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	389	!$ACC PRESENT(f_inv, f_in)
[3690]	390	#endif
[1003]	391	DO i = nxl, nxr
	392	DO j = nys, nyn
	393	DO k = 1, nz
[164]	394	f_inv(j,i,k) = f_in(i,j,k)
[1]	395	ENDDO
	396	ENDDO
	397	ENDDO
[683]	398	!$OMP END PARALLEL
[1]	399
[164]	400	ENDIF
	401
[1]	402	END SUBROUTINE transpose_xz
	403
	404
	405	!------------------------------------------------------------------------------!
	406	! Description:
	407	! ------------
[1682]	408	!> Resorting data after the transposition from y to x. The transposition itself
	409	!> is carried out in transpose_yx
[1216]	410	!------------------------------------------------------------------------------!
[1682]	411	SUBROUTINE resort_for_yx( f_inv, f_out )
[1216]	412
[1682]	413
[1320]	414	USE indices, &
	415	ONLY: nx
[1216]	416
[1320]	417	USE kinds
	418
	419	USE transpose_indices, &
	420	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	421
[1216]	422	IMPLICIT NONE
	423
[4171]	424	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	425	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1216]	426
	427
[4171]	428	INTEGER(iwp) :: i !<
	429	INTEGER(iwp) :: j !<
	430	INTEGER(iwp) :: k !<
[1216]	431	!
	432	!-- Rearrange indices of input array in order to make data to be send
	433	!-- by MPI contiguous
	434	!$OMP PARALLEL PRIVATE ( i, j, k )
	435	!$OMP DO
[3690]	436	#if __acc_fft_device
[3634]	437	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	438	!$ACC PRESENT(f_out, f_inv)
[3690]	439	#endif
[4171]	440	DO k = nzb_x, nzt_x
	441	DO j = nys_x, nyn_x
	442	DO i = 0, nx
[1216]	443	f_out(i,j,k) = f_inv(j,k,i)
	444	ENDDO
	445	ENDDO
	446	ENDDO
	447	!$OMP END PARALLEL
	448
	449	END SUBROUTINE resort_for_yx
	450
	451
	452	!------------------------------------------------------------------------------!
	453	! Description:
	454	! ------------
[1682]	455	!> Transposition of input array (f_in) from y to x. For the input array, all
	456	!> elements along y reside on the same PE, while after transposition, all
	457	!> elements along x reside on the same PE.
[1]	458	!------------------------------------------------------------------------------!
[1682]	459	SUBROUTINE transpose_yx( f_in, f_inv )
[1]	460
[1682]	461
[1320]	462	USE cpulog, &
	463	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	464
[1320]	465	USE indices, &
	466	ONLY: nx, ny
	467
	468	USE kinds
	469
[1324]	470	USE pegrid
[1320]	471
	472	USE transpose_indices, &
	473	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	474
[1]	475	IMPLICIT NONE
	476
[4171]	477	INTEGER(iwp) :: i !<
	478	INTEGER(iwp) :: j !<
	479	INTEGER(iwp) :: k !<
	480	INTEGER(iwp) :: l !<
	481	INTEGER(iwp) :: ys !<
[1]	482
[4171]	483	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	484	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1111]	485
[4171]	486	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	487	#if __acc_fft_device
[3634]	488	!$ACC DECLARE CREATE(work)
[3690]	489	#endif
[1111]	490
[1320]	491
[1106]	492	IF ( numprocs /= 1 ) THEN
	493
[1]	494	#if defined( __parallel )
	495	!
[1106]	496	!-- Reorder input array for transposition
[1111]	497	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[1106]	498	DO l = 0, pdims(2) - 1
	499	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	500	#if __acc_fft_device
[3634]	501	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	502	!$ACC PRESENT(work, f_in)
[3690]	503	#endif
[4236]	504	!$OMP DO
[1106]	505	DO i = nxl_y, nxr_y
	506	DO k = nzb_y, nzt_y
	507	DO j = ys, ys + nyn_x - nys_x
[1111]	508	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	509	ENDDO
	510	ENDDO
	511	ENDDO
[4236]	512	!$OMP END DO NOWAIT
[1106]	513	ENDDO
	514	!$OMP END PARALLEL
	515
	516	!
	517	!-- Transpose array
[1318]	518	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	519
	520	#if __acc_fft_device
[3657]	521	#ifndef __cuda_aware_mpi
[3634]	522	!$ACC UPDATE HOST(work)
[3657]	523	#else
	524	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	525	#endif
[3690]	526	#endif
	527
[1106]	528	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	529	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	530	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	531	comm1dy, ierr )
[3690]	532
	533	#if __acc_fft_device
[3657]	534	#ifndef __cuda_aware_mpi
[3634]	535	!$ACC UPDATE DEVICE(f_inv)
[3657]	536	#else
	537	!$ACC END HOST_DATA
	538	#endif
[3690]	539	#endif
	540
[1106]	541	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	542	#endif
	543
	544	ELSE
	545
	546	!
	547	!-- Reorder array f_in the same way as ALLTOALL did it
	548	!$OMP PARALLEL PRIVATE ( i, j, k )
	549	!$OMP DO
[3690]	550	#if __acc_fft_device
[3634]	551	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	552	!$ACC PRESENT(f_inv, f_in)
[3690]	553	#endif
[1003]	554	DO i = nxl_y, nxr_y
	555	DO k = nzb_y, nzt_y
[1106]	556	DO j = 0, ny
	557	f_inv(j,k,i) = f_in(j,i,k)
[1]	558	ENDDO
	559	ENDDO
	560	ENDDO
[683]	561	!$OMP END PARALLEL
[1]	562
[1106]	563	ENDIF
[1]	564
	565	END SUBROUTINE transpose_yx
	566
	567
	568	!------------------------------------------------------------------------------!
	569	! Description:
	570	! ------------
[1682]	571	!> Transposition of input array (f_in) from y to x. For the input array, all
	572	!> elements along y reside on the same PE, while after transposition, all
	573	!> elements along x reside on the same PE.
	574	!> This is a direct transposition for arrays with indices in regular order
	575	!> (k,j,i) (cf. transpose_yx).
[1]	576	!------------------------------------------------------------------------------!
[1682]	577	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	578
[1682]	579
[1320]	580	USE cpulog, &
[3241]	581	ONLY: cpu_log, log_point_s
[1]	582
[1320]	583	USE indices, &
	584	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
	585
	586	USE kinds
	587
[1324]	588	USE pegrid
[1320]	589
	590	USE transpose_indices, &
	591	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	592
[1]	593	IMPLICIT NONE
	594
[4171]	595	INTEGER(iwp) :: i !<
	596	INTEGER(iwp) :: j !<
	597	INTEGER(iwp) :: k !<
	598	INTEGER(iwp) :: l !<
	599	INTEGER(iwp) :: m !<
	600	INTEGER(iwp) :: xs !<
[1]	601
[4171]	602	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	603	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
	604	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	605	REAL(wp) :: work(nnxnnynnz) !<
[1]	606	#if defined( __parallel )
	607
	608	!
	609	!-- Rearrange indices of input array in order to make data to be send
	610	!-- by MPI contiguous
[1003]	611	DO k = 1, nz
	612	DO j = nys, nyn
	613	DO i = nxl, nxr
[164]	614	f_inv(i,k,j) = f_in(k,j,i)
[1]	615	ENDDO
	616	ENDDO
	617	ENDDO
	618
	619	!
	620	!-- Transpose array
	621	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	622	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	623	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	624	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	625	comm1dx, ierr )
	626	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	627
	628	!
	629	!-- Reorder transposed array
	630	m = 0
	631	DO l = 0, pdims(1) - 1
	632	xs = 0 + l * nnx
[1003]	633	DO j = nys_x, nyn_x
	634	DO k = 1, nz
[1]	635	DO i = xs, xs + nnx - 1
	636	m = m + 1
[164]	637	f_out(i,j,k) = work(m)
[1]	638	ENDDO
	639	ENDDO
	640	ENDDO
	641	ENDDO
	642
	643	#endif
	644
	645	END SUBROUTINE transpose_yxd
	646
	647
	648	!------------------------------------------------------------------------------!
	649	! Description:
	650	! ------------
[1682]	651	!> Resorting data for the transposition from y to z. The transposition itself
	652	!> is carried out in transpose_yz
[1216]	653	!------------------------------------------------------------------------------!
[1682]	654	SUBROUTINE resort_for_yz( f_in, f_inv )
[1216]	655
[1682]	656
[1320]	657	USE indices, &
	658	ONLY: ny
[1216]	659
[1320]	660	USE kinds
	661
	662	USE transpose_indices, &
	663	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	664
[1216]	665	IMPLICIT NONE
	666
[4171]	667	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	668	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1216]	669
[4171]	670	INTEGER(iwp) :: i !<
	671	INTEGER(iwp) :: j !<
	672	INTEGER(iwp) :: k !<
[1216]	673
	674	!
	675	!-- Rearrange indices of input array in order to make data to be send
	676	!-- by MPI contiguous
	677	!$OMP PARALLEL PRIVATE ( i, j, k )
	678	!$OMP DO
[3690]	679	#if __acc_fft_device
[3634]	680	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	681	!$ACC PRESENT(f_inv, f_in)
[3690]	682	#endif
[4171]	683	DO k = nzb_y, nzt_y
	684	DO i = nxl_y, nxr_y
	685	DO j = 0, ny
[1216]	686	f_inv(i,k,j) = f_in(j,i,k)
	687	ENDDO
	688	ENDDO
	689	ENDDO
	690	!$OMP END PARALLEL
	691
	692	END SUBROUTINE resort_for_yz
	693
	694
	695	!------------------------------------------------------------------------------!
	696	! Description:
	697	! ------------
[1682]	698	!> Transposition of input array (f_in) from y to z. For the input array, all
	699	!> elements along y reside on the same PE, while after transposition, all
	700	!> elements along z reside on the same PE.
[1]	701	!------------------------------------------------------------------------------!
[1682]	702	SUBROUTINE transpose_yz( f_inv, f_out )
[1]	703
[1682]	704
[1320]	705	USE cpulog, &
	706	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	707
[1320]	708	USE indices, &
	709	ONLY: ny, nz
	710
	711	USE kinds
	712
[1324]	713	USE pegrid
[1320]	714
	715	USE transpose_indices, &
	716	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	717
[1]	718	IMPLICIT NONE
	719
[4171]	720	INTEGER(iwp) :: i !<
	721	INTEGER(iwp) :: j !<
	722	INTEGER(iwp) :: k !<
	723	INTEGER(iwp) :: l !<
	724	INTEGER(iwp) :: zs !<
[1]	725
[4171]	726	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	727	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
[1111]	728
[4171]	729	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	730	#if __acc_fft_device
[3634]	731	!$ACC DECLARE CREATE(work)
[3690]	732	#endif
[1111]	733
[1320]	734
[1]	735	!
	736	!-- If the PE grid is one-dimensional along y, only local reordering
	737	!-- of the data is necessary and no transposition has to be done.
	738	IF ( pdims(1) == 1 ) THEN
[1106]	739
[683]	740	!$OMP PARALLEL PRIVATE ( i, j, k )
	741	!$OMP DO
[3690]	742	#if __acc_fft_device
[3634]	743	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	744	!$ACC PRESENT(f_out, f_inv)
[3690]	745	#endif
[1003]	746	DO j = 0, ny
	747	DO k = nzb_y, nzt_y
	748	DO i = nxl_y, nxr_y
[164]	749	f_out(i,j,k) = f_inv(i,k,j)
[1]	750	ENDDO
	751	ENDDO
	752	ENDDO
[683]	753	!$OMP END PARALLEL
[1]	754
[1106]	755	ELSE
	756
	757	#if defined( __parallel )
[1]	758	!
[1106]	759	!-- Transpose array
[1318]	760	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	761
	762	#if __acc_fft_device
[3657]	763	#ifndef __cuda_aware_mpi
[3634]	764	!$ACC UPDATE HOST(f_inv)
[3657]	765	#else
	766	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	767	#endif
[3690]	768	#endif
	769
[1106]	770	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	771	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	772	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	773	comm1dx, ierr )
[3690]	774
	775	#if __acc_fft_device
[3657]	776	#ifndef __cuda_aware_mpi
[3634]	777	!$ACC UPDATE DEVICE(work)
[3657]	778	#else
	779	!$ACC END HOST_DATA
	780	#endif
[3690]	781	#endif
	782
[1106]	783	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	784
	785	!
[1106]	786	!-- Reorder transposed array
[1111]	787	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[1106]	788	DO l = 0, pdims(1) - 1
	789	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	790	#if __acc_fft_device
[3634]	791	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	792	!$ACC PRESENT(f_out, work)
[3690]	793	#endif
[4236]	794	!$OMP DO
[1106]	795	DO j = nys_z, nyn_z
	796	DO k = zs, zs + nzt_y - nzb_y
	797	DO i = nxl_z, nxr_z
[1111]	798	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	799	ENDDO
[1]	800	ENDDO
	801	ENDDO
[4236]	802	!$OMP END DO NOWAIT
[1]	803	ENDDO
[683]	804	!$OMP END PARALLEL
[1]	805	#endif
	806
[1106]	807	ENDIF
	808
[1]	809	END SUBROUTINE transpose_yz
	810
	811
	812	!------------------------------------------------------------------------------!
	813	! Description:
	814	! ------------
[1682]	815	!> Resorting data for the transposition from z to x. The transposition itself
	816	!> is carried out in transpose_zx
[1216]	817	!------------------------------------------------------------------------------!
[1682]	818	SUBROUTINE resort_for_zx( f_in, f_inv )
[1216]	819
[1682]	820
[1320]	821	USE indices, &
	822	ONLY: nxl, nxr, nyn, nys, nz
[1216]	823
[1320]	824	USE kinds
	825
[1216]	826	IMPLICIT NONE
	827
[4171]	828	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	829	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1216]	830
[4171]	831	INTEGER(iwp) :: i !<
	832	INTEGER(iwp) :: j !<
	833	INTEGER(iwp) :: k !<
[1216]	834
	835	!
	836	!-- Rearrange indices of input array in order to make data to be send
	837	!-- by MPI contiguous
	838	!$OMP PARALLEL PRIVATE ( i, j, k )
	839	!$OMP DO
[3690]	840	#if __acc_fft_device
[3634]	841	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	842	!$ACC PRESENT(f_in, f_inv)
[3690]	843	#endif
[3832]	844	DO i = nxl, nxr
[4171]	845	DO j = nys, nyn
	846	DO k = 1,nz
[1216]	847	f_inv(j,i,k) = f_in(k,j,i)
	848	ENDDO
	849	ENDDO
	850	ENDDO
	851	!$OMP END PARALLEL
	852
	853	END SUBROUTINE resort_for_zx
	854
	855
	856	!------------------------------------------------------------------------------!
	857	! Description:
	858	! ------------
[1682]	859	!> Transposition of input array (f_in) from z to x. For the input array, all
	860	!> elements along z reside on the same PE, while after transposition, all
	861	!> elements along x reside on the same PE.
[1]	862	!------------------------------------------------------------------------------!
[1682]	863	SUBROUTINE transpose_zx( f_inv, f_out )
[1]	864
[1682]	865
[1320]	866	USE cpulog, &
	867	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	868
[4366]	869	USE fft_xy, &
[4370]	870	ONLY: f_vec_x, temperton_fft_vec
[4366]	871
[1320]	872	USE indices, &
	873	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
	874
	875	USE kinds
	876
[1324]	877	USE pegrid
[1320]	878
	879	USE transpose_indices, &
	880	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	881
[1]	882	IMPLICIT NONE
	883
[4171]	884	INTEGER(iwp) :: i !<
	885	INTEGER(iwp) :: j !<
	886	INTEGER(iwp) :: k !<
	887	INTEGER(iwp) :: l !<
[4366]	888	INTEGER(iwp) :: mm !<
[4171]	889	INTEGER(iwp) :: xs !<
[1]	890
[4171]	891	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	892	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1111]	893
[4171]	894	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	895	#if __acc_fft_device
[3634]	896	!$ACC DECLARE CREATE(work)
[3690]	897	#endif
[1]	898
[1320]	899
[1]	900	!
	901	!-- If the PE grid is one-dimensional along y, only local reordering
	902	!-- of the data is necessary and no transposition has to be done.
	903	IF ( pdims(1) == 1 ) THEN
[1106]	904
[683]	905	!$OMP PARALLEL PRIVATE ( i, j, k )
	906	!$OMP DO
[3690]	907	#if __acc_fft_device
[3634]	908	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	909	!$ACC PRESENT(f_out, f_inv)
[3690]	910	#endif
[1003]	911	DO k = 1, nz
	912	DO i = nxl, nxr
	913	DO j = nys, nyn
[164]	914	f_out(i,j,k) = f_inv(j,i,k)
[1]	915	ENDDO
	916	ENDDO
	917	ENDDO
[683]	918	!$OMP END PARALLEL
[1]	919
[1106]	920	ELSE
	921
	922	#if defined( __parallel )
[1]	923	!
[1106]	924	!-- Transpose array
[1318]	925	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	926
	927	#if __acc_fft_device
[3657]	928	#ifndef __cuda_aware_mpi
[3634]	929	!$ACC UPDATE HOST(f_inv)
[3657]	930	#else
	931	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	932	#endif
[3690]	933	#endif
	934
[1106]	935	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	936	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	937	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	938	comm1dx, ierr )
[3690]	939
	940	#if __acc_fft_device
[3657]	941	#ifndef __cuda_aware_mpi
[3634]	942	!$ACC UPDATE DEVICE(work)
[3657]	943	#else
	944	!$ACC END HOST_DATA
	945	#endif
[3690]	946	#endif
	947
[1106]	948	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	949
	950	!
[4366]	951	!-- Reorder transposed array.
[4370]	952	!-- Data for the vectorized Temperton-fft is stored in different array format (f_vec_x) which
	953	!-- saves additional data copy in fft_x.
[4366]	954	IF ( temperton_fft_vec ) THEN
	955
	956	DO l = 0, pdims(1) - 1
	957	xs = 0 + l * nnx
	958	DO k = nzb_x, nzt_x
	959	DO i = xs, xs + nnx - 1
	960	DO j = nys_x, nyn_x
	961	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
[4370]	962	f_vec_x(mm,i) = work(j,i-xs+1,k,l)
[4366]	963	ENDDO
	964	ENDDO
	965	ENDDO
	966	ENDDO
	967
	968	ELSE
	969
	970	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
	971	DO l = 0, pdims(1) - 1
	972	xs = 0 + l * nnx
[3690]	973	#if __acc_fft_device
[4366]	974	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	975	!$ACC PRESENT(f_out, work)
[3690]	976	#endif
[4366]	977	!$OMP DO
	978	DO k = nzb_x, nzt_x
	979	DO i = xs, xs + nnx - 1
	980	DO j = nys_x, nyn_x
	981	f_out(i,j,k) = work(j,i-xs+1,k,l)
	982	ENDDO
[1106]	983	ENDDO
[1]	984	ENDDO
[4366]	985	!$OMP END DO NOWAIT
[1]	986	ENDDO
[4366]	987	!$OMP END PARALLEL
[1]	988
[4366]	989	ENDIF
	990
[4415]	991	#endif
	992
[1106]	993	ENDIF
	994
[1]	995	END SUBROUTINE transpose_zx
	996
	997
	998	!------------------------------------------------------------------------------!
	999	! Description:
	1000	! ------------
[1682]	1001	!> Resorting data after the transposition from z to y. The transposition itself
	1002	!> is carried out in transpose_zy
[1216]	1003	!------------------------------------------------------------------------------!
[1682]	1004	SUBROUTINE resort_for_zy( f_inv, f_out )
[1216]	1005
[1682]	1006
[1320]	1007	USE indices, &
	1008	ONLY: ny
[1216]	1009
[1320]	1010	USE kinds
	1011
	1012	USE transpose_indices, &
	1013	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	1014
[1216]	1015	IMPLICIT NONE
	1016
[4171]	1017	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	1018	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1216]	1019
	1020
[4171]	1021	INTEGER(iwp) :: i !<
	1022	INTEGER(iwp) :: j !<
	1023	INTEGER(iwp) :: k !<
[1216]	1024
	1025	!
	1026	!-- Rearrange indices of input array in order to make data to be send
	1027	!-- by MPI contiguous
	1028	!$OMP PARALLEL PRIVATE ( i, j, k )
	1029	!$OMP DO
[3690]	1030	#if __acc_fft_device
[3634]	1031	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1032	!$ACC PRESENT(f_out, f_inv)
[3690]	1033	#endif
[4171]	1034	DO k = nzb_y, nzt_y
	1035	DO i = nxl_y, nxr_y
	1036	DO j = 0, ny
[1216]	1037	f_out(j,i,k) = f_inv(i,k,j)
	1038	ENDDO
	1039	ENDDO
	1040	ENDDO
	1041	!$OMP END PARALLEL
	1042
	1043	END SUBROUTINE resort_for_zy
	1044
	1045
	1046	!------------------------------------------------------------------------------!
[3241]	1047	! Description:cpu_log_nowait
[1216]	1048	! ------------
[1682]	1049	!> Transposition of input array (f_in) from z to y. For the input array, all
	1050	!> elements along z reside on the same PE, while after transposition, all
	1051	!> elements along y reside on the same PE.
[1]	1052	!------------------------------------------------------------------------------!
[1682]	1053	SUBROUTINE transpose_zy( f_in, f_inv )
[1]	1054
[1682]	1055
[1320]	1056	USE cpulog, &
	1057	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	1058
[1320]	1059	USE indices, &
	1060	ONLY: ny, nz
	1061
	1062	USE kinds
	1063
[1324]	1064	USE pegrid
[1320]	1065
	1066	USE transpose_indices, &
	1067	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	1068
[1]	1069	IMPLICIT NONE
	1070
[4171]	1071	INTEGER(iwp) :: i !<
	1072	INTEGER(iwp) :: j !<
	1073	INTEGER(iwp) :: k !<
	1074	INTEGER(iwp) :: l !<
	1075	INTEGER(iwp) :: zs !<
[1]	1076
[4171]	1077	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
	1078	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1111]	1079
[1682]	1080	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	1081	#if __acc_fft_device
[3634]	1082	!$ACC DECLARE CREATE(work)
[3690]	1083	#endif
[1111]	1084
[1]	1085	!
	1086	!-- If the PE grid is one-dimensional along y, the array has only to be
	1087	!-- reordered locally and therefore no transposition has to be done.
	1088	IF ( pdims(1) /= 1 ) THEN
[1106]	1089
	1090	#if defined( __parallel )
[1]	1091	!
	1092	!-- Reorder input array for transposition
[1111]	1093	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[1]	1094	DO l = 0, pdims(1) - 1
[1003]	1095	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	1096	#if __acc_fft_device
[3634]	1097	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1098	!$ACC PRESENT(work, f_in)
[3690]	1099	#endif
[4236]	1100	!$OMP DO
[1003]	1101	DO j = nys_z, nyn_z
	1102	DO k = zs, zs + nzt_y - nzb_y
	1103	DO i = nxl_z, nxr_z
[1111]	1104	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	1105	ENDDO
	1106	ENDDO
	1107	ENDDO
[4236]	1108	!$OMP END DO NOWAIT
[1]	1109	ENDDO
[683]	1110	!$OMP END PARALLEL
[1]	1111
	1112	!
	1113	!-- Transpose array
[1318]	1114	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	1115
	1116	#if __acc_fft_device
[3657]	1117	#ifndef __cuda_aware_mpi
[3634]	1118	!$ACC UPDATE HOST(work)
[3657]	1119	#else
	1120	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	1121	#endif
[3690]	1122	#endif
	1123
[622]	1124	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	1125	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	1126	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	1127	comm1dx, ierr )
[3690]	1128
	1129	#if __acc_fft_device
[3657]	1130	#ifndef __cuda_aware_mpi
[3634]	1131	!$ACC UPDATE DEVICE(f_inv)
[3657]	1132	#else
	1133	!$ACC END HOST_DATA
	1134	#endif
[3690]	1135	#endif
	1136
[1]	1137	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	1138	#endif
[1]	1139
	1140	ELSE
	1141	!
[1106]	1142	!-- Reorder the array in the same way like ALLTOALL did it
[683]	1143	!$OMP PARALLEL PRIVATE ( i, j, k )
	1144	!$OMP DO
[3690]	1145	#if __acc_fft_device
[3634]	1146	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1147	!$ACC PRESENT(f_inv, f_in)
[3690]	1148	#endif
[1003]	1149	DO k = nzb_y, nzt_y
	1150	DO j = 0, ny
	1151	DO i = nxl_y, nxr_y
[164]	1152	f_inv(i,k,j) = f_in(i,j,k)
	1153	ENDDO
	1154	ENDDO
	1155	ENDDO
[683]	1156	!$OMP END PARALLEL
[1106]	1157
	1158	ENDIF
	1159
[1]	1160	END SUBROUTINE transpose_zy
	1161
	1162
	1163	!------------------------------------------------------------------------------!
	1164	! Description:
	1165	! ------------
[1682]	1166	!> Transposition of input array (f_in) from z to y. For the input array, all
	1167	!> elements along z reside on the same PE, while after transposition, all
	1168	!> elements along y reside on the same PE.
	1169	!> This is a direct transposition for arrays with indices in regular order
	1170	!> (k,j,i) (cf. transpose_zy).
[1]	1171	!------------------------------------------------------------------------------!
[1682]	1172	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	1173
[1682]	1174
[1320]	1175	USE cpulog, &
[3241]	1176	ONLY: cpu_log, log_point_s
[1]	1177
[1320]	1178	USE indices, &
	1179	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
	1180
	1181	USE kinds
	1182
[1324]	1183	USE pegrid
[1320]	1184
	1185	USE transpose_indices, &
[3241]	1186	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
[1320]	1187
[1]	1188	IMPLICIT NONE
	1189
[4171]	1190	INTEGER(iwp) :: i !<
	1191	INTEGER(iwp) :: j !<
	1192	INTEGER(iwp) :: k !<
	1193	INTEGER(iwp) :: l !<
	1194	INTEGER(iwp) :: m !<
	1195	INTEGER(iwp) :: ys !<
[1]	1196
[4171]	1197	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	1198	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	1199	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
	1200	REAL(wp) :: work(nnxnnynnz) !<
[1320]	1201
[1]	1202	#if defined( __parallel )
	1203
	1204	!
	1205	!-- Rearrange indices of input array in order to make data to be send
	1206	!-- by MPI contiguous
[1003]	1207	DO i = nxl, nxr
	1208	DO j = nys, nyn
	1209	DO k = 1, nz
[164]	1210	f_inv(j,i,k) = f_in(k,j,i)
[1]	1211	ENDDO
	1212	ENDDO
	1213	ENDDO
	1214
	1215	!
	1216	!-- Move data to different array, because memory location of work1 is
	1217	!-- needed further below (work1 = work2).
	1218	!-- If the PE grid is one-dimensional along x, only local reordering
	1219	!-- of the data is necessary and no transposition has to be done.
	1220	IF ( pdims(2) == 1 ) THEN
[1003]	1221	DO k = 1, nz
	1222	DO i = nxl, nxr
	1223	DO j = nys, nyn
[164]	1224	f_out(j,i,k) = f_inv(j,i,k)
[1]	1225	ENDDO
	1226	ENDDO
	1227	ENDDO
	1228	RETURN
	1229	ENDIF
	1230
	1231	!
	1232	!-- Transpose array
	1233	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1234	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1235	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	1236	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	1237	comm1dy, ierr )
	1238	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1239
	1240	!
	1241	!-- Reorder transposed array
	1242	m = 0
	1243	DO l = 0, pdims(2) - 1
	1244	ys = 0 + l * nny
[1003]	1245	DO k = nzb_yd, nzt_yd
	1246	DO i = nxl_yd, nxr_yd
[1]	1247	DO j = ys, ys + nny - 1
	1248	m = m + 1
[164]	1249	f_out(j,i,k) = work(m)
[1]	1250	ENDDO
	1251	ENDDO
	1252	ENDDO
	1253	ENDDO
	1254
	1255	#endif
	1256
	1257	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |