Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 4455

Last change on this file since 4455 was 4429, checked in by raasch, 5 years ago
serial (non-MPI) test case added, several bugfixes for the serial mode
Property svn:keywords set to `Id`
File size: 35.1 KB

Rev	Line
[1682]	1	!> @file transpose.f90
[2000]	2	!------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[4360]	17	! Copyright 1997-2020 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1321]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: transpose.f90 4429 2020-02-27 15:24:30Z Giersch $
[4429]	27	! bugfix: cpp-directives added for serial mode
	28	!
	29	! 4415 2020-02-20 10:30:33Z raasch
[4415]	30	! bugfix for misplaced preprocessor directive
	31	!
	32	! 4370 2020-01-10 14:00:44Z raasch
[4370]	33	! vector array renamed
	34	!
	35	! 4366 2020-01-09 08:12:43Z raasch
[4366]	36	! modifications for NEC vectorization
	37	!
	38	! 4360 2020-01-07 11:25:50Z suehring
[4236]	39	! Added missing OpenMP directives
	40	!
	41	! 4182 2019-08-22 15:20:23Z scharf
[4182]	42	! Corrected "Former revisions" section
[4181]	43	!
[4182]	44	! 4171 2019-08-19 17:44:09Z gronemeier
[3832]	45	! loop reordering for performance optimization
[4171]	46	!
	47	! 3832 2019-03-28 13:16:58Z raasch
	48	! loop reordering for performance optimization
	49	!
[3832]	50	! 3694 2019-01-23 17:01:49Z knoop
[3634]	51	! OpenACC port for SPEC
[4171]	52	!
[4182]	53	! Revision 1.1 1997/07/24 11:25:18 raasch
	54	! Initial revision
	55	!
	56	!
[1216]	57	! Description:
	58	! ------------
[1682]	59	!> Resorting data for the transposition from x to y. The transposition itself
	60	!> is carried out in transpose_xy
[1216]	61	!------------------------------------------------------------------------------!
[4181]	62
	63	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
	64
[1682]	65	SUBROUTINE resort_for_xy( f_in, f_inv )
[1216]	66
[4171]	67
[1320]	68	USE indices, &
	69	ONLY: nx
[1216]	70
[1320]	71	USE kinds
	72
	73	USE transpose_indices, &
[3241]	74	ONLY: nyn_x, nys_x, nzb_x, nzt_x
[1320]	75
[1216]	76	IMPLICIT NONE
	77
[4171]	78	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	79	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1216]	80
	81
[4171]	82	INTEGER(iwp) :: i !<
	83	INTEGER(iwp) :: j !<
	84	INTEGER(iwp) :: k !<
[1]	85	!
[1216]	86	!-- Rearrange indices of input array in order to make data to be send
	87	!-- by MPI contiguous
	88	!$OMP PARALLEL PRIVATE ( i, j, k )
	89	!$OMP DO
[3690]	90	#if __acc_fft_device
[3634]	91	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	92	!$ACC PRESENT(f_inv, f_in)
[3690]	93	#endif
[3832]	94	DO k = nzb_x, nzt_x
[4171]	95	DO j = nys_x, nyn_x
	96	DO i = 0, nx
[1216]	97	f_inv(j,k,i) = f_in(i,j,k)
	98	ENDDO
	99	ENDDO
	100	ENDDO
	101	!$OMP END PARALLEL
	102
	103	END SUBROUTINE resort_for_xy
	104
	105
	106	!------------------------------------------------------------------------------!
[1]	107	! Description:
	108	! ------------
[1682]	109	!> Transposition of input array (f_in) from x to y. For the input array, all
	110	!> elements along x reside on the same PE, while after transposition, all
	111	!> elements along y reside on the same PE.
[1]	112	!------------------------------------------------------------------------------!
[1682]	113	SUBROUTINE transpose_xy( f_inv, f_out )
[1]	114
[1682]	115
[4429]	116	#if defined( __parallel )
[1320]	117	USE cpulog, &
	118	ONLY: cpu_log, cpu_log_nowait, log_point_s
[4429]	119	#endif
[1320]	120
	121	USE indices, &
	122	ONLY: nx, ny
[4171]	123
[1320]	124	USE kinds
	125
[1]	126	USE pegrid
	127
[1320]	128	USE transpose_indices, &
	129	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	130
[1]	131	IMPLICIT NONE
	132
[4171]	133	INTEGER(iwp) :: i !<
	134	INTEGER(iwp) :: j !<
	135	INTEGER(iwp) :: k !<
[4429]	136
	137	#if defined( __parallel )
[4171]	138	INTEGER(iwp) :: l !<
	139	INTEGER(iwp) :: ys !<
[4429]	140	#endif
[1]	141
[4171]	142	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	143	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	144
[4429]	145	#if defined( __parallel )
[4171]	146	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	147	#if __acc_fft_device
[3634]	148	!$ACC DECLARE CREATE(work)
[3690]	149	#endif
[4429]	150	#endif
[1111]	151
	152
[1106]	153	IF ( numprocs /= 1 ) THEN
	154
	155	#if defined( __parallel )
[1]	156	!
[1106]	157	!-- Transpose array
[1318]	158	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	159
	160	#if __acc_fft_device
[3657]	161	#ifndef __cuda_aware_mpi
[3634]	162	!$ACC UPDATE HOST(f_inv)
[3657]	163	#else
	164	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	165	#endif
[3690]	166	#endif
	167
[1106]	168	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	169	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	170	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	171	comm1dy, ierr )
[3690]	172
	173	#if __acc_fft_device
[3657]	174	#ifndef __cuda_aware_mpi
[3634]	175	!$ACC UPDATE DEVICE(work)
[3657]	176	#else
	177	!$ACC END HOST_DATA
	178	#endif
[3690]	179	#endif
	180
[1106]	181	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	182
	183	!
[1106]	184	!-- Reorder transposed array
[1111]	185	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[1106]	186	DO l = 0, pdims(2) - 1
	187	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	188	#if __acc_fft_device
[3634]	189	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	190	!$ACC PRESENT(f_out, work)
[3690]	191	#endif
[4236]	192	!$OMP DO
[1106]	193	DO i = nxl_y, nxr_y
	194	DO k = nzb_y, nzt_y
	195	DO j = ys, ys + nyn_x - nys_x
[1111]	196	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	197	ENDDO
[1]	198	ENDDO
	199	ENDDO
[4236]	200	!$OMP END DO NOWAIT
[1]	201	ENDDO
[683]	202	!$OMP END PARALLEL
[1]	203	#endif
	204
[1106]	205	ELSE
	206
	207	!
	208	!-- Reorder transposed array
	209	!$OMP PARALLEL PRIVATE ( i, j, k )
	210	!$OMP DO
[3690]	211	#if __acc_fft_device
[3634]	212	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	213	!$ACC PRESENT(f_out, f_inv)
[3690]	214	#endif
[1106]	215	DO k = nzb_y, nzt_y
	216	DO i = nxl_y, nxr_y
	217	DO j = 0, ny
	218	f_out(j,i,k) = f_inv(j,k,i)
	219	ENDDO
	220	ENDDO
	221	ENDDO
	222	!$OMP END PARALLEL
	223
	224	ENDIF
	225
[1]	226	END SUBROUTINE transpose_xy
	227
	228
	229	!------------------------------------------------------------------------------!
	230	! Description:
	231	! ------------
[1682]	232	!> Resorting data after the transposition from x to z. The transposition itself
	233	!> is carried out in transpose_xz
[1216]	234	!------------------------------------------------------------------------------!
[1682]	235	SUBROUTINE resort_for_xz( f_inv, f_out )
[1216]	236
[1682]	237
[1320]	238	USE indices, &
	239	ONLY: nxl, nxr, nyn, nys, nz
[1216]	240
[1320]	241	USE kinds
	242
[1216]	243	IMPLICIT NONE
	244
[4171]	245	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	246	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
[1216]	247
[4171]	248	INTEGER(iwp) :: i !<
	249	INTEGER(iwp) :: j !<
	250	INTEGER(iwp) :: k !<
[1216]	251	!
	252	!-- Rearrange indices of input array in order to make data to be send
	253	!-- by MPI contiguous.
	254	!-- In case of parallel fft/transposition, scattered store is faster in
	255	!-- backward direction!!!
	256	!$OMP PARALLEL PRIVATE ( i, j, k )
	257	!$OMP DO
[3690]	258	#if __acc_fft_device
[3634]	259	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	260	!$ACC PRESENT(f_out, f_inv)
[3690]	261	#endif
[4171]	262	DO i = nxl, nxr
	263	DO j = nys, nyn
	264	DO k = 1, nz
[1216]	265	f_out(k,j,i) = f_inv(j,i,k)
	266	ENDDO
	267	ENDDO
	268	ENDDO
	269	!$OMP END PARALLEL
	270
	271	END SUBROUTINE resort_for_xz
	272
	273
	274	!------------------------------------------------------------------------------!
	275	! Description:
	276	! ------------
[1682]	277	!> Transposition of input array (f_in) from x to z. For the input array, all
	278	!> elements along x reside on the same PE, while after transposition, all
	279	!> elements along z reside on the same PE.
[1]	280	!------------------------------------------------------------------------------!
[1682]	281	SUBROUTINE transpose_xz( f_in, f_inv )
[1]	282
[4429]	283	#if defined( __parallel )
[1320]	284	USE cpulog, &
	285	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	286
[4366]	287	USE fft_xy, &
[4370]	288	ONLY: f_vec_x, temperton_fft_vec
[4429]	289	#endif
[4366]	290
[1320]	291	USE indices, &
[4429]	292	ONLY: nx, nxl, nxr, nyn, nys, nz
	293	#if defined( __parallel )
	294	USE indices, &
	295	ONLY: nnx
	296	#endif
[1320]	297
	298	USE kinds
	299
[1324]	300	USE pegrid
[1320]	301
	302	USE transpose_indices, &
	303	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	304
[1]	305	IMPLICIT NONE
	306
[4171]	307	INTEGER(iwp) :: i !<
	308	INTEGER(iwp) :: j !<
	309	INTEGER(iwp) :: k !<
[4429]	310	#if defined( __parallel )
[4171]	311	INTEGER(iwp) :: l !<
[4366]	312	INTEGER(iwp) :: mm !<
[4171]	313	INTEGER(iwp) :: xs !<
[4429]	314	#endif
[1]	315
[4171]	316	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	317	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1]	318
[4429]	319	#if defined( __parallel )
[4171]	320	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	321	#if __acc_fft_device
[3634]	322	!$ACC DECLARE CREATE(work)
[3690]	323	#endif
[4429]	324	#endif
[1111]	325
[4366]	326	!
	327	!-- If the PE grid is one-dimensional along y, the array has only to be
	328	!-- reordered locally and therefore no transposition has to be done.
[1]	329	IF ( pdims(1) /= 1 ) THEN
[1106]	330
	331	#if defined( __parallel )
[1]	332	!
[4366]	333	!-- Reorder input array for transposition. Data from the vectorized Temperton-fft is stored in
[4370]	334	!-- different array format (f_vec_x).
[4366]	335	IF ( temperton_fft_vec ) THEN
	336
	337	DO l = 0, pdims(1) - 1
	338	xs = 0 + l * nnx
	339	DO k = nzb_x, nzt_x
	340	DO i = xs, xs + nnx - 1
	341	DO j = nys_x, nyn_x
	342	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
[4370]	343	work(j,i-xs+1,k,l) = f_vec_x(mm,i)
[4366]	344	ENDDO
	345	ENDDO
	346	ENDDO
	347	ENDDO
	348
	349	ELSE
	350
	351	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
	352	DO l = 0, pdims(1) - 1
	353	xs = 0 + l * nnx
[3690]	354	#if __acc_fft_device
[4366]	355	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	356	!$ACC PRESENT(work, f_in)
[3690]	357	#endif
[4366]	358	!$OMP DO
	359	DO k = nzb_x, nzt_x
	360	DO i = xs, xs + nnx - 1
	361	DO j = nys_x, nyn_x
	362	work(j,i-xs+1,k,l) = f_in(i,j,k)
	363	ENDDO
[1]	364	ENDDO
	365	ENDDO
[4366]	366	!$OMP END DO NOWAIT
[1]	367	ENDDO
[4366]	368	!$OMP END PARALLEL
[1]	369
[4366]	370	ENDIF
	371
[1]	372	!
	373	!-- Transpose array
[1318]	374	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	375
	376	#if __acc_fft_device
[3657]	377	#ifndef __cuda_aware_mpi
[3634]	378	!$ACC UPDATE HOST(work)
[3657]	379	#else
	380	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	381	#endif
[3690]	382	#endif
	383
[622]	384	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	385	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	386	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	387	comm1dx, ierr )
[3690]	388
	389	#if __acc_fft_device
[3657]	390	#ifndef __cuda_aware_mpi
[3634]	391	!$ACC UPDATE DEVICE(f_inv)
[3657]	392	#else
	393	!$ACC END HOST_DATA
	394	#endif
[3694]	395	#endif
	396
[1]	397	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	398	#endif
	399
[1]	400	ELSE
[1106]	401
[1]	402	!
	403	!-- Reorder the array in a way that the z index is in first position
[683]	404	!$OMP PARALLEL PRIVATE ( i, j, k )
	405	!$OMP DO
[3690]	406	#if __acc_fft_device
[3634]	407	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	408	!$ACC PRESENT(f_inv, f_in)
[3690]	409	#endif
[1003]	410	DO i = nxl, nxr
	411	DO j = nys, nyn
	412	DO k = 1, nz
[164]	413	f_inv(j,i,k) = f_in(i,j,k)
[1]	414	ENDDO
	415	ENDDO
	416	ENDDO
[683]	417	!$OMP END PARALLEL
[1]	418
[164]	419	ENDIF
	420
[1]	421	END SUBROUTINE transpose_xz
	422
	423
	424	!------------------------------------------------------------------------------!
	425	! Description:
	426	! ------------
[1682]	427	!> Resorting data after the transposition from y to x. The transposition itself
	428	!> is carried out in transpose_yx
[1216]	429	!------------------------------------------------------------------------------!
[1682]	430	SUBROUTINE resort_for_yx( f_inv, f_out )
[1216]	431
[1682]	432
[1320]	433	USE indices, &
	434	ONLY: nx
[1216]	435
[1320]	436	USE kinds
	437
	438	USE transpose_indices, &
	439	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	440
[1216]	441	IMPLICIT NONE
	442
[4171]	443	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	444	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1216]	445
	446
[4171]	447	INTEGER(iwp) :: i !<
	448	INTEGER(iwp) :: j !<
	449	INTEGER(iwp) :: k !<
[1216]	450	!
	451	!-- Rearrange indices of input array in order to make data to be send
	452	!-- by MPI contiguous
	453	!$OMP PARALLEL PRIVATE ( i, j, k )
	454	!$OMP DO
[3690]	455	#if __acc_fft_device
[3634]	456	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	457	!$ACC PRESENT(f_out, f_inv)
[3690]	458	#endif
[4171]	459	DO k = nzb_x, nzt_x
	460	DO j = nys_x, nyn_x
	461	DO i = 0, nx
[1216]	462	f_out(i,j,k) = f_inv(j,k,i)
	463	ENDDO
	464	ENDDO
	465	ENDDO
	466	!$OMP END PARALLEL
	467
	468	END SUBROUTINE resort_for_yx
	469
	470
	471	!------------------------------------------------------------------------------!
	472	! Description:
	473	! ------------
[1682]	474	!> Transposition of input array (f_in) from y to x. For the input array, all
	475	!> elements along y reside on the same PE, while after transposition, all
	476	!> elements along x reside on the same PE.
[1]	477	!------------------------------------------------------------------------------!
[1682]	478	SUBROUTINE transpose_yx( f_in, f_inv )
[1]	479
[1682]	480
[4429]	481	#if defined( __parallel )
[1320]	482	USE cpulog, &
	483	ONLY: cpu_log, cpu_log_nowait, log_point_s
[4429]	484	#endif
[1]	485
[1320]	486	USE indices, &
	487	ONLY: nx, ny
	488
	489	USE kinds
	490
[1324]	491	USE pegrid
[1320]	492
	493	USE transpose_indices, &
	494	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	495
[1]	496	IMPLICIT NONE
	497
[4171]	498	INTEGER(iwp) :: i !<
	499	INTEGER(iwp) :: j !<
	500	INTEGER(iwp) :: k !<
[4429]	501	#if defined( __parallel )
[4171]	502	INTEGER(iwp) :: l !<
	503	INTEGER(iwp) :: ys !<
[4429]	504	#endif
[1]	505
[4171]	506	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	507	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1111]	508
[4429]	509	#if defined( __parallel )
[4171]	510	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	511	#if __acc_fft_device
[3634]	512	!$ACC DECLARE CREATE(work)
[3690]	513	#endif
[4429]	514	#endif
[1111]	515
[1320]	516
[1106]	517	IF ( numprocs /= 1 ) THEN
	518
[1]	519	#if defined( __parallel )
	520	!
[1106]	521	!-- Reorder input array for transposition
[1111]	522	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[1106]	523	DO l = 0, pdims(2) - 1
	524	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	525	#if __acc_fft_device
[3634]	526	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	527	!$ACC PRESENT(work, f_in)
[3690]	528	#endif
[4236]	529	!$OMP DO
[1106]	530	DO i = nxl_y, nxr_y
	531	DO k = nzb_y, nzt_y
	532	DO j = ys, ys + nyn_x - nys_x
[1111]	533	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	534	ENDDO
	535	ENDDO
	536	ENDDO
[4236]	537	!$OMP END DO NOWAIT
[1106]	538	ENDDO
	539	!$OMP END PARALLEL
	540
	541	!
	542	!-- Transpose array
[1318]	543	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	544
	545	#if __acc_fft_device
[3657]	546	#ifndef __cuda_aware_mpi
[3634]	547	!$ACC UPDATE HOST(work)
[3657]	548	#else
	549	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	550	#endif
[3690]	551	#endif
	552
[1106]	553	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	554	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	555	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	556	comm1dy, ierr )
[3690]	557
	558	#if __acc_fft_device
[3657]	559	#ifndef __cuda_aware_mpi
[3634]	560	!$ACC UPDATE DEVICE(f_inv)
[3657]	561	#else
	562	!$ACC END HOST_DATA
	563	#endif
[3690]	564	#endif
	565
[1106]	566	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	567	#endif
	568
	569	ELSE
	570
	571	!
	572	!-- Reorder array f_in the same way as ALLTOALL did it
	573	!$OMP PARALLEL PRIVATE ( i, j, k )
	574	!$OMP DO
[3690]	575	#if __acc_fft_device
[3634]	576	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	577	!$ACC PRESENT(f_inv, f_in)
[3690]	578	#endif
[1003]	579	DO i = nxl_y, nxr_y
	580	DO k = nzb_y, nzt_y
[1106]	581	DO j = 0, ny
	582	f_inv(j,k,i) = f_in(j,i,k)
[1]	583	ENDDO
	584	ENDDO
	585	ENDDO
[683]	586	!$OMP END PARALLEL
[1]	587
[1106]	588	ENDIF
[1]	589
	590	END SUBROUTINE transpose_yx
	591
	592
	593	!------------------------------------------------------------------------------!
	594	! Description:
	595	! ------------
[1682]	596	!> Transposition of input array (f_in) from y to x. For the input array, all
	597	!> elements along y reside on the same PE, while after transposition, all
	598	!> elements along x reside on the same PE.
	599	!> This is a direct transposition for arrays with indices in regular order
	600	!> (k,j,i) (cf. transpose_yx).
[1]	601	!------------------------------------------------------------------------------!
[4429]	602	#if defined( __parallel )
[1682]	603	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	604
[1682]	605
[1320]	606	USE cpulog, &
[3241]	607	ONLY: cpu_log, log_point_s
[1]	608
[1320]	609	USE indices, &
	610	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
	611
	612	USE kinds
	613
[1324]	614	USE pegrid
[1320]	615
	616	USE transpose_indices, &
	617	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	618
[1]	619	IMPLICIT NONE
	620
[4171]	621	INTEGER(iwp) :: i !<
	622	INTEGER(iwp) :: j !<
	623	INTEGER(iwp) :: k !<
	624	INTEGER(iwp) :: l !<
	625	INTEGER(iwp) :: m !<
	626	INTEGER(iwp) :: xs !<
[1]	627
[4171]	628	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	629	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
	630	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	631	REAL(wp) :: work(nnxnnynnz) !<
[1]	632
	633	!
	634	!-- Rearrange indices of input array in order to make data to be send
	635	!-- by MPI contiguous
[1003]	636	DO k = 1, nz
	637	DO j = nys, nyn
	638	DO i = nxl, nxr
[164]	639	f_inv(i,k,j) = f_in(k,j,i)
[1]	640	ENDDO
	641	ENDDO
	642	ENDDO
	643
	644	!
	645	!-- Transpose array
	646	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	647	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	648	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	649	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	650	comm1dx, ierr )
	651	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	652
	653	!
	654	!-- Reorder transposed array
	655	m = 0
	656	DO l = 0, pdims(1) - 1
	657	xs = 0 + l * nnx
[1003]	658	DO j = nys_x, nyn_x
	659	DO k = 1, nz
[1]	660	DO i = xs, xs + nnx - 1
	661	m = m + 1
[164]	662	f_out(i,j,k) = work(m)
[1]	663	ENDDO
	664	ENDDO
	665	ENDDO
	666	ENDDO
	667
[4429]	668	END SUBROUTINE transpose_yxd
[1]	669	#endif
	670
	671
	672	!------------------------------------------------------------------------------!
	673	! Description:
	674	! ------------
[1682]	675	!> Resorting data for the transposition from y to z. The transposition itself
	676	!> is carried out in transpose_yz
[1216]	677	!------------------------------------------------------------------------------!
[1682]	678	SUBROUTINE resort_for_yz( f_in, f_inv )
[1216]	679
[1682]	680
[1320]	681	USE indices, &
	682	ONLY: ny
[1216]	683
[1320]	684	USE kinds
	685
	686	USE transpose_indices, &
	687	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	688
[1216]	689	IMPLICIT NONE
	690
[4171]	691	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	692	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1216]	693
[4171]	694	INTEGER(iwp) :: i !<
	695	INTEGER(iwp) :: j !<
	696	INTEGER(iwp) :: k !<
[1216]	697
	698	!
	699	!-- Rearrange indices of input array in order to make data to be send
	700	!-- by MPI contiguous
	701	!$OMP PARALLEL PRIVATE ( i, j, k )
	702	!$OMP DO
[3690]	703	#if __acc_fft_device
[3634]	704	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	705	!$ACC PRESENT(f_inv, f_in)
[3690]	706	#endif
[4171]	707	DO k = nzb_y, nzt_y
	708	DO i = nxl_y, nxr_y
	709	DO j = 0, ny
[1216]	710	f_inv(i,k,j) = f_in(j,i,k)
	711	ENDDO
	712	ENDDO
	713	ENDDO
	714	!$OMP END PARALLEL
	715
	716	END SUBROUTINE resort_for_yz
	717
	718
	719	!------------------------------------------------------------------------------!
	720	! Description:
	721	! ------------
[1682]	722	!> Transposition of input array (f_in) from y to z. For the input array, all
	723	!> elements along y reside on the same PE, while after transposition, all
	724	!> elements along z reside on the same PE.
[1]	725	!------------------------------------------------------------------------------!
[1682]	726	SUBROUTINE transpose_yz( f_inv, f_out )
[1]	727
[1682]	728
[4429]	729	#if defined( __parallel )
[1320]	730	USE cpulog, &
	731	ONLY: cpu_log, cpu_log_nowait, log_point_s
[4429]	732	#endif
[1]	733
[1320]	734	USE indices, &
	735	ONLY: ny, nz
	736
	737	USE kinds
	738
[1324]	739	USE pegrid
[1320]	740
	741	USE transpose_indices, &
	742	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	743
[1]	744	IMPLICIT NONE
	745
[4171]	746	INTEGER(iwp) :: i !<
	747	INTEGER(iwp) :: j !<
	748	INTEGER(iwp) :: k !<
[4429]	749	#if defined( __parallel )
[4171]	750	INTEGER(iwp) :: l !<
	751	INTEGER(iwp) :: zs !<
[4429]	752	#endif
[1]	753
[4171]	754	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	755	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
[1111]	756
[4429]	757	#if defined( __parallel )
[4171]	758	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	759	#if __acc_fft_device
[3634]	760	!$ACC DECLARE CREATE(work)
[3690]	761	#endif
[4429]	762	#endif
[1111]	763
[1320]	764
[1]	765	!
	766	!-- If the PE grid is one-dimensional along y, only local reordering
	767	!-- of the data is necessary and no transposition has to be done.
	768	IF ( pdims(1) == 1 ) THEN
[1106]	769
[683]	770	!$OMP PARALLEL PRIVATE ( i, j, k )
	771	!$OMP DO
[3690]	772	#if __acc_fft_device
[3634]	773	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	774	!$ACC PRESENT(f_out, f_inv)
[3690]	775	#endif
[1003]	776	DO j = 0, ny
	777	DO k = nzb_y, nzt_y
	778	DO i = nxl_y, nxr_y
[164]	779	f_out(i,j,k) = f_inv(i,k,j)
[1]	780	ENDDO
	781	ENDDO
	782	ENDDO
[683]	783	!$OMP END PARALLEL
[1]	784
[1106]	785	ELSE
	786
	787	#if defined( __parallel )
[1]	788	!
[1106]	789	!-- Transpose array
[1318]	790	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	791
	792	#if __acc_fft_device
[3657]	793	#ifndef __cuda_aware_mpi
[3634]	794	!$ACC UPDATE HOST(f_inv)
[3657]	795	#else
	796	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	797	#endif
[3690]	798	#endif
	799
[1106]	800	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	801	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	802	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	803	comm1dx, ierr )
[3690]	804
	805	#if __acc_fft_device
[3657]	806	#ifndef __cuda_aware_mpi
[3634]	807	!$ACC UPDATE DEVICE(work)
[3657]	808	#else
	809	!$ACC END HOST_DATA
	810	#endif
[3690]	811	#endif
	812
[1106]	813	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	814
	815	!
[1106]	816	!-- Reorder transposed array
[1111]	817	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[1106]	818	DO l = 0, pdims(1) - 1
	819	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	820	#if __acc_fft_device
[3634]	821	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	822	!$ACC PRESENT(f_out, work)
[3690]	823	#endif
[4236]	824	!$OMP DO
[1106]	825	DO j = nys_z, nyn_z
	826	DO k = zs, zs + nzt_y - nzb_y
	827	DO i = nxl_z, nxr_z
[1111]	828	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	829	ENDDO
[1]	830	ENDDO
	831	ENDDO
[4236]	832	!$OMP END DO NOWAIT
[1]	833	ENDDO
[683]	834	!$OMP END PARALLEL
[1]	835	#endif
	836
[1106]	837	ENDIF
	838
[1]	839	END SUBROUTINE transpose_yz
	840
	841
	842	!------------------------------------------------------------------------------!
	843	! Description:
	844	! ------------
[1682]	845	!> Resorting data for the transposition from z to x. The transposition itself
	846	!> is carried out in transpose_zx
[1216]	847	!------------------------------------------------------------------------------!
[1682]	848	SUBROUTINE resort_for_zx( f_in, f_inv )
[1216]	849
[1682]	850
[1320]	851	USE indices, &
	852	ONLY: nxl, nxr, nyn, nys, nz
[1216]	853
[1320]	854	USE kinds
	855
[1216]	856	IMPLICIT NONE
	857
[4171]	858	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	859	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1216]	860
[4171]	861	INTEGER(iwp) :: i !<
	862	INTEGER(iwp) :: j !<
	863	INTEGER(iwp) :: k !<
[1216]	864
	865	!
	866	!-- Rearrange indices of input array in order to make data to be send
	867	!-- by MPI contiguous
	868	!$OMP PARALLEL PRIVATE ( i, j, k )
	869	!$OMP DO
[3690]	870	#if __acc_fft_device
[3634]	871	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	872	!$ACC PRESENT(f_in, f_inv)
[3690]	873	#endif
[3832]	874	DO i = nxl, nxr
[4171]	875	DO j = nys, nyn
	876	DO k = 1,nz
[1216]	877	f_inv(j,i,k) = f_in(k,j,i)
	878	ENDDO
	879	ENDDO
	880	ENDDO
	881	!$OMP END PARALLEL
	882
	883	END SUBROUTINE resort_for_zx
	884
	885
	886	!------------------------------------------------------------------------------!
	887	! Description:
	888	! ------------
[1682]	889	!> Transposition of input array (f_in) from z to x. For the input array, all
	890	!> elements along z reside on the same PE, while after transposition, all
	891	!> elements along x reside on the same PE.
[1]	892	!------------------------------------------------------------------------------!
[1682]	893	SUBROUTINE transpose_zx( f_inv, f_out )
[1]	894
[1682]	895
[4429]	896	#if defined( __parallel )
[1320]	897	USE cpulog, &
	898	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	899
[4366]	900	USE fft_xy, &
[4370]	901	ONLY: f_vec_x, temperton_fft_vec
[4429]	902	#endif
[4366]	903
[1320]	904	USE indices, &
[4429]	905	ONLY: nx, nxl, nxr, nyn, nys, nz
	906	#if defined( __parallel )
	907	USE indices, &
	908	ONLY: nnx
	909	#endif
[1320]	910
	911	USE kinds
	912
[1324]	913	USE pegrid
[1320]	914
	915	USE transpose_indices, &
	916	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	917
[1]	918	IMPLICIT NONE
	919
[4171]	920	INTEGER(iwp) :: i !<
	921	INTEGER(iwp) :: j !<
	922	INTEGER(iwp) :: k !<
[4429]	923	#if defined( __parallel )
[4171]	924	INTEGER(iwp) :: l !<
[4366]	925	INTEGER(iwp) :: mm !<
[4171]	926	INTEGER(iwp) :: xs !<
[4429]	927	#endif
[1]	928
[4171]	929	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	930	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1111]	931
[4429]	932	#if defined( __parallel )
[4171]	933	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	934	#if __acc_fft_device
[3634]	935	!$ACC DECLARE CREATE(work)
[3690]	936	#endif
[4429]	937	#endif
[1]	938
[1320]	939
[1]	940	!
	941	!-- If the PE grid is one-dimensional along y, only local reordering
	942	!-- of the data is necessary and no transposition has to be done.
	943	IF ( pdims(1) == 1 ) THEN
[1106]	944
[683]	945	!$OMP PARALLEL PRIVATE ( i, j, k )
	946	!$OMP DO
[3690]	947	#if __acc_fft_device
[3634]	948	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	949	!$ACC PRESENT(f_out, f_inv)
[3690]	950	#endif
[1003]	951	DO k = 1, nz
	952	DO i = nxl, nxr
	953	DO j = nys, nyn
[164]	954	f_out(i,j,k) = f_inv(j,i,k)
[1]	955	ENDDO
	956	ENDDO
	957	ENDDO
[683]	958	!$OMP END PARALLEL
[1]	959
[1106]	960	ELSE
	961
	962	#if defined( __parallel )
[1]	963	!
[1106]	964	!-- Transpose array
[1318]	965	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	966
	967	#if __acc_fft_device
[3657]	968	#ifndef __cuda_aware_mpi
[3634]	969	!$ACC UPDATE HOST(f_inv)
[3657]	970	#else
	971	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	972	#endif
[3690]	973	#endif
	974
[1106]	975	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	976	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	977	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	978	comm1dx, ierr )
[3690]	979
	980	#if __acc_fft_device
[3657]	981	#ifndef __cuda_aware_mpi
[3634]	982	!$ACC UPDATE DEVICE(work)
[3657]	983	#else
	984	!$ACC END HOST_DATA
	985	#endif
[3690]	986	#endif
	987
[1106]	988	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	989
	990	!
[4366]	991	!-- Reorder transposed array.
[4370]	992	!-- Data for the vectorized Temperton-fft is stored in different array format (f_vec_x) which
	993	!-- saves additional data copy in fft_x.
[4366]	994	IF ( temperton_fft_vec ) THEN
	995
	996	DO l = 0, pdims(1) - 1
	997	xs = 0 + l * nnx
	998	DO k = nzb_x, nzt_x
	999	DO i = xs, xs + nnx - 1
	1000	DO j = nys_x, nyn_x
	1001	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
[4370]	1002	f_vec_x(mm,i) = work(j,i-xs+1,k,l)
[4366]	1003	ENDDO
	1004	ENDDO
	1005	ENDDO
	1006	ENDDO
	1007
	1008	ELSE
	1009
	1010	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
	1011	DO l = 0, pdims(1) - 1
	1012	xs = 0 + l * nnx
[3690]	1013	#if __acc_fft_device
[4366]	1014	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1015	!$ACC PRESENT(f_out, work)
[3690]	1016	#endif
[4366]	1017	!$OMP DO
	1018	DO k = nzb_x, nzt_x
	1019	DO i = xs, xs + nnx - 1
	1020	DO j = nys_x, nyn_x
	1021	f_out(i,j,k) = work(j,i-xs+1,k,l)
	1022	ENDDO
[1106]	1023	ENDDO
[1]	1024	ENDDO
[4366]	1025	!$OMP END DO NOWAIT
[1]	1026	ENDDO
[4366]	1027	!$OMP END PARALLEL
[1]	1028
[4366]	1029	ENDIF
	1030
[4415]	1031	#endif
	1032
[1106]	1033	ENDIF
	1034
[1]	1035	END SUBROUTINE transpose_zx
	1036
	1037
	1038	!------------------------------------------------------------------------------!
	1039	! Description:
	1040	! ------------
[1682]	1041	!> Resorting data after the transposition from z to y. The transposition itself
	1042	!> is carried out in transpose_zy
[1216]	1043	!------------------------------------------------------------------------------!
[1682]	1044	SUBROUTINE resort_for_zy( f_inv, f_out )
[1216]	1045
[1682]	1046
[1320]	1047	USE indices, &
	1048	ONLY: ny
[1216]	1049
[1320]	1050	USE kinds
	1051
	1052	USE transpose_indices, &
	1053	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	1054
[1216]	1055	IMPLICIT NONE
	1056
[4171]	1057	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	1058	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1216]	1059
	1060
[4171]	1061	INTEGER(iwp) :: i !<
	1062	INTEGER(iwp) :: j !<
	1063	INTEGER(iwp) :: k !<
[1216]	1064
	1065	!
	1066	!-- Rearrange indices of input array in order to make data to be send
	1067	!-- by MPI contiguous
	1068	!$OMP PARALLEL PRIVATE ( i, j, k )
	1069	!$OMP DO
[3690]	1070	#if __acc_fft_device
[3634]	1071	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1072	!$ACC PRESENT(f_out, f_inv)
[3690]	1073	#endif
[4171]	1074	DO k = nzb_y, nzt_y
	1075	DO i = nxl_y, nxr_y
	1076	DO j = 0, ny
[1216]	1077	f_out(j,i,k) = f_inv(i,k,j)
	1078	ENDDO
	1079	ENDDO
	1080	ENDDO
	1081	!$OMP END PARALLEL
	1082
	1083	END SUBROUTINE resort_for_zy
	1084
	1085
	1086	!------------------------------------------------------------------------------!
[3241]	1087	! Description:cpu_log_nowait
[1216]	1088	! ------------
[1682]	1089	!> Transposition of input array (f_in) from z to y. For the input array, all
	1090	!> elements along z reside on the same PE, while after transposition, all
	1091	!> elements along y reside on the same PE.
[1]	1092	!------------------------------------------------------------------------------!
[1682]	1093	SUBROUTINE transpose_zy( f_in, f_inv )
[1]	1094
[1682]	1095
[4429]	1096	#if defined( __parallel )
[1320]	1097	USE cpulog, &
	1098	ONLY: cpu_log, cpu_log_nowait, log_point_s
[4429]	1099	#endif
[1]	1100
[1320]	1101	USE indices, &
	1102	ONLY: ny, nz
	1103
	1104	USE kinds
	1105
[1324]	1106	USE pegrid
[1320]	1107
	1108	USE transpose_indices, &
	1109	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	1110
[1]	1111	IMPLICIT NONE
	1112
[4171]	1113	INTEGER(iwp) :: i !<
	1114	INTEGER(iwp) :: j !<
	1115	INTEGER(iwp) :: k !<
[4429]	1116	#if defined( __parallel )
[4171]	1117	INTEGER(iwp) :: l !<
	1118	INTEGER(iwp) :: zs !<
[4429]	1119	#endif
[1]	1120
[4171]	1121	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
	1122	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1111]	1123
[4429]	1124	#if defined( __parallel )
[1682]	1125	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	1126	#if __acc_fft_device
[3634]	1127	!$ACC DECLARE CREATE(work)
[3690]	1128	#endif
[4429]	1129	#endif
[1111]	1130
[1]	1131	!
	1132	!-- If the PE grid is one-dimensional along y, the array has only to be
	1133	!-- reordered locally and therefore no transposition has to be done.
	1134	IF ( pdims(1) /= 1 ) THEN
[1106]	1135
	1136	#if defined( __parallel )
[1]	1137	!
	1138	!-- Reorder input array for transposition
[1111]	1139	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[1]	1140	DO l = 0, pdims(1) - 1
[1003]	1141	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	1142	#if __acc_fft_device
[3634]	1143	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1144	!$ACC PRESENT(work, f_in)
[3690]	1145	#endif
[4236]	1146	!$OMP DO
[1003]	1147	DO j = nys_z, nyn_z
	1148	DO k = zs, zs + nzt_y - nzb_y
	1149	DO i = nxl_z, nxr_z
[1111]	1150	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	1151	ENDDO
	1152	ENDDO
	1153	ENDDO
[4236]	1154	!$OMP END DO NOWAIT
[1]	1155	ENDDO
[683]	1156	!$OMP END PARALLEL
[1]	1157
	1158	!
	1159	!-- Transpose array
[1318]	1160	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	1161
	1162	#if __acc_fft_device
[3657]	1163	#ifndef __cuda_aware_mpi
[3634]	1164	!$ACC UPDATE HOST(work)
[3657]	1165	#else
	1166	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	1167	#endif
[3690]	1168	#endif
	1169
[622]	1170	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	1171	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	1172	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	1173	comm1dx, ierr )
[3690]	1174
	1175	#if __acc_fft_device
[3657]	1176	#ifndef __cuda_aware_mpi
[3634]	1177	!$ACC UPDATE DEVICE(f_inv)
[3657]	1178	#else
	1179	!$ACC END HOST_DATA
	1180	#endif
[3690]	1181	#endif
	1182
[1]	1183	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	1184	#endif
[1]	1185
	1186	ELSE
	1187	!
[1106]	1188	!-- Reorder the array in the same way like ALLTOALL did it
[683]	1189	!$OMP PARALLEL PRIVATE ( i, j, k )
	1190	!$OMP DO
[3690]	1191	#if __acc_fft_device
[3634]	1192	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1193	!$ACC PRESENT(f_inv, f_in)
[3690]	1194	#endif
[1003]	1195	DO k = nzb_y, nzt_y
	1196	DO j = 0, ny
	1197	DO i = nxl_y, nxr_y
[164]	1198	f_inv(i,k,j) = f_in(i,j,k)
	1199	ENDDO
	1200	ENDDO
	1201	ENDDO
[683]	1202	!$OMP END PARALLEL
[1106]	1203
	1204	ENDIF
	1205
[1]	1206	END SUBROUTINE transpose_zy
	1207
	1208
	1209	!------------------------------------------------------------------------------!
	1210	! Description:
	1211	! ------------
[1682]	1212	!> Transposition of input array (f_in) from z to y. For the input array, all
	1213	!> elements along z reside on the same PE, while after transposition, all
	1214	!> elements along y reside on the same PE.
	1215	!> This is a direct transposition for arrays with indices in regular order
	1216	!> (k,j,i) (cf. transpose_zy).
[1]	1217	!------------------------------------------------------------------------------!
[4429]	1218	#if defined( __parallel )
[1682]	1219	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	1220
[1682]	1221
[1320]	1222	USE cpulog, &
[3241]	1223	ONLY: cpu_log, log_point_s
[1]	1224
[1320]	1225	USE indices, &
	1226	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
	1227
	1228	USE kinds
	1229
[1324]	1230	USE pegrid
[1320]	1231
	1232	USE transpose_indices, &
[3241]	1233	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
[1320]	1234
[1]	1235	IMPLICIT NONE
	1236
[4171]	1237	INTEGER(iwp) :: i !<
	1238	INTEGER(iwp) :: j !<
	1239	INTEGER(iwp) :: k !<
	1240	INTEGER(iwp) :: l !<
	1241	INTEGER(iwp) :: m !<
	1242	INTEGER(iwp) :: ys !<
[1]	1243
[4171]	1244	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	1245	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	1246	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
	1247	REAL(wp) :: work(nnxnnynnz) !<
[1320]	1248
[1]	1249	!
	1250	!-- Rearrange indices of input array in order to make data to be send
	1251	!-- by MPI contiguous
[1003]	1252	DO i = nxl, nxr
	1253	DO j = nys, nyn
	1254	DO k = 1, nz
[164]	1255	f_inv(j,i,k) = f_in(k,j,i)
[1]	1256	ENDDO
	1257	ENDDO
	1258	ENDDO
	1259
	1260	!
	1261	!-- Move data to different array, because memory location of work1 is
	1262	!-- needed further below (work1 = work2).
	1263	!-- If the PE grid is one-dimensional along x, only local reordering
	1264	!-- of the data is necessary and no transposition has to be done.
	1265	IF ( pdims(2) == 1 ) THEN
[1003]	1266	DO k = 1, nz
	1267	DO i = nxl, nxr
	1268	DO j = nys, nyn
[164]	1269	f_out(j,i,k) = f_inv(j,i,k)
[1]	1270	ENDDO
	1271	ENDDO
	1272	ENDDO
	1273	RETURN
	1274	ENDIF
	1275
	1276	!
	1277	!-- Transpose array
	1278	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1279	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1280	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	1281	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	1282	comm1dy, ierr )
	1283	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1284
	1285	!
	1286	!-- Reorder transposed array
	1287	m = 0
	1288	DO l = 0, pdims(2) - 1
	1289	ys = 0 + l * nny
[1003]	1290	DO k = nzb_yd, nzt_yd
	1291	DO i = nxl_yd, nxr_yd
[1]	1292	DO j = ys, ys + nny - 1
	1293	m = m + 1
[164]	1294	f_out(j,i,k) = work(m)
[1]	1295	ENDDO
	1296	ENDDO
	1297	ENDDO
	1298	ENDDO
	1299
[4429]	1300	END SUBROUTINE transpose_zyd
[1]	1301	#endif

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |