Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 4366

Last change on this file since 4366 was 4366, checked in by raasch, 5 years ago
code vectorization for NEC Aurora: vectorized version of Temperton FFT, vectorization of Newtor iteration for calculating the Obukhov length
Property svn:keywords set to `Id`
File size: 34.0 KB

Rev	Line
[1682]	1	!> @file transpose.f90
[2000]	2	!------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[4360]	17	! Copyright 1997-2020 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1321]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: transpose.f90 4366 2020-01-09 08:12:43Z raasch $
[4366]	27	! modifications for NEC vectorization
	28	!
	29	! 4360 2020-01-07 11:25:50Z suehring
[4236]	30	! Added missing OpenMP directives
	31	!
	32	! 4182 2019-08-22 15:20:23Z scharf
[4182]	33	! Corrected "Former revisions" section
[4181]	34	!
[4182]	35	! 4171 2019-08-19 17:44:09Z gronemeier
[3832]	36	! loop reordering for performance optimization
[4171]	37	!
	38	! 3832 2019-03-28 13:16:58Z raasch
	39	! loop reordering for performance optimization
	40	!
[3832]	41	! 3694 2019-01-23 17:01:49Z knoop
[3634]	42	! OpenACC port for SPEC
[4171]	43	!
[4182]	44	! Revision 1.1 1997/07/24 11:25:18 raasch
	45	! Initial revision
	46	!
	47	!
[1216]	48	! Description:
	49	! ------------
[1682]	50	!> Resorting data for the transposition from x to y. The transposition itself
	51	!> is carried out in transpose_xy
[1216]	52	!------------------------------------------------------------------------------!
[4181]	53
	54	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
	55
[1682]	56	SUBROUTINE resort_for_xy( f_in, f_inv )
[1216]	57
[4171]	58
[1320]	59	USE indices, &
	60	ONLY: nx
[1216]	61
[1320]	62	USE kinds
	63
	64	USE transpose_indices, &
[3241]	65	ONLY: nyn_x, nys_x, nzb_x, nzt_x
[1320]	66
[1216]	67	IMPLICIT NONE
	68
[4171]	69	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	70	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1216]	71
	72
[4171]	73	INTEGER(iwp) :: i !<
	74	INTEGER(iwp) :: j !<
	75	INTEGER(iwp) :: k !<
[1]	76	!
[1216]	77	!-- Rearrange indices of input array in order to make data to be send
	78	!-- by MPI contiguous
	79	!$OMP PARALLEL PRIVATE ( i, j, k )
	80	!$OMP DO
[3690]	81	#if __acc_fft_device
[3634]	82	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	83	!$ACC PRESENT(f_inv, f_in)
[3690]	84	#endif
[3832]	85	DO k = nzb_x, nzt_x
[4171]	86	DO j = nys_x, nyn_x
	87	DO i = 0, nx
[1216]	88	f_inv(j,k,i) = f_in(i,j,k)
	89	ENDDO
	90	ENDDO
	91	ENDDO
	92	!$OMP END PARALLEL
	93
	94	END SUBROUTINE resort_for_xy
	95
	96
	97	!------------------------------------------------------------------------------!
[1]	98	! Description:
	99	! ------------
[1682]	100	!> Transposition of input array (f_in) from x to y. For the input array, all
	101	!> elements along x reside on the same PE, while after transposition, all
	102	!> elements along y reside on the same PE.
[1]	103	!------------------------------------------------------------------------------!
[1682]	104	SUBROUTINE transpose_xy( f_inv, f_out )
[1]	105
[1682]	106
[1320]	107	USE cpulog, &
	108	ONLY: cpu_log, cpu_log_nowait, log_point_s
	109
	110	USE indices, &
	111	ONLY: nx, ny
[4171]	112
[1320]	113	USE kinds
	114
[1]	115	USE pegrid
	116
[1320]	117	USE transpose_indices, &
	118	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	119
[1]	120	IMPLICIT NONE
	121
[4171]	122	INTEGER(iwp) :: i !<
	123	INTEGER(iwp) :: j !<
	124	INTEGER(iwp) :: k !<
	125	INTEGER(iwp) :: l !<
	126	INTEGER(iwp) :: ys !<
[1]	127
[4171]	128	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	129	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	130
	131	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	132	#if __acc_fft_device
[3634]	133	!$ACC DECLARE CREATE(work)
[3690]	134	#endif
[1111]	135
	136
[1106]	137	IF ( numprocs /= 1 ) THEN
	138
	139	#if defined( __parallel )
[1]	140	!
[1106]	141	!-- Transpose array
[1318]	142	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	143
	144	#if __acc_fft_device
[3657]	145	#ifndef __cuda_aware_mpi
[3634]	146	!$ACC UPDATE HOST(f_inv)
[3657]	147	#else
	148	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	149	#endif
[3690]	150	#endif
	151
[1106]	152	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	153	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	154	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	155	comm1dy, ierr )
[3690]	156
	157	#if __acc_fft_device
[3657]	158	#ifndef __cuda_aware_mpi
[3634]	159	!$ACC UPDATE DEVICE(work)
[3657]	160	#else
	161	!$ACC END HOST_DATA
	162	#endif
[3690]	163	#endif
	164
[1106]	165	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	166
	167	!
[1106]	168	!-- Reorder transposed array
[1111]	169	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[1106]	170	DO l = 0, pdims(2) - 1
	171	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	172	#if __acc_fft_device
[3634]	173	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	174	!$ACC PRESENT(f_out, work)
[3690]	175	#endif
[4236]	176	!$OMP DO
[1106]	177	DO i = nxl_y, nxr_y
	178	DO k = nzb_y, nzt_y
	179	DO j = ys, ys + nyn_x - nys_x
[1111]	180	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	181	ENDDO
[1]	182	ENDDO
	183	ENDDO
[4236]	184	!$OMP END DO NOWAIT
[1]	185	ENDDO
[683]	186	!$OMP END PARALLEL
[1]	187	#endif
	188
[1106]	189	ELSE
	190
	191	!
	192	!-- Reorder transposed array
	193	!$OMP PARALLEL PRIVATE ( i, j, k )
	194	!$OMP DO
[3690]	195	#if __acc_fft_device
[3634]	196	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	197	!$ACC PRESENT(f_out, f_inv)
[3690]	198	#endif
[1106]	199	DO k = nzb_y, nzt_y
	200	DO i = nxl_y, nxr_y
	201	DO j = 0, ny
	202	f_out(j,i,k) = f_inv(j,k,i)
	203	ENDDO
	204	ENDDO
	205	ENDDO
	206	!$OMP END PARALLEL
	207
	208	ENDIF
	209
[1]	210	END SUBROUTINE transpose_xy
	211
	212
	213	!------------------------------------------------------------------------------!
	214	! Description:
	215	! ------------
[1682]	216	!> Resorting data after the transposition from x to z. The transposition itself
	217	!> is carried out in transpose_xz
[1216]	218	!------------------------------------------------------------------------------!
[1682]	219	SUBROUTINE resort_for_xz( f_inv, f_out )
[1216]	220
[1682]	221
[1320]	222	USE indices, &
	223	ONLY: nxl, nxr, nyn, nys, nz
[1216]	224
[1320]	225	USE kinds
	226
[1216]	227	IMPLICIT NONE
	228
[4171]	229	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	230	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
[1216]	231
[4171]	232	INTEGER(iwp) :: i !<
	233	INTEGER(iwp) :: j !<
	234	INTEGER(iwp) :: k !<
[1216]	235	!
	236	!-- Rearrange indices of input array in order to make data to be send
	237	!-- by MPI contiguous.
	238	!-- In case of parallel fft/transposition, scattered store is faster in
	239	!-- backward direction!!!
	240	!$OMP PARALLEL PRIVATE ( i, j, k )
	241	!$OMP DO
[3690]	242	#if __acc_fft_device
[3634]	243	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	244	!$ACC PRESENT(f_out, f_inv)
[3690]	245	#endif
[4171]	246	DO i = nxl, nxr
	247	DO j = nys, nyn
	248	DO k = 1, nz
[1216]	249	f_out(k,j,i) = f_inv(j,i,k)
	250	ENDDO
	251	ENDDO
	252	ENDDO
	253	!$OMP END PARALLEL
	254
	255	END SUBROUTINE resort_for_xz
	256
	257
	258	!------------------------------------------------------------------------------!
	259	! Description:
	260	! ------------
[1682]	261	!> Transposition of input array (f_in) from x to z. For the input array, all
	262	!> elements along x reside on the same PE, while after transposition, all
	263	!> elements along z reside on the same PE.
[1]	264	!------------------------------------------------------------------------------!
[1682]	265	SUBROUTINE transpose_xz( f_in, f_inv )
[1]	266
[1682]	267
[1320]	268	USE cpulog, &
	269	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	270
[4366]	271	USE fft_xy, &
	272	ONLY: f_vec, temperton_fft_vec
	273
[1320]	274	USE indices, &
[3241]	275	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
[1320]	276
	277	USE kinds
	278
[1324]	279	USE pegrid
[1320]	280
	281	USE transpose_indices, &
	282	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	283
[1]	284	IMPLICIT NONE
	285
[4171]	286	INTEGER(iwp) :: i !<
	287	INTEGER(iwp) :: j !<
	288	INTEGER(iwp) :: k !<
	289	INTEGER(iwp) :: l !<
[4366]	290	INTEGER(iwp) :: mm !<
[4171]	291	INTEGER(iwp) :: xs !<
[1]	292
[4171]	293	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	294	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1]	295
[4171]	296	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	297	#if __acc_fft_device
[3634]	298	!$ACC DECLARE CREATE(work)
[3690]	299	#endif
[1111]	300
[4366]	301	!
	302	!-- If the PE grid is one-dimensional along y, the array has only to be
	303	!-- reordered locally and therefore no transposition has to be done.
[1]	304	IF ( pdims(1) /= 1 ) THEN
[1106]	305
	306	#if defined( __parallel )
[1]	307	!
[4366]	308	!-- Reorder input array for transposition. Data from the vectorized Temperton-fft is stored in
	309	!-- different array format (f_vec).
	310	IF ( temperton_fft_vec ) THEN
	311
	312	DO l = 0, pdims(1) - 1
	313	xs = 0 + l * nnx
	314	DO k = nzb_x, nzt_x
	315	DO i = xs, xs + nnx - 1
	316	DO j = nys_x, nyn_x
	317	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
	318	work(j,i-xs+1,k,l) = f_vec(mm,i)
	319	ENDDO
	320	ENDDO
	321	ENDDO
	322	ENDDO
	323
	324	ELSE
	325
	326	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
	327	DO l = 0, pdims(1) - 1
	328	xs = 0 + l * nnx
[3690]	329	#if __acc_fft_device
[4366]	330	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	331	!$ACC PRESENT(work, f_in)
[3690]	332	#endif
[4366]	333	!$OMP DO
	334	DO k = nzb_x, nzt_x
	335	DO i = xs, xs + nnx - 1
	336	DO j = nys_x, nyn_x
	337	work(j,i-xs+1,k,l) = f_in(i,j,k)
	338	ENDDO
[1]	339	ENDDO
	340	ENDDO
[4366]	341	!$OMP END DO NOWAIT
[1]	342	ENDDO
[4366]	343	!$OMP END PARALLEL
[1]	344
[4366]	345	ENDIF
	346
[1]	347	!
	348	!-- Transpose array
[1318]	349	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	350
	351	#if __acc_fft_device
[3657]	352	#ifndef __cuda_aware_mpi
[3634]	353	!$ACC UPDATE HOST(work)
[3657]	354	#else
	355	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	356	#endif
[3690]	357	#endif
	358
[622]	359	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	360	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	361	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	362	comm1dx, ierr )
[3690]	363
	364	#if __acc_fft_device
[3657]	365	#ifndef __cuda_aware_mpi
[3634]	366	!$ACC UPDATE DEVICE(f_inv)
[3657]	367	#else
	368	!$ACC END HOST_DATA
	369	#endif
[3694]	370	#endif
	371
[1]	372	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	373	#endif
	374
[1]	375	ELSE
[1106]	376
[1]	377	!
	378	!-- Reorder the array in a way that the z index is in first position
[683]	379	!$OMP PARALLEL PRIVATE ( i, j, k )
	380	!$OMP DO
[3690]	381	#if __acc_fft_device
[3634]	382	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	383	!$ACC PRESENT(f_inv, f_in)
[3690]	384	#endif
[1003]	385	DO i = nxl, nxr
	386	DO j = nys, nyn
	387	DO k = 1, nz
[164]	388	f_inv(j,i,k) = f_in(i,j,k)
[1]	389	ENDDO
	390	ENDDO
	391	ENDDO
[683]	392	!$OMP END PARALLEL
[1]	393
[164]	394	ENDIF
	395
[1]	396	END SUBROUTINE transpose_xz
	397
	398
	399	!------------------------------------------------------------------------------!
	400	! Description:
	401	! ------------
[1682]	402	!> Resorting data after the transposition from y to x. The transposition itself
	403	!> is carried out in transpose_yx
[1216]	404	!------------------------------------------------------------------------------!
[1682]	405	SUBROUTINE resort_for_yx( f_inv, f_out )
[1216]	406
[1682]	407
[1320]	408	USE indices, &
	409	ONLY: nx
[1216]	410
[1320]	411	USE kinds
	412
	413	USE transpose_indices, &
	414	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	415
[1216]	416	IMPLICIT NONE
	417
[4171]	418	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	419	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1216]	420
	421
[4171]	422	INTEGER(iwp) :: i !<
	423	INTEGER(iwp) :: j !<
	424	INTEGER(iwp) :: k !<
[1216]	425	!
	426	!-- Rearrange indices of input array in order to make data to be send
	427	!-- by MPI contiguous
	428	!$OMP PARALLEL PRIVATE ( i, j, k )
	429	!$OMP DO
[3690]	430	#if __acc_fft_device
[3634]	431	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	432	!$ACC PRESENT(f_out, f_inv)
[3690]	433	#endif
[4171]	434	DO k = nzb_x, nzt_x
	435	DO j = nys_x, nyn_x
	436	DO i = 0, nx
[1216]	437	f_out(i,j,k) = f_inv(j,k,i)
	438	ENDDO
	439	ENDDO
	440	ENDDO
	441	!$OMP END PARALLEL
	442
	443	END SUBROUTINE resort_for_yx
	444
	445
	446	!------------------------------------------------------------------------------!
	447	! Description:
	448	! ------------
[1682]	449	!> Transposition of input array (f_in) from y to x. For the input array, all
	450	!> elements along y reside on the same PE, while after transposition, all
	451	!> elements along x reside on the same PE.
[1]	452	!------------------------------------------------------------------------------!
[1682]	453	SUBROUTINE transpose_yx( f_in, f_inv )
[1]	454
[1682]	455
[1320]	456	USE cpulog, &
	457	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	458
[1320]	459	USE indices, &
	460	ONLY: nx, ny
	461
	462	USE kinds
	463
[1324]	464	USE pegrid
[1320]	465
	466	USE transpose_indices, &
	467	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	468
[1]	469	IMPLICIT NONE
	470
[4171]	471	INTEGER(iwp) :: i !<
	472	INTEGER(iwp) :: j !<
	473	INTEGER(iwp) :: k !<
	474	INTEGER(iwp) :: l !<
	475	INTEGER(iwp) :: ys !<
[1]	476
[4171]	477	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	478	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1111]	479
[4171]	480	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	481	#if __acc_fft_device
[3634]	482	!$ACC DECLARE CREATE(work)
[3690]	483	#endif
[1111]	484
[1320]	485
[1106]	486	IF ( numprocs /= 1 ) THEN
	487
[1]	488	#if defined( __parallel )
	489	!
[1106]	490	!-- Reorder input array for transposition
[1111]	491	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[1106]	492	DO l = 0, pdims(2) - 1
	493	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	494	#if __acc_fft_device
[3634]	495	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	496	!$ACC PRESENT(work, f_in)
[3690]	497	#endif
[4236]	498	!$OMP DO
[1106]	499	DO i = nxl_y, nxr_y
	500	DO k = nzb_y, nzt_y
	501	DO j = ys, ys + nyn_x - nys_x
[1111]	502	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	503	ENDDO
	504	ENDDO
	505	ENDDO
[4236]	506	!$OMP END DO NOWAIT
[1106]	507	ENDDO
	508	!$OMP END PARALLEL
	509
	510	!
	511	!-- Transpose array
[1318]	512	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	513
	514	#if __acc_fft_device
[3657]	515	#ifndef __cuda_aware_mpi
[3634]	516	!$ACC UPDATE HOST(work)
[3657]	517	#else
	518	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	519	#endif
[3690]	520	#endif
	521
[1106]	522	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	523	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	524	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	525	comm1dy, ierr )
[3690]	526
	527	#if __acc_fft_device
[3657]	528	#ifndef __cuda_aware_mpi
[3634]	529	!$ACC UPDATE DEVICE(f_inv)
[3657]	530	#else
	531	!$ACC END HOST_DATA
	532	#endif
[3690]	533	#endif
	534
[1106]	535	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	536	#endif
	537
	538	ELSE
	539
	540	!
	541	!-- Reorder array f_in the same way as ALLTOALL did it
	542	!$OMP PARALLEL PRIVATE ( i, j, k )
	543	!$OMP DO
[3690]	544	#if __acc_fft_device
[3634]	545	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	546	!$ACC PRESENT(f_inv, f_in)
[3690]	547	#endif
[1003]	548	DO i = nxl_y, nxr_y
	549	DO k = nzb_y, nzt_y
[1106]	550	DO j = 0, ny
	551	f_inv(j,k,i) = f_in(j,i,k)
[1]	552	ENDDO
	553	ENDDO
	554	ENDDO
[683]	555	!$OMP END PARALLEL
[1]	556
[1106]	557	ENDIF
[1]	558
	559	END SUBROUTINE transpose_yx
	560
	561
	562	!------------------------------------------------------------------------------!
	563	! Description:
	564	! ------------
[1682]	565	!> Transposition of input array (f_in) from y to x. For the input array, all
	566	!> elements along y reside on the same PE, while after transposition, all
	567	!> elements along x reside on the same PE.
	568	!> This is a direct transposition for arrays with indices in regular order
	569	!> (k,j,i) (cf. transpose_yx).
[1]	570	!------------------------------------------------------------------------------!
[1682]	571	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	572
[1682]	573
[1320]	574	USE cpulog, &
[3241]	575	ONLY: cpu_log, log_point_s
[1]	576
[1320]	577	USE indices, &
	578	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
	579
	580	USE kinds
	581
[1324]	582	USE pegrid
[1320]	583
	584	USE transpose_indices, &
	585	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	586
[1]	587	IMPLICIT NONE
	588
[4171]	589	INTEGER(iwp) :: i !<
	590	INTEGER(iwp) :: j !<
	591	INTEGER(iwp) :: k !<
	592	INTEGER(iwp) :: l !<
	593	INTEGER(iwp) :: m !<
	594	INTEGER(iwp) :: xs !<
[1]	595
[4171]	596	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	597	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
	598	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	599	REAL(wp) :: work(nnxnnynnz) !<
[1]	600	#if defined( __parallel )
	601
	602	!
	603	!-- Rearrange indices of input array in order to make data to be send
	604	!-- by MPI contiguous
[1003]	605	DO k = 1, nz
	606	DO j = nys, nyn
	607	DO i = nxl, nxr
[164]	608	f_inv(i,k,j) = f_in(k,j,i)
[1]	609	ENDDO
	610	ENDDO
	611	ENDDO
	612
	613	!
	614	!-- Transpose array
	615	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	616	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	617	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	618	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	619	comm1dx, ierr )
	620	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	621
	622	!
	623	!-- Reorder transposed array
	624	m = 0
	625	DO l = 0, pdims(1) - 1
	626	xs = 0 + l * nnx
[1003]	627	DO j = nys_x, nyn_x
	628	DO k = 1, nz
[1]	629	DO i = xs, xs + nnx - 1
	630	m = m + 1
[164]	631	f_out(i,j,k) = work(m)
[1]	632	ENDDO
	633	ENDDO
	634	ENDDO
	635	ENDDO
	636
	637	#endif
	638
	639	END SUBROUTINE transpose_yxd
	640
	641
	642	!------------------------------------------------------------------------------!
	643	! Description:
	644	! ------------
[1682]	645	!> Resorting data for the transposition from y to z. The transposition itself
	646	!> is carried out in transpose_yz
[1216]	647	!------------------------------------------------------------------------------!
[1682]	648	SUBROUTINE resort_for_yz( f_in, f_inv )
[1216]	649
[1682]	650
[1320]	651	USE indices, &
	652	ONLY: ny
[1216]	653
[1320]	654	USE kinds
	655
	656	USE transpose_indices, &
	657	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	658
[1216]	659	IMPLICIT NONE
	660
[4171]	661	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	662	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1216]	663
[4171]	664	INTEGER(iwp) :: i !<
	665	INTEGER(iwp) :: j !<
	666	INTEGER(iwp) :: k !<
[1216]	667
	668	!
	669	!-- Rearrange indices of input array in order to make data to be send
	670	!-- by MPI contiguous
	671	!$OMP PARALLEL PRIVATE ( i, j, k )
	672	!$OMP DO
[3690]	673	#if __acc_fft_device
[3634]	674	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	675	!$ACC PRESENT(f_inv, f_in)
[3690]	676	#endif
[4171]	677	DO k = nzb_y, nzt_y
	678	DO i = nxl_y, nxr_y
	679	DO j = 0, ny
[1216]	680	f_inv(i,k,j) = f_in(j,i,k)
	681	ENDDO
	682	ENDDO
	683	ENDDO
	684	!$OMP END PARALLEL
	685
	686	END SUBROUTINE resort_for_yz
	687
	688
	689	!------------------------------------------------------------------------------!
	690	! Description:
	691	! ------------
[1682]	692	!> Transposition of input array (f_in) from y to z. For the input array, all
	693	!> elements along y reside on the same PE, while after transposition, all
	694	!> elements along z reside on the same PE.
[1]	695	!------------------------------------------------------------------------------!
[1682]	696	SUBROUTINE transpose_yz( f_inv, f_out )
[1]	697
[1682]	698
[1320]	699	USE cpulog, &
	700	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	701
[1320]	702	USE indices, &
	703	ONLY: ny, nz
	704
	705	USE kinds
	706
[1324]	707	USE pegrid
[1320]	708
	709	USE transpose_indices, &
	710	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	711
[1]	712	IMPLICIT NONE
	713
[4171]	714	INTEGER(iwp) :: i !<
	715	INTEGER(iwp) :: j !<
	716	INTEGER(iwp) :: k !<
	717	INTEGER(iwp) :: l !<
	718	INTEGER(iwp) :: zs !<
[1]	719
[4171]	720	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	721	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
[1111]	722
[4171]	723	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	724	#if __acc_fft_device
[3634]	725	!$ACC DECLARE CREATE(work)
[3690]	726	#endif
[1111]	727
[1320]	728
[1]	729	!
	730	!-- If the PE grid is one-dimensional along y, only local reordering
	731	!-- of the data is necessary and no transposition has to be done.
	732	IF ( pdims(1) == 1 ) THEN
[1106]	733
[683]	734	!$OMP PARALLEL PRIVATE ( i, j, k )
	735	!$OMP DO
[3690]	736	#if __acc_fft_device
[3634]	737	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	738	!$ACC PRESENT(f_out, f_inv)
[3690]	739	#endif
[1003]	740	DO j = 0, ny
	741	DO k = nzb_y, nzt_y
	742	DO i = nxl_y, nxr_y
[164]	743	f_out(i,j,k) = f_inv(i,k,j)
[1]	744	ENDDO
	745	ENDDO
	746	ENDDO
[683]	747	!$OMP END PARALLEL
[1]	748
[1106]	749	ELSE
	750
	751	#if defined( __parallel )
[1]	752	!
[1106]	753	!-- Transpose array
[1318]	754	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	755
	756	#if __acc_fft_device
[3657]	757	#ifndef __cuda_aware_mpi
[3634]	758	!$ACC UPDATE HOST(f_inv)
[3657]	759	#else
	760	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	761	#endif
[3690]	762	#endif
	763
[1106]	764	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	765	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	766	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	767	comm1dx, ierr )
[3690]	768
	769	#if __acc_fft_device
[3657]	770	#ifndef __cuda_aware_mpi
[3634]	771	!$ACC UPDATE DEVICE(work)
[3657]	772	#else
	773	!$ACC END HOST_DATA
	774	#endif
[3690]	775	#endif
	776
[1106]	777	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	778
	779	!
[1106]	780	!-- Reorder transposed array
[1111]	781	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[1106]	782	DO l = 0, pdims(1) - 1
	783	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	784	#if __acc_fft_device
[3634]	785	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	786	!$ACC PRESENT(f_out, work)
[3690]	787	#endif
[4236]	788	!$OMP DO
[1106]	789	DO j = nys_z, nyn_z
	790	DO k = zs, zs + nzt_y - nzb_y
	791	DO i = nxl_z, nxr_z
[1111]	792	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	793	ENDDO
[1]	794	ENDDO
	795	ENDDO
[4236]	796	!$OMP END DO NOWAIT
[1]	797	ENDDO
[683]	798	!$OMP END PARALLEL
[1]	799	#endif
	800
[1106]	801	ENDIF
	802
[1]	803	END SUBROUTINE transpose_yz
	804
	805
	806	!------------------------------------------------------------------------------!
	807	! Description:
	808	! ------------
[1682]	809	!> Resorting data for the transposition from z to x. The transposition itself
	810	!> is carried out in transpose_zx
[1216]	811	!------------------------------------------------------------------------------!
[1682]	812	SUBROUTINE resort_for_zx( f_in, f_inv )
[1216]	813
[1682]	814
[1320]	815	USE indices, &
	816	ONLY: nxl, nxr, nyn, nys, nz
[1216]	817
[1320]	818	USE kinds
	819
[1216]	820	IMPLICIT NONE
	821
[4171]	822	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	823	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1216]	824
[4171]	825	INTEGER(iwp) :: i !<
	826	INTEGER(iwp) :: j !<
	827	INTEGER(iwp) :: k !<
[1216]	828
	829	!
	830	!-- Rearrange indices of input array in order to make data to be send
	831	!-- by MPI contiguous
	832	!$OMP PARALLEL PRIVATE ( i, j, k )
	833	!$OMP DO
[3690]	834	#if __acc_fft_device
[3634]	835	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	836	!$ACC PRESENT(f_in, f_inv)
[3690]	837	#endif
[3832]	838	DO i = nxl, nxr
[4171]	839	DO j = nys, nyn
	840	DO k = 1,nz
[1216]	841	f_inv(j,i,k) = f_in(k,j,i)
	842	ENDDO
	843	ENDDO
	844	ENDDO
	845	!$OMP END PARALLEL
	846
	847	END SUBROUTINE resort_for_zx
	848
	849
	850	!------------------------------------------------------------------------------!
	851	! Description:
	852	! ------------
[1682]	853	!> Transposition of input array (f_in) from z to x. For the input array, all
	854	!> elements along z reside on the same PE, while after transposition, all
	855	!> elements along x reside on the same PE.
[1]	856	!------------------------------------------------------------------------------!
[1682]	857	SUBROUTINE transpose_zx( f_inv, f_out )
[1]	858
[1682]	859
[1320]	860	USE cpulog, &
	861	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	862
[4366]	863	USE fft_xy, &
	864	ONLY: f_vec, temperton_fft_vec
	865
[1320]	866	USE indices, &
	867	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
	868
	869	USE kinds
	870
[1324]	871	USE pegrid
[1320]	872
	873	USE transpose_indices, &
	874	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	875
[1]	876	IMPLICIT NONE
	877
[4171]	878	INTEGER(iwp) :: i !<
	879	INTEGER(iwp) :: j !<
	880	INTEGER(iwp) :: k !<
	881	INTEGER(iwp) :: l !<
[4366]	882	INTEGER(iwp) :: mm !<
[4171]	883	INTEGER(iwp) :: xs !<
[1]	884
[4171]	885	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	886	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1111]	887
[4171]	888	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	889	#if __acc_fft_device
[3634]	890	!$ACC DECLARE CREATE(work)
[3690]	891	#endif
[1]	892
[1320]	893
[1]	894	!
	895	!-- If the PE grid is one-dimensional along y, only local reordering
	896	!-- of the data is necessary and no transposition has to be done.
	897	IF ( pdims(1) == 1 ) THEN
[1106]	898
[683]	899	!$OMP PARALLEL PRIVATE ( i, j, k )
	900	!$OMP DO
[3690]	901	#if __acc_fft_device
[3634]	902	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	903	!$ACC PRESENT(f_out, f_inv)
[3690]	904	#endif
[1003]	905	DO k = 1, nz
	906	DO i = nxl, nxr
	907	DO j = nys, nyn
[164]	908	f_out(i,j,k) = f_inv(j,i,k)
[1]	909	ENDDO
	910	ENDDO
	911	ENDDO
[683]	912	!$OMP END PARALLEL
[1]	913
[1106]	914	ELSE
	915
	916	#if defined( __parallel )
[1]	917	!
[1106]	918	!-- Transpose array
[1318]	919	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	920
	921	#if __acc_fft_device
[3657]	922	#ifndef __cuda_aware_mpi
[3634]	923	!$ACC UPDATE HOST(f_inv)
[3657]	924	#else
	925	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	926	#endif
[3690]	927	#endif
	928
[1106]	929	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	930	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	931	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	932	comm1dx, ierr )
[3690]	933
	934	#if __acc_fft_device
[3657]	935	#ifndef __cuda_aware_mpi
[3634]	936	!$ACC UPDATE DEVICE(work)
[3657]	937	#else
	938	!$ACC END HOST_DATA
	939	#endif
[3690]	940	#endif
	941
[1106]	942	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	943
	944	!
[4366]	945	!-- Reorder transposed array.
	946	!-- Data for the vectorized Temperton-fft is stored in different array format (f_vec) which saves
	947	!-- additional data copy in fft_x.
	948	IF ( temperton_fft_vec ) THEN
	949
	950	DO l = 0, pdims(1) - 1
	951	xs = 0 + l * nnx
	952	DO k = nzb_x, nzt_x
	953	DO i = xs, xs + nnx - 1
	954	DO j = nys_x, nyn_x
	955	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
	956	f_vec(mm,i) = work(j,i-xs+1,k,l)
	957	ENDDO
	958	ENDDO
	959	ENDDO
	960	ENDDO
	961
	962	ELSE
	963
	964	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
	965	DO l = 0, pdims(1) - 1
	966	xs = 0 + l * nnx
[3690]	967	#if __acc_fft_device
[4366]	968	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	969	!$ACC PRESENT(f_out, work)
[3690]	970	#endif
[4366]	971	!$OMP DO
	972	DO k = nzb_x, nzt_x
	973	DO i = xs, xs + nnx - 1
	974	DO j = nys_x, nyn_x
	975	f_out(i,j,k) = work(j,i-xs+1,k,l)
	976	ENDDO
[1106]	977	ENDDO
[1]	978	ENDDO
[4366]	979	!$OMP END DO NOWAIT
[1]	980	ENDDO
[4366]	981	!$OMP END PARALLEL
[1]	982	#endif
	983
[4366]	984	ENDIF
	985
[1106]	986	ENDIF
	987
[1]	988	END SUBROUTINE transpose_zx
	989
	990
	991	!------------------------------------------------------------------------------!
	992	! Description:
	993	! ------------
[1682]	994	!> Resorting data after the transposition from z to y. The transposition itself
	995	!> is carried out in transpose_zy
[1216]	996	!------------------------------------------------------------------------------!
[1682]	997	SUBROUTINE resort_for_zy( f_inv, f_out )
[1216]	998
[1682]	999
[1320]	1000	USE indices, &
	1001	ONLY: ny
[1216]	1002
[1320]	1003	USE kinds
	1004
	1005	USE transpose_indices, &
	1006	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	1007
[1216]	1008	IMPLICIT NONE
	1009
[4171]	1010	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	1011	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1216]	1012
	1013
[4171]	1014	INTEGER(iwp) :: i !<
	1015	INTEGER(iwp) :: j !<
	1016	INTEGER(iwp) :: k !<
[1216]	1017
	1018	!
	1019	!-- Rearrange indices of input array in order to make data to be send
	1020	!-- by MPI contiguous
	1021	!$OMP PARALLEL PRIVATE ( i, j, k )
	1022	!$OMP DO
[3690]	1023	#if __acc_fft_device
[3634]	1024	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1025	!$ACC PRESENT(f_out, f_inv)
[3690]	1026	#endif
[4171]	1027	DO k = nzb_y, nzt_y
	1028	DO i = nxl_y, nxr_y
	1029	DO j = 0, ny
[1216]	1030	f_out(j,i,k) = f_inv(i,k,j)
	1031	ENDDO
	1032	ENDDO
	1033	ENDDO
	1034	!$OMP END PARALLEL
	1035
	1036	END SUBROUTINE resort_for_zy
	1037
	1038
	1039	!------------------------------------------------------------------------------!
[3241]	1040	! Description:cpu_log_nowait
[1216]	1041	! ------------
[1682]	1042	!> Transposition of input array (f_in) from z to y. For the input array, all
	1043	!> elements along z reside on the same PE, while after transposition, all
	1044	!> elements along y reside on the same PE.
[1]	1045	!------------------------------------------------------------------------------!
[1682]	1046	SUBROUTINE transpose_zy( f_in, f_inv )
[1]	1047
[1682]	1048
[1320]	1049	USE cpulog, &
	1050	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	1051
[1320]	1052	USE indices, &
	1053	ONLY: ny, nz
	1054
	1055	USE kinds
	1056
[1324]	1057	USE pegrid
[1320]	1058
	1059	USE transpose_indices, &
	1060	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	1061
[1]	1062	IMPLICIT NONE
	1063
[4171]	1064	INTEGER(iwp) :: i !<
	1065	INTEGER(iwp) :: j !<
	1066	INTEGER(iwp) :: k !<
	1067	INTEGER(iwp) :: l !<
	1068	INTEGER(iwp) :: zs !<
[1]	1069
[4171]	1070	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
	1071	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1111]	1072
[1682]	1073	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	1074	#if __acc_fft_device
[3634]	1075	!$ACC DECLARE CREATE(work)
[3690]	1076	#endif
[1111]	1077
[1]	1078	!
	1079	!-- If the PE grid is one-dimensional along y, the array has only to be
	1080	!-- reordered locally and therefore no transposition has to be done.
	1081	IF ( pdims(1) /= 1 ) THEN
[1106]	1082
	1083	#if defined( __parallel )
[1]	1084	!
	1085	!-- Reorder input array for transposition
[1111]	1086	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[1]	1087	DO l = 0, pdims(1) - 1
[1003]	1088	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	1089	#if __acc_fft_device
[3634]	1090	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1091	!$ACC PRESENT(work, f_in)
[3690]	1092	#endif
[4236]	1093	!$OMP DO
[1003]	1094	DO j = nys_z, nyn_z
	1095	DO k = zs, zs + nzt_y - nzb_y
	1096	DO i = nxl_z, nxr_z
[1111]	1097	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	1098	ENDDO
	1099	ENDDO
	1100	ENDDO
[4236]	1101	!$OMP END DO NOWAIT
[1]	1102	ENDDO
[683]	1103	!$OMP END PARALLEL
[1]	1104
	1105	!
	1106	!-- Transpose array
[1318]	1107	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	1108
	1109	#if __acc_fft_device
[3657]	1110	#ifndef __cuda_aware_mpi
[3634]	1111	!$ACC UPDATE HOST(work)
[3657]	1112	#else
	1113	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	1114	#endif
[3690]	1115	#endif
	1116
[622]	1117	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	1118	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	1119	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	1120	comm1dx, ierr )
[3690]	1121
	1122	#if __acc_fft_device
[3657]	1123	#ifndef __cuda_aware_mpi
[3634]	1124	!$ACC UPDATE DEVICE(f_inv)
[3657]	1125	#else
	1126	!$ACC END HOST_DATA
	1127	#endif
[3690]	1128	#endif
	1129
[1]	1130	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	1131	#endif
[1]	1132
	1133	ELSE
	1134	!
[1106]	1135	!-- Reorder the array in the same way like ALLTOALL did it
[683]	1136	!$OMP PARALLEL PRIVATE ( i, j, k )
	1137	!$OMP DO
[3690]	1138	#if __acc_fft_device
[3634]	1139	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1140	!$ACC PRESENT(f_inv, f_in)
[3690]	1141	#endif
[1003]	1142	DO k = nzb_y, nzt_y
	1143	DO j = 0, ny
	1144	DO i = nxl_y, nxr_y
[164]	1145	f_inv(i,k,j) = f_in(i,j,k)
	1146	ENDDO
	1147	ENDDO
	1148	ENDDO
[683]	1149	!$OMP END PARALLEL
[1106]	1150
	1151	ENDIF
	1152
[1]	1153	END SUBROUTINE transpose_zy
	1154
	1155
	1156	!------------------------------------------------------------------------------!
	1157	! Description:
	1158	! ------------
[1682]	1159	!> Transposition of input array (f_in) from z to y. For the input array, all
	1160	!> elements along z reside on the same PE, while after transposition, all
	1161	!> elements along y reside on the same PE.
	1162	!> This is a direct transposition for arrays with indices in regular order
	1163	!> (k,j,i) (cf. transpose_zy).
[1]	1164	!------------------------------------------------------------------------------!
[1682]	1165	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	1166
[1682]	1167
[1320]	1168	USE cpulog, &
[3241]	1169	ONLY: cpu_log, log_point_s
[1]	1170
[1320]	1171	USE indices, &
	1172	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
	1173
	1174	USE kinds
	1175
[1324]	1176	USE pegrid
[1320]	1177
	1178	USE transpose_indices, &
[3241]	1179	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
[1320]	1180
[1]	1181	IMPLICIT NONE
	1182
[4171]	1183	INTEGER(iwp) :: i !<
	1184	INTEGER(iwp) :: j !<
	1185	INTEGER(iwp) :: k !<
	1186	INTEGER(iwp) :: l !<
	1187	INTEGER(iwp) :: m !<
	1188	INTEGER(iwp) :: ys !<
[1]	1189
[4171]	1190	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	1191	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	1192	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
	1193	REAL(wp) :: work(nnxnnynnz) !<
[1320]	1194
[1]	1195	#if defined( __parallel )
	1196
	1197	!
	1198	!-- Rearrange indices of input array in order to make data to be send
	1199	!-- by MPI contiguous
[1003]	1200	DO i = nxl, nxr
	1201	DO j = nys, nyn
	1202	DO k = 1, nz
[164]	1203	f_inv(j,i,k) = f_in(k,j,i)
[1]	1204	ENDDO
	1205	ENDDO
	1206	ENDDO
	1207
	1208	!
	1209	!-- Move data to different array, because memory location of work1 is
	1210	!-- needed further below (work1 = work2).
	1211	!-- If the PE grid is one-dimensional along x, only local reordering
	1212	!-- of the data is necessary and no transposition has to be done.
	1213	IF ( pdims(2) == 1 ) THEN
[1003]	1214	DO k = 1, nz
	1215	DO i = nxl, nxr
	1216	DO j = nys, nyn
[164]	1217	f_out(j,i,k) = f_inv(j,i,k)
[1]	1218	ENDDO
	1219	ENDDO
	1220	ENDDO
	1221	RETURN
	1222	ENDIF
	1223
	1224	!
	1225	!-- Transpose array
	1226	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1227	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1228	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	1229	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	1230	comm1dy, ierr )
	1231	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1232
	1233	!
	1234	!-- Reorder transposed array
	1235	m = 0
	1236	DO l = 0, pdims(2) - 1
	1237	ys = 0 + l * nny
[1003]	1238	DO k = nzb_yd, nzt_yd
	1239	DO i = nxl_yd, nxr_yd
[1]	1240	DO j = ys, ys + nny - 1
	1241	m = m + 1
[164]	1242	f_out(j,i,k) = work(m)
[1]	1243	ENDDO
	1244	ENDDO
	1245	ENDDO
	1246	ENDDO
	1247
	1248	#endif
	1249
	1250	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |