Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 4173

Last change on this file since 4173 was 4171, checked in by gronemeier, 5 years ago
loop reordering for performance optimization
Property svn:keywords set to `Id`
File size: 34.0 KB

Rev	Line
[1682]	1	!> @file transpose.f90
[2000]	2	!------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[3655]	17	! Copyright 1997-2019 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1321]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: transpose.f90 4171 2019-08-19 17:44:09Z gronemeier $
[3832]	27	! loop reordering for performance optimization
[4171]	28	!
	29	! 3832 2019-03-28 13:16:58Z raasch
	30	! loop reordering for performance optimization
	31	!
[3832]	32	! 3694 2019-01-23 17:01:49Z knoop
[3634]	33	! OpenACC port for SPEC
[4171]	34	!
[3634]	35	! 3241 2018-09-12 15:02:00Z raasch
[3241]	36	! unused variables removed
[4171]	37	!
[3241]	38	! 2718 2018-01-02 08:49:38Z maronga
[2716]	39	! Corrected "Former revisions" section
[4171]	40	!
[2716]	41	! 2696 2017-12-14 17:12:51Z kanani
	42	! Change in file header (GPL part)
[1321]	43	!
[2716]	44	! 2119 2017-01-17 16:51:50Z raasch
	45	!
[2119]	46	! 2118 2017-01-17 16:38:49Z raasch
	47	! OpenACC directives removed
[4171]	48	!
[2001]	49	! 2000 2016-08-20 18:09:15Z knoop
	50	! Forced header and separation lines into 80 columns
[4171]	51	!
[1683]	52	! 1682 2015-10-07 23:56:08Z knoop
[4171]	53	! Code annotations made doxygen readable
	54	!
[1325]	55	! 1324 2014-03-21 09:13:16Z suehring
[4171]	56	! Bugfix: ONLY statement for module pegrid removed
[1325]	57	!
[1321]	58	! 1320 2014-03-20 08:40:49Z raasch
[1320]	59	! ONLY-attribute added to USE-statements,
	60	! kind-parameters added to all INTEGER and REAL declaration statements,
	61	! kinds are defined in new module kinds,
	62	! old module precision_kind is removed,
	63	! revision history before 2012 removed,
	64	! comment fields (!:) to be used for variable explanations added to
[4171]	65	! all variable declaration statements
[198]	66	!
[1319]	67	! 1318 2014-03-17 13:35:16Z raasch
	68	! cpu_log_nowait parameter added to cpu measurements of the transpositions
	69	! required for solving the Poisson equation (poisfft),
	70	! module interfaces removed
	71	!
[1258]	72	! 1257 2013-11-08 15:18:40Z raasch
	73	! openacc loop and loop vector clauses removed
	74	!
[1217]	75	! 1216 2013-08-26 09:31:42Z raasch
	76	! re-sorting of the transposed / to be transposed arrays moved to separate
	77	! routines resort_for_...
	78	!
[1112]	79	! 1111 2013-03-08 23:54:10Z raasch
	80	! openACC directives added,
	81	! resorting data from/to work changed, work got 4 dimensions instead of 1
	82	!
[1107]	83	! 1106 2013-03-04 05:31:38Z raasch
	84	! preprocessor lines rearranged so that routines can also be used in serial
	85	! (non-parallel) mode
	86	!
[1093]	87	! 1092 2013-02-02 11:24:22Z raasch
	88	! unused variables removed
	89	!
[1037]	90	! 1036 2012-10-22 13:43:42Z raasch
	91	! code put under GPL (PALM 3.9)
	92	!
[1004]	93	! 1003 2012-09-14 14:35:53Z raasch
	94	! indices nxa, nya, etc. replaced by nx, ny, etc.
	95	!
[1]	96	! Revision 1.1 1997/07/24 11:25:18 raasch
	97	! Initial revision
	98	!
[3690]	99
	100	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
	101
[1216]	102	!------------------------------------------------------------------------------!
	103	! Description:
	104	! ------------
[1682]	105	!> Resorting data for the transposition from x to y. The transposition itself
	106	!> is carried out in transpose_xy
[1216]	107	!------------------------------------------------------------------------------!
[1682]	108	SUBROUTINE resort_for_xy( f_in, f_inv )
[1216]	109
[4171]	110
[1320]	111	USE indices, &
	112	ONLY: nx
[1216]	113
[1320]	114	USE kinds
	115
	116	USE transpose_indices, &
[3241]	117	ONLY: nyn_x, nys_x, nzb_x, nzt_x
[1320]	118
[1216]	119	IMPLICIT NONE
	120
[4171]	121	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	122	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1216]	123
	124
[4171]	125	INTEGER(iwp) :: i !<
	126	INTEGER(iwp) :: j !<
	127	INTEGER(iwp) :: k !<
[1]	128	!
[1216]	129	!-- Rearrange indices of input array in order to make data to be send
	130	!-- by MPI contiguous
	131	!$OMP PARALLEL PRIVATE ( i, j, k )
	132	!$OMP DO
[3690]	133	#if __acc_fft_device
[3634]	134	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	135	!$ACC PRESENT(f_inv, f_in)
[3690]	136	#endif
[3832]	137	DO k = nzb_x, nzt_x
[4171]	138	DO j = nys_x, nyn_x
	139	DO i = 0, nx
[1216]	140	f_inv(j,k,i) = f_in(i,j,k)
	141	ENDDO
	142	ENDDO
	143	ENDDO
	144	!$OMP END PARALLEL
	145
	146	END SUBROUTINE resort_for_xy
	147
	148
	149	!------------------------------------------------------------------------------!
[1]	150	! Description:
	151	! ------------
[1682]	152	!> Transposition of input array (f_in) from x to y. For the input array, all
	153	!> elements along x reside on the same PE, while after transposition, all
	154	!> elements along y reside on the same PE.
[1]	155	!------------------------------------------------------------------------------!
[1682]	156	SUBROUTINE transpose_xy( f_inv, f_out )
[1]	157
[1682]	158
[1320]	159	USE cpulog, &
	160	ONLY: cpu_log, cpu_log_nowait, log_point_s
	161
	162	USE indices, &
	163	ONLY: nx, ny
[4171]	164
[1320]	165	USE kinds
	166
[1]	167	USE pegrid
	168
[1320]	169	USE transpose_indices, &
	170	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	171
[1]	172	IMPLICIT NONE
	173
[4171]	174	INTEGER(iwp) :: i !<
	175	INTEGER(iwp) :: j !<
	176	INTEGER(iwp) :: k !<
	177	INTEGER(iwp) :: l !<
	178	INTEGER(iwp) :: ys !<
[1]	179
[4171]	180	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	181	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	182
	183	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	184	#if __acc_fft_device
[3634]	185	!$ACC DECLARE CREATE(work)
[3690]	186	#endif
[1111]	187
	188
[1106]	189	IF ( numprocs /= 1 ) THEN
	190
	191	#if defined( __parallel )
[1]	192	!
[1106]	193	!-- Transpose array
[1318]	194	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	195
	196	#if __acc_fft_device
[3657]	197	#ifndef __cuda_aware_mpi
[3634]	198	!$ACC UPDATE HOST(f_inv)
[3657]	199	#else
	200	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	201	#endif
[3690]	202	#endif
	203
[1106]	204	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	205	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	206	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	207	comm1dy, ierr )
[3690]	208
	209	#if __acc_fft_device
[3657]	210	#ifndef __cuda_aware_mpi
[3634]	211	!$ACC UPDATE DEVICE(work)
[3657]	212	#else
	213	!$ACC END HOST_DATA
	214	#endif
[3690]	215	#endif
	216
[1106]	217	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	218
	219	!
[1106]	220	!-- Reorder transposed array
[1111]	221	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	222	!$OMP DO
[1106]	223	DO l = 0, pdims(2) - 1
	224	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	225	#if __acc_fft_device
[3634]	226	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	227	!$ACC PRESENT(f_out, work)
[3690]	228	#endif
[1106]	229	DO i = nxl_y, nxr_y
	230	DO k = nzb_y, nzt_y
	231	DO j = ys, ys + nyn_x - nys_x
[1111]	232	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	233	ENDDO
[1]	234	ENDDO
	235	ENDDO
	236	ENDDO
[683]	237	!$OMP END PARALLEL
[1]	238	#endif
	239
[1106]	240	ELSE
	241
	242	!
	243	!-- Reorder transposed array
	244	!$OMP PARALLEL PRIVATE ( i, j, k )
	245	!$OMP DO
[3690]	246	#if __acc_fft_device
[3634]	247	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	248	!$ACC PRESENT(f_out, f_inv)
[3690]	249	#endif
[1106]	250	DO k = nzb_y, nzt_y
	251	DO i = nxl_y, nxr_y
	252	DO j = 0, ny
	253	f_out(j,i,k) = f_inv(j,k,i)
	254	ENDDO
	255	ENDDO
	256	ENDDO
	257	!$OMP END PARALLEL
	258
	259	ENDIF
	260
[1]	261	END SUBROUTINE transpose_xy
	262
	263
	264	!------------------------------------------------------------------------------!
	265	! Description:
	266	! ------------
[1682]	267	!> Resorting data after the transposition from x to z. The transposition itself
	268	!> is carried out in transpose_xz
[1216]	269	!------------------------------------------------------------------------------!
[1682]	270	SUBROUTINE resort_for_xz( f_inv, f_out )
[1216]	271
[1682]	272
[1320]	273	USE indices, &
	274	ONLY: nxl, nxr, nyn, nys, nz
[1216]	275
[1320]	276	USE kinds
	277
[1216]	278	IMPLICIT NONE
	279
[4171]	280	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	281	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
[1216]	282
[4171]	283	INTEGER(iwp) :: i !<
	284	INTEGER(iwp) :: j !<
	285	INTEGER(iwp) :: k !<
[1216]	286	!
	287	!-- Rearrange indices of input array in order to make data to be send
	288	!-- by MPI contiguous.
	289	!-- In case of parallel fft/transposition, scattered store is faster in
	290	!-- backward direction!!!
	291	!$OMP PARALLEL PRIVATE ( i, j, k )
	292	!$OMP DO
[3690]	293	#if __acc_fft_device
[3634]	294	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	295	!$ACC PRESENT(f_out, f_inv)
[3690]	296	#endif
[4171]	297	DO i = nxl, nxr
	298	DO j = nys, nyn
	299	DO k = 1, nz
[1216]	300	f_out(k,j,i) = f_inv(j,i,k)
	301	ENDDO
	302	ENDDO
	303	ENDDO
	304	!$OMP END PARALLEL
	305
	306	END SUBROUTINE resort_for_xz
	307
	308
	309	!------------------------------------------------------------------------------!
	310	! Description:
	311	! ------------
[1682]	312	!> Transposition of input array (f_in) from x to z. For the input array, all
	313	!> elements along x reside on the same PE, while after transposition, all
	314	!> elements along z reside on the same PE.
[1]	315	!------------------------------------------------------------------------------!
[1682]	316	SUBROUTINE transpose_xz( f_in, f_inv )
[1]	317
[1682]	318
[1320]	319	USE cpulog, &
	320	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	321
[1320]	322	USE indices, &
[3241]	323	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
[1320]	324
	325	USE kinds
	326
[1324]	327	USE pegrid
[1320]	328
	329	USE transpose_indices, &
	330	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	331
[1]	332	IMPLICIT NONE
	333
[4171]	334	INTEGER(iwp) :: i !<
	335	INTEGER(iwp) :: j !<
	336	INTEGER(iwp) :: k !<
	337	INTEGER(iwp) :: l !<
	338	INTEGER(iwp) :: xs !<
[1]	339
[4171]	340	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	341	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1]	342
[4171]	343	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	344	#if __acc_fft_device
[3634]	345	!$ACC DECLARE CREATE(work)
[3690]	346	#endif
[1111]	347
[1320]	348
[1]	349	!
	350	!-- If the PE grid is one-dimensional along y, the array has only to be
	351	!-- reordered locally and therefore no transposition has to be done.
	352	IF ( pdims(1) /= 1 ) THEN
[1106]	353
	354	#if defined( __parallel )
[1]	355	!
	356	!-- Reorder input array for transposition
[1111]	357	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	358	!$OMP DO
[1]	359	DO l = 0, pdims(1) - 1
	360	xs = 0 + l * nnx
[3690]	361	#if __acc_fft_device
[3634]	362	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	363	!$ACC PRESENT(work, f_in)
[3690]	364	#endif
[1003]	365	DO k = nzb_x, nzt_x
[164]	366	DO i = xs, xs + nnx - 1
[1003]	367	DO j = nys_x, nyn_x
[1111]	368	work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]	369	ENDDO
	370	ENDDO
	371	ENDDO
	372	ENDDO
[683]	373	!$OMP END PARALLEL
[1]	374
	375	!
	376	!-- Transpose array
[1318]	377	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	378
	379	#if __acc_fft_device
[3657]	380	#ifndef __cuda_aware_mpi
[3634]	381	!$ACC UPDATE HOST(work)
[3657]	382	#else
	383	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	384	#endif
[3690]	385	#endif
	386
[622]	387	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	388	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	389	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	390	comm1dx, ierr )
[3690]	391
	392	#if __acc_fft_device
[3657]	393	#ifndef __cuda_aware_mpi
[3634]	394	!$ACC UPDATE DEVICE(f_inv)
[3657]	395	#else
	396	!$ACC END HOST_DATA
	397	#endif
[3694]	398	#endif
	399
[1]	400	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	401	#endif
	402
[1]	403	ELSE
[1106]	404
[1]	405	!
	406	!-- Reorder the array in a way that the z index is in first position
[683]	407	!$OMP PARALLEL PRIVATE ( i, j, k )
	408	!$OMP DO
[3690]	409	#if __acc_fft_device
[3634]	410	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	411	!$ACC PRESENT(f_inv, f_in)
[3690]	412	#endif
[1003]	413	DO i = nxl, nxr
	414	DO j = nys, nyn
	415	DO k = 1, nz
[164]	416	f_inv(j,i,k) = f_in(i,j,k)
[1]	417	ENDDO
	418	ENDDO
	419	ENDDO
[683]	420	!$OMP END PARALLEL
[1]	421
[164]	422	ENDIF
	423
[1]	424	END SUBROUTINE transpose_xz
	425
	426
	427	!------------------------------------------------------------------------------!
	428	! Description:
	429	! ------------
[1682]	430	!> Resorting data after the transposition from y to x. The transposition itself
	431	!> is carried out in transpose_yx
[1216]	432	!------------------------------------------------------------------------------!
[1682]	433	SUBROUTINE resort_for_yx( f_inv, f_out )
[1216]	434
[1682]	435
[1320]	436	USE indices, &
	437	ONLY: nx
[1216]	438
[1320]	439	USE kinds
	440
	441	USE transpose_indices, &
	442	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	443
[1216]	444	IMPLICIT NONE
	445
[4171]	446	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	447	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1216]	448
	449
[4171]	450	INTEGER(iwp) :: i !<
	451	INTEGER(iwp) :: j !<
	452	INTEGER(iwp) :: k !<
[1216]	453	!
	454	!-- Rearrange indices of input array in order to make data to be send
	455	!-- by MPI contiguous
	456	!$OMP PARALLEL PRIVATE ( i, j, k )
	457	!$OMP DO
[3690]	458	#if __acc_fft_device
[3634]	459	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	460	!$ACC PRESENT(f_out, f_inv)
[3690]	461	#endif
[4171]	462	DO k = nzb_x, nzt_x
	463	DO j = nys_x, nyn_x
	464	DO i = 0, nx
[1216]	465	f_out(i,j,k) = f_inv(j,k,i)
	466	ENDDO
	467	ENDDO
	468	ENDDO
	469	!$OMP END PARALLEL
	470
	471	END SUBROUTINE resort_for_yx
	472
	473
	474	!------------------------------------------------------------------------------!
	475	! Description:
	476	! ------------
[1682]	477	!> Transposition of input array (f_in) from y to x. For the input array, all
	478	!> elements along y reside on the same PE, while after transposition, all
	479	!> elements along x reside on the same PE.
[1]	480	!------------------------------------------------------------------------------!
[1682]	481	SUBROUTINE transpose_yx( f_in, f_inv )
[1]	482
[1682]	483
[1320]	484	USE cpulog, &
	485	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	486
[1320]	487	USE indices, &
	488	ONLY: nx, ny
	489
	490	USE kinds
	491
[1324]	492	USE pegrid
[1320]	493
	494	USE transpose_indices, &
	495	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	496
[1]	497	IMPLICIT NONE
	498
[4171]	499	INTEGER(iwp) :: i !<
	500	INTEGER(iwp) :: j !<
	501	INTEGER(iwp) :: k !<
	502	INTEGER(iwp) :: l !<
	503	INTEGER(iwp) :: ys !<
[1]	504
[4171]	505	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	506	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1111]	507
[4171]	508	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	509	#if __acc_fft_device
[3634]	510	!$ACC DECLARE CREATE(work)
[3690]	511	#endif
[1111]	512
[1320]	513
[1106]	514	IF ( numprocs /= 1 ) THEN
	515
[1]	516	#if defined( __parallel )
	517	!
[1106]	518	!-- Reorder input array for transposition
[1111]	519	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	520	!$OMP DO
[1106]	521	DO l = 0, pdims(2) - 1
	522	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	523	#if __acc_fft_device
[3634]	524	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	525	!$ACC PRESENT(work, f_in)
[3690]	526	#endif
[1106]	527	DO i = nxl_y, nxr_y
	528	DO k = nzb_y, nzt_y
	529	DO j = ys, ys + nyn_x - nys_x
[1111]	530	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	531	ENDDO
	532	ENDDO
	533	ENDDO
	534	ENDDO
	535	!$OMP END PARALLEL
	536
	537	!
	538	!-- Transpose array
[1318]	539	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	540
	541	#if __acc_fft_device
[3657]	542	#ifndef __cuda_aware_mpi
[3634]	543	!$ACC UPDATE HOST(work)
[3657]	544	#else
	545	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	546	#endif
[3690]	547	#endif
	548
[1106]	549	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	550	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	551	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	552	comm1dy, ierr )
[3690]	553
	554	#if __acc_fft_device
[3657]	555	#ifndef __cuda_aware_mpi
[3634]	556	!$ACC UPDATE DEVICE(f_inv)
[3657]	557	#else
	558	!$ACC END HOST_DATA
	559	#endif
[3690]	560	#endif
	561
[1106]	562	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	563	#endif
	564
	565	ELSE
	566
	567	!
	568	!-- Reorder array f_in the same way as ALLTOALL did it
	569	!$OMP PARALLEL PRIVATE ( i, j, k )
	570	!$OMP DO
[3690]	571	#if __acc_fft_device
[3634]	572	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	573	!$ACC PRESENT(f_inv, f_in)
[3690]	574	#endif
[1003]	575	DO i = nxl_y, nxr_y
	576	DO k = nzb_y, nzt_y
[1106]	577	DO j = 0, ny
	578	f_inv(j,k,i) = f_in(j,i,k)
[1]	579	ENDDO
	580	ENDDO
	581	ENDDO
[683]	582	!$OMP END PARALLEL
[1]	583
[1106]	584	ENDIF
[1]	585
	586	END SUBROUTINE transpose_yx
	587
	588
	589	!------------------------------------------------------------------------------!
	590	! Description:
	591	! ------------
[1682]	592	!> Transposition of input array (f_in) from y to x. For the input array, all
	593	!> elements along y reside on the same PE, while after transposition, all
	594	!> elements along x reside on the same PE.
	595	!> This is a direct transposition for arrays with indices in regular order
	596	!> (k,j,i) (cf. transpose_yx).
[1]	597	!------------------------------------------------------------------------------!
[1682]	598	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	599
[1682]	600
[1320]	601	USE cpulog, &
[3241]	602	ONLY: cpu_log, log_point_s
[1]	603
[1320]	604	USE indices, &
	605	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
	606
	607	USE kinds
	608
[1324]	609	USE pegrid
[1320]	610
	611	USE transpose_indices, &
	612	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	613
[1]	614	IMPLICIT NONE
	615
[4171]	616	INTEGER(iwp) :: i !<
	617	INTEGER(iwp) :: j !<
	618	INTEGER(iwp) :: k !<
	619	INTEGER(iwp) :: l !<
	620	INTEGER(iwp) :: m !<
	621	INTEGER(iwp) :: xs !<
[1]	622
[4171]	623	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	624	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
	625	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	626	REAL(wp) :: work(nnxnnynnz) !<
[1]	627	#if defined( __parallel )
	628
	629	!
	630	!-- Rearrange indices of input array in order to make data to be send
	631	!-- by MPI contiguous
[1003]	632	DO k = 1, nz
	633	DO j = nys, nyn
	634	DO i = nxl, nxr
[164]	635	f_inv(i,k,j) = f_in(k,j,i)
[1]	636	ENDDO
	637	ENDDO
	638	ENDDO
	639
	640	!
	641	!-- Transpose array
	642	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	643	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	644	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	645	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	646	comm1dx, ierr )
	647	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	648
	649	!
	650	!-- Reorder transposed array
	651	m = 0
	652	DO l = 0, pdims(1) - 1
	653	xs = 0 + l * nnx
[1003]	654	DO j = nys_x, nyn_x
	655	DO k = 1, nz
[1]	656	DO i = xs, xs + nnx - 1
	657	m = m + 1
[164]	658	f_out(i,j,k) = work(m)
[1]	659	ENDDO
	660	ENDDO
	661	ENDDO
	662	ENDDO
	663
	664	#endif
	665
	666	END SUBROUTINE transpose_yxd
	667
	668
	669	!------------------------------------------------------------------------------!
	670	! Description:
	671	! ------------
[1682]	672	!> Resorting data for the transposition from y to z. The transposition itself
	673	!> is carried out in transpose_yz
[1216]	674	!------------------------------------------------------------------------------!
[1682]	675	SUBROUTINE resort_for_yz( f_in, f_inv )
[1216]	676
[1682]	677
[1320]	678	USE indices, &
	679	ONLY: ny
[1216]	680
[1320]	681	USE kinds
	682
	683	USE transpose_indices, &
	684	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	685
[1216]	686	IMPLICIT NONE
	687
[4171]	688	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	689	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1216]	690
[4171]	691	INTEGER(iwp) :: i !<
	692	INTEGER(iwp) :: j !<
	693	INTEGER(iwp) :: k !<
[1216]	694
	695	!
	696	!-- Rearrange indices of input array in order to make data to be send
	697	!-- by MPI contiguous
	698	!$OMP PARALLEL PRIVATE ( i, j, k )
	699	!$OMP DO
[3690]	700	#if __acc_fft_device
[3634]	701	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	702	!$ACC PRESENT(f_inv, f_in)
[3690]	703	#endif
[4171]	704	DO k = nzb_y, nzt_y
	705	DO i = nxl_y, nxr_y
	706	DO j = 0, ny
[1216]	707	f_inv(i,k,j) = f_in(j,i,k)
	708	ENDDO
	709	ENDDO
	710	ENDDO
	711	!$OMP END PARALLEL
	712
	713	END SUBROUTINE resort_for_yz
	714
	715
	716	!------------------------------------------------------------------------------!
	717	! Description:
	718	! ------------
[1682]	719	!> Transposition of input array (f_in) from y to z. For the input array, all
	720	!> elements along y reside on the same PE, while after transposition, all
	721	!> elements along z reside on the same PE.
[1]	722	!------------------------------------------------------------------------------!
[1682]	723	SUBROUTINE transpose_yz( f_inv, f_out )
[1]	724
[1682]	725
[1320]	726	USE cpulog, &
	727	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	728
[1320]	729	USE indices, &
	730	ONLY: ny, nz
	731
	732	USE kinds
	733
[1324]	734	USE pegrid
[1320]	735
	736	USE transpose_indices, &
	737	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	738
[1]	739	IMPLICIT NONE
	740
[4171]	741	INTEGER(iwp) :: i !<
	742	INTEGER(iwp) :: j !<
	743	INTEGER(iwp) :: k !<
	744	INTEGER(iwp) :: l !<
	745	INTEGER(iwp) :: zs !<
[1]	746
[4171]	747	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	748	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
[1111]	749
[4171]	750	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	751	#if __acc_fft_device
[3634]	752	!$ACC DECLARE CREATE(work)
[3690]	753	#endif
[1111]	754
[1320]	755
[1]	756	!
	757	!-- If the PE grid is one-dimensional along y, only local reordering
	758	!-- of the data is necessary and no transposition has to be done.
	759	IF ( pdims(1) == 1 ) THEN
[1106]	760
[683]	761	!$OMP PARALLEL PRIVATE ( i, j, k )
	762	!$OMP DO
[3690]	763	#if __acc_fft_device
[3634]	764	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	765	!$ACC PRESENT(f_out, f_inv)
[3690]	766	#endif
[1003]	767	DO j = 0, ny
	768	DO k = nzb_y, nzt_y
	769	DO i = nxl_y, nxr_y
[164]	770	f_out(i,j,k) = f_inv(i,k,j)
[1]	771	ENDDO
	772	ENDDO
	773	ENDDO
[683]	774	!$OMP END PARALLEL
[1]	775
[1106]	776	ELSE
	777
	778	#if defined( __parallel )
[1]	779	!
[1106]	780	!-- Transpose array
[1318]	781	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	782
	783	#if __acc_fft_device
[3657]	784	#ifndef __cuda_aware_mpi
[3634]	785	!$ACC UPDATE HOST(f_inv)
[3657]	786	#else
	787	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	788	#endif
[3690]	789	#endif
	790
[1106]	791	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	792	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	793	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	794	comm1dx, ierr )
[3690]	795
	796	#if __acc_fft_device
[3657]	797	#ifndef __cuda_aware_mpi
[3634]	798	!$ACC UPDATE DEVICE(work)
[3657]	799	#else
	800	!$ACC END HOST_DATA
	801	#endif
[3690]	802	#endif
	803
[1106]	804	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	805
	806	!
[1106]	807	!-- Reorder transposed array
[1111]	808	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	809	!$OMP DO
[1106]	810	DO l = 0, pdims(1) - 1
	811	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	812	#if __acc_fft_device
[3634]	813	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	814	!$ACC PRESENT(f_out, work)
[3690]	815	#endif
[1106]	816	DO j = nys_z, nyn_z
	817	DO k = zs, zs + nzt_y - nzb_y
	818	DO i = nxl_z, nxr_z
[1111]	819	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	820	ENDDO
[1]	821	ENDDO
	822	ENDDO
	823	ENDDO
[683]	824	!$OMP END PARALLEL
[1]	825	#endif
	826
[1106]	827	ENDIF
	828
[1]	829	END SUBROUTINE transpose_yz
	830
	831
	832	!------------------------------------------------------------------------------!
	833	! Description:
	834	! ------------
[1682]	835	!> Resorting data for the transposition from z to x. The transposition itself
	836	!> is carried out in transpose_zx
[1216]	837	!------------------------------------------------------------------------------!
[1682]	838	SUBROUTINE resort_for_zx( f_in, f_inv )
[1216]	839
[1682]	840
[1320]	841	USE indices, &
	842	ONLY: nxl, nxr, nyn, nys, nz
[1216]	843
[1320]	844	USE kinds
	845
[1216]	846	IMPLICIT NONE
	847
[4171]	848	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	849	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1216]	850
[4171]	851	INTEGER(iwp) :: i !<
	852	INTEGER(iwp) :: j !<
	853	INTEGER(iwp) :: k !<
[1216]	854
	855	!
	856	!-- Rearrange indices of input array in order to make data to be send
	857	!-- by MPI contiguous
	858	!$OMP PARALLEL PRIVATE ( i, j, k )
	859	!$OMP DO
[3690]	860	#if __acc_fft_device
[3634]	861	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	862	!$ACC PRESENT(f_in, f_inv)
[3690]	863	#endif
[3832]	864	DO i = nxl, nxr
[4171]	865	DO j = nys, nyn
	866	DO k = 1,nz
[1216]	867	f_inv(j,i,k) = f_in(k,j,i)
	868	ENDDO
	869	ENDDO
	870	ENDDO
	871	!$OMP END PARALLEL
	872
	873	END SUBROUTINE resort_for_zx
	874
	875
	876	!------------------------------------------------------------------------------!
	877	! Description:
	878	! ------------
[1682]	879	!> Transposition of input array (f_in) from z to x. For the input array, all
	880	!> elements along z reside on the same PE, while after transposition, all
	881	!> elements along x reside on the same PE.
[1]	882	!------------------------------------------------------------------------------!
[1682]	883	SUBROUTINE transpose_zx( f_inv, f_out )
[1]	884
[1682]	885
[1320]	886	USE cpulog, &
	887	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	888
[1320]	889	USE indices, &
	890	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
	891
	892	USE kinds
	893
[1324]	894	USE pegrid
[1320]	895
	896	USE transpose_indices, &
	897	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	898
[1]	899	IMPLICIT NONE
	900
[4171]	901	INTEGER(iwp) :: i !<
	902	INTEGER(iwp) :: j !<
	903	INTEGER(iwp) :: k !<
	904	INTEGER(iwp) :: l !<
	905	INTEGER(iwp) :: xs !<
[1]	906
[4171]	907	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	908	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1111]	909
[4171]	910	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	911	#if __acc_fft_device
[3634]	912	!$ACC DECLARE CREATE(work)
[3690]	913	#endif
[1]	914
[1320]	915
[1]	916	!
	917	!-- If the PE grid is one-dimensional along y, only local reordering
	918	!-- of the data is necessary and no transposition has to be done.
	919	IF ( pdims(1) == 1 ) THEN
[1106]	920
[683]	921	!$OMP PARALLEL PRIVATE ( i, j, k )
	922	!$OMP DO
[3690]	923	#if __acc_fft_device
[3634]	924	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	925	!$ACC PRESENT(f_out, f_inv)
[3690]	926	#endif
[1003]	927	DO k = 1, nz
	928	DO i = nxl, nxr
	929	DO j = nys, nyn
[164]	930	f_out(i,j,k) = f_inv(j,i,k)
[1]	931	ENDDO
	932	ENDDO
	933	ENDDO
[683]	934	!$OMP END PARALLEL
[1]	935
[1106]	936	ELSE
	937
	938	#if defined( __parallel )
[1]	939	!
[1106]	940	!-- Transpose array
[1318]	941	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	942
	943	#if __acc_fft_device
[3657]	944	#ifndef __cuda_aware_mpi
[3634]	945	!$ACC UPDATE HOST(f_inv)
[3657]	946	#else
	947	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	948	#endif
[3690]	949	#endif
	950
[1106]	951	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	952	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	953	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	954	comm1dx, ierr )
[3690]	955
	956	#if __acc_fft_device
[3657]	957	#ifndef __cuda_aware_mpi
[3634]	958	!$ACC UPDATE DEVICE(work)
[3657]	959	#else
	960	!$ACC END HOST_DATA
	961	#endif
[3690]	962	#endif
	963
[1106]	964	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	965
	966	!
[1106]	967	!-- Reorder transposed array
[1111]	968	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	969	!$OMP DO
[1106]	970	DO l = 0, pdims(1) - 1
	971	xs = 0 + l * nnx
[3690]	972	#if __acc_fft_device
[3634]	973	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	974	!$ACC PRESENT(f_out, work)
[3690]	975	#endif
[1106]	976	DO k = nzb_x, nzt_x
	977	DO i = xs, xs + nnx - 1
	978	DO j = nys_x, nyn_x
[1111]	979	f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]	980	ENDDO
[1]	981	ENDDO
	982	ENDDO
	983	ENDDO
[683]	984	!$OMP END PARALLEL
[1]	985	#endif
	986
[1106]	987	ENDIF
	988
[1]	989	END SUBROUTINE transpose_zx
	990
	991
	992	!------------------------------------------------------------------------------!
	993	! Description:
	994	! ------------
[1682]	995	!> Resorting data after the transposition from z to y. The transposition itself
	996	!> is carried out in transpose_zy
[1216]	997	!------------------------------------------------------------------------------!
[1682]	998	SUBROUTINE resort_for_zy( f_inv, f_out )
[1216]	999
[1682]	1000
[1320]	1001	USE indices, &
	1002	ONLY: ny
[1216]	1003
[1320]	1004	USE kinds
	1005
	1006	USE transpose_indices, &
	1007	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	1008
[1216]	1009	IMPLICIT NONE
	1010
[4171]	1011	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	1012	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1216]	1013
	1014
[4171]	1015	INTEGER(iwp) :: i !<
	1016	INTEGER(iwp) :: j !<
	1017	INTEGER(iwp) :: k !<
[1216]	1018
	1019	!
	1020	!-- Rearrange indices of input array in order to make data to be send
	1021	!-- by MPI contiguous
	1022	!$OMP PARALLEL PRIVATE ( i, j, k )
	1023	!$OMP DO
[3690]	1024	#if __acc_fft_device
[3634]	1025	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1026	!$ACC PRESENT(f_out, f_inv)
[3690]	1027	#endif
[4171]	1028	DO k = nzb_y, nzt_y
	1029	DO i = nxl_y, nxr_y
	1030	DO j = 0, ny
[1216]	1031	f_out(j,i,k) = f_inv(i,k,j)
	1032	ENDDO
	1033	ENDDO
	1034	ENDDO
	1035	!$OMP END PARALLEL
	1036
	1037	END SUBROUTINE resort_for_zy
	1038
	1039
	1040	!------------------------------------------------------------------------------!
[3241]	1041	! Description:cpu_log_nowait
[1216]	1042	! ------------
[1682]	1043	!> Transposition of input array (f_in) from z to y. For the input array, all
	1044	!> elements along z reside on the same PE, while after transposition, all
	1045	!> elements along y reside on the same PE.
[1]	1046	!------------------------------------------------------------------------------!
[1682]	1047	SUBROUTINE transpose_zy( f_in, f_inv )
[1]	1048
[1682]	1049
[1320]	1050	USE cpulog, &
	1051	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	1052
[1320]	1053	USE indices, &
	1054	ONLY: ny, nz
	1055
	1056	USE kinds
	1057
[1324]	1058	USE pegrid
[1320]	1059
	1060	USE transpose_indices, &
	1061	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	1062
[1]	1063	IMPLICIT NONE
	1064
[4171]	1065	INTEGER(iwp) :: i !<
	1066	INTEGER(iwp) :: j !<
	1067	INTEGER(iwp) :: k !<
	1068	INTEGER(iwp) :: l !<
	1069	INTEGER(iwp) :: zs !<
[1]	1070
[4171]	1071	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
	1072	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1111]	1073
[1682]	1074	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	1075	#if __acc_fft_device
[3634]	1076	!$ACC DECLARE CREATE(work)
[3690]	1077	#endif
[1111]	1078
[1]	1079	!
	1080	!-- If the PE grid is one-dimensional along y, the array has only to be
	1081	!-- reordered locally and therefore no transposition has to be done.
	1082	IF ( pdims(1) /= 1 ) THEN
[1106]	1083
	1084	#if defined( __parallel )
[1]	1085	!
	1086	!-- Reorder input array for transposition
[1111]	1087	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	1088	!$OMP DO
[1]	1089	DO l = 0, pdims(1) - 1
[1003]	1090	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	1091	#if __acc_fft_device
[3634]	1092	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1093	!$ACC PRESENT(work, f_in)
[3690]	1094	#endif
[1003]	1095	DO j = nys_z, nyn_z
	1096	DO k = zs, zs + nzt_y - nzb_y
	1097	DO i = nxl_z, nxr_z
[1111]	1098	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	1099	ENDDO
	1100	ENDDO
	1101	ENDDO
	1102	ENDDO
[683]	1103	!$OMP END PARALLEL
[1]	1104
	1105	!
	1106	!-- Transpose array
[1318]	1107	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	1108
	1109	#if __acc_fft_device
[3657]	1110	#ifndef __cuda_aware_mpi
[3634]	1111	!$ACC UPDATE HOST(work)
[3657]	1112	#else
	1113	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	1114	#endif
[3690]	1115	#endif
	1116
[622]	1117	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	1118	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	1119	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	1120	comm1dx, ierr )
[3690]	1121
	1122	#if __acc_fft_device
[3657]	1123	#ifndef __cuda_aware_mpi
[3634]	1124	!$ACC UPDATE DEVICE(f_inv)
[3657]	1125	#else
	1126	!$ACC END HOST_DATA
	1127	#endif
[3690]	1128	#endif
	1129
[1]	1130	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	1131	#endif
[1]	1132
	1133	ELSE
	1134	!
[1106]	1135	!-- Reorder the array in the same way like ALLTOALL did it
[683]	1136	!$OMP PARALLEL PRIVATE ( i, j, k )
	1137	!$OMP DO
[3690]	1138	#if __acc_fft_device
[3634]	1139	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1140	!$ACC PRESENT(f_inv, f_in)
[3690]	1141	#endif
[1003]	1142	DO k = nzb_y, nzt_y
	1143	DO j = 0, ny
	1144	DO i = nxl_y, nxr_y
[164]	1145	f_inv(i,k,j) = f_in(i,j,k)
	1146	ENDDO
	1147	ENDDO
	1148	ENDDO
[683]	1149	!$OMP END PARALLEL
[1106]	1150
	1151	ENDIF
	1152
[1]	1153	END SUBROUTINE transpose_zy
	1154
	1155
	1156	!------------------------------------------------------------------------------!
	1157	! Description:
	1158	! ------------
[1682]	1159	!> Transposition of input array (f_in) from z to y. For the input array, all
	1160	!> elements along z reside on the same PE, while after transposition, all
	1161	!> elements along y reside on the same PE.
	1162	!> This is a direct transposition for arrays with indices in regular order
	1163	!> (k,j,i) (cf. transpose_zy).
[1]	1164	!------------------------------------------------------------------------------!
[1682]	1165	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	1166
[1682]	1167
[1320]	1168	USE cpulog, &
[3241]	1169	ONLY: cpu_log, log_point_s
[1]	1170
[1320]	1171	USE indices, &
	1172	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
	1173
	1174	USE kinds
	1175
[1324]	1176	USE pegrid
[1320]	1177
	1178	USE transpose_indices, &
[3241]	1179	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
[1320]	1180
[1]	1181	IMPLICIT NONE
	1182
[4171]	1183	INTEGER(iwp) :: i !<
	1184	INTEGER(iwp) :: j !<
	1185	INTEGER(iwp) :: k !<
	1186	INTEGER(iwp) :: l !<
	1187	INTEGER(iwp) :: m !<
	1188	INTEGER(iwp) :: ys !<
[1]	1189
[4171]	1190	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	1191	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	1192	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
	1193	REAL(wp) :: work(nnxnnynnz) !<
[1320]	1194
[1]	1195	#if defined( __parallel )
	1196
	1197	!
	1198	!-- Rearrange indices of input array in order to make data to be send
	1199	!-- by MPI contiguous
[1003]	1200	DO i = nxl, nxr
	1201	DO j = nys, nyn
	1202	DO k = 1, nz
[164]	1203	f_inv(j,i,k) = f_in(k,j,i)
[1]	1204	ENDDO
	1205	ENDDO
	1206	ENDDO
	1207
	1208	!
	1209	!-- Move data to different array, because memory location of work1 is
	1210	!-- needed further below (work1 = work2).
	1211	!-- If the PE grid is one-dimensional along x, only local reordering
	1212	!-- of the data is necessary and no transposition has to be done.
	1213	IF ( pdims(2) == 1 ) THEN
[1003]	1214	DO k = 1, nz
	1215	DO i = nxl, nxr
	1216	DO j = nys, nyn
[164]	1217	f_out(j,i,k) = f_inv(j,i,k)
[1]	1218	ENDDO
	1219	ENDDO
	1220	ENDDO
	1221	RETURN
	1222	ENDIF
	1223
	1224	!
	1225	!-- Transpose array
	1226	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1227	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1228	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	1229	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	1230	comm1dy, ierr )
	1231	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1232
	1233	!
	1234	!-- Reorder transposed array
	1235	m = 0
	1236	DO l = 0, pdims(2) - 1
	1237	ys = 0 + l * nny
[1003]	1238	DO k = nzb_yd, nzt_yd
	1239	DO i = nxl_yd, nxr_yd
[1]	1240	DO j = ys, ys + nny - 1
	1241	m = m + 1
[164]	1242	f_out(j,i,k) = work(m)
[1]	1243	ENDDO
	1244	ENDDO
	1245	ENDDO
	1246	ENDDO
	1247
	1248	#endif
	1249
	1250	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |