Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 4087

Last change on this file since 4087 was 3832, checked in by raasch, 6 years ago
some routines instrumented with openmp directives, loop reordering for performance optimization
Property svn:keywords set to `Id`
File size: 34.0 KB

Rev	Line
[1682]	1	!> @file transpose.f90
[2000]	2	!------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[3655]	17	! Copyright 1997-2019 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1321]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: transpose.f90 3832 2019-03-28 13:16:58Z gronemeier $
[3832]	27	! loop reordering for performance optimization
	28	!
	29	! 3694 2019-01-23 17:01:49Z knoop
[3634]	30	! OpenACC port for SPEC
	31	!
	32	! 3241 2018-09-12 15:02:00Z raasch
[3241]	33	! unused variables removed
	34	!
	35	! 2718 2018-01-02 08:49:38Z maronga
[2716]	36	! Corrected "Former revisions" section
	37	!
	38	! 2696 2017-12-14 17:12:51Z kanani
	39	! Change in file header (GPL part)
[1321]	40	!
[2716]	41	! 2119 2017-01-17 16:51:50Z raasch
	42	!
[2119]	43	! 2118 2017-01-17 16:38:49Z raasch
	44	! OpenACC directives removed
	45	!
[2001]	46	! 2000 2016-08-20 18:09:15Z knoop
	47	! Forced header and separation lines into 80 columns
	48	!
[1683]	49	! 1682 2015-10-07 23:56:08Z knoop
	50	! Code annotations made doxygen readable
	51	!
[1325]	52	! 1324 2014-03-21 09:13:16Z suehring
	53	! Bugfix: ONLY statement for module pegrid removed
	54	!
[1321]	55	! 1320 2014-03-20 08:40:49Z raasch
[1320]	56	! ONLY-attribute added to USE-statements,
	57	! kind-parameters added to all INTEGER and REAL declaration statements,
	58	! kinds are defined in new module kinds,
	59	! old module precision_kind is removed,
	60	! revision history before 2012 removed,
	61	! comment fields (!:) to be used for variable explanations added to
	62	! all variable declaration statements
[198]	63	!
[1319]	64	! 1318 2014-03-17 13:35:16Z raasch
	65	! cpu_log_nowait parameter added to cpu measurements of the transpositions
	66	! required for solving the Poisson equation (poisfft),
	67	! module interfaces removed
	68	!
[1258]	69	! 1257 2013-11-08 15:18:40Z raasch
	70	! openacc loop and loop vector clauses removed
	71	!
[1217]	72	! 1216 2013-08-26 09:31:42Z raasch
	73	! re-sorting of the transposed / to be transposed arrays moved to separate
	74	! routines resort_for_...
	75	!
[1112]	76	! 1111 2013-03-08 23:54:10Z raasch
	77	! openACC directives added,
	78	! resorting data from/to work changed, work got 4 dimensions instead of 1
	79	!
[1107]	80	! 1106 2013-03-04 05:31:38Z raasch
	81	! preprocessor lines rearranged so that routines can also be used in serial
	82	! (non-parallel) mode
	83	!
[1093]	84	! 1092 2013-02-02 11:24:22Z raasch
	85	! unused variables removed
	86	!
[1037]	87	! 1036 2012-10-22 13:43:42Z raasch
	88	! code put under GPL (PALM 3.9)
	89	!
[1004]	90	! 1003 2012-09-14 14:35:53Z raasch
	91	! indices nxa, nya, etc. replaced by nx, ny, etc.
	92	!
[1]	93	! Revision 1.1 1997/07/24 11:25:18 raasch
	94	! Initial revision
	95	!
[3690]	96
	97	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
	98
[1216]	99	!------------------------------------------------------------------------------!
	100	! Description:
	101	! ------------
[1682]	102	!> Resorting data for the transposition from x to y. The transposition itself
	103	!> is carried out in transpose_xy
[1216]	104	!------------------------------------------------------------------------------!
[1682]	105	SUBROUTINE resort_for_xy( f_in, f_inv )
	106
[1216]	107
[1320]	108	USE indices, &
	109	ONLY: nx
[1216]	110
[1320]	111	USE kinds
	112
	113	USE transpose_indices, &
[3241]	114	ONLY: nyn_x, nys_x, nzb_x, nzt_x
[1320]	115
[1216]	116	IMPLICIT NONE
	117
[1682]	118	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	119	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1216]	120
	121
[1682]	122	INTEGER(iwp) :: i !<
	123	INTEGER(iwp) :: j !<
	124	INTEGER(iwp) :: k !<
[1]	125	!
[1216]	126	!-- Rearrange indices of input array in order to make data to be send
	127	!-- by MPI contiguous
	128	!$OMP PARALLEL PRIVATE ( i, j, k )
	129	!$OMP DO
[3690]	130	#if __acc_fft_device
[3634]	131	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	132	!$ACC PRESENT(f_inv, f_in)
[3690]	133	#endif
[3832]	134	DO k = nzb_x, nzt_x
	135	DO i = 0, nx
[1216]	136	DO j = nys_x, nyn_x
	137	f_inv(j,k,i) = f_in(i,j,k)
	138	ENDDO
	139	ENDDO
	140	ENDDO
	141	!$OMP END PARALLEL
	142
	143	END SUBROUTINE resort_for_xy
	144
	145
	146	!------------------------------------------------------------------------------!
[1]	147	! Description:
	148	! ------------
[1682]	149	!> Transposition of input array (f_in) from x to y. For the input array, all
	150	!> elements along x reside on the same PE, while after transposition, all
	151	!> elements along y reside on the same PE.
[1]	152	!------------------------------------------------------------------------------!
[1682]	153	SUBROUTINE transpose_xy( f_inv, f_out )
[1]	154
[1682]	155
[1320]	156	USE cpulog, &
	157	ONLY: cpu_log, cpu_log_nowait, log_point_s
	158
	159	USE indices, &
	160	ONLY: nx, ny
	161
	162	USE kinds
	163
[1]	164	USE pegrid
	165
[1320]	166	USE transpose_indices, &
	167	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	168
[1]	169	IMPLICIT NONE
	170
[1682]	171	INTEGER(iwp) :: i !<
	172	INTEGER(iwp) :: j !<
	173	INTEGER(iwp) :: k !<
	174	INTEGER(iwp) :: l !<
	175	INTEGER(iwp) :: ys !<
[1320]	176
[1682]	177	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	178	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1]	179
[1682]	180	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	181	#if __acc_fft_device
[3634]	182	!$ACC DECLARE CREATE(work)
[3690]	183	#endif
[1111]	184
	185
[1106]	186	IF ( numprocs /= 1 ) THEN
	187
	188	#if defined( __parallel )
[1]	189	!
[1106]	190	!-- Transpose array
[1318]	191	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	192
	193	#if __acc_fft_device
[3657]	194	#ifndef __cuda_aware_mpi
[3634]	195	!$ACC UPDATE HOST(f_inv)
[3657]	196	#else
	197	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	198	#endif
[3690]	199	#endif
	200
[1106]	201	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	202	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	203	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	204	comm1dy, ierr )
[3690]	205
	206	#if __acc_fft_device
[3657]	207	#ifndef __cuda_aware_mpi
[3634]	208	!$ACC UPDATE DEVICE(work)
[3657]	209	#else
	210	!$ACC END HOST_DATA
	211	#endif
[3690]	212	#endif
	213
[1106]	214	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	215
	216	!
[1106]	217	!-- Reorder transposed array
[1111]	218	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	219	!$OMP DO
[1106]	220	DO l = 0, pdims(2) - 1
	221	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	222	#if __acc_fft_device
[3634]	223	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	224	!$ACC PRESENT(f_out, work)
[3690]	225	#endif
[1106]	226	DO i = nxl_y, nxr_y
	227	DO k = nzb_y, nzt_y
	228	DO j = ys, ys + nyn_x - nys_x
[1111]	229	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	230	ENDDO
[1]	231	ENDDO
	232	ENDDO
	233	ENDDO
[683]	234	!$OMP END PARALLEL
[1]	235	#endif
	236
[1106]	237	ELSE
	238
	239	!
	240	!-- Reorder transposed array
	241	!$OMP PARALLEL PRIVATE ( i, j, k )
	242	!$OMP DO
[3690]	243	#if __acc_fft_device
[3634]	244	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	245	!$ACC PRESENT(f_out, f_inv)
[3690]	246	#endif
[1106]	247	DO k = nzb_y, nzt_y
	248	DO i = nxl_y, nxr_y
	249	DO j = 0, ny
	250	f_out(j,i,k) = f_inv(j,k,i)
	251	ENDDO
	252	ENDDO
	253	ENDDO
	254	!$OMP END PARALLEL
	255
	256	ENDIF
	257
[1]	258	END SUBROUTINE transpose_xy
	259
	260
	261	!------------------------------------------------------------------------------!
	262	! Description:
	263	! ------------
[1682]	264	!> Resorting data after the transposition from x to z. The transposition itself
	265	!> is carried out in transpose_xz
[1216]	266	!------------------------------------------------------------------------------!
[1682]	267	SUBROUTINE resort_for_xz( f_inv, f_out )
[1216]	268
[1682]	269
[1320]	270	USE indices, &
	271	ONLY: nxl, nxr, nyn, nys, nz
[1216]	272
[1320]	273	USE kinds
	274
[1216]	275	IMPLICIT NONE
	276
[1682]	277	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	278	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
[1216]	279
[1682]	280	INTEGER(iwp) :: i !<
	281	INTEGER(iwp) :: j !<
	282	INTEGER(iwp) :: k !<
[1216]	283	!
	284	!-- Rearrange indices of input array in order to make data to be send
	285	!-- by MPI contiguous.
	286	!-- In case of parallel fft/transposition, scattered store is faster in
	287	!-- backward direction!!!
	288	!$OMP PARALLEL PRIVATE ( i, j, k )
	289	!$OMP DO
[3690]	290	#if __acc_fft_device
[3634]	291	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	292	!$ACC PRESENT(f_out, f_inv)
[3690]	293	#endif
[1216]	294	DO k = 1, nz
	295	DO i = nxl, nxr
	296	DO j = nys, nyn
	297	f_out(k,j,i) = f_inv(j,i,k)
	298	ENDDO
	299	ENDDO
	300	ENDDO
	301	!$OMP END PARALLEL
	302
	303	END SUBROUTINE resort_for_xz
	304
	305
	306	!------------------------------------------------------------------------------!
	307	! Description:
	308	! ------------
[1682]	309	!> Transposition of input array (f_in) from x to z. For the input array, all
	310	!> elements along x reside on the same PE, while after transposition, all
	311	!> elements along z reside on the same PE.
[1]	312	!------------------------------------------------------------------------------!
[1682]	313	SUBROUTINE transpose_xz( f_in, f_inv )
[1]	314
[1682]	315
[1320]	316	USE cpulog, &
	317	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	318
[1320]	319	USE indices, &
[3241]	320	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
[1320]	321
	322	USE kinds
	323
[1324]	324	USE pegrid
[1320]	325
	326	USE transpose_indices, &
	327	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	328
[1]	329	IMPLICIT NONE
	330
[1682]	331	INTEGER(iwp) :: i !<
	332	INTEGER(iwp) :: j !<
	333	INTEGER(iwp) :: k !<
	334	INTEGER(iwp) :: l !<
	335	INTEGER(iwp) :: xs !<
[1]	336
[1682]	337	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	338	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1]	339
[1682]	340	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	341	#if __acc_fft_device
[3634]	342	!$ACC DECLARE CREATE(work)
[3690]	343	#endif
[1111]	344
[1320]	345
[1]	346	!
	347	!-- If the PE grid is one-dimensional along y, the array has only to be
	348	!-- reordered locally and therefore no transposition has to be done.
	349	IF ( pdims(1) /= 1 ) THEN
[1106]	350
	351	#if defined( __parallel )
[1]	352	!
	353	!-- Reorder input array for transposition
[1111]	354	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	355	!$OMP DO
[1]	356	DO l = 0, pdims(1) - 1
	357	xs = 0 + l * nnx
[3690]	358	#if __acc_fft_device
[3634]	359	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	360	!$ACC PRESENT(work, f_in)
[3690]	361	#endif
[1003]	362	DO k = nzb_x, nzt_x
[164]	363	DO i = xs, xs + nnx - 1
[1003]	364	DO j = nys_x, nyn_x
[1111]	365	work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]	366	ENDDO
	367	ENDDO
	368	ENDDO
	369	ENDDO
[683]	370	!$OMP END PARALLEL
[1]	371
	372	!
	373	!-- Transpose array
[1318]	374	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	375
	376	#if __acc_fft_device
[3657]	377	#ifndef __cuda_aware_mpi
[3634]	378	!$ACC UPDATE HOST(work)
[3657]	379	#else
	380	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	381	#endif
[3690]	382	#endif
	383
[622]	384	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	385	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	386	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	387	comm1dx, ierr )
[3690]	388
	389	#if __acc_fft_device
[3657]	390	#ifndef __cuda_aware_mpi
[3634]	391	!$ACC UPDATE DEVICE(f_inv)
[3657]	392	#else
	393	!$ACC END HOST_DATA
	394	#endif
[3694]	395	#endif
	396
[1]	397	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	398	#endif
	399
[1]	400	ELSE
[1106]	401
[1]	402	!
	403	!-- Reorder the array in a way that the z index is in first position
[683]	404	!$OMP PARALLEL PRIVATE ( i, j, k )
	405	!$OMP DO
[3690]	406	#if __acc_fft_device
[3634]	407	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	408	!$ACC PRESENT(f_inv, f_in)
[3690]	409	#endif
[1003]	410	DO i = nxl, nxr
	411	DO j = nys, nyn
	412	DO k = 1, nz
[164]	413	f_inv(j,i,k) = f_in(i,j,k)
[1]	414	ENDDO
	415	ENDDO
	416	ENDDO
[683]	417	!$OMP END PARALLEL
[1]	418
[164]	419	ENDIF
	420
[1]	421	END SUBROUTINE transpose_xz
	422
	423
	424	!------------------------------------------------------------------------------!
	425	! Description:
	426	! ------------
[1682]	427	!> Resorting data after the transposition from y to x. The transposition itself
	428	!> is carried out in transpose_yx
[1216]	429	!------------------------------------------------------------------------------!
[1682]	430	SUBROUTINE resort_for_yx( f_inv, f_out )
[1216]	431
[1682]	432
[1320]	433	USE indices, &
	434	ONLY: nx
[1216]	435
[1320]	436	USE kinds
	437
	438	USE transpose_indices, &
	439	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	440
[1216]	441	IMPLICIT NONE
	442
[1682]	443	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	444	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1216]	445
	446
[1682]	447	INTEGER(iwp) :: i !<
	448	INTEGER(iwp) :: j !<
	449	INTEGER(iwp) :: k !<
[1216]	450	!
	451	!-- Rearrange indices of input array in order to make data to be send
	452	!-- by MPI contiguous
	453	!$OMP PARALLEL PRIVATE ( i, j, k )
	454	!$OMP DO
[3690]	455	#if __acc_fft_device
[3634]	456	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	457	!$ACC PRESENT(f_out, f_inv)
[3690]	458	#endif
[1216]	459	DO i = 0, nx
	460	DO k = nzb_x, nzt_x
	461	DO j = nys_x, nyn_x
	462	f_out(i,j,k) = f_inv(j,k,i)
	463	ENDDO
	464	ENDDO
	465	ENDDO
	466	!$OMP END PARALLEL
	467
	468	END SUBROUTINE resort_for_yx
	469
	470
	471	!------------------------------------------------------------------------------!
	472	! Description:
	473	! ------------
[1682]	474	!> Transposition of input array (f_in) from y to x. For the input array, all
	475	!> elements along y reside on the same PE, while after transposition, all
	476	!> elements along x reside on the same PE.
[1]	477	!------------------------------------------------------------------------------!
[1682]	478	SUBROUTINE transpose_yx( f_in, f_inv )
[1]	479
[1682]	480
[1320]	481	USE cpulog, &
	482	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	483
[1320]	484	USE indices, &
	485	ONLY: nx, ny
	486
	487	USE kinds
	488
[1324]	489	USE pegrid
[1320]	490
	491	USE transpose_indices, &
	492	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	493
[1]	494	IMPLICIT NONE
	495
[1682]	496	INTEGER(iwp) :: i !<
	497	INTEGER(iwp) :: j !<
	498	INTEGER(iwp) :: k !<
	499	INTEGER(iwp) :: l !<
	500	INTEGER(iwp) :: ys !<
[1]	501
[1682]	502	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	503	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1111]	504
[1682]	505	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	506	#if __acc_fft_device
[3634]	507	!$ACC DECLARE CREATE(work)
[3690]	508	#endif
[1111]	509
[1320]	510
[1106]	511	IF ( numprocs /= 1 ) THEN
	512
[1]	513	#if defined( __parallel )
	514	!
[1106]	515	!-- Reorder input array for transposition
[1111]	516	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	517	!$OMP DO
[1106]	518	DO l = 0, pdims(2) - 1
	519	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	520	#if __acc_fft_device
[3634]	521	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	522	!$ACC PRESENT(work, f_in)
[3690]	523	#endif
[1106]	524	DO i = nxl_y, nxr_y
	525	DO k = nzb_y, nzt_y
	526	DO j = ys, ys + nyn_x - nys_x
[1111]	527	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	528	ENDDO
	529	ENDDO
	530	ENDDO
	531	ENDDO
	532	!$OMP END PARALLEL
	533
	534	!
	535	!-- Transpose array
[1318]	536	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	537
	538	#if __acc_fft_device
[3657]	539	#ifndef __cuda_aware_mpi
[3634]	540	!$ACC UPDATE HOST(work)
[3657]	541	#else
	542	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	543	#endif
[3690]	544	#endif
	545
[1106]	546	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	547	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	548	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	549	comm1dy, ierr )
[3690]	550
	551	#if __acc_fft_device
[3657]	552	#ifndef __cuda_aware_mpi
[3634]	553	!$ACC UPDATE DEVICE(f_inv)
[3657]	554	#else
	555	!$ACC END HOST_DATA
	556	#endif
[3690]	557	#endif
	558
[1106]	559	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	560	#endif
	561
	562	ELSE
	563
	564	!
	565	!-- Reorder array f_in the same way as ALLTOALL did it
	566	!$OMP PARALLEL PRIVATE ( i, j, k )
	567	!$OMP DO
[3690]	568	#if __acc_fft_device
[3634]	569	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	570	!$ACC PRESENT(f_inv, f_in)
[3690]	571	#endif
[1003]	572	DO i = nxl_y, nxr_y
	573	DO k = nzb_y, nzt_y
[1106]	574	DO j = 0, ny
	575	f_inv(j,k,i) = f_in(j,i,k)
[1]	576	ENDDO
	577	ENDDO
	578	ENDDO
[683]	579	!$OMP END PARALLEL
[1]	580
[1106]	581	ENDIF
[1]	582
	583	END SUBROUTINE transpose_yx
	584
	585
	586	!------------------------------------------------------------------------------!
	587	! Description:
	588	! ------------
[1682]	589	!> Transposition of input array (f_in) from y to x. For the input array, all
	590	!> elements along y reside on the same PE, while after transposition, all
	591	!> elements along x reside on the same PE.
	592	!> This is a direct transposition for arrays with indices in regular order
	593	!> (k,j,i) (cf. transpose_yx).
[1]	594	!------------------------------------------------------------------------------!
[1682]	595	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	596
[1682]	597
[1320]	598	USE cpulog, &
[3241]	599	ONLY: cpu_log, log_point_s
[1]	600
[1320]	601	USE indices, &
	602	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
	603
	604	USE kinds
	605
[1324]	606	USE pegrid
[1320]	607
	608	USE transpose_indices, &
	609	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	610
[1]	611	IMPLICIT NONE
	612
[1682]	613	INTEGER(iwp) :: i !<
	614	INTEGER(iwp) :: j !<
	615	INTEGER(iwp) :: k !<
	616	INTEGER(iwp) :: l !<
	617	INTEGER(iwp) :: m !<
	618	INTEGER(iwp) :: xs !<
[1]	619
[1682]	620	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	621	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
	622	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	623	REAL(wp) :: work(nnxnnynnz) !<
[1]	624	#if defined( __parallel )
	625
	626	!
	627	!-- Rearrange indices of input array in order to make data to be send
	628	!-- by MPI contiguous
[1003]	629	DO k = 1, nz
	630	DO j = nys, nyn
	631	DO i = nxl, nxr
[164]	632	f_inv(i,k,j) = f_in(k,j,i)
[1]	633	ENDDO
	634	ENDDO
	635	ENDDO
	636
	637	!
	638	!-- Transpose array
	639	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	640	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	641	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	642	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	643	comm1dx, ierr )
	644	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	645
	646	!
	647	!-- Reorder transposed array
	648	m = 0
	649	DO l = 0, pdims(1) - 1
	650	xs = 0 + l * nnx
[1003]	651	DO j = nys_x, nyn_x
	652	DO k = 1, nz
[1]	653	DO i = xs, xs + nnx - 1
	654	m = m + 1
[164]	655	f_out(i,j,k) = work(m)
[1]	656	ENDDO
	657	ENDDO
	658	ENDDO
	659	ENDDO
	660
	661	#endif
	662
	663	END SUBROUTINE transpose_yxd
	664
	665
	666	!------------------------------------------------------------------------------!
	667	! Description:
	668	! ------------
[1682]	669	!> Resorting data for the transposition from y to z. The transposition itself
	670	!> is carried out in transpose_yz
[1216]	671	!------------------------------------------------------------------------------!
[1682]	672	SUBROUTINE resort_for_yz( f_in, f_inv )
[1216]	673
[1682]	674
[1320]	675	USE indices, &
	676	ONLY: ny
[1216]	677
[1320]	678	USE kinds
	679
	680	USE transpose_indices, &
	681	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	682
[1216]	683	IMPLICIT NONE
	684
[1682]	685	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	686	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1216]	687
[1682]	688	INTEGER(iwp) :: i !<
	689	INTEGER(iwp) :: j !<
	690	INTEGER(iwp) :: k !<
[1216]	691
	692	!
	693	!-- Rearrange indices of input array in order to make data to be send
	694	!-- by MPI contiguous
	695	!$OMP PARALLEL PRIVATE ( i, j, k )
	696	!$OMP DO
[3690]	697	#if __acc_fft_device
[3634]	698	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	699	!$ACC PRESENT(f_inv, f_in)
[3690]	700	#endif
[1216]	701	DO j = 0, ny
	702	DO k = nzb_y, nzt_y
	703	DO i = nxl_y, nxr_y
	704	f_inv(i,k,j) = f_in(j,i,k)
	705	ENDDO
	706	ENDDO
	707	ENDDO
	708	!$OMP END PARALLEL
	709
	710	END SUBROUTINE resort_for_yz
	711
	712
	713	!------------------------------------------------------------------------------!
	714	! Description:
	715	! ------------
[1682]	716	!> Transposition of input array (f_in) from y to z. For the input array, all
	717	!> elements along y reside on the same PE, while after transposition, all
	718	!> elements along z reside on the same PE.
[1]	719	!------------------------------------------------------------------------------!
[1682]	720	SUBROUTINE transpose_yz( f_inv, f_out )
[1]	721
[1682]	722
[1320]	723	USE cpulog, &
	724	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	725
[1320]	726	USE indices, &
	727	ONLY: ny, nz
	728
	729	USE kinds
	730
[1324]	731	USE pegrid
[1320]	732
	733	USE transpose_indices, &
	734	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	735
[1]	736	IMPLICIT NONE
	737
[1682]	738	INTEGER(iwp) :: i !<
	739	INTEGER(iwp) :: j !<
	740	INTEGER(iwp) :: k !<
	741	INTEGER(iwp) :: l !<
	742	INTEGER(iwp) :: zs !<
[1]	743
[1682]	744	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	745	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
[1111]	746
[1682]	747	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	748	#if __acc_fft_device
[3634]	749	!$ACC DECLARE CREATE(work)
[3690]	750	#endif
[1111]	751
[1320]	752
[1]	753	!
	754	!-- If the PE grid is one-dimensional along y, only local reordering
	755	!-- of the data is necessary and no transposition has to be done.
	756	IF ( pdims(1) == 1 ) THEN
[1106]	757
[683]	758	!$OMP PARALLEL PRIVATE ( i, j, k )
	759	!$OMP DO
[3690]	760	#if __acc_fft_device
[3634]	761	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	762	!$ACC PRESENT(f_out, f_inv)
[3690]	763	#endif
[1003]	764	DO j = 0, ny
	765	DO k = nzb_y, nzt_y
	766	DO i = nxl_y, nxr_y
[164]	767	f_out(i,j,k) = f_inv(i,k,j)
[1]	768	ENDDO
	769	ENDDO
	770	ENDDO
[683]	771	!$OMP END PARALLEL
[1]	772
[1106]	773	ELSE
	774
	775	#if defined( __parallel )
[1]	776	!
[1106]	777	!-- Transpose array
[1318]	778	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	779
	780	#if __acc_fft_device
[3657]	781	#ifndef __cuda_aware_mpi
[3634]	782	!$ACC UPDATE HOST(f_inv)
[3657]	783	#else
	784	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	785	#endif
[3690]	786	#endif
	787
[1106]	788	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	789	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	790	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	791	comm1dx, ierr )
[3690]	792
	793	#if __acc_fft_device
[3657]	794	#ifndef __cuda_aware_mpi
[3634]	795	!$ACC UPDATE DEVICE(work)
[3657]	796	#else
	797	!$ACC END HOST_DATA
	798	#endif
[3690]	799	#endif
	800
[1106]	801	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	802
	803	!
[1106]	804	!-- Reorder transposed array
[1111]	805	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	806	!$OMP DO
[1106]	807	DO l = 0, pdims(1) - 1
	808	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	809	#if __acc_fft_device
[3634]	810	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	811	!$ACC PRESENT(f_out, work)
[3690]	812	#endif
[1106]	813	DO j = nys_z, nyn_z
	814	DO k = zs, zs + nzt_y - nzb_y
	815	DO i = nxl_z, nxr_z
[1111]	816	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	817	ENDDO
[1]	818	ENDDO
	819	ENDDO
	820	ENDDO
[683]	821	!$OMP END PARALLEL
[1]	822	#endif
	823
[1106]	824	ENDIF
	825
[1]	826	END SUBROUTINE transpose_yz
	827
	828
	829	!------------------------------------------------------------------------------!
	830	! Description:
	831	! ------------
[1682]	832	!> Resorting data for the transposition from z to x. The transposition itself
	833	!> is carried out in transpose_zx
[1216]	834	!------------------------------------------------------------------------------!
[1682]	835	SUBROUTINE resort_for_zx( f_in, f_inv )
[1216]	836
[1682]	837
[1320]	838	USE indices, &
	839	ONLY: nxl, nxr, nyn, nys, nz
[1216]	840
[1320]	841	USE kinds
	842
[1216]	843	IMPLICIT NONE
	844
[1682]	845	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	846	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1216]	847
[1682]	848	INTEGER(iwp) :: i !<
	849	INTEGER(iwp) :: j !<
	850	INTEGER(iwp) :: k !<
[1216]	851
	852	!
	853	!-- Rearrange indices of input array in order to make data to be send
	854	!-- by MPI contiguous
	855	!$OMP PARALLEL PRIVATE ( i, j, k )
	856	!$OMP DO
[3690]	857	#if __acc_fft_device
[3634]	858	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	859	!$ACC PRESENT(f_in, f_inv)
[3690]	860	#endif
[3832]	861	DO i = nxl, nxr
	862	DO k = 1,nz
[1216]	863	DO j = nys, nyn
	864	f_inv(j,i,k) = f_in(k,j,i)
	865	ENDDO
	866	ENDDO
	867	ENDDO
	868	!$OMP END PARALLEL
	869
	870	END SUBROUTINE resort_for_zx
	871
	872
	873	!------------------------------------------------------------------------------!
	874	! Description:
	875	! ------------
[1682]	876	!> Transposition of input array (f_in) from z to x. For the input array, all
	877	!> elements along z reside on the same PE, while after transposition, all
	878	!> elements along x reside on the same PE.
[1]	879	!------------------------------------------------------------------------------!
[1682]	880	SUBROUTINE transpose_zx( f_inv, f_out )
[1]	881
[1682]	882
[1320]	883	USE cpulog, &
	884	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	885
[1320]	886	USE indices, &
	887	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
	888
	889	USE kinds
	890
[1324]	891	USE pegrid
[1320]	892
	893	USE transpose_indices, &
	894	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	895
[1]	896	IMPLICIT NONE
	897
[1682]	898	INTEGER(iwp) :: i !<
	899	INTEGER(iwp) :: j !<
	900	INTEGER(iwp) :: k !<
	901	INTEGER(iwp) :: l !<
	902	INTEGER(iwp) :: xs !<
[1]	903
[1682]	904	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	905	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1111]	906
[1682]	907	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	908	#if __acc_fft_device
[3634]	909	!$ACC DECLARE CREATE(work)
[3690]	910	#endif
[1]	911
[1320]	912
[1]	913	!
	914	!-- If the PE grid is one-dimensional along y, only local reordering
	915	!-- of the data is necessary and no transposition has to be done.
	916	IF ( pdims(1) == 1 ) THEN
[1106]	917
[683]	918	!$OMP PARALLEL PRIVATE ( i, j, k )
	919	!$OMP DO
[3690]	920	#if __acc_fft_device
[3634]	921	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	922	!$ACC PRESENT(f_out, f_inv)
[3690]	923	#endif
[1003]	924	DO k = 1, nz
	925	DO i = nxl, nxr
	926	DO j = nys, nyn
[164]	927	f_out(i,j,k) = f_inv(j,i,k)
[1]	928	ENDDO
	929	ENDDO
	930	ENDDO
[683]	931	!$OMP END PARALLEL
[1]	932
[1106]	933	ELSE
	934
	935	#if defined( __parallel )
[1]	936	!
[1106]	937	!-- Transpose array
[1318]	938	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	939
	940	#if __acc_fft_device
[3657]	941	#ifndef __cuda_aware_mpi
[3634]	942	!$ACC UPDATE HOST(f_inv)
[3657]	943	#else
	944	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	945	#endif
[3690]	946	#endif
	947
[1106]	948	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	949	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	950	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	951	comm1dx, ierr )
[3690]	952
	953	#if __acc_fft_device
[3657]	954	#ifndef __cuda_aware_mpi
[3634]	955	!$ACC UPDATE DEVICE(work)
[3657]	956	#else
	957	!$ACC END HOST_DATA
	958	#endif
[3690]	959	#endif
	960
[1106]	961	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	962
	963	!
[1106]	964	!-- Reorder transposed array
[1111]	965	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	966	!$OMP DO
[1106]	967	DO l = 0, pdims(1) - 1
	968	xs = 0 + l * nnx
[3690]	969	#if __acc_fft_device
[3634]	970	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	971	!$ACC PRESENT(f_out, work)
[3690]	972	#endif
[1106]	973	DO k = nzb_x, nzt_x
	974	DO i = xs, xs + nnx - 1
	975	DO j = nys_x, nyn_x
[1111]	976	f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]	977	ENDDO
[1]	978	ENDDO
	979	ENDDO
	980	ENDDO
[683]	981	!$OMP END PARALLEL
[1]	982	#endif
	983
[1106]	984	ENDIF
	985
[1]	986	END SUBROUTINE transpose_zx
	987
	988
	989	!------------------------------------------------------------------------------!
	990	! Description:
	991	! ------------
[1682]	992	!> Resorting data after the transposition from z to y. The transposition itself
	993	!> is carried out in transpose_zy
[1216]	994	!------------------------------------------------------------------------------!
[1682]	995	SUBROUTINE resort_for_zy( f_inv, f_out )
[1216]	996
[1682]	997
[1320]	998	USE indices, &
	999	ONLY: ny
[1216]	1000
[1320]	1001	USE kinds
	1002
	1003	USE transpose_indices, &
	1004	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	1005
[1216]	1006	IMPLICIT NONE
	1007
[1682]	1008	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	1009	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1216]	1010
	1011
[1682]	1012	INTEGER(iwp) :: i !<
	1013	INTEGER(iwp) :: j !<
	1014	INTEGER(iwp) :: k !<
[1216]	1015
	1016	!
	1017	!-- Rearrange indices of input array in order to make data to be send
	1018	!-- by MPI contiguous
	1019	!$OMP PARALLEL PRIVATE ( i, j, k )
	1020	!$OMP DO
[3690]	1021	#if __acc_fft_device
[3634]	1022	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1023	!$ACC PRESENT(f_out, f_inv)
[3690]	1024	#endif
[3832]	1025	DO j = 0, ny
	1026	DO k = nzb_y, nzt_y
[1216]	1027	DO i = nxl_y, nxr_y
	1028	f_out(j,i,k) = f_inv(i,k,j)
	1029	ENDDO
	1030	ENDDO
	1031	ENDDO
	1032	!$OMP END PARALLEL
	1033
	1034	END SUBROUTINE resort_for_zy
	1035
	1036
	1037	!------------------------------------------------------------------------------!
[3241]	1038	! Description:cpu_log_nowait
[1216]	1039	! ------------
[1682]	1040	!> Transposition of input array (f_in) from z to y. For the input array, all
	1041	!> elements along z reside on the same PE, while after transposition, all
	1042	!> elements along y reside on the same PE.
[1]	1043	!------------------------------------------------------------------------------!
[1682]	1044	SUBROUTINE transpose_zy( f_in, f_inv )
[1]	1045
[1682]	1046
[1320]	1047	USE cpulog, &
	1048	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	1049
[1320]	1050	USE indices, &
	1051	ONLY: ny, nz
	1052
	1053	USE kinds
	1054
[1324]	1055	USE pegrid
[1320]	1056
	1057	USE transpose_indices, &
	1058	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	1059
[1]	1060	IMPLICIT NONE
	1061
[1682]	1062	INTEGER(iwp) :: i !<
	1063	INTEGER(iwp) :: j !<
	1064	INTEGER(iwp) :: k !<
	1065	INTEGER(iwp) :: l !<
	1066	INTEGER(iwp) :: zs !<
[1]	1067
[1682]	1068	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
	1069	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1111]	1070
[1682]	1071	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	1072	#if __acc_fft_device
[3634]	1073	!$ACC DECLARE CREATE(work)
[3690]	1074	#endif
[1111]	1075
[1]	1076	!
	1077	!-- If the PE grid is one-dimensional along y, the array has only to be
	1078	!-- reordered locally and therefore no transposition has to be done.
	1079	IF ( pdims(1) /= 1 ) THEN
[1106]	1080
	1081	#if defined( __parallel )
[1]	1082	!
	1083	!-- Reorder input array for transposition
[1111]	1084	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	1085	!$OMP DO
[1]	1086	DO l = 0, pdims(1) - 1
[1003]	1087	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	1088	#if __acc_fft_device
[3634]	1089	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1090	!$ACC PRESENT(work, f_in)
[3690]	1091	#endif
[1003]	1092	DO j = nys_z, nyn_z
	1093	DO k = zs, zs + nzt_y - nzb_y
	1094	DO i = nxl_z, nxr_z
[1111]	1095	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	1096	ENDDO
	1097	ENDDO
	1098	ENDDO
	1099	ENDDO
[683]	1100	!$OMP END PARALLEL
[1]	1101
	1102	!
	1103	!-- Transpose array
[1318]	1104	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	1105
	1106	#if __acc_fft_device
[3657]	1107	#ifndef __cuda_aware_mpi
[3634]	1108	!$ACC UPDATE HOST(work)
[3657]	1109	#else
	1110	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	1111	#endif
[3690]	1112	#endif
	1113
[622]	1114	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	1115	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	1116	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	1117	comm1dx, ierr )
[3690]	1118
	1119	#if __acc_fft_device
[3657]	1120	#ifndef __cuda_aware_mpi
[3634]	1121	!$ACC UPDATE DEVICE(f_inv)
[3657]	1122	#else
	1123	!$ACC END HOST_DATA
	1124	#endif
[3690]	1125	#endif
	1126
[1]	1127	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	1128	#endif
[1]	1129
	1130	ELSE
	1131	!
[1106]	1132	!-- Reorder the array in the same way like ALLTOALL did it
[683]	1133	!$OMP PARALLEL PRIVATE ( i, j, k )
	1134	!$OMP DO
[3690]	1135	#if __acc_fft_device
[3634]	1136	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1137	!$ACC PRESENT(f_inv, f_in)
[3690]	1138	#endif
[1003]	1139	DO k = nzb_y, nzt_y
	1140	DO j = 0, ny
	1141	DO i = nxl_y, nxr_y
[164]	1142	f_inv(i,k,j) = f_in(i,j,k)
	1143	ENDDO
	1144	ENDDO
	1145	ENDDO
[683]	1146	!$OMP END PARALLEL
[1106]	1147
	1148	ENDIF
	1149
[1]	1150	END SUBROUTINE transpose_zy
	1151
	1152
	1153	!------------------------------------------------------------------------------!
	1154	! Description:
	1155	! ------------
[1682]	1156	!> Transposition of input array (f_in) from z to y. For the input array, all
	1157	!> elements along z reside on the same PE, while after transposition, all
	1158	!> elements along y reside on the same PE.
	1159	!> This is a direct transposition for arrays with indices in regular order
	1160	!> (k,j,i) (cf. transpose_zy).
[1]	1161	!------------------------------------------------------------------------------!
[1682]	1162	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	1163
[1682]	1164
[1320]	1165	USE cpulog, &
[3241]	1166	ONLY: cpu_log, log_point_s
[1]	1167
[1320]	1168	USE indices, &
	1169	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
	1170
	1171	USE kinds
	1172
[1324]	1173	USE pegrid
[1320]	1174
	1175	USE transpose_indices, &
[3241]	1176	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
[1320]	1177
[1]	1178	IMPLICIT NONE
	1179
[1682]	1180	INTEGER(iwp) :: i !<
	1181	INTEGER(iwp) :: j !<
	1182	INTEGER(iwp) :: k !<
	1183	INTEGER(iwp) :: l !<
	1184	INTEGER(iwp) :: m !<
	1185	INTEGER(iwp) :: ys !<
[1]	1186
[1682]	1187	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	1188	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	1189	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
	1190	REAL(wp) :: work(nnxnnynnz) !<
[1320]	1191
[1]	1192	#if defined( __parallel )
	1193
	1194	!
	1195	!-- Rearrange indices of input array in order to make data to be send
	1196	!-- by MPI contiguous
[1003]	1197	DO i = nxl, nxr
	1198	DO j = nys, nyn
	1199	DO k = 1, nz
[164]	1200	f_inv(j,i,k) = f_in(k,j,i)
[1]	1201	ENDDO
	1202	ENDDO
	1203	ENDDO
	1204
	1205	!
	1206	!-- Move data to different array, because memory location of work1 is
	1207	!-- needed further below (work1 = work2).
	1208	!-- If the PE grid is one-dimensional along x, only local reordering
	1209	!-- of the data is necessary and no transposition has to be done.
	1210	IF ( pdims(2) == 1 ) THEN
[1003]	1211	DO k = 1, nz
	1212	DO i = nxl, nxr
	1213	DO j = nys, nyn
[164]	1214	f_out(j,i,k) = f_inv(j,i,k)
[1]	1215	ENDDO
	1216	ENDDO
	1217	ENDDO
	1218	RETURN
	1219	ENDIF
	1220
	1221	!
	1222	!-- Transpose array
	1223	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1224	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1225	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	1226	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	1227	comm1dy, ierr )
	1228	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1229
	1230	!
	1231	!-- Reorder transposed array
	1232	m = 0
	1233	DO l = 0, pdims(2) - 1
	1234	ys = 0 + l * nny
[1003]	1235	DO k = nzb_yd, nzt_yd
	1236	DO i = nxl_yd, nxr_yd
[1]	1237	DO j = ys, ys + nny - 1
	1238	m = m + 1
[164]	1239	f_out(j,i,k) = work(m)
[1]	1240	ENDDO
	1241	ENDDO
	1242	ENDDO
	1243	ENDDO
	1244
	1245	#endif
	1246
	1247	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |