Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 3692

Last change on this file since 3692 was 3690, checked in by knoop, 5 years ago
Enabled OpenACC usage without using the cudaFFT library. Added respective palmtest build configuration and testcase.
Property svn:keywords set to `Id`
File size: 33.9 KB

Rev	Line
[1682]	1	!> @file transpose.f90
[2000]	2	!------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[3655]	17	! Copyright 1997-2019 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1321]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: transpose.f90 3690 2019-01-22 22:56:42Z suehring $
[3634]	27	! OpenACC port for SPEC
	28	!
	29	! 3241 2018-09-12 15:02:00Z raasch
[3241]	30	! unused variables removed
	31	!
	32	! 2718 2018-01-02 08:49:38Z maronga
[2716]	33	! Corrected "Former revisions" section
	34	!
	35	! 2696 2017-12-14 17:12:51Z kanani
	36	! Change in file header (GPL part)
[1321]	37	!
[2716]	38	! 2119 2017-01-17 16:51:50Z raasch
	39	!
[2119]	40	! 2118 2017-01-17 16:38:49Z raasch
	41	! OpenACC directives removed
	42	!
[2001]	43	! 2000 2016-08-20 18:09:15Z knoop
	44	! Forced header and separation lines into 80 columns
	45	!
[1683]	46	! 1682 2015-10-07 23:56:08Z knoop
	47	! Code annotations made doxygen readable
	48	!
[1325]	49	! 1324 2014-03-21 09:13:16Z suehring
	50	! Bugfix: ONLY statement for module pegrid removed
	51	!
[1321]	52	! 1320 2014-03-20 08:40:49Z raasch
[1320]	53	! ONLY-attribute added to USE-statements,
	54	! kind-parameters added to all INTEGER and REAL declaration statements,
	55	! kinds are defined in new module kinds,
	56	! old module precision_kind is removed,
	57	! revision history before 2012 removed,
	58	! comment fields (!:) to be used for variable explanations added to
	59	! all variable declaration statements
[198]	60	!
[1319]	61	! 1318 2014-03-17 13:35:16Z raasch
	62	! cpu_log_nowait parameter added to cpu measurements of the transpositions
	63	! required for solving the Poisson equation (poisfft),
	64	! module interfaces removed
	65	!
[1258]	66	! 1257 2013-11-08 15:18:40Z raasch
	67	! openacc loop and loop vector clauses removed
	68	!
[1217]	69	! 1216 2013-08-26 09:31:42Z raasch
	70	! re-sorting of the transposed / to be transposed arrays moved to separate
	71	! routines resort_for_...
	72	!
[1112]	73	! 1111 2013-03-08 23:54:10Z raasch
	74	! openACC directives added,
	75	! resorting data from/to work changed, work got 4 dimensions instead of 1
	76	!
[1107]	77	! 1106 2013-03-04 05:31:38Z raasch
	78	! preprocessor lines rearranged so that routines can also be used in serial
	79	! (non-parallel) mode
	80	!
[1093]	81	! 1092 2013-02-02 11:24:22Z raasch
	82	! unused variables removed
	83	!
[1037]	84	! 1036 2012-10-22 13:43:42Z raasch
	85	! code put under GPL (PALM 3.9)
	86	!
[1004]	87	! 1003 2012-09-14 14:35:53Z raasch
	88	! indices nxa, nya, etc. replaced by nx, ny, etc.
	89	!
[1]	90	! Revision 1.1 1997/07/24 11:25:18 raasch
	91	! Initial revision
	92	!
[3690]	93
	94	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
	95
[1216]	96	!------------------------------------------------------------------------------!
	97	! Description:
	98	! ------------
[1682]	99	!> Resorting data for the transposition from x to y. The transposition itself
	100	!> is carried out in transpose_xy
[1216]	101	!------------------------------------------------------------------------------!
[1682]	102	SUBROUTINE resort_for_xy( f_in, f_inv )
	103
[1216]	104
[1320]	105	USE indices, &
	106	ONLY: nx
[1216]	107
[1320]	108	USE kinds
	109
	110	USE transpose_indices, &
[3241]	111	ONLY: nyn_x, nys_x, nzb_x, nzt_x
[1320]	112
[1216]	113	IMPLICIT NONE
	114
[1682]	115	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	116	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1216]	117
	118
[1682]	119	INTEGER(iwp) :: i !<
	120	INTEGER(iwp) :: j !<
	121	INTEGER(iwp) :: k !<
[1]	122	!
[1216]	123	!-- Rearrange indices of input array in order to make data to be send
	124	!-- by MPI contiguous
	125	!$OMP PARALLEL PRIVATE ( i, j, k )
	126	!$OMP DO
[3690]	127	#if __acc_fft_device
[3634]	128	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	129	!$ACC PRESENT(f_inv, f_in)
[3690]	130	#endif
[1216]	131	DO i = 0, nx
	132	DO k = nzb_x, nzt_x
	133	DO j = nys_x, nyn_x
	134	f_inv(j,k,i) = f_in(i,j,k)
	135	ENDDO
	136	ENDDO
	137	ENDDO
	138	!$OMP END PARALLEL
	139
	140	END SUBROUTINE resort_for_xy
	141
	142
	143	!------------------------------------------------------------------------------!
[1]	144	! Description:
	145	! ------------
[1682]	146	!> Transposition of input array (f_in) from x to y. For the input array, all
	147	!> elements along x reside on the same PE, while after transposition, all
	148	!> elements along y reside on the same PE.
[1]	149	!------------------------------------------------------------------------------!
[1682]	150	SUBROUTINE transpose_xy( f_inv, f_out )
[1]	151
[1682]	152
[1320]	153	USE cpulog, &
	154	ONLY: cpu_log, cpu_log_nowait, log_point_s
	155
	156	USE indices, &
	157	ONLY: nx, ny
	158
	159	USE kinds
	160
[1]	161	USE pegrid
	162
[1320]	163	USE transpose_indices, &
	164	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	165
[1]	166	IMPLICIT NONE
	167
[1682]	168	INTEGER(iwp) :: i !<
	169	INTEGER(iwp) :: j !<
	170	INTEGER(iwp) :: k !<
	171	INTEGER(iwp) :: l !<
	172	INTEGER(iwp) :: ys !<
[1320]	173
[1682]	174	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	175	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1]	176
[1682]	177	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	178	#if __acc_fft_device
[3634]	179	!$ACC DECLARE CREATE(work)
[3690]	180	#endif
[1111]	181
	182
[1106]	183	IF ( numprocs /= 1 ) THEN
	184
	185	#if defined( __parallel )
[1]	186	!
[1106]	187	!-- Transpose array
[1318]	188	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	189
	190	#if __acc_fft_device
[3657]	191	#ifndef __cuda_aware_mpi
[3634]	192	!$ACC UPDATE HOST(f_inv)
[3657]	193	#else
	194	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	195	#endif
[3690]	196	#endif
	197
[1106]	198	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	199	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	200	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	201	comm1dy, ierr )
[3690]	202
	203	#if __acc_fft_device
[3657]	204	#ifndef __cuda_aware_mpi
[3634]	205	!$ACC UPDATE DEVICE(work)
[3657]	206	#else
	207	!$ACC END HOST_DATA
	208	#endif
[3690]	209	#endif
	210
[1106]	211	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	212
	213	!
[1106]	214	!-- Reorder transposed array
[1111]	215	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	216	!$OMP DO
[1106]	217	DO l = 0, pdims(2) - 1
	218	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	219	#if __acc_fft_device
[3634]	220	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	221	!$ACC PRESENT(f_out, work)
[3690]	222	#endif
[1106]	223	DO i = nxl_y, nxr_y
	224	DO k = nzb_y, nzt_y
	225	DO j = ys, ys + nyn_x - nys_x
[1111]	226	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	227	ENDDO
[1]	228	ENDDO
	229	ENDDO
	230	ENDDO
[683]	231	!$OMP END PARALLEL
[1]	232	#endif
	233
[1106]	234	ELSE
	235
	236	!
	237	!-- Reorder transposed array
	238	!$OMP PARALLEL PRIVATE ( i, j, k )
	239	!$OMP DO
[3690]	240	#if __acc_fft_device
[3634]	241	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	242	!$ACC PRESENT(f_out, f_inv)
[3690]	243	#endif
[1106]	244	DO k = nzb_y, nzt_y
	245	DO i = nxl_y, nxr_y
	246	DO j = 0, ny
	247	f_out(j,i,k) = f_inv(j,k,i)
	248	ENDDO
	249	ENDDO
	250	ENDDO
	251	!$OMP END PARALLEL
	252
	253	ENDIF
	254
[1]	255	END SUBROUTINE transpose_xy
	256
	257
	258	!------------------------------------------------------------------------------!
	259	! Description:
	260	! ------------
[1682]	261	!> Resorting data after the transposition from x to z. The transposition itself
	262	!> is carried out in transpose_xz
[1216]	263	!------------------------------------------------------------------------------!
[1682]	264	SUBROUTINE resort_for_xz( f_inv, f_out )
[1216]	265
[1682]	266
[1320]	267	USE indices, &
	268	ONLY: nxl, nxr, nyn, nys, nz
[1216]	269
[1320]	270	USE kinds
	271
[1216]	272	IMPLICIT NONE
	273
[1682]	274	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	275	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
[1216]	276
[1682]	277	INTEGER(iwp) :: i !<
	278	INTEGER(iwp) :: j !<
	279	INTEGER(iwp) :: k !<
[1216]	280	!
	281	!-- Rearrange indices of input array in order to make data to be send
	282	!-- by MPI contiguous.
	283	!-- In case of parallel fft/transposition, scattered store is faster in
	284	!-- backward direction!!!
	285	!$OMP PARALLEL PRIVATE ( i, j, k )
	286	!$OMP DO
[3690]	287	#if __acc_fft_device
[3634]	288	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	289	!$ACC PRESENT(f_out, f_inv)
[3690]	290	#endif
[1216]	291	DO k = 1, nz
	292	DO i = nxl, nxr
	293	DO j = nys, nyn
	294	f_out(k,j,i) = f_inv(j,i,k)
	295	ENDDO
	296	ENDDO
	297	ENDDO
	298	!$OMP END PARALLEL
	299
	300	END SUBROUTINE resort_for_xz
	301
	302
	303	!------------------------------------------------------------------------------!
	304	! Description:
	305	! ------------
[1682]	306	!> Transposition of input array (f_in) from x to z. For the input array, all
	307	!> elements along x reside on the same PE, while after transposition, all
	308	!> elements along z reside on the same PE.
[1]	309	!------------------------------------------------------------------------------!
[1682]	310	SUBROUTINE transpose_xz( f_in, f_inv )
[1]	311
[1682]	312
[1320]	313	USE cpulog, &
	314	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	315
[1320]	316	USE indices, &
[3241]	317	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
[1320]	318
	319	USE kinds
	320
[1324]	321	USE pegrid
[1320]	322
	323	USE transpose_indices, &
	324	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	325
[1]	326	IMPLICIT NONE
	327
[1682]	328	INTEGER(iwp) :: i !<
	329	INTEGER(iwp) :: j !<
	330	INTEGER(iwp) :: k !<
	331	INTEGER(iwp) :: l !<
	332	INTEGER(iwp) :: xs !<
[1]	333
[1682]	334	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	335	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1]	336
[1682]	337	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	338	#if __acc_fft_device
[3634]	339	!$ACC DECLARE CREATE(work)
[3690]	340	#endif
[1111]	341
[1320]	342
[1]	343	!
	344	!-- If the PE grid is one-dimensional along y, the array has only to be
	345	!-- reordered locally and therefore no transposition has to be done.
	346	IF ( pdims(1) /= 1 ) THEN
[1106]	347
	348	#if defined( __parallel )
[1]	349	!
	350	!-- Reorder input array for transposition
[1111]	351	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	352	!$OMP DO
[1]	353	DO l = 0, pdims(1) - 1
	354	xs = 0 + l * nnx
[3690]	355	#if __acc_fft_device
[3634]	356	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	357	!$ACC PRESENT(work, f_in)
[3690]	358	#endif
[1003]	359	DO k = nzb_x, nzt_x
[164]	360	DO i = xs, xs + nnx - 1
[1003]	361	DO j = nys_x, nyn_x
[1111]	362	work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]	363	ENDDO
	364	ENDDO
	365	ENDDO
	366	ENDDO
[683]	367	!$OMP END PARALLEL
[1]	368
	369	!
	370	!-- Transpose array
[1318]	371	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	372
	373	#if __acc_fft_device
[3657]	374	#ifndef __cuda_aware_mpi
[3634]	375	!$ACC UPDATE HOST(work)
[3657]	376	#else
	377	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	378	#endif
[3690]	379	#endif
	380
[622]	381	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	382	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	383	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	384	comm1dx, ierr )
[3690]	385
	386	#if __acc_fft_device
[3657]	387	#ifndef __cuda_aware_mpi
[3634]	388	!$ACC UPDATE DEVICE(f_inv)
[3657]	389	#else
	390	!$ACC END HOST_DATA
	391	#endif
[1]	392	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	393	#endif
[3690]	394	#endif
[1106]	395
[1]	396	ELSE
[1106]	397
[1]	398	!
	399	!-- Reorder the array in a way that the z index is in first position
[683]	400	!$OMP PARALLEL PRIVATE ( i, j, k )
	401	!$OMP DO
[3690]	402	#if __acc_fft_device
[3634]	403	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	404	!$ACC PRESENT(f_inv, f_in)
[3690]	405	#endif
[1003]	406	DO i = nxl, nxr
	407	DO j = nys, nyn
	408	DO k = 1, nz
[164]	409	f_inv(j,i,k) = f_in(i,j,k)
[1]	410	ENDDO
	411	ENDDO
	412	ENDDO
[683]	413	!$OMP END PARALLEL
[1]	414
[164]	415	ENDIF
	416
[1]	417	END SUBROUTINE transpose_xz
	418
	419
	420	!------------------------------------------------------------------------------!
	421	! Description:
	422	! ------------
[1682]	423	!> Resorting data after the transposition from y to x. The transposition itself
	424	!> is carried out in transpose_yx
[1216]	425	!------------------------------------------------------------------------------!
[1682]	426	SUBROUTINE resort_for_yx( f_inv, f_out )
[1216]	427
[1682]	428
[1320]	429	USE indices, &
	430	ONLY: nx
[1216]	431
[1320]	432	USE kinds
	433
	434	USE transpose_indices, &
	435	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	436
[1216]	437	IMPLICIT NONE
	438
[1682]	439	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	440	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1216]	441
	442
[1682]	443	INTEGER(iwp) :: i !<
	444	INTEGER(iwp) :: j !<
	445	INTEGER(iwp) :: k !<
[1216]	446	!
	447	!-- Rearrange indices of input array in order to make data to be send
	448	!-- by MPI contiguous
	449	!$OMP PARALLEL PRIVATE ( i, j, k )
	450	!$OMP DO
[3690]	451	#if __acc_fft_device
[3634]	452	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	453	!$ACC PRESENT(f_out, f_inv)
[3690]	454	#endif
[1216]	455	DO i = 0, nx
	456	DO k = nzb_x, nzt_x
	457	DO j = nys_x, nyn_x
	458	f_out(i,j,k) = f_inv(j,k,i)
	459	ENDDO
	460	ENDDO
	461	ENDDO
	462	!$OMP END PARALLEL
	463
	464	END SUBROUTINE resort_for_yx
	465
	466
	467	!------------------------------------------------------------------------------!
	468	! Description:
	469	! ------------
[1682]	470	!> Transposition of input array (f_in) from y to x. For the input array, all
	471	!> elements along y reside on the same PE, while after transposition, all
	472	!> elements along x reside on the same PE.
[1]	473	!------------------------------------------------------------------------------!
[1682]	474	SUBROUTINE transpose_yx( f_in, f_inv )
[1]	475
[1682]	476
[1320]	477	USE cpulog, &
	478	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	479
[1320]	480	USE indices, &
	481	ONLY: nx, ny
	482
	483	USE kinds
	484
[1324]	485	USE pegrid
[1320]	486
	487	USE transpose_indices, &
	488	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	489
[1]	490	IMPLICIT NONE
	491
[1682]	492	INTEGER(iwp) :: i !<
	493	INTEGER(iwp) :: j !<
	494	INTEGER(iwp) :: k !<
	495	INTEGER(iwp) :: l !<
	496	INTEGER(iwp) :: ys !<
[1]	497
[1682]	498	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	499	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1111]	500
[1682]	501	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	502	#if __acc_fft_device
[3634]	503	!$ACC DECLARE CREATE(work)
[3690]	504	#endif
[1111]	505
[1320]	506
[1106]	507	IF ( numprocs /= 1 ) THEN
	508
[1]	509	#if defined( __parallel )
	510	!
[1106]	511	!-- Reorder input array for transposition
[1111]	512	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	513	!$OMP DO
[1106]	514	DO l = 0, pdims(2) - 1
	515	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	516	#if __acc_fft_device
[3634]	517	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	518	!$ACC PRESENT(work, f_in)
[3690]	519	#endif
[1106]	520	DO i = nxl_y, nxr_y
	521	DO k = nzb_y, nzt_y
	522	DO j = ys, ys + nyn_x - nys_x
[1111]	523	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	524	ENDDO
	525	ENDDO
	526	ENDDO
	527	ENDDO
	528	!$OMP END PARALLEL
	529
	530	!
	531	!-- Transpose array
[1318]	532	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	533
	534	#if __acc_fft_device
[3657]	535	#ifndef __cuda_aware_mpi
[3634]	536	!$ACC UPDATE HOST(work)
[3657]	537	#else
	538	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	539	#endif
[3690]	540	#endif
	541
[1106]	542	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	543	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	544	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	545	comm1dy, ierr )
[3690]	546
	547	#if __acc_fft_device
[3657]	548	#ifndef __cuda_aware_mpi
[3634]	549	!$ACC UPDATE DEVICE(f_inv)
[3657]	550	#else
	551	!$ACC END HOST_DATA
	552	#endif
[3690]	553	#endif
	554
[1106]	555	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	556	#endif
	557
	558	ELSE
	559
	560	!
	561	!-- Reorder array f_in the same way as ALLTOALL did it
	562	!$OMP PARALLEL PRIVATE ( i, j, k )
	563	!$OMP DO
[3690]	564	#if __acc_fft_device
[3634]	565	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	566	!$ACC PRESENT(f_inv, f_in)
[3690]	567	#endif
[1003]	568	DO i = nxl_y, nxr_y
	569	DO k = nzb_y, nzt_y
[1106]	570	DO j = 0, ny
	571	f_inv(j,k,i) = f_in(j,i,k)
[1]	572	ENDDO
	573	ENDDO
	574	ENDDO
[683]	575	!$OMP END PARALLEL
[1]	576
[1106]	577	ENDIF
[1]	578
	579	END SUBROUTINE transpose_yx
	580
	581
	582	!------------------------------------------------------------------------------!
	583	! Description:
	584	! ------------
[1682]	585	!> Transposition of input array (f_in) from y to x. For the input array, all
	586	!> elements along y reside on the same PE, while after transposition, all
	587	!> elements along x reside on the same PE.
	588	!> This is a direct transposition for arrays with indices in regular order
	589	!> (k,j,i) (cf. transpose_yx).
[1]	590	!------------------------------------------------------------------------------!
[1682]	591	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	592
[1682]	593
[1320]	594	USE cpulog, &
[3241]	595	ONLY: cpu_log, log_point_s
[1]	596
[1320]	597	USE indices, &
	598	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
	599
	600	USE kinds
	601
[1324]	602	USE pegrid
[1320]	603
	604	USE transpose_indices, &
	605	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	606
[1]	607	IMPLICIT NONE
	608
[1682]	609	INTEGER(iwp) :: i !<
	610	INTEGER(iwp) :: j !<
	611	INTEGER(iwp) :: k !<
	612	INTEGER(iwp) :: l !<
	613	INTEGER(iwp) :: m !<
	614	INTEGER(iwp) :: xs !<
[1]	615
[1682]	616	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	617	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
	618	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	619	REAL(wp) :: work(nnxnnynnz) !<
[1]	620	#if defined( __parallel )
	621
	622	!
	623	!-- Rearrange indices of input array in order to make data to be send
	624	!-- by MPI contiguous
[1003]	625	DO k = 1, nz
	626	DO j = nys, nyn
	627	DO i = nxl, nxr
[164]	628	f_inv(i,k,j) = f_in(k,j,i)
[1]	629	ENDDO
	630	ENDDO
	631	ENDDO
	632
	633	!
	634	!-- Transpose array
	635	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	636	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	637	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	638	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	639	comm1dx, ierr )
	640	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	641
	642	!
	643	!-- Reorder transposed array
	644	m = 0
	645	DO l = 0, pdims(1) - 1
	646	xs = 0 + l * nnx
[1003]	647	DO j = nys_x, nyn_x
	648	DO k = 1, nz
[1]	649	DO i = xs, xs + nnx - 1
	650	m = m + 1
[164]	651	f_out(i,j,k) = work(m)
[1]	652	ENDDO
	653	ENDDO
	654	ENDDO
	655	ENDDO
	656
	657	#endif
	658
	659	END SUBROUTINE transpose_yxd
	660
	661
	662	!------------------------------------------------------------------------------!
	663	! Description:
	664	! ------------
[1682]	665	!> Resorting data for the transposition from y to z. The transposition itself
	666	!> is carried out in transpose_yz
[1216]	667	!------------------------------------------------------------------------------!
[1682]	668	SUBROUTINE resort_for_yz( f_in, f_inv )
[1216]	669
[1682]	670
[1320]	671	USE indices, &
	672	ONLY: ny
[1216]	673
[1320]	674	USE kinds
	675
	676	USE transpose_indices, &
	677	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	678
[1216]	679	IMPLICIT NONE
	680
[1682]	681	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	682	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1216]	683
[1682]	684	INTEGER(iwp) :: i !<
	685	INTEGER(iwp) :: j !<
	686	INTEGER(iwp) :: k !<
[1216]	687
	688	!
	689	!-- Rearrange indices of input array in order to make data to be send
	690	!-- by MPI contiguous
	691	!$OMP PARALLEL PRIVATE ( i, j, k )
	692	!$OMP DO
[3690]	693	#if __acc_fft_device
[3634]	694	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	695	!$ACC PRESENT(f_inv, f_in)
[3690]	696	#endif
[1216]	697	DO j = 0, ny
	698	DO k = nzb_y, nzt_y
	699	DO i = nxl_y, nxr_y
	700	f_inv(i,k,j) = f_in(j,i,k)
	701	ENDDO
	702	ENDDO
	703	ENDDO
	704	!$OMP END PARALLEL
	705
	706	END SUBROUTINE resort_for_yz
	707
	708
	709	!------------------------------------------------------------------------------!
	710	! Description:
	711	! ------------
[1682]	712	!> Transposition of input array (f_in) from y to z. For the input array, all
	713	!> elements along y reside on the same PE, while after transposition, all
	714	!> elements along z reside on the same PE.
[1]	715	!------------------------------------------------------------------------------!
[1682]	716	SUBROUTINE transpose_yz( f_inv, f_out )
[1]	717
[1682]	718
[1320]	719	USE cpulog, &
	720	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	721
[1320]	722	USE indices, &
	723	ONLY: ny, nz
	724
	725	USE kinds
	726
[1324]	727	USE pegrid
[1320]	728
	729	USE transpose_indices, &
	730	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	731
[1]	732	IMPLICIT NONE
	733
[1682]	734	INTEGER(iwp) :: i !<
	735	INTEGER(iwp) :: j !<
	736	INTEGER(iwp) :: k !<
	737	INTEGER(iwp) :: l !<
	738	INTEGER(iwp) :: zs !<
[1]	739
[1682]	740	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	741	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
[1111]	742
[1682]	743	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	744	#if __acc_fft_device
[3634]	745	!$ACC DECLARE CREATE(work)
[3690]	746	#endif
[1111]	747
[1320]	748
[1]	749	!
	750	!-- If the PE grid is one-dimensional along y, only local reordering
	751	!-- of the data is necessary and no transposition has to be done.
	752	IF ( pdims(1) == 1 ) THEN
[1106]	753
[683]	754	!$OMP PARALLEL PRIVATE ( i, j, k )
	755	!$OMP DO
[3690]	756	#if __acc_fft_device
[3634]	757	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	758	!$ACC PRESENT(f_out, f_inv)
[3690]	759	#endif
[1003]	760	DO j = 0, ny
	761	DO k = nzb_y, nzt_y
	762	DO i = nxl_y, nxr_y
[164]	763	f_out(i,j,k) = f_inv(i,k,j)
[1]	764	ENDDO
	765	ENDDO
	766	ENDDO
[683]	767	!$OMP END PARALLEL
[1]	768
[1106]	769	ELSE
	770
	771	#if defined( __parallel )
[1]	772	!
[1106]	773	!-- Transpose array
[1318]	774	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	775
	776	#if __acc_fft_device
[3657]	777	#ifndef __cuda_aware_mpi
[3634]	778	!$ACC UPDATE HOST(f_inv)
[3657]	779	#else
	780	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	781	#endif
[3690]	782	#endif
	783
[1106]	784	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	785	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	786	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	787	comm1dx, ierr )
[3690]	788
	789	#if __acc_fft_device
[3657]	790	#ifndef __cuda_aware_mpi
[3634]	791	!$ACC UPDATE DEVICE(work)
[3657]	792	#else
	793	!$ACC END HOST_DATA
	794	#endif
[3690]	795	#endif
	796
[1106]	797	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	798
	799	!
[1106]	800	!-- Reorder transposed array
[1111]	801	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	802	!$OMP DO
[1106]	803	DO l = 0, pdims(1) - 1
	804	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	805	#if __acc_fft_device
[3634]	806	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	807	!$ACC PRESENT(f_out, work)
[3690]	808	#endif
[1106]	809	DO j = nys_z, nyn_z
	810	DO k = zs, zs + nzt_y - nzb_y
	811	DO i = nxl_z, nxr_z
[1111]	812	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	813	ENDDO
[1]	814	ENDDO
	815	ENDDO
	816	ENDDO
[683]	817	!$OMP END PARALLEL
[1]	818	#endif
	819
[1106]	820	ENDIF
	821
[1]	822	END SUBROUTINE transpose_yz
	823
	824
	825	!------------------------------------------------------------------------------!
	826	! Description:
	827	! ------------
[1682]	828	!> Resorting data for the transposition from z to x. The transposition itself
	829	!> is carried out in transpose_zx
[1216]	830	!------------------------------------------------------------------------------!
[1682]	831	SUBROUTINE resort_for_zx( f_in, f_inv )
[1216]	832
[1682]	833
[1320]	834	USE indices, &
	835	ONLY: nxl, nxr, nyn, nys, nz
[1216]	836
[1320]	837	USE kinds
	838
[1216]	839	IMPLICIT NONE
	840
[1682]	841	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	842	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1216]	843
[1682]	844	INTEGER(iwp) :: i !<
	845	INTEGER(iwp) :: j !<
	846	INTEGER(iwp) :: k !<
[1216]	847
	848	!
	849	!-- Rearrange indices of input array in order to make data to be send
	850	!-- by MPI contiguous
	851	!$OMP PARALLEL PRIVATE ( i, j, k )
	852	!$OMP DO
[3690]	853	#if __acc_fft_device
[3634]	854	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	855	!$ACC PRESENT(f_in, f_inv)
[3690]	856	#endif
[1216]	857	DO k = 1,nz
	858	DO i = nxl, nxr
	859	DO j = nys, nyn
	860	f_inv(j,i,k) = f_in(k,j,i)
	861	ENDDO
	862	ENDDO
	863	ENDDO
	864	!$OMP END PARALLEL
	865
	866	END SUBROUTINE resort_for_zx
	867
	868
	869	!------------------------------------------------------------------------------!
	870	! Description:
	871	! ------------
[1682]	872	!> Transposition of input array (f_in) from z to x. For the input array, all
	873	!> elements along z reside on the same PE, while after transposition, all
	874	!> elements along x reside on the same PE.
[1]	875	!------------------------------------------------------------------------------!
[1682]	876	SUBROUTINE transpose_zx( f_inv, f_out )
[1]	877
[1682]	878
[1320]	879	USE cpulog, &
	880	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	881
[1320]	882	USE indices, &
	883	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
	884
	885	USE kinds
	886
[1324]	887	USE pegrid
[1320]	888
	889	USE transpose_indices, &
	890	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	891
[1]	892	IMPLICIT NONE
	893
[1682]	894	INTEGER(iwp) :: i !<
	895	INTEGER(iwp) :: j !<
	896	INTEGER(iwp) :: k !<
	897	INTEGER(iwp) :: l !<
	898	INTEGER(iwp) :: xs !<
[1]	899
[1682]	900	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	901	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1111]	902
[1682]	903	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	904	#if __acc_fft_device
[3634]	905	!$ACC DECLARE CREATE(work)
[3690]	906	#endif
[1]	907
[1320]	908
[1]	909	!
	910	!-- If the PE grid is one-dimensional along y, only local reordering
	911	!-- of the data is necessary and no transposition has to be done.
	912	IF ( pdims(1) == 1 ) THEN
[1106]	913
[683]	914	!$OMP PARALLEL PRIVATE ( i, j, k )
	915	!$OMP DO
[3690]	916	#if __acc_fft_device
[3634]	917	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	918	!$ACC PRESENT(f_out, f_inv)
[3690]	919	#endif
[1003]	920	DO k = 1, nz
	921	DO i = nxl, nxr
	922	DO j = nys, nyn
[164]	923	f_out(i,j,k) = f_inv(j,i,k)
[1]	924	ENDDO
	925	ENDDO
	926	ENDDO
[683]	927	!$OMP END PARALLEL
[1]	928
[1106]	929	ELSE
	930
	931	#if defined( __parallel )
[1]	932	!
[1106]	933	!-- Transpose array
[1318]	934	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	935
	936	#if __acc_fft_device
[3657]	937	#ifndef __cuda_aware_mpi
[3634]	938	!$ACC UPDATE HOST(f_inv)
[3657]	939	#else
	940	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	941	#endif
[3690]	942	#endif
	943
[1106]	944	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	945	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	946	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	947	comm1dx, ierr )
[3690]	948
	949	#if __acc_fft_device
[3657]	950	#ifndef __cuda_aware_mpi
[3634]	951	!$ACC UPDATE DEVICE(work)
[3657]	952	#else
	953	!$ACC END HOST_DATA
	954	#endif
[3690]	955	#endif
	956
[1106]	957	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	958
	959	!
[1106]	960	!-- Reorder transposed array
[1111]	961	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	962	!$OMP DO
[1106]	963	DO l = 0, pdims(1) - 1
	964	xs = 0 + l * nnx
[3690]	965	#if __acc_fft_device
[3634]	966	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	967	!$ACC PRESENT(f_out, work)
[3690]	968	#endif
[1106]	969	DO k = nzb_x, nzt_x
	970	DO i = xs, xs + nnx - 1
	971	DO j = nys_x, nyn_x
[1111]	972	f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]	973	ENDDO
[1]	974	ENDDO
	975	ENDDO
	976	ENDDO
[683]	977	!$OMP END PARALLEL
[1]	978	#endif
	979
[1106]	980	ENDIF
	981
[1]	982	END SUBROUTINE transpose_zx
	983
	984
	985	!------------------------------------------------------------------------------!
	986	! Description:
	987	! ------------
[1682]	988	!> Resorting data after the transposition from z to y. The transposition itself
	989	!> is carried out in transpose_zy
[1216]	990	!------------------------------------------------------------------------------!
[1682]	991	SUBROUTINE resort_for_zy( f_inv, f_out )
[1216]	992
[1682]	993
[1320]	994	USE indices, &
	995	ONLY: ny
[1216]	996
[1320]	997	USE kinds
	998
	999	USE transpose_indices, &
	1000	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	1001
[1216]	1002	IMPLICIT NONE
	1003
[1682]	1004	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	1005	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1216]	1006
	1007
[1682]	1008	INTEGER(iwp) :: i !<
	1009	INTEGER(iwp) :: j !<
	1010	INTEGER(iwp) :: k !<
[1216]	1011
	1012	!
	1013	!-- Rearrange indices of input array in order to make data to be send
	1014	!-- by MPI contiguous
	1015	!$OMP PARALLEL PRIVATE ( i, j, k )
	1016	!$OMP DO
[3690]	1017	#if __acc_fft_device
[3634]	1018	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1019	!$ACC PRESENT(f_out, f_inv)
[3690]	1020	#endif
[1216]	1021	DO k = nzb_y, nzt_y
	1022	DO j = 0, ny
	1023	DO i = nxl_y, nxr_y
	1024	f_out(j,i,k) = f_inv(i,k,j)
	1025	ENDDO
	1026	ENDDO
	1027	ENDDO
	1028	!$OMP END PARALLEL
	1029
	1030	END SUBROUTINE resort_for_zy
	1031
	1032
	1033	!------------------------------------------------------------------------------!
[3241]	1034	! Description:cpu_log_nowait
[1216]	1035	! ------------
[1682]	1036	!> Transposition of input array (f_in) from z to y. For the input array, all
	1037	!> elements along z reside on the same PE, while after transposition, all
	1038	!> elements along y reside on the same PE.
[1]	1039	!------------------------------------------------------------------------------!
[1682]	1040	SUBROUTINE transpose_zy( f_in, f_inv )
[1]	1041
[1682]	1042
[1320]	1043	USE cpulog, &
	1044	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	1045
[1320]	1046	USE indices, &
	1047	ONLY: ny, nz
	1048
	1049	USE kinds
	1050
[1324]	1051	USE pegrid
[1320]	1052
	1053	USE transpose_indices, &
	1054	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	1055
[1]	1056	IMPLICIT NONE
	1057
[1682]	1058	INTEGER(iwp) :: i !<
	1059	INTEGER(iwp) :: j !<
	1060	INTEGER(iwp) :: k !<
	1061	INTEGER(iwp) :: l !<
	1062	INTEGER(iwp) :: zs !<
[1]	1063
[1682]	1064	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
	1065	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1111]	1066
[1682]	1067	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	1068	#if __acc_fft_device
[3634]	1069	!$ACC DECLARE CREATE(work)
[3690]	1070	#endif
[1111]	1071
[1]	1072	!
	1073	!-- If the PE grid is one-dimensional along y, the array has only to be
	1074	!-- reordered locally and therefore no transposition has to be done.
	1075	IF ( pdims(1) /= 1 ) THEN
[1106]	1076
	1077	#if defined( __parallel )
[1]	1078	!
	1079	!-- Reorder input array for transposition
[1111]	1080	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	1081	!$OMP DO
[1]	1082	DO l = 0, pdims(1) - 1
[1003]	1083	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	1084	#if __acc_fft_device
[3634]	1085	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1086	!$ACC PRESENT(work, f_in)
[3690]	1087	#endif
[1003]	1088	DO j = nys_z, nyn_z
	1089	DO k = zs, zs + nzt_y - nzb_y
	1090	DO i = nxl_z, nxr_z
[1111]	1091	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	1092	ENDDO
	1093	ENDDO
	1094	ENDDO
	1095	ENDDO
[683]	1096	!$OMP END PARALLEL
[1]	1097
	1098	!
	1099	!-- Transpose array
[1318]	1100	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	1101
	1102	#if __acc_fft_device
[3657]	1103	#ifndef __cuda_aware_mpi
[3634]	1104	!$ACC UPDATE HOST(work)
[3657]	1105	#else
	1106	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	1107	#endif
[3690]	1108	#endif
	1109
[622]	1110	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	1111	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	1112	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	1113	comm1dx, ierr )
[3690]	1114
	1115	#if __acc_fft_device
[3657]	1116	#ifndef __cuda_aware_mpi
[3634]	1117	!$ACC UPDATE DEVICE(f_inv)
[3657]	1118	#else
	1119	!$ACC END HOST_DATA
	1120	#endif
[3690]	1121	#endif
	1122
[1]	1123	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	1124	#endif
[1]	1125
	1126	ELSE
	1127	!
[1106]	1128	!-- Reorder the array in the same way like ALLTOALL did it
[683]	1129	!$OMP PARALLEL PRIVATE ( i, j, k )
	1130	!$OMP DO
[3690]	1131	#if __acc_fft_device
[3634]	1132	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1133	!$ACC PRESENT(f_inv, f_in)
[3690]	1134	#endif
[1003]	1135	DO k = nzb_y, nzt_y
	1136	DO j = 0, ny
	1137	DO i = nxl_y, nxr_y
[164]	1138	f_inv(i,k,j) = f_in(i,j,k)
	1139	ENDDO
	1140	ENDDO
	1141	ENDDO
[683]	1142	!$OMP END PARALLEL
[1106]	1143
	1144	ENDIF
	1145
[1]	1146	END SUBROUTINE transpose_zy
	1147
	1148
	1149	!------------------------------------------------------------------------------!
	1150	! Description:
	1151	! ------------
[1682]	1152	!> Transposition of input array (f_in) from z to y. For the input array, all
	1153	!> elements along z reside on the same PE, while after transposition, all
	1154	!> elements along y reside on the same PE.
	1155	!> This is a direct transposition for arrays with indices in regular order
	1156	!> (k,j,i) (cf. transpose_zy).
[1]	1157	!------------------------------------------------------------------------------!
[1682]	1158	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	1159
[1682]	1160
[1320]	1161	USE cpulog, &
[3241]	1162	ONLY: cpu_log, log_point_s
[1]	1163
[1320]	1164	USE indices, &
	1165	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
	1166
	1167	USE kinds
	1168
[1324]	1169	USE pegrid
[1320]	1170
	1171	USE transpose_indices, &
[3241]	1172	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
[1320]	1173
[1]	1174	IMPLICIT NONE
	1175
[1682]	1176	INTEGER(iwp) :: i !<
	1177	INTEGER(iwp) :: j !<
	1178	INTEGER(iwp) :: k !<
	1179	INTEGER(iwp) :: l !<
	1180	INTEGER(iwp) :: m !<
	1181	INTEGER(iwp) :: ys !<
[1]	1182
[1682]	1183	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	1184	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	1185	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
	1186	REAL(wp) :: work(nnxnnynnz) !<
[1320]	1187
[1]	1188	#if defined( __parallel )
	1189
	1190	!
	1191	!-- Rearrange indices of input array in order to make data to be send
	1192	!-- by MPI contiguous
[1003]	1193	DO i = nxl, nxr
	1194	DO j = nys, nyn
	1195	DO k = 1, nz
[164]	1196	f_inv(j,i,k) = f_in(k,j,i)
[1]	1197	ENDDO
	1198	ENDDO
	1199	ENDDO
	1200
	1201	!
	1202	!-- Move data to different array, because memory location of work1 is
	1203	!-- needed further below (work1 = work2).
	1204	!-- If the PE grid is one-dimensional along x, only local reordering
	1205	!-- of the data is necessary and no transposition has to be done.
	1206	IF ( pdims(2) == 1 ) THEN
[1003]	1207	DO k = 1, nz
	1208	DO i = nxl, nxr
	1209	DO j = nys, nyn
[164]	1210	f_out(j,i,k) = f_inv(j,i,k)
[1]	1211	ENDDO
	1212	ENDDO
	1213	ENDDO
	1214	RETURN
	1215	ENDIF
	1216
	1217	!
	1218	!-- Transpose array
	1219	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1220	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1221	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	1222	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	1223	comm1dy, ierr )
	1224	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1225
	1226	!
	1227	!-- Reorder transposed array
	1228	m = 0
	1229	DO l = 0, pdims(2) - 1
	1230	ys = 0 + l * nny
[1003]	1231	DO k = nzb_yd, nzt_yd
	1232	DO i = nxl_yd, nxr_yd
[1]	1233	DO j = ys, ys + nny - 1
	1234	m = m + 1
[164]	1235	f_out(j,i,k) = work(m)
[1]	1236	ENDDO
	1237	ENDDO
	1238	ENDDO
	1239	ENDDO
	1240
	1241	#endif
	1242
	1243	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |