Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 3701

Last change on this file since 3701 was 3694, checked in by knoop, 6 years ago
Bugfix: moved OpenACC specific cpp #endif , so cpu_log call is not affected.
Property svn:keywords set to `Id`
File size: 33.9 KB

Rev	Line
[1682]	1	!> @file transpose.f90
[2000]	2	!------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[3655]	17	! Copyright 1997-2019 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1321]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: transpose.f90 3694 2019-01-23 17:01:49Z knoop $
[3634]	27	! OpenACC port for SPEC
	28	!
	29	! 3241 2018-09-12 15:02:00Z raasch
[3241]	30	! unused variables removed
	31	!
	32	! 2718 2018-01-02 08:49:38Z maronga
[2716]	33	! Corrected "Former revisions" section
	34	!
	35	! 2696 2017-12-14 17:12:51Z kanani
	36	! Change in file header (GPL part)
[1321]	37	!
[2716]	38	! 2119 2017-01-17 16:51:50Z raasch
	39	!
[2119]	40	! 2118 2017-01-17 16:38:49Z raasch
	41	! OpenACC directives removed
	42	!
[2001]	43	! 2000 2016-08-20 18:09:15Z knoop
	44	! Forced header and separation lines into 80 columns
	45	!
[1683]	46	! 1682 2015-10-07 23:56:08Z knoop
	47	! Code annotations made doxygen readable
	48	!
[1325]	49	! 1324 2014-03-21 09:13:16Z suehring
	50	! Bugfix: ONLY statement for module pegrid removed
	51	!
[1321]	52	! 1320 2014-03-20 08:40:49Z raasch
[1320]	53	! ONLY-attribute added to USE-statements,
	54	! kind-parameters added to all INTEGER and REAL declaration statements,
	55	! kinds are defined in new module kinds,
	56	! old module precision_kind is removed,
	57	! revision history before 2012 removed,
	58	! comment fields (!:) to be used for variable explanations added to
	59	! all variable declaration statements
[198]	60	!
[1319]	61	! 1318 2014-03-17 13:35:16Z raasch
	62	! cpu_log_nowait parameter added to cpu measurements of the transpositions
	63	! required for solving the Poisson equation (poisfft),
	64	! module interfaces removed
	65	!
[1258]	66	! 1257 2013-11-08 15:18:40Z raasch
	67	! openacc loop and loop vector clauses removed
	68	!
[1217]	69	! 1216 2013-08-26 09:31:42Z raasch
	70	! re-sorting of the transposed / to be transposed arrays moved to separate
	71	! routines resort_for_...
	72	!
[1112]	73	! 1111 2013-03-08 23:54:10Z raasch
	74	! openACC directives added,
	75	! resorting data from/to work changed, work got 4 dimensions instead of 1
	76	!
[1107]	77	! 1106 2013-03-04 05:31:38Z raasch
	78	! preprocessor lines rearranged so that routines can also be used in serial
	79	! (non-parallel) mode
	80	!
[1093]	81	! 1092 2013-02-02 11:24:22Z raasch
	82	! unused variables removed
	83	!
[1037]	84	! 1036 2012-10-22 13:43:42Z raasch
	85	! code put under GPL (PALM 3.9)
	86	!
[1004]	87	! 1003 2012-09-14 14:35:53Z raasch
	88	! indices nxa, nya, etc. replaced by nx, ny, etc.
	89	!
[1]	90	! Revision 1.1 1997/07/24 11:25:18 raasch
	91	! Initial revision
	92	!
[3690]	93
	94	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
	95
[1216]	96	!------------------------------------------------------------------------------!
	97	! Description:
	98	! ------------
[1682]	99	!> Resorting data for the transposition from x to y. The transposition itself
	100	!> is carried out in transpose_xy
[1216]	101	!------------------------------------------------------------------------------!
[1682]	102	SUBROUTINE resort_for_xy( f_in, f_inv )
	103
[1216]	104
[1320]	105	USE indices, &
	106	ONLY: nx
[1216]	107
[1320]	108	USE kinds
	109
	110	USE transpose_indices, &
[3241]	111	ONLY: nyn_x, nys_x, nzb_x, nzt_x
[1320]	112
[1216]	113	IMPLICIT NONE
	114
[1682]	115	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	116	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1216]	117
	118
[1682]	119	INTEGER(iwp) :: i !<
	120	INTEGER(iwp) :: j !<
	121	INTEGER(iwp) :: k !<
[1]	122	!
[1216]	123	!-- Rearrange indices of input array in order to make data to be send
	124	!-- by MPI contiguous
	125	!$OMP PARALLEL PRIVATE ( i, j, k )
	126	!$OMP DO
[3690]	127	#if __acc_fft_device
[3634]	128	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	129	!$ACC PRESENT(f_inv, f_in)
[3690]	130	#endif
[1216]	131	DO i = 0, nx
	132	DO k = nzb_x, nzt_x
	133	DO j = nys_x, nyn_x
	134	f_inv(j,k,i) = f_in(i,j,k)
	135	ENDDO
	136	ENDDO
	137	ENDDO
	138	!$OMP END PARALLEL
	139
	140	END SUBROUTINE resort_for_xy
	141
	142
	143	!------------------------------------------------------------------------------!
[1]	144	! Description:
	145	! ------------
[1682]	146	!> Transposition of input array (f_in) from x to y. For the input array, all
	147	!> elements along x reside on the same PE, while after transposition, all
	148	!> elements along y reside on the same PE.
[1]	149	!------------------------------------------------------------------------------!
[1682]	150	SUBROUTINE transpose_xy( f_inv, f_out )
[1]	151
[1682]	152
[1320]	153	USE cpulog, &
	154	ONLY: cpu_log, cpu_log_nowait, log_point_s
	155
	156	USE indices, &
	157	ONLY: nx, ny
	158
	159	USE kinds
	160
[1]	161	USE pegrid
	162
[1320]	163	USE transpose_indices, &
	164	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	165
[1]	166	IMPLICIT NONE
	167
[1682]	168	INTEGER(iwp) :: i !<
	169	INTEGER(iwp) :: j !<
	170	INTEGER(iwp) :: k !<
	171	INTEGER(iwp) :: l !<
	172	INTEGER(iwp) :: ys !<
[1320]	173
[1682]	174	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	175	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1]	176
[1682]	177	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	178	#if __acc_fft_device
[3634]	179	!$ACC DECLARE CREATE(work)
[3690]	180	#endif
[1111]	181
	182
[1106]	183	IF ( numprocs /= 1 ) THEN
	184
	185	#if defined( __parallel )
[1]	186	!
[1106]	187	!-- Transpose array
[1318]	188	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	189
	190	#if __acc_fft_device
[3657]	191	#ifndef __cuda_aware_mpi
[3634]	192	!$ACC UPDATE HOST(f_inv)
[3657]	193	#else
	194	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	195	#endif
[3690]	196	#endif
	197
[1106]	198	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	199	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	200	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	201	comm1dy, ierr )
[3690]	202
	203	#if __acc_fft_device
[3657]	204	#ifndef __cuda_aware_mpi
[3634]	205	!$ACC UPDATE DEVICE(work)
[3657]	206	#else
	207	!$ACC END HOST_DATA
	208	#endif
[3690]	209	#endif
	210
[1106]	211	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	212
	213	!
[1106]	214	!-- Reorder transposed array
[1111]	215	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	216	!$OMP DO
[1106]	217	DO l = 0, pdims(2) - 1
	218	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	219	#if __acc_fft_device
[3634]	220	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	221	!$ACC PRESENT(f_out, work)
[3690]	222	#endif
[1106]	223	DO i = nxl_y, nxr_y
	224	DO k = nzb_y, nzt_y
	225	DO j = ys, ys + nyn_x - nys_x
[1111]	226	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	227	ENDDO
[1]	228	ENDDO
	229	ENDDO
	230	ENDDO
[683]	231	!$OMP END PARALLEL
[1]	232	#endif
	233
[1106]	234	ELSE
	235
	236	!
	237	!-- Reorder transposed array
	238	!$OMP PARALLEL PRIVATE ( i, j, k )
	239	!$OMP DO
[3690]	240	#if __acc_fft_device
[3634]	241	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	242	!$ACC PRESENT(f_out, f_inv)
[3690]	243	#endif
[1106]	244	DO k = nzb_y, nzt_y
	245	DO i = nxl_y, nxr_y
	246	DO j = 0, ny
	247	f_out(j,i,k) = f_inv(j,k,i)
	248	ENDDO
	249	ENDDO
	250	ENDDO
	251	!$OMP END PARALLEL
	252
	253	ENDIF
	254
[1]	255	END SUBROUTINE transpose_xy
	256
	257
	258	!------------------------------------------------------------------------------!
	259	! Description:
	260	! ------------
[1682]	261	!> Resorting data after the transposition from x to z. The transposition itself
	262	!> is carried out in transpose_xz
[1216]	263	!------------------------------------------------------------------------------!
[1682]	264	SUBROUTINE resort_for_xz( f_inv, f_out )
[1216]	265
[1682]	266
[1320]	267	USE indices, &
	268	ONLY: nxl, nxr, nyn, nys, nz
[1216]	269
[1320]	270	USE kinds
	271
[1216]	272	IMPLICIT NONE
	273
[1682]	274	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	275	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
[1216]	276
[1682]	277	INTEGER(iwp) :: i !<
	278	INTEGER(iwp) :: j !<
	279	INTEGER(iwp) :: k !<
[1216]	280	!
	281	!-- Rearrange indices of input array in order to make data to be send
	282	!-- by MPI contiguous.
	283	!-- In case of parallel fft/transposition, scattered store is faster in
	284	!-- backward direction!!!
	285	!$OMP PARALLEL PRIVATE ( i, j, k )
	286	!$OMP DO
[3690]	287	#if __acc_fft_device
[3634]	288	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	289	!$ACC PRESENT(f_out, f_inv)
[3690]	290	#endif
[1216]	291	DO k = 1, nz
	292	DO i = nxl, nxr
	293	DO j = nys, nyn
	294	f_out(k,j,i) = f_inv(j,i,k)
	295	ENDDO
	296	ENDDO
	297	ENDDO
	298	!$OMP END PARALLEL
	299
	300	END SUBROUTINE resort_for_xz
	301
	302
	303	!------------------------------------------------------------------------------!
	304	! Description:
	305	! ------------
[1682]	306	!> Transposition of input array (f_in) from x to z. For the input array, all
	307	!> elements along x reside on the same PE, while after transposition, all
	308	!> elements along z reside on the same PE.
[1]	309	!------------------------------------------------------------------------------!
[1682]	310	SUBROUTINE transpose_xz( f_in, f_inv )
[1]	311
[1682]	312
[1320]	313	USE cpulog, &
	314	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	315
[1320]	316	USE indices, &
[3241]	317	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
[1320]	318
	319	USE kinds
	320
[1324]	321	USE pegrid
[1320]	322
	323	USE transpose_indices, &
	324	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	325
[1]	326	IMPLICIT NONE
	327
[1682]	328	INTEGER(iwp) :: i !<
	329	INTEGER(iwp) :: j !<
	330	INTEGER(iwp) :: k !<
	331	INTEGER(iwp) :: l !<
	332	INTEGER(iwp) :: xs !<
[1]	333
[1682]	334	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	335	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1]	336
[1682]	337	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	338	#if __acc_fft_device
[3634]	339	!$ACC DECLARE CREATE(work)
[3690]	340	#endif
[1111]	341
[1320]	342
[1]	343	!
	344	!-- If the PE grid is one-dimensional along y, the array has only to be
	345	!-- reordered locally and therefore no transposition has to be done.
	346	IF ( pdims(1) /= 1 ) THEN
[1106]	347
	348	#if defined( __parallel )
[1]	349	!
	350	!-- Reorder input array for transposition
[1111]	351	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	352	!$OMP DO
[1]	353	DO l = 0, pdims(1) - 1
	354	xs = 0 + l * nnx
[3690]	355	#if __acc_fft_device
[3634]	356	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	357	!$ACC PRESENT(work, f_in)
[3690]	358	#endif
[1003]	359	DO k = nzb_x, nzt_x
[164]	360	DO i = xs, xs + nnx - 1
[1003]	361	DO j = nys_x, nyn_x
[1111]	362	work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]	363	ENDDO
	364	ENDDO
	365	ENDDO
	366	ENDDO
[683]	367	!$OMP END PARALLEL
[1]	368
	369	!
	370	!-- Transpose array
[1318]	371	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	372
	373	#if __acc_fft_device
[3657]	374	#ifndef __cuda_aware_mpi
[3634]	375	!$ACC UPDATE HOST(work)
[3657]	376	#else
	377	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	378	#endif
[3690]	379	#endif
	380
[622]	381	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	382	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	383	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	384	comm1dx, ierr )
[3690]	385
	386	#if __acc_fft_device
[3657]	387	#ifndef __cuda_aware_mpi
[3634]	388	!$ACC UPDATE DEVICE(f_inv)
[3657]	389	#else
	390	!$ACC END HOST_DATA
	391	#endif
[3694]	392	#endif
	393
[1]	394	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	395	#endif
	396
[1]	397	ELSE
[1106]	398
[1]	399	!
	400	!-- Reorder the array in a way that the z index is in first position
[683]	401	!$OMP PARALLEL PRIVATE ( i, j, k )
	402	!$OMP DO
[3690]	403	#if __acc_fft_device
[3634]	404	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	405	!$ACC PRESENT(f_inv, f_in)
[3690]	406	#endif
[1003]	407	DO i = nxl, nxr
	408	DO j = nys, nyn
	409	DO k = 1, nz
[164]	410	f_inv(j,i,k) = f_in(i,j,k)
[1]	411	ENDDO
	412	ENDDO
	413	ENDDO
[683]	414	!$OMP END PARALLEL
[1]	415
[164]	416	ENDIF
	417
[1]	418	END SUBROUTINE transpose_xz
	419
	420
	421	!------------------------------------------------------------------------------!
	422	! Description:
	423	! ------------
[1682]	424	!> Resorting data after the transposition from y to x. The transposition itself
	425	!> is carried out in transpose_yx
[1216]	426	!------------------------------------------------------------------------------!
[1682]	427	SUBROUTINE resort_for_yx( f_inv, f_out )
[1216]	428
[1682]	429
[1320]	430	USE indices, &
	431	ONLY: nx
[1216]	432
[1320]	433	USE kinds
	434
	435	USE transpose_indices, &
	436	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	437
[1216]	438	IMPLICIT NONE
	439
[1682]	440	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	441	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1216]	442
	443
[1682]	444	INTEGER(iwp) :: i !<
	445	INTEGER(iwp) :: j !<
	446	INTEGER(iwp) :: k !<
[1216]	447	!
	448	!-- Rearrange indices of input array in order to make data to be send
	449	!-- by MPI contiguous
	450	!$OMP PARALLEL PRIVATE ( i, j, k )
	451	!$OMP DO
[3690]	452	#if __acc_fft_device
[3634]	453	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	454	!$ACC PRESENT(f_out, f_inv)
[3690]	455	#endif
[1216]	456	DO i = 0, nx
	457	DO k = nzb_x, nzt_x
	458	DO j = nys_x, nyn_x
	459	f_out(i,j,k) = f_inv(j,k,i)
	460	ENDDO
	461	ENDDO
	462	ENDDO
	463	!$OMP END PARALLEL
	464
	465	END SUBROUTINE resort_for_yx
	466
	467
	468	!------------------------------------------------------------------------------!
	469	! Description:
	470	! ------------
[1682]	471	!> Transposition of input array (f_in) from y to x. For the input array, all
	472	!> elements along y reside on the same PE, while after transposition, all
	473	!> elements along x reside on the same PE.
[1]	474	!------------------------------------------------------------------------------!
[1682]	475	SUBROUTINE transpose_yx( f_in, f_inv )
[1]	476
[1682]	477
[1320]	478	USE cpulog, &
	479	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	480
[1320]	481	USE indices, &
	482	ONLY: nx, ny
	483
	484	USE kinds
	485
[1324]	486	USE pegrid
[1320]	487
	488	USE transpose_indices, &
	489	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	490
[1]	491	IMPLICIT NONE
	492
[1682]	493	INTEGER(iwp) :: i !<
	494	INTEGER(iwp) :: j !<
	495	INTEGER(iwp) :: k !<
	496	INTEGER(iwp) :: l !<
	497	INTEGER(iwp) :: ys !<
[1]	498
[1682]	499	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	500	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1111]	501
[1682]	502	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3690]	503	#if __acc_fft_device
[3634]	504	!$ACC DECLARE CREATE(work)
[3690]	505	#endif
[1111]	506
[1320]	507
[1106]	508	IF ( numprocs /= 1 ) THEN
	509
[1]	510	#if defined( __parallel )
	511	!
[1106]	512	!-- Reorder input array for transposition
[1111]	513	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	514	!$OMP DO
[1106]	515	DO l = 0, pdims(2) - 1
	516	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3690]	517	#if __acc_fft_device
[3634]	518	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	519	!$ACC PRESENT(work, f_in)
[3690]	520	#endif
[1106]	521	DO i = nxl_y, nxr_y
	522	DO k = nzb_y, nzt_y
	523	DO j = ys, ys + nyn_x - nys_x
[1111]	524	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	525	ENDDO
	526	ENDDO
	527	ENDDO
	528	ENDDO
	529	!$OMP END PARALLEL
	530
	531	!
	532	!-- Transpose array
[1318]	533	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	534
	535	#if __acc_fft_device
[3657]	536	#ifndef __cuda_aware_mpi
[3634]	537	!$ACC UPDATE HOST(work)
[3657]	538	#else
	539	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	540	#endif
[3690]	541	#endif
	542
[1106]	543	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	544	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	545	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	546	comm1dy, ierr )
[3690]	547
	548	#if __acc_fft_device
[3657]	549	#ifndef __cuda_aware_mpi
[3634]	550	!$ACC UPDATE DEVICE(f_inv)
[3657]	551	#else
	552	!$ACC END HOST_DATA
	553	#endif
[3690]	554	#endif
	555
[1106]	556	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	557	#endif
	558
	559	ELSE
	560
	561	!
	562	!-- Reorder array f_in the same way as ALLTOALL did it
	563	!$OMP PARALLEL PRIVATE ( i, j, k )
	564	!$OMP DO
[3690]	565	#if __acc_fft_device
[3634]	566	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	567	!$ACC PRESENT(f_inv, f_in)
[3690]	568	#endif
[1003]	569	DO i = nxl_y, nxr_y
	570	DO k = nzb_y, nzt_y
[1106]	571	DO j = 0, ny
	572	f_inv(j,k,i) = f_in(j,i,k)
[1]	573	ENDDO
	574	ENDDO
	575	ENDDO
[683]	576	!$OMP END PARALLEL
[1]	577
[1106]	578	ENDIF
[1]	579
	580	END SUBROUTINE transpose_yx
	581
	582
	583	!------------------------------------------------------------------------------!
	584	! Description:
	585	! ------------
[1682]	586	!> Transposition of input array (f_in) from y to x. For the input array, all
	587	!> elements along y reside on the same PE, while after transposition, all
	588	!> elements along x reside on the same PE.
	589	!> This is a direct transposition for arrays with indices in regular order
	590	!> (k,j,i) (cf. transpose_yx).
[1]	591	!------------------------------------------------------------------------------!
[1682]	592	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	593
[1682]	594
[1320]	595	USE cpulog, &
[3241]	596	ONLY: cpu_log, log_point_s
[1]	597
[1320]	598	USE indices, &
	599	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
	600
	601	USE kinds
	602
[1324]	603	USE pegrid
[1320]	604
	605	USE transpose_indices, &
	606	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	607
[1]	608	IMPLICIT NONE
	609
[1682]	610	INTEGER(iwp) :: i !<
	611	INTEGER(iwp) :: j !<
	612	INTEGER(iwp) :: k !<
	613	INTEGER(iwp) :: l !<
	614	INTEGER(iwp) :: m !<
	615	INTEGER(iwp) :: xs !<
[1]	616
[1682]	617	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	618	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
	619	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	620	REAL(wp) :: work(nnxnnynnz) !<
[1]	621	#if defined( __parallel )
	622
	623	!
	624	!-- Rearrange indices of input array in order to make data to be send
	625	!-- by MPI contiguous
[1003]	626	DO k = 1, nz
	627	DO j = nys, nyn
	628	DO i = nxl, nxr
[164]	629	f_inv(i,k,j) = f_in(k,j,i)
[1]	630	ENDDO
	631	ENDDO
	632	ENDDO
	633
	634	!
	635	!-- Transpose array
	636	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	637	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	638	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	639	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	640	comm1dx, ierr )
	641	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	642
	643	!
	644	!-- Reorder transposed array
	645	m = 0
	646	DO l = 0, pdims(1) - 1
	647	xs = 0 + l * nnx
[1003]	648	DO j = nys_x, nyn_x
	649	DO k = 1, nz
[1]	650	DO i = xs, xs + nnx - 1
	651	m = m + 1
[164]	652	f_out(i,j,k) = work(m)
[1]	653	ENDDO
	654	ENDDO
	655	ENDDO
	656	ENDDO
	657
	658	#endif
	659
	660	END SUBROUTINE transpose_yxd
	661
	662
	663	!------------------------------------------------------------------------------!
	664	! Description:
	665	! ------------
[1682]	666	!> Resorting data for the transposition from y to z. The transposition itself
	667	!> is carried out in transpose_yz
[1216]	668	!------------------------------------------------------------------------------!
[1682]	669	SUBROUTINE resort_for_yz( f_in, f_inv )
[1216]	670
[1682]	671
[1320]	672	USE indices, &
	673	ONLY: ny
[1216]	674
[1320]	675	USE kinds
	676
	677	USE transpose_indices, &
	678	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	679
[1216]	680	IMPLICIT NONE
	681
[1682]	682	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	683	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1216]	684
[1682]	685	INTEGER(iwp) :: i !<
	686	INTEGER(iwp) :: j !<
	687	INTEGER(iwp) :: k !<
[1216]	688
	689	!
	690	!-- Rearrange indices of input array in order to make data to be send
	691	!-- by MPI contiguous
	692	!$OMP PARALLEL PRIVATE ( i, j, k )
	693	!$OMP DO
[3690]	694	#if __acc_fft_device
[3634]	695	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	696	!$ACC PRESENT(f_inv, f_in)
[3690]	697	#endif
[1216]	698	DO j = 0, ny
	699	DO k = nzb_y, nzt_y
	700	DO i = nxl_y, nxr_y
	701	f_inv(i,k,j) = f_in(j,i,k)
	702	ENDDO
	703	ENDDO
	704	ENDDO
	705	!$OMP END PARALLEL
	706
	707	END SUBROUTINE resort_for_yz
	708
	709
	710	!------------------------------------------------------------------------------!
	711	! Description:
	712	! ------------
[1682]	713	!> Transposition of input array (f_in) from y to z. For the input array, all
	714	!> elements along y reside on the same PE, while after transposition, all
	715	!> elements along z reside on the same PE.
[1]	716	!------------------------------------------------------------------------------!
[1682]	717	SUBROUTINE transpose_yz( f_inv, f_out )
[1]	718
[1682]	719
[1320]	720	USE cpulog, &
	721	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	722
[1320]	723	USE indices, &
	724	ONLY: ny, nz
	725
	726	USE kinds
	727
[1324]	728	USE pegrid
[1320]	729
	730	USE transpose_indices, &
	731	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	732
[1]	733	IMPLICIT NONE
	734
[1682]	735	INTEGER(iwp) :: i !<
	736	INTEGER(iwp) :: j !<
	737	INTEGER(iwp) :: k !<
	738	INTEGER(iwp) :: l !<
	739	INTEGER(iwp) :: zs !<
[1]	740
[1682]	741	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	742	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
[1111]	743
[1682]	744	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	745	#if __acc_fft_device
[3634]	746	!$ACC DECLARE CREATE(work)
[3690]	747	#endif
[1111]	748
[1320]	749
[1]	750	!
	751	!-- If the PE grid is one-dimensional along y, only local reordering
	752	!-- of the data is necessary and no transposition has to be done.
	753	IF ( pdims(1) == 1 ) THEN
[1106]	754
[683]	755	!$OMP PARALLEL PRIVATE ( i, j, k )
	756	!$OMP DO
[3690]	757	#if __acc_fft_device
[3634]	758	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	759	!$ACC PRESENT(f_out, f_inv)
[3690]	760	#endif
[1003]	761	DO j = 0, ny
	762	DO k = nzb_y, nzt_y
	763	DO i = nxl_y, nxr_y
[164]	764	f_out(i,j,k) = f_inv(i,k,j)
[1]	765	ENDDO
	766	ENDDO
	767	ENDDO
[683]	768	!$OMP END PARALLEL
[1]	769
[1106]	770	ELSE
	771
	772	#if defined( __parallel )
[1]	773	!
[1106]	774	!-- Transpose array
[1318]	775	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	776
	777	#if __acc_fft_device
[3657]	778	#ifndef __cuda_aware_mpi
[3634]	779	!$ACC UPDATE HOST(f_inv)
[3657]	780	#else
	781	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	782	#endif
[3690]	783	#endif
	784
[1106]	785	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	786	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	787	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	788	comm1dx, ierr )
[3690]	789
	790	#if __acc_fft_device
[3657]	791	#ifndef __cuda_aware_mpi
[3634]	792	!$ACC UPDATE DEVICE(work)
[3657]	793	#else
	794	!$ACC END HOST_DATA
	795	#endif
[3690]	796	#endif
	797
[1106]	798	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	799
	800	!
[1106]	801	!-- Reorder transposed array
[1111]	802	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	803	!$OMP DO
[1106]	804	DO l = 0, pdims(1) - 1
	805	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	806	#if __acc_fft_device
[3634]	807	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	808	!$ACC PRESENT(f_out, work)
[3690]	809	#endif
[1106]	810	DO j = nys_z, nyn_z
	811	DO k = zs, zs + nzt_y - nzb_y
	812	DO i = nxl_z, nxr_z
[1111]	813	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	814	ENDDO
[1]	815	ENDDO
	816	ENDDO
	817	ENDDO
[683]	818	!$OMP END PARALLEL
[1]	819	#endif
	820
[1106]	821	ENDIF
	822
[1]	823	END SUBROUTINE transpose_yz
	824
	825
	826	!------------------------------------------------------------------------------!
	827	! Description:
	828	! ------------
[1682]	829	!> Resorting data for the transposition from z to x. The transposition itself
	830	!> is carried out in transpose_zx
[1216]	831	!------------------------------------------------------------------------------!
[1682]	832	SUBROUTINE resort_for_zx( f_in, f_inv )
[1216]	833
[1682]	834
[1320]	835	USE indices, &
	836	ONLY: nxl, nxr, nyn, nys, nz
[1216]	837
[1320]	838	USE kinds
	839
[1216]	840	IMPLICIT NONE
	841
[1682]	842	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	843	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1216]	844
[1682]	845	INTEGER(iwp) :: i !<
	846	INTEGER(iwp) :: j !<
	847	INTEGER(iwp) :: k !<
[1216]	848
	849	!
	850	!-- Rearrange indices of input array in order to make data to be send
	851	!-- by MPI contiguous
	852	!$OMP PARALLEL PRIVATE ( i, j, k )
	853	!$OMP DO
[3690]	854	#if __acc_fft_device
[3634]	855	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	856	!$ACC PRESENT(f_in, f_inv)
[3690]	857	#endif
[1216]	858	DO k = 1,nz
	859	DO i = nxl, nxr
	860	DO j = nys, nyn
	861	f_inv(j,i,k) = f_in(k,j,i)
	862	ENDDO
	863	ENDDO
	864	ENDDO
	865	!$OMP END PARALLEL
	866
	867	END SUBROUTINE resort_for_zx
	868
	869
	870	!------------------------------------------------------------------------------!
	871	! Description:
	872	! ------------
[1682]	873	!> Transposition of input array (f_in) from z to x. For the input array, all
	874	!> elements along z reside on the same PE, while after transposition, all
	875	!> elements along x reside on the same PE.
[1]	876	!------------------------------------------------------------------------------!
[1682]	877	SUBROUTINE transpose_zx( f_inv, f_out )
[1]	878
[1682]	879
[1320]	880	USE cpulog, &
	881	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	882
[1320]	883	USE indices, &
	884	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
	885
	886	USE kinds
	887
[1324]	888	USE pegrid
[1320]	889
	890	USE transpose_indices, &
	891	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	892
[1]	893	IMPLICIT NONE
	894
[1682]	895	INTEGER(iwp) :: i !<
	896	INTEGER(iwp) :: j !<
	897	INTEGER(iwp) :: k !<
	898	INTEGER(iwp) :: l !<
	899	INTEGER(iwp) :: xs !<
[1]	900
[1682]	901	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	902	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1111]	903
[1682]	904	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3690]	905	#if __acc_fft_device
[3634]	906	!$ACC DECLARE CREATE(work)
[3690]	907	#endif
[1]	908
[1320]	909
[1]	910	!
	911	!-- If the PE grid is one-dimensional along y, only local reordering
	912	!-- of the data is necessary and no transposition has to be done.
	913	IF ( pdims(1) == 1 ) THEN
[1106]	914
[683]	915	!$OMP PARALLEL PRIVATE ( i, j, k )
	916	!$OMP DO
[3690]	917	#if __acc_fft_device
[3634]	918	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	919	!$ACC PRESENT(f_out, f_inv)
[3690]	920	#endif
[1003]	921	DO k = 1, nz
	922	DO i = nxl, nxr
	923	DO j = nys, nyn
[164]	924	f_out(i,j,k) = f_inv(j,i,k)
[1]	925	ENDDO
	926	ENDDO
	927	ENDDO
[683]	928	!$OMP END PARALLEL
[1]	929
[1106]	930	ELSE
	931
	932	#if defined( __parallel )
[1]	933	!
[1106]	934	!-- Transpose array
[1318]	935	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	936
	937	#if __acc_fft_device
[3657]	938	#ifndef __cuda_aware_mpi
[3634]	939	!$ACC UPDATE HOST(f_inv)
[3657]	940	#else
	941	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	942	#endif
[3690]	943	#endif
	944
[1106]	945	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	946	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	947	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	948	comm1dx, ierr )
[3690]	949
	950	#if __acc_fft_device
[3657]	951	#ifndef __cuda_aware_mpi
[3634]	952	!$ACC UPDATE DEVICE(work)
[3657]	953	#else
	954	!$ACC END HOST_DATA
	955	#endif
[3690]	956	#endif
	957
[1106]	958	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	959
	960	!
[1106]	961	!-- Reorder transposed array
[1111]	962	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	963	!$OMP DO
[1106]	964	DO l = 0, pdims(1) - 1
	965	xs = 0 + l * nnx
[3690]	966	#if __acc_fft_device
[3634]	967	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	968	!$ACC PRESENT(f_out, work)
[3690]	969	#endif
[1106]	970	DO k = nzb_x, nzt_x
	971	DO i = xs, xs + nnx - 1
	972	DO j = nys_x, nyn_x
[1111]	973	f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]	974	ENDDO
[1]	975	ENDDO
	976	ENDDO
	977	ENDDO
[683]	978	!$OMP END PARALLEL
[1]	979	#endif
	980
[1106]	981	ENDIF
	982
[1]	983	END SUBROUTINE transpose_zx
	984
	985
	986	!------------------------------------------------------------------------------!
	987	! Description:
	988	! ------------
[1682]	989	!> Resorting data after the transposition from z to y. The transposition itself
	990	!> is carried out in transpose_zy
[1216]	991	!------------------------------------------------------------------------------!
[1682]	992	SUBROUTINE resort_for_zy( f_inv, f_out )
[1216]	993
[1682]	994
[1320]	995	USE indices, &
	996	ONLY: ny
[1216]	997
[1320]	998	USE kinds
	999
	1000	USE transpose_indices, &
	1001	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	1002
[1216]	1003	IMPLICIT NONE
	1004
[1682]	1005	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	1006	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1216]	1007
	1008
[1682]	1009	INTEGER(iwp) :: i !<
	1010	INTEGER(iwp) :: j !<
	1011	INTEGER(iwp) :: k !<
[1216]	1012
	1013	!
	1014	!-- Rearrange indices of input array in order to make data to be send
	1015	!-- by MPI contiguous
	1016	!$OMP PARALLEL PRIVATE ( i, j, k )
	1017	!$OMP DO
[3690]	1018	#if __acc_fft_device
[3634]	1019	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1020	!$ACC PRESENT(f_out, f_inv)
[3690]	1021	#endif
[1216]	1022	DO k = nzb_y, nzt_y
	1023	DO j = 0, ny
	1024	DO i = nxl_y, nxr_y
	1025	f_out(j,i,k) = f_inv(i,k,j)
	1026	ENDDO
	1027	ENDDO
	1028	ENDDO
	1029	!$OMP END PARALLEL
	1030
	1031	END SUBROUTINE resort_for_zy
	1032
	1033
	1034	!------------------------------------------------------------------------------!
[3241]	1035	! Description:cpu_log_nowait
[1216]	1036	! ------------
[1682]	1037	!> Transposition of input array (f_in) from z to y. For the input array, all
	1038	!> elements along z reside on the same PE, while after transposition, all
	1039	!> elements along y reside on the same PE.
[1]	1040	!------------------------------------------------------------------------------!
[1682]	1041	SUBROUTINE transpose_zy( f_in, f_inv )
[1]	1042
[1682]	1043
[1320]	1044	USE cpulog, &
	1045	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	1046
[1320]	1047	USE indices, &
	1048	ONLY: ny, nz
	1049
	1050	USE kinds
	1051
[1324]	1052	USE pegrid
[1320]	1053
	1054	USE transpose_indices, &
	1055	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	1056
[1]	1057	IMPLICIT NONE
	1058
[1682]	1059	INTEGER(iwp) :: i !<
	1060	INTEGER(iwp) :: j !<
	1061	INTEGER(iwp) :: k !<
	1062	INTEGER(iwp) :: l !<
	1063	INTEGER(iwp) :: zs !<
[1]	1064
[1682]	1065	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
	1066	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1111]	1067
[1682]	1068	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3690]	1069	#if __acc_fft_device
[3634]	1070	!$ACC DECLARE CREATE(work)
[3690]	1071	#endif
[1111]	1072
[1]	1073	!
	1074	!-- If the PE grid is one-dimensional along y, the array has only to be
	1075	!-- reordered locally and therefore no transposition has to be done.
	1076	IF ( pdims(1) /= 1 ) THEN
[1106]	1077
	1078	#if defined( __parallel )
[1]	1079	!
	1080	!-- Reorder input array for transposition
[1111]	1081	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	1082	!$OMP DO
[1]	1083	DO l = 0, pdims(1) - 1
[1003]	1084	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3690]	1085	#if __acc_fft_device
[3634]	1086	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1087	!$ACC PRESENT(work, f_in)
[3690]	1088	#endif
[1003]	1089	DO j = nys_z, nyn_z
	1090	DO k = zs, zs + nzt_y - nzb_y
	1091	DO i = nxl_z, nxr_z
[1111]	1092	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	1093	ENDDO
	1094	ENDDO
	1095	ENDDO
	1096	ENDDO
[683]	1097	!$OMP END PARALLEL
[1]	1098
	1099	!
	1100	!-- Transpose array
[1318]	1101	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3690]	1102
	1103	#if __acc_fft_device
[3657]	1104	#ifndef __cuda_aware_mpi
[3634]	1105	!$ACC UPDATE HOST(work)
[3657]	1106	#else
	1107	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	1108	#endif
[3690]	1109	#endif
	1110
[622]	1111	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	1112	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	1113	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	1114	comm1dx, ierr )
[3690]	1115
	1116	#if __acc_fft_device
[3657]	1117	#ifndef __cuda_aware_mpi
[3634]	1118	!$ACC UPDATE DEVICE(f_inv)
[3657]	1119	#else
	1120	!$ACC END HOST_DATA
	1121	#endif
[3690]	1122	#endif
	1123
[1]	1124	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	1125	#endif
[1]	1126
	1127	ELSE
	1128	!
[1106]	1129	!-- Reorder the array in the same way like ALLTOALL did it
[683]	1130	!$OMP PARALLEL PRIVATE ( i, j, k )
	1131	!$OMP DO
[3690]	1132	#if __acc_fft_device
[3634]	1133	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1134	!$ACC PRESENT(f_inv, f_in)
[3690]	1135	#endif
[1003]	1136	DO k = nzb_y, nzt_y
	1137	DO j = 0, ny
	1138	DO i = nxl_y, nxr_y
[164]	1139	f_inv(i,k,j) = f_in(i,j,k)
	1140	ENDDO
	1141	ENDDO
	1142	ENDDO
[683]	1143	!$OMP END PARALLEL
[1106]	1144
	1145	ENDIF
	1146
[1]	1147	END SUBROUTINE transpose_zy
	1148
	1149
	1150	!------------------------------------------------------------------------------!
	1151	! Description:
	1152	! ------------
[1682]	1153	!> Transposition of input array (f_in) from z to y. For the input array, all
	1154	!> elements along z reside on the same PE, while after transposition, all
	1155	!> elements along y reside on the same PE.
	1156	!> This is a direct transposition for arrays with indices in regular order
	1157	!> (k,j,i) (cf. transpose_zy).
[1]	1158	!------------------------------------------------------------------------------!
[1682]	1159	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	1160
[1682]	1161
[1320]	1162	USE cpulog, &
[3241]	1163	ONLY: cpu_log, log_point_s
[1]	1164
[1320]	1165	USE indices, &
	1166	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
	1167
	1168	USE kinds
	1169
[1324]	1170	USE pegrid
[1320]	1171
	1172	USE transpose_indices, &
[3241]	1173	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
[1320]	1174
[1]	1175	IMPLICIT NONE
	1176
[1682]	1177	INTEGER(iwp) :: i !<
	1178	INTEGER(iwp) :: j !<
	1179	INTEGER(iwp) :: k !<
	1180	INTEGER(iwp) :: l !<
	1181	INTEGER(iwp) :: m !<
	1182	INTEGER(iwp) :: ys !<
[1]	1183
[1682]	1184	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	1185	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	1186	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
	1187	REAL(wp) :: work(nnxnnynnz) !<
[1320]	1188
[1]	1189	#if defined( __parallel )
	1190
	1191	!
	1192	!-- Rearrange indices of input array in order to make data to be send
	1193	!-- by MPI contiguous
[1003]	1194	DO i = nxl, nxr
	1195	DO j = nys, nyn
	1196	DO k = 1, nz
[164]	1197	f_inv(j,i,k) = f_in(k,j,i)
[1]	1198	ENDDO
	1199	ENDDO
	1200	ENDDO
	1201
	1202	!
	1203	!-- Move data to different array, because memory location of work1 is
	1204	!-- needed further below (work1 = work2).
	1205	!-- If the PE grid is one-dimensional along x, only local reordering
	1206	!-- of the data is necessary and no transposition has to be done.
	1207	IF ( pdims(2) == 1 ) THEN
[1003]	1208	DO k = 1, nz
	1209	DO i = nxl, nxr
	1210	DO j = nys, nyn
[164]	1211	f_out(j,i,k) = f_inv(j,i,k)
[1]	1212	ENDDO
	1213	ENDDO
	1214	ENDDO
	1215	RETURN
	1216	ENDIF
	1217
	1218	!
	1219	!-- Transpose array
	1220	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1221	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1222	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	1223	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	1224	comm1dy, ierr )
	1225	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1226
	1227	!
	1228	!-- Reorder transposed array
	1229	m = 0
	1230	DO l = 0, pdims(2) - 1
	1231	ys = 0 + l * nny
[1003]	1232	DO k = nzb_yd, nzt_yd
	1233	DO i = nxl_yd, nxr_yd
[1]	1234	DO j = ys, ys + nny - 1
	1235	m = m + 1
[164]	1236	f_out(j,i,k) = work(m)
[1]	1237	ENDDO
	1238	ENDDO
	1239	ENDDO
	1240	ENDDO
	1241
	1242	#endif
	1243
	1244	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |