Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 3688

Last change on this file since 3688 was 3657, checked in by knoop, 6 years ago
OpenACC: cuda-aware-mpi in transpose and acc update async in exchange_horiz added
Property svn:keywords set to `Id`
File size: 32.8 KB

Rev	Line
[1682]	1	!> @file transpose.f90
[2000]	2	!------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[3655]	17	! Copyright 1997-2019 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1321]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: transpose.f90 3657 2019-01-07 20:14:18Z maronga $
[3634]	27	! OpenACC port for SPEC
	28	!
	29	! 3241 2018-09-12 15:02:00Z raasch
[3241]	30	! unused variables removed
	31	!
	32	! 2718 2018-01-02 08:49:38Z maronga
[2716]	33	! Corrected "Former revisions" section
	34	!
	35	! 2696 2017-12-14 17:12:51Z kanani
	36	! Change in file header (GPL part)
[1321]	37	!
[2716]	38	! 2119 2017-01-17 16:51:50Z raasch
	39	!
[2119]	40	! 2118 2017-01-17 16:38:49Z raasch
	41	! OpenACC directives removed
	42	!
[2001]	43	! 2000 2016-08-20 18:09:15Z knoop
	44	! Forced header and separation lines into 80 columns
	45	!
[1683]	46	! 1682 2015-10-07 23:56:08Z knoop
	47	! Code annotations made doxygen readable
	48	!
[1325]	49	! 1324 2014-03-21 09:13:16Z suehring
	50	! Bugfix: ONLY statement for module pegrid removed
	51	!
[1321]	52	! 1320 2014-03-20 08:40:49Z raasch
[1320]	53	! ONLY-attribute added to USE-statements,
	54	! kind-parameters added to all INTEGER and REAL declaration statements,
	55	! kinds are defined in new module kinds,
	56	! old module precision_kind is removed,
	57	! revision history before 2012 removed,
	58	! comment fields (!:) to be used for variable explanations added to
	59	! all variable declaration statements
[198]	60	!
[1319]	61	! 1318 2014-03-17 13:35:16Z raasch
	62	! cpu_log_nowait parameter added to cpu measurements of the transpositions
	63	! required for solving the Poisson equation (poisfft),
	64	! module interfaces removed
	65	!
[1258]	66	! 1257 2013-11-08 15:18:40Z raasch
	67	! openacc loop and loop vector clauses removed
	68	!
[1217]	69	! 1216 2013-08-26 09:31:42Z raasch
	70	! re-sorting of the transposed / to be transposed arrays moved to separate
	71	! routines resort_for_...
	72	!
[1112]	73	! 1111 2013-03-08 23:54:10Z raasch
	74	! openACC directives added,
	75	! resorting data from/to work changed, work got 4 dimensions instead of 1
	76	!
[1107]	77	! 1106 2013-03-04 05:31:38Z raasch
	78	! preprocessor lines rearranged so that routines can also be used in serial
	79	! (non-parallel) mode
	80	!
[1093]	81	! 1092 2013-02-02 11:24:22Z raasch
	82	! unused variables removed
	83	!
[1037]	84	! 1036 2012-10-22 13:43:42Z raasch
	85	! code put under GPL (PALM 3.9)
	86	!
[1004]	87	! 1003 2012-09-14 14:35:53Z raasch
	88	! indices nxa, nya, etc. replaced by nx, ny, etc.
	89	!
[1]	90	! Revision 1.1 1997/07/24 11:25:18 raasch
	91	! Initial revision
	92	!
[1216]	93	!------------------------------------------------------------------------------!
	94	! Description:
	95	! ------------
[1682]	96	!> Resorting data for the transposition from x to y. The transposition itself
	97	!> is carried out in transpose_xy
[1216]	98	!------------------------------------------------------------------------------!
[1682]	99	SUBROUTINE resort_for_xy( f_in, f_inv )
	100
[1216]	101
[1320]	102	USE indices, &
	103	ONLY: nx
[1216]	104
[1320]	105	USE kinds
	106
	107	USE transpose_indices, &
[3241]	108	ONLY: nyn_x, nys_x, nzb_x, nzt_x
[1320]	109
[1216]	110	IMPLICIT NONE
	111
[1682]	112	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	113	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1216]	114
	115
[1682]	116	INTEGER(iwp) :: i !<
	117	INTEGER(iwp) :: j !<
	118	INTEGER(iwp) :: k !<
[1]	119	!
[1216]	120	!-- Rearrange indices of input array in order to make data to be send
	121	!-- by MPI contiguous
	122	!$OMP PARALLEL PRIVATE ( i, j, k )
	123	!$OMP DO
[3634]	124	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	125	!$ACC PRESENT(f_inv, f_in)
[1216]	126	DO i = 0, nx
	127	DO k = nzb_x, nzt_x
	128	DO j = nys_x, nyn_x
	129	f_inv(j,k,i) = f_in(i,j,k)
	130	ENDDO
	131	ENDDO
	132	ENDDO
	133	!$OMP END PARALLEL
	134
	135	END SUBROUTINE resort_for_xy
	136
	137
	138	!------------------------------------------------------------------------------!
[1]	139	! Description:
	140	! ------------
[1682]	141	!> Transposition of input array (f_in) from x to y. For the input array, all
	142	!> elements along x reside on the same PE, while after transposition, all
	143	!> elements along y reside on the same PE.
[1]	144	!------------------------------------------------------------------------------!
[1682]	145	SUBROUTINE transpose_xy( f_inv, f_out )
[1]	146
[1682]	147
[1320]	148	USE cpulog, &
	149	ONLY: cpu_log, cpu_log_nowait, log_point_s
	150
	151	USE indices, &
	152	ONLY: nx, ny
	153
	154	USE kinds
	155
[1]	156	USE pegrid
	157
[1320]	158	USE transpose_indices, &
	159	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	160
[1]	161	IMPLICIT NONE
	162
[1682]	163	INTEGER(iwp) :: i !<
	164	INTEGER(iwp) :: j !<
	165	INTEGER(iwp) :: k !<
	166	INTEGER(iwp) :: l !<
	167	INTEGER(iwp) :: ys !<
[1320]	168
[1682]	169	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	170	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1]	171
[1682]	172	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3634]	173	!$ACC DECLARE CREATE(work)
[1111]	174
	175
[1106]	176	IF ( numprocs /= 1 ) THEN
	177
	178	#if defined( __parallel )
[1]	179	!
[1106]	180	!-- Transpose array
[1318]	181	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3657]	182	#ifndef __cuda_aware_mpi
[3634]	183	!$ACC UPDATE HOST(f_inv)
[3657]	184	#else
	185	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	186	#endif
[1106]	187	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	188	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	189	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	190	comm1dy, ierr )
[3657]	191	#ifndef __cuda_aware_mpi
[3634]	192	!$ACC UPDATE DEVICE(work)
[3657]	193	#else
	194	!$ACC END HOST_DATA
	195	#endif
[1106]	196	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	197
	198	!
[1106]	199	!-- Reorder transposed array
[1111]	200	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	201	!$OMP DO
[1106]	202	DO l = 0, pdims(2) - 1
	203	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3634]	204	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	205	!$ACC PRESENT(f_out, work)
[1106]	206	DO i = nxl_y, nxr_y
	207	DO k = nzb_y, nzt_y
	208	DO j = ys, ys + nyn_x - nys_x
[1111]	209	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	210	ENDDO
[1]	211	ENDDO
	212	ENDDO
	213	ENDDO
[683]	214	!$OMP END PARALLEL
[1]	215	#endif
	216
[1106]	217	ELSE
	218
	219	!
	220	!-- Reorder transposed array
	221	!$OMP PARALLEL PRIVATE ( i, j, k )
	222	!$OMP DO
[3634]	223	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	224	!$ACC PRESENT(f_out, f_inv)
[1106]	225	DO k = nzb_y, nzt_y
	226	DO i = nxl_y, nxr_y
	227	DO j = 0, ny
	228	f_out(j,i,k) = f_inv(j,k,i)
	229	ENDDO
	230	ENDDO
	231	ENDDO
	232	!$OMP END PARALLEL
	233
	234	ENDIF
	235
[1]	236	END SUBROUTINE transpose_xy
	237
	238
	239	!------------------------------------------------------------------------------!
	240	! Description:
	241	! ------------
[1682]	242	!> Resorting data after the transposition from x to z. The transposition itself
	243	!> is carried out in transpose_xz
[1216]	244	!------------------------------------------------------------------------------!
[1682]	245	SUBROUTINE resort_for_xz( f_inv, f_out )
[1216]	246
[1682]	247
[1320]	248	USE indices, &
	249	ONLY: nxl, nxr, nyn, nys, nz
[1216]	250
[1320]	251	USE kinds
	252
[1216]	253	IMPLICIT NONE
	254
[1682]	255	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	256	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
[1216]	257
[1682]	258	INTEGER(iwp) :: i !<
	259	INTEGER(iwp) :: j !<
	260	INTEGER(iwp) :: k !<
[1216]	261	!
	262	!-- Rearrange indices of input array in order to make data to be send
	263	!-- by MPI contiguous.
	264	!-- In case of parallel fft/transposition, scattered store is faster in
	265	!-- backward direction!!!
	266	!$OMP PARALLEL PRIVATE ( i, j, k )
	267	!$OMP DO
[3634]	268	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	269	!$ACC PRESENT(f_out, f_inv)
[1216]	270	DO k = 1, nz
	271	DO i = nxl, nxr
	272	DO j = nys, nyn
	273	f_out(k,j,i) = f_inv(j,i,k)
	274	ENDDO
	275	ENDDO
	276	ENDDO
	277	!$OMP END PARALLEL
	278
	279	END SUBROUTINE resort_for_xz
	280
	281
	282	!------------------------------------------------------------------------------!
	283	! Description:
	284	! ------------
[1682]	285	!> Transposition of input array (f_in) from x to z. For the input array, all
	286	!> elements along x reside on the same PE, while after transposition, all
	287	!> elements along z reside on the same PE.
[1]	288	!------------------------------------------------------------------------------!
[1682]	289	SUBROUTINE transpose_xz( f_in, f_inv )
[1]	290
[1682]	291
[1320]	292	USE cpulog, &
	293	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	294
[1320]	295	USE indices, &
[3241]	296	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
[1320]	297
	298	USE kinds
	299
[1324]	300	USE pegrid
[1320]	301
	302	USE transpose_indices, &
	303	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	304
[1]	305	IMPLICIT NONE
	306
[1682]	307	INTEGER(iwp) :: i !<
	308	INTEGER(iwp) :: j !<
	309	INTEGER(iwp) :: k !<
	310	INTEGER(iwp) :: l !<
	311	INTEGER(iwp) :: xs !<
[1]	312
[1682]	313	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	314	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1]	315
[1682]	316	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3634]	317	!$ACC DECLARE CREATE(work)
[1111]	318
[1320]	319
[1]	320	!
	321	!-- If the PE grid is one-dimensional along y, the array has only to be
	322	!-- reordered locally and therefore no transposition has to be done.
	323	IF ( pdims(1) /= 1 ) THEN
[1106]	324
	325	#if defined( __parallel )
[1]	326	!
	327	!-- Reorder input array for transposition
[1111]	328	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	329	!$OMP DO
[1]	330	DO l = 0, pdims(1) - 1
	331	xs = 0 + l * nnx
[3634]	332	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	333	!$ACC PRESENT(work, f_in)
[1003]	334	DO k = nzb_x, nzt_x
[164]	335	DO i = xs, xs + nnx - 1
[1003]	336	DO j = nys_x, nyn_x
[1111]	337	work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]	338	ENDDO
	339	ENDDO
	340	ENDDO
	341	ENDDO
[683]	342	!$OMP END PARALLEL
[1]	343
	344	!
	345	!-- Transpose array
[1318]	346	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3657]	347	#ifndef __cuda_aware_mpi
[3634]	348	!$ACC UPDATE HOST(work)
[3657]	349	#else
	350	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	351	#endif
[622]	352	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	353	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	354	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	355	comm1dx, ierr )
[3657]	356	#ifndef __cuda_aware_mpi
[3634]	357	!$ACC UPDATE DEVICE(f_inv)
[3657]	358	#else
	359	!$ACC END HOST_DATA
	360	#endif
[1]	361	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	362	#endif
	363
[1]	364	ELSE
[1106]	365
[1]	366	!
	367	!-- Reorder the array in a way that the z index is in first position
[683]	368	!$OMP PARALLEL PRIVATE ( i, j, k )
	369	!$OMP DO
[3634]	370	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	371	!$ACC PRESENT(f_inv, f_in)
[1003]	372	DO i = nxl, nxr
	373	DO j = nys, nyn
	374	DO k = 1, nz
[164]	375	f_inv(j,i,k) = f_in(i,j,k)
[1]	376	ENDDO
	377	ENDDO
	378	ENDDO
[683]	379	!$OMP END PARALLEL
[1]	380
[164]	381	ENDIF
	382
[1]	383	END SUBROUTINE transpose_xz
	384
	385
	386	!------------------------------------------------------------------------------!
	387	! Description:
	388	! ------------
[1682]	389	!> Resorting data after the transposition from y to x. The transposition itself
	390	!> is carried out in transpose_yx
[1216]	391	!------------------------------------------------------------------------------!
[1682]	392	SUBROUTINE resort_for_yx( f_inv, f_out )
[1216]	393
[1682]	394
[1320]	395	USE indices, &
	396	ONLY: nx
[1216]	397
[1320]	398	USE kinds
	399
	400	USE transpose_indices, &
	401	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	402
[1216]	403	IMPLICIT NONE
	404
[1682]	405	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	406	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1216]	407
	408
[1682]	409	INTEGER(iwp) :: i !<
	410	INTEGER(iwp) :: j !<
	411	INTEGER(iwp) :: k !<
[1216]	412	!
	413	!-- Rearrange indices of input array in order to make data to be send
	414	!-- by MPI contiguous
	415	!$OMP PARALLEL PRIVATE ( i, j, k )
	416	!$OMP DO
[3634]	417	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	418	!$ACC PRESENT(f_out, f_inv)
[1216]	419	DO i = 0, nx
	420	DO k = nzb_x, nzt_x
	421	DO j = nys_x, nyn_x
	422	f_out(i,j,k) = f_inv(j,k,i)
	423	ENDDO
	424	ENDDO
	425	ENDDO
	426	!$OMP END PARALLEL
	427
	428	END SUBROUTINE resort_for_yx
	429
	430
	431	!------------------------------------------------------------------------------!
	432	! Description:
	433	! ------------
[1682]	434	!> Transposition of input array (f_in) from y to x. For the input array, all
	435	!> elements along y reside on the same PE, while after transposition, all
	436	!> elements along x reside on the same PE.
[1]	437	!------------------------------------------------------------------------------!
[1682]	438	SUBROUTINE transpose_yx( f_in, f_inv )
[1]	439
[1682]	440
[1320]	441	USE cpulog, &
	442	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	443
[1320]	444	USE indices, &
	445	ONLY: nx, ny
	446
	447	USE kinds
	448
[1324]	449	USE pegrid
[1320]	450
	451	USE transpose_indices, &
	452	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	453
[1]	454	IMPLICIT NONE
	455
[1682]	456	INTEGER(iwp) :: i !<
	457	INTEGER(iwp) :: j !<
	458	INTEGER(iwp) :: k !<
	459	INTEGER(iwp) :: l !<
	460	INTEGER(iwp) :: ys !<
[1]	461
[1682]	462	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	463	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1111]	464
[1682]	465	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3634]	466	!$ACC DECLARE CREATE(work)
[1111]	467
[1320]	468
[1106]	469	IF ( numprocs /= 1 ) THEN
	470
[1]	471	#if defined( __parallel )
	472	!
[1106]	473	!-- Reorder input array for transposition
[1111]	474	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	475	!$OMP DO
[1106]	476	DO l = 0, pdims(2) - 1
	477	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3634]	478	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	479	!$ACC PRESENT(work, f_in)
[1106]	480	DO i = nxl_y, nxr_y
	481	DO k = nzb_y, nzt_y
	482	DO j = ys, ys + nyn_x - nys_x
[1111]	483	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	484	ENDDO
	485	ENDDO
	486	ENDDO
	487	ENDDO
	488	!$OMP END PARALLEL
	489
	490	!
	491	!-- Transpose array
[1318]	492	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3657]	493	#ifndef __cuda_aware_mpi
[3634]	494	!$ACC UPDATE HOST(work)
[3657]	495	#else
	496	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	497	#endif
[1106]	498	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	499	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	500	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	501	comm1dy, ierr )
[3657]	502	#ifndef __cuda_aware_mpi
[3634]	503	!$ACC UPDATE DEVICE(f_inv)
[3657]	504	#else
	505	!$ACC END HOST_DATA
	506	#endif
[1106]	507	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	508	#endif
	509
	510	ELSE
	511
	512	!
	513	!-- Reorder array f_in the same way as ALLTOALL did it
	514	!$OMP PARALLEL PRIVATE ( i, j, k )
	515	!$OMP DO
[3634]	516	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	517	!$ACC PRESENT(f_inv, f_in)
[1003]	518	DO i = nxl_y, nxr_y
	519	DO k = nzb_y, nzt_y
[1106]	520	DO j = 0, ny
	521	f_inv(j,k,i) = f_in(j,i,k)
[1]	522	ENDDO
	523	ENDDO
	524	ENDDO
[683]	525	!$OMP END PARALLEL
[1]	526
[1106]	527	ENDIF
[1]	528
	529	END SUBROUTINE transpose_yx
	530
	531
	532	!------------------------------------------------------------------------------!
	533	! Description:
	534	! ------------
[1682]	535	!> Transposition of input array (f_in) from y to x. For the input array, all
	536	!> elements along y reside on the same PE, while after transposition, all
	537	!> elements along x reside on the same PE.
	538	!> This is a direct transposition for arrays with indices in regular order
	539	!> (k,j,i) (cf. transpose_yx).
[1]	540	!------------------------------------------------------------------------------!
[1682]	541	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	542
[1682]	543
[1320]	544	USE cpulog, &
[3241]	545	ONLY: cpu_log, log_point_s
[1]	546
[1320]	547	USE indices, &
	548	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
	549
	550	USE kinds
	551
[1324]	552	USE pegrid
[1320]	553
	554	USE transpose_indices, &
	555	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	556
[1]	557	IMPLICIT NONE
	558
[1682]	559	INTEGER(iwp) :: i !<
	560	INTEGER(iwp) :: j !<
	561	INTEGER(iwp) :: k !<
	562	INTEGER(iwp) :: l !<
	563	INTEGER(iwp) :: m !<
	564	INTEGER(iwp) :: xs !<
[1]	565
[1682]	566	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	567	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
	568	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	569	REAL(wp) :: work(nnxnnynnz) !<
[1]	570	#if defined( __parallel )
	571
	572	!
	573	!-- Rearrange indices of input array in order to make data to be send
	574	!-- by MPI contiguous
[1003]	575	DO k = 1, nz
	576	DO j = nys, nyn
	577	DO i = nxl, nxr
[164]	578	f_inv(i,k,j) = f_in(k,j,i)
[1]	579	ENDDO
	580	ENDDO
	581	ENDDO
	582
	583	!
	584	!-- Transpose array
	585	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	586	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	587	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	588	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	589	comm1dx, ierr )
	590	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	591
	592	!
	593	!-- Reorder transposed array
	594	m = 0
	595	DO l = 0, pdims(1) - 1
	596	xs = 0 + l * nnx
[1003]	597	DO j = nys_x, nyn_x
	598	DO k = 1, nz
[1]	599	DO i = xs, xs + nnx - 1
	600	m = m + 1
[164]	601	f_out(i,j,k) = work(m)
[1]	602	ENDDO
	603	ENDDO
	604	ENDDO
	605	ENDDO
	606
	607	#endif
	608
	609	END SUBROUTINE transpose_yxd
	610
	611
	612	!------------------------------------------------------------------------------!
	613	! Description:
	614	! ------------
[1682]	615	!> Resorting data for the transposition from y to z. The transposition itself
	616	!> is carried out in transpose_yz
[1216]	617	!------------------------------------------------------------------------------!
[1682]	618	SUBROUTINE resort_for_yz( f_in, f_inv )
[1216]	619
[1682]	620
[1320]	621	USE indices, &
	622	ONLY: ny
[1216]	623
[1320]	624	USE kinds
	625
	626	USE transpose_indices, &
	627	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	628
[1216]	629	IMPLICIT NONE
	630
[1682]	631	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	632	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1216]	633
[1682]	634	INTEGER(iwp) :: i !<
	635	INTEGER(iwp) :: j !<
	636	INTEGER(iwp) :: k !<
[1216]	637
	638	!
	639	!-- Rearrange indices of input array in order to make data to be send
	640	!-- by MPI contiguous
	641	!$OMP PARALLEL PRIVATE ( i, j, k )
	642	!$OMP DO
[3634]	643	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	644	!$ACC PRESENT(f_inv, f_in)
[1216]	645	DO j = 0, ny
	646	DO k = nzb_y, nzt_y
	647	DO i = nxl_y, nxr_y
	648	f_inv(i,k,j) = f_in(j,i,k)
	649	ENDDO
	650	ENDDO
	651	ENDDO
	652	!$OMP END PARALLEL
	653
	654	END SUBROUTINE resort_for_yz
	655
	656
	657	!------------------------------------------------------------------------------!
	658	! Description:
	659	! ------------
[1682]	660	!> Transposition of input array (f_in) from y to z. For the input array, all
	661	!> elements along y reside on the same PE, while after transposition, all
	662	!> elements along z reside on the same PE.
[1]	663	!------------------------------------------------------------------------------!
[1682]	664	SUBROUTINE transpose_yz( f_inv, f_out )
[1]	665
[1682]	666
[1320]	667	USE cpulog, &
	668	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	669
[1320]	670	USE indices, &
	671	ONLY: ny, nz
	672
	673	USE kinds
	674
[1324]	675	USE pegrid
[1320]	676
	677	USE transpose_indices, &
	678	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	679
[1]	680	IMPLICIT NONE
	681
[1682]	682	INTEGER(iwp) :: i !<
	683	INTEGER(iwp) :: j !<
	684	INTEGER(iwp) :: k !<
	685	INTEGER(iwp) :: l !<
	686	INTEGER(iwp) :: zs !<
[1]	687
[1682]	688	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	689	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
[1111]	690
[1682]	691	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3634]	692	!$ACC DECLARE CREATE(work)
[1111]	693
[1320]	694
[1]	695	!
	696	!-- If the PE grid is one-dimensional along y, only local reordering
	697	!-- of the data is necessary and no transposition has to be done.
	698	IF ( pdims(1) == 1 ) THEN
[1106]	699
[683]	700	!$OMP PARALLEL PRIVATE ( i, j, k )
	701	!$OMP DO
[3634]	702	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	703	!$ACC PRESENT(f_out, f_inv)
[1003]	704	DO j = 0, ny
	705	DO k = nzb_y, nzt_y
	706	DO i = nxl_y, nxr_y
[164]	707	f_out(i,j,k) = f_inv(i,k,j)
[1]	708	ENDDO
	709	ENDDO
	710	ENDDO
[683]	711	!$OMP END PARALLEL
[1]	712
[1106]	713	ELSE
	714
	715	#if defined( __parallel )
[1]	716	!
[1106]	717	!-- Transpose array
[1318]	718	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3657]	719	#ifndef __cuda_aware_mpi
[3634]	720	!$ACC UPDATE HOST(f_inv)
[3657]	721	#else
	722	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	723	#endif
[1106]	724	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	725	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	726	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	727	comm1dx, ierr )
[3657]	728	#ifndef __cuda_aware_mpi
[3634]	729	!$ACC UPDATE DEVICE(work)
[3657]	730	#else
	731	!$ACC END HOST_DATA
	732	#endif
[1106]	733	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	734
	735	!
[1106]	736	!-- Reorder transposed array
[1111]	737	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	738	!$OMP DO
[1106]	739	DO l = 0, pdims(1) - 1
	740	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3634]	741	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	742	!$ACC PRESENT(f_out, work)
[1106]	743	DO j = nys_z, nyn_z
	744	DO k = zs, zs + nzt_y - nzb_y
	745	DO i = nxl_z, nxr_z
[1111]	746	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	747	ENDDO
[1]	748	ENDDO
	749	ENDDO
	750	ENDDO
[683]	751	!$OMP END PARALLEL
[1]	752	#endif
	753
[1106]	754	ENDIF
	755
[1]	756	END SUBROUTINE transpose_yz
	757
	758
	759	!------------------------------------------------------------------------------!
	760	! Description:
	761	! ------------
[1682]	762	!> Resorting data for the transposition from z to x. The transposition itself
	763	!> is carried out in transpose_zx
[1216]	764	!------------------------------------------------------------------------------!
[1682]	765	SUBROUTINE resort_for_zx( f_in, f_inv )
[1216]	766
[1682]	767
[1320]	768	USE indices, &
	769	ONLY: nxl, nxr, nyn, nys, nz
[1216]	770
[1320]	771	USE kinds
	772
[1216]	773	IMPLICIT NONE
	774
[1682]	775	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	776	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1216]	777
[1682]	778	INTEGER(iwp) :: i !<
	779	INTEGER(iwp) :: j !<
	780	INTEGER(iwp) :: k !<
[1216]	781
	782	!
	783	!-- Rearrange indices of input array in order to make data to be send
	784	!-- by MPI contiguous
	785	!$OMP PARALLEL PRIVATE ( i, j, k )
	786	!$OMP DO
[3634]	787	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	788	!$ACC PRESENT(f_in, f_inv)
[1216]	789	DO k = 1,nz
	790	DO i = nxl, nxr
	791	DO j = nys, nyn
	792	f_inv(j,i,k) = f_in(k,j,i)
	793	ENDDO
	794	ENDDO
	795	ENDDO
	796	!$OMP END PARALLEL
	797
	798	END SUBROUTINE resort_for_zx
	799
	800
	801	!------------------------------------------------------------------------------!
	802	! Description:
	803	! ------------
[1682]	804	!> Transposition of input array (f_in) from z to x. For the input array, all
	805	!> elements along z reside on the same PE, while after transposition, all
	806	!> elements along x reside on the same PE.
[1]	807	!------------------------------------------------------------------------------!
[1682]	808	SUBROUTINE transpose_zx( f_inv, f_out )
[1]	809
[1682]	810
[1320]	811	USE cpulog, &
	812	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	813
[1320]	814	USE indices, &
	815	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
	816
	817	USE kinds
	818
[1324]	819	USE pegrid
[1320]	820
	821	USE transpose_indices, &
	822	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	823
[1]	824	IMPLICIT NONE
	825
[1682]	826	INTEGER(iwp) :: i !<
	827	INTEGER(iwp) :: j !<
	828	INTEGER(iwp) :: k !<
	829	INTEGER(iwp) :: l !<
	830	INTEGER(iwp) :: xs !<
[1]	831
[1682]	832	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	833	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1111]	834
[1682]	835	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3634]	836	!$ACC DECLARE CREATE(work)
[1]	837
[1320]	838
[1]	839	!
	840	!-- If the PE grid is one-dimensional along y, only local reordering
	841	!-- of the data is necessary and no transposition has to be done.
	842	IF ( pdims(1) == 1 ) THEN
[1106]	843
[683]	844	!$OMP PARALLEL PRIVATE ( i, j, k )
	845	!$OMP DO
[3634]	846	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	847	!$ACC PRESENT(f_out, f_inv)
[1003]	848	DO k = 1, nz
	849	DO i = nxl, nxr
	850	DO j = nys, nyn
[164]	851	f_out(i,j,k) = f_inv(j,i,k)
[1]	852	ENDDO
	853	ENDDO
	854	ENDDO
[683]	855	!$OMP END PARALLEL
[1]	856
[1106]	857	ELSE
	858
	859	#if defined( __parallel )
[1]	860	!
[1106]	861	!-- Transpose array
[1318]	862	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3657]	863	#ifndef __cuda_aware_mpi
[3634]	864	!$ACC UPDATE HOST(f_inv)
[3657]	865	#else
	866	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	867	#endif
[1106]	868	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	869	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	870	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	871	comm1dx, ierr )
[3657]	872	#ifndef __cuda_aware_mpi
[3634]	873	!$ACC UPDATE DEVICE(work)
[3657]	874	#else
	875	!$ACC END HOST_DATA
	876	#endif
[1106]	877	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	878
	879	!
[1106]	880	!-- Reorder transposed array
[1111]	881	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	882	!$OMP DO
[1106]	883	DO l = 0, pdims(1) - 1
	884	xs = 0 + l * nnx
[3634]	885	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	886	!$ACC PRESENT(f_out, work)
[1106]	887	DO k = nzb_x, nzt_x
	888	DO i = xs, xs + nnx - 1
	889	DO j = nys_x, nyn_x
[1111]	890	f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]	891	ENDDO
[1]	892	ENDDO
	893	ENDDO
	894	ENDDO
[683]	895	!$OMP END PARALLEL
[1]	896	#endif
	897
[1106]	898	ENDIF
	899
[1]	900	END SUBROUTINE transpose_zx
	901
	902
	903	!------------------------------------------------------------------------------!
	904	! Description:
	905	! ------------
[1682]	906	!> Resorting data after the transposition from z to y. The transposition itself
	907	!> is carried out in transpose_zy
[1216]	908	!------------------------------------------------------------------------------!
[1682]	909	SUBROUTINE resort_for_zy( f_inv, f_out )
[1216]	910
[1682]	911
[1320]	912	USE indices, &
	913	ONLY: ny
[1216]	914
[1320]	915	USE kinds
	916
	917	USE transpose_indices, &
	918	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	919
[1216]	920	IMPLICIT NONE
	921
[1682]	922	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	923	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1216]	924
	925
[1682]	926	INTEGER(iwp) :: i !<
	927	INTEGER(iwp) :: j !<
	928	INTEGER(iwp) :: k !<
[1216]	929
	930	!
	931	!-- Rearrange indices of input array in order to make data to be send
	932	!-- by MPI contiguous
	933	!$OMP PARALLEL PRIVATE ( i, j, k )
	934	!$OMP DO
[3634]	935	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	936	!$ACC PRESENT(f_out, f_inv)
[1216]	937	DO k = nzb_y, nzt_y
	938	DO j = 0, ny
	939	DO i = nxl_y, nxr_y
	940	f_out(j,i,k) = f_inv(i,k,j)
	941	ENDDO
	942	ENDDO
	943	ENDDO
	944	!$OMP END PARALLEL
	945
	946	END SUBROUTINE resort_for_zy
	947
	948
	949	!------------------------------------------------------------------------------!
[3241]	950	! Description:cpu_log_nowait
[1216]	951	! ------------
[1682]	952	!> Transposition of input array (f_in) from z to y. For the input array, all
	953	!> elements along z reside on the same PE, while after transposition, all
	954	!> elements along y reside on the same PE.
[1]	955	!------------------------------------------------------------------------------!
[1682]	956	SUBROUTINE transpose_zy( f_in, f_inv )
[1]	957
[1682]	958
[1320]	959	USE cpulog, &
	960	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	961
[1320]	962	USE indices, &
	963	ONLY: ny, nz
	964
	965	USE kinds
	966
[1324]	967	USE pegrid
[1320]	968
	969	USE transpose_indices, &
	970	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	971
[1]	972	IMPLICIT NONE
	973
[1682]	974	INTEGER(iwp) :: i !<
	975	INTEGER(iwp) :: j !<
	976	INTEGER(iwp) :: k !<
	977	INTEGER(iwp) :: l !<
	978	INTEGER(iwp) :: zs !<
[1]	979
[1682]	980	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
	981	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1111]	982
[1682]	983	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3634]	984	!$ACC DECLARE CREATE(work)
[1111]	985
[1]	986	!
	987	!-- If the PE grid is one-dimensional along y, the array has only to be
	988	!-- reordered locally and therefore no transposition has to be done.
	989	IF ( pdims(1) /= 1 ) THEN
[1106]	990
	991	#if defined( __parallel )
[1]	992	!
	993	!-- Reorder input array for transposition
[1111]	994	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	995	!$OMP DO
[1]	996	DO l = 0, pdims(1) - 1
[1003]	997	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3634]	998	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	999	!$ACC PRESENT(work, f_in)
[1003]	1000	DO j = nys_z, nyn_z
	1001	DO k = zs, zs + nzt_y - nzb_y
	1002	DO i = nxl_z, nxr_z
[1111]	1003	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	1004	ENDDO
	1005	ENDDO
	1006	ENDDO
	1007	ENDDO
[683]	1008	!$OMP END PARALLEL
[1]	1009
	1010	!
	1011	!-- Transpose array
[1318]	1012	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3657]	1013	#ifndef __cuda_aware_mpi
[3634]	1014	!$ACC UPDATE HOST(work)
[3657]	1015	#else
	1016	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
	1017	#endif
[622]	1018	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	1019	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	1020	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	1021	comm1dx, ierr )
[3657]	1022	#ifndef __cuda_aware_mpi
[3634]	1023	!$ACC UPDATE DEVICE(f_inv)
[3657]	1024	#else
	1025	!$ACC END HOST_DATA
	1026	#endif
[1]	1027	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	1028	#endif
[1]	1029
	1030	ELSE
	1031	!
[1106]	1032	!-- Reorder the array in the same way like ALLTOALL did it
[683]	1033	!$OMP PARALLEL PRIVATE ( i, j, k )
	1034	!$OMP DO
[3634]	1035	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	1036	!$ACC PRESENT(f_inv, f_in)
[1003]	1037	DO k = nzb_y, nzt_y
	1038	DO j = 0, ny
	1039	DO i = nxl_y, nxr_y
[164]	1040	f_inv(i,k,j) = f_in(i,j,k)
	1041	ENDDO
	1042	ENDDO
	1043	ENDDO
[683]	1044	!$OMP END PARALLEL
[1106]	1045
	1046	ENDIF
	1047
[1]	1048	END SUBROUTINE transpose_zy
	1049
	1050
	1051	!------------------------------------------------------------------------------!
	1052	! Description:
	1053	! ------------
[1682]	1054	!> Transposition of input array (f_in) from z to y. For the input array, all
	1055	!> elements along z reside on the same PE, while after transposition, all
	1056	!> elements along y reside on the same PE.
	1057	!> This is a direct transposition for arrays with indices in regular order
	1058	!> (k,j,i) (cf. transpose_zy).
[1]	1059	!------------------------------------------------------------------------------!
[1682]	1060	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	1061
[1682]	1062
[1320]	1063	USE cpulog, &
[3241]	1064	ONLY: cpu_log, log_point_s
[1]	1065
[1320]	1066	USE indices, &
	1067	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
	1068
	1069	USE kinds
	1070
[1324]	1071	USE pegrid
[1320]	1072
	1073	USE transpose_indices, &
[3241]	1074	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
[1320]	1075
[1]	1076	IMPLICIT NONE
	1077
[1682]	1078	INTEGER(iwp) :: i !<
	1079	INTEGER(iwp) :: j !<
	1080	INTEGER(iwp) :: k !<
	1081	INTEGER(iwp) :: l !<
	1082	INTEGER(iwp) :: m !<
	1083	INTEGER(iwp) :: ys !<
[1]	1084
[1682]	1085	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	1086	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	1087	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
	1088	REAL(wp) :: work(nnxnnynnz) !<
[1320]	1089
[1]	1090	#if defined( __parallel )
	1091
	1092	!
	1093	!-- Rearrange indices of input array in order to make data to be send
	1094	!-- by MPI contiguous
[1003]	1095	DO i = nxl, nxr
	1096	DO j = nys, nyn
	1097	DO k = 1, nz
[164]	1098	f_inv(j,i,k) = f_in(k,j,i)
[1]	1099	ENDDO
	1100	ENDDO
	1101	ENDDO
	1102
	1103	!
	1104	!-- Move data to different array, because memory location of work1 is
	1105	!-- needed further below (work1 = work2).
	1106	!-- If the PE grid is one-dimensional along x, only local reordering
	1107	!-- of the data is necessary and no transposition has to be done.
	1108	IF ( pdims(2) == 1 ) THEN
[1003]	1109	DO k = 1, nz
	1110	DO i = nxl, nxr
	1111	DO j = nys, nyn
[164]	1112	f_out(j,i,k) = f_inv(j,i,k)
[1]	1113	ENDDO
	1114	ENDDO
	1115	ENDDO
	1116	RETURN
	1117	ENDIF
	1118
	1119	!
	1120	!-- Transpose array
	1121	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1122	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1123	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	1124	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	1125	comm1dy, ierr )
	1126	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1127
	1128	!
	1129	!-- Reorder transposed array
	1130	m = 0
	1131	DO l = 0, pdims(2) - 1
	1132	ys = 0 + l * nny
[1003]	1133	DO k = nzb_yd, nzt_yd
	1134	DO i = nxl_yd, nxr_yd
[1]	1135	DO j = ys, ys + nny - 1
	1136	m = m + 1
[164]	1137	f_out(j,i,k) = work(m)
[1]	1138	ENDDO
	1139	ENDDO
	1140	ENDDO
	1141	ENDDO
	1142
	1143	#endif
	1144
	1145	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |