Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 3647

Last change on this file since 3647 was 3634, checked in by knoop, 6 years ago
OpenACC port for SPEC
Property svn:keywords set to `Id`
File size: 31.9 KB

Rev	Line
[1682]	1	!> @file transpose.f90
[2000]	2	!------------------------------------------------------------------------------!
[2696]	3	! This file is part of the PALM model system.
[1036]	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[2718]	17	! Copyright 1997-2018 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1321]	22	!
[2119]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: transpose.f90 3634 2018-12-18 12:31:28Z kanani $
[3634]	27	! OpenACC port for SPEC
	28	!
	29	! 3241 2018-09-12 15:02:00Z raasch
[3241]	30	! unused variables removed
	31	!
	32	! 2718 2018-01-02 08:49:38Z maronga
[2716]	33	! Corrected "Former revisions" section
	34	!
	35	! 2696 2017-12-14 17:12:51Z kanani
	36	! Change in file header (GPL part)
[1321]	37	!
[2716]	38	! 2119 2017-01-17 16:51:50Z raasch
	39	!
[2119]	40	! 2118 2017-01-17 16:38:49Z raasch
	41	! OpenACC directives removed
	42	!
[2001]	43	! 2000 2016-08-20 18:09:15Z knoop
	44	! Forced header and separation lines into 80 columns
	45	!
[1683]	46	! 1682 2015-10-07 23:56:08Z knoop
	47	! Code annotations made doxygen readable
	48	!
[1325]	49	! 1324 2014-03-21 09:13:16Z suehring
	50	! Bugfix: ONLY statement for module pegrid removed
	51	!
[1321]	52	! 1320 2014-03-20 08:40:49Z raasch
[1320]	53	! ONLY-attribute added to USE-statements,
	54	! kind-parameters added to all INTEGER and REAL declaration statements,
	55	! kinds are defined in new module kinds,
	56	! old module precision_kind is removed,
	57	! revision history before 2012 removed,
	58	! comment fields (!:) to be used for variable explanations added to
	59	! all variable declaration statements
[198]	60	!
[1319]	61	! 1318 2014-03-17 13:35:16Z raasch
	62	! cpu_log_nowait parameter added to cpu measurements of the transpositions
	63	! required for solving the Poisson equation (poisfft),
	64	! module interfaces removed
	65	!
[1258]	66	! 1257 2013-11-08 15:18:40Z raasch
	67	! openacc loop and loop vector clauses removed
	68	!
[1217]	69	! 1216 2013-08-26 09:31:42Z raasch
	70	! re-sorting of the transposed / to be transposed arrays moved to separate
	71	! routines resort_for_...
	72	!
[1112]	73	! 1111 2013-03-08 23:54:10Z raasch
	74	! openACC directives added,
	75	! resorting data from/to work changed, work got 4 dimensions instead of 1
	76	!
[1107]	77	! 1106 2013-03-04 05:31:38Z raasch
	78	! preprocessor lines rearranged so that routines can also be used in serial
	79	! (non-parallel) mode
	80	!
[1093]	81	! 1092 2013-02-02 11:24:22Z raasch
	82	! unused variables removed
	83	!
[1037]	84	! 1036 2012-10-22 13:43:42Z raasch
	85	! code put under GPL (PALM 3.9)
	86	!
[1004]	87	! 1003 2012-09-14 14:35:53Z raasch
	88	! indices nxa, nya, etc. replaced by nx, ny, etc.
	89	!
[1]	90	! Revision 1.1 1997/07/24 11:25:18 raasch
	91	! Initial revision
	92	!
[1216]	93	!------------------------------------------------------------------------------!
	94	! Description:
	95	! ------------
[1682]	96	!> Resorting data for the transposition from x to y. The transposition itself
	97	!> is carried out in transpose_xy
[1216]	98	!------------------------------------------------------------------------------!
[1682]	99	SUBROUTINE resort_for_xy( f_in, f_inv )
	100
[1216]	101
[1320]	102	USE indices, &
	103	ONLY: nx
[1216]	104
[1320]	105	USE kinds
	106
	107	USE transpose_indices, &
[3241]	108	ONLY: nyn_x, nys_x, nzb_x, nzt_x
[1320]	109
[1216]	110	IMPLICIT NONE
	111
[1682]	112	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	113	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1216]	114
	115
[1682]	116	INTEGER(iwp) :: i !<
	117	INTEGER(iwp) :: j !<
	118	INTEGER(iwp) :: k !<
[1]	119	!
[1216]	120	!-- Rearrange indices of input array in order to make data to be send
	121	!-- by MPI contiguous
	122	!$OMP PARALLEL PRIVATE ( i, j, k )
	123	!$OMP DO
[3634]	124	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	125	!$ACC PRESENT(f_inv, f_in)
[1216]	126	DO i = 0, nx
	127	DO k = nzb_x, nzt_x
	128	DO j = nys_x, nyn_x
	129	f_inv(j,k,i) = f_in(i,j,k)
	130	ENDDO
	131	ENDDO
	132	ENDDO
	133	!$OMP END PARALLEL
	134
	135	END SUBROUTINE resort_for_xy
	136
	137
	138	!------------------------------------------------------------------------------!
[1]	139	! Description:
	140	! ------------
[1682]	141	!> Transposition of input array (f_in) from x to y. For the input array, all
	142	!> elements along x reside on the same PE, while after transposition, all
	143	!> elements along y reside on the same PE.
[1]	144	!------------------------------------------------------------------------------!
[1682]	145	SUBROUTINE transpose_xy( f_inv, f_out )
[1]	146
[1682]	147
[1320]	148	USE cpulog, &
	149	ONLY: cpu_log, cpu_log_nowait, log_point_s
	150
	151	USE indices, &
	152	ONLY: nx, ny
	153
	154	USE kinds
	155
[1]	156	USE pegrid
	157
[1320]	158	USE transpose_indices, &
	159	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	160
[1]	161	IMPLICIT NONE
	162
[1682]	163	INTEGER(iwp) :: i !<
	164	INTEGER(iwp) :: j !<
	165	INTEGER(iwp) :: k !<
	166	INTEGER(iwp) :: l !<
	167	INTEGER(iwp) :: ys !<
[1320]	168
[1682]	169	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	170	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1]	171
[1682]	172	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3634]	173	!$ACC DECLARE CREATE(work)
[1111]	174
	175
[1106]	176	IF ( numprocs /= 1 ) THEN
	177
	178	#if defined( __parallel )
[1]	179	!
[1106]	180	!-- Transpose array
[1318]	181	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3634]	182	!$ACC UPDATE HOST(f_inv)
[1106]	183	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	184	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	185	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	186	comm1dy, ierr )
[3634]	187	!$ACC UPDATE DEVICE(work)
[1106]	188	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	189
	190	!
[1106]	191	!-- Reorder transposed array
[1111]	192	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	193	!$OMP DO
[1106]	194	DO l = 0, pdims(2) - 1
	195	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3634]	196	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	197	!$ACC PRESENT(f_out, work)
[1106]	198	DO i = nxl_y, nxr_y
	199	DO k = nzb_y, nzt_y
	200	DO j = ys, ys + nyn_x - nys_x
[1111]	201	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	202	ENDDO
[1]	203	ENDDO
	204	ENDDO
	205	ENDDO
[683]	206	!$OMP END PARALLEL
[1]	207	#endif
	208
[1106]	209	ELSE
	210
	211	!
	212	!-- Reorder transposed array
	213	!$OMP PARALLEL PRIVATE ( i, j, k )
	214	!$OMP DO
[3634]	215	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	216	!$ACC PRESENT(f_out, f_inv)
[1106]	217	DO k = nzb_y, nzt_y
	218	DO i = nxl_y, nxr_y
	219	DO j = 0, ny
	220	f_out(j,i,k) = f_inv(j,k,i)
	221	ENDDO
	222	ENDDO
	223	ENDDO
	224	!$OMP END PARALLEL
	225
	226	ENDIF
	227
[1]	228	END SUBROUTINE transpose_xy
	229
	230
	231	!------------------------------------------------------------------------------!
	232	! Description:
	233	! ------------
[1682]	234	!> Resorting data after the transposition from x to z. The transposition itself
	235	!> is carried out in transpose_xz
[1216]	236	!------------------------------------------------------------------------------!
[1682]	237	SUBROUTINE resort_for_xz( f_inv, f_out )
[1216]	238
[1682]	239
[1320]	240	USE indices, &
	241	ONLY: nxl, nxr, nyn, nys, nz
[1216]	242
[1320]	243	USE kinds
	244
[1216]	245	IMPLICIT NONE
	246
[1682]	247	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	248	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
[1216]	249
[1682]	250	INTEGER(iwp) :: i !<
	251	INTEGER(iwp) :: j !<
	252	INTEGER(iwp) :: k !<
[1216]	253	!
	254	!-- Rearrange indices of input array in order to make data to be send
	255	!-- by MPI contiguous.
	256	!-- In case of parallel fft/transposition, scattered store is faster in
	257	!-- backward direction!!!
	258	!$OMP PARALLEL PRIVATE ( i, j, k )
	259	!$OMP DO
[3634]	260	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	261	!$ACC PRESENT(f_out, f_inv)
[1216]	262	DO k = 1, nz
	263	DO i = nxl, nxr
	264	DO j = nys, nyn
	265	f_out(k,j,i) = f_inv(j,i,k)
	266	ENDDO
	267	ENDDO
	268	ENDDO
	269	!$OMP END PARALLEL
	270
	271	END SUBROUTINE resort_for_xz
	272
	273
	274	!------------------------------------------------------------------------------!
	275	! Description:
	276	! ------------
[1682]	277	!> Transposition of input array (f_in) from x to z. For the input array, all
	278	!> elements along x reside on the same PE, while after transposition, all
	279	!> elements along z reside on the same PE.
[1]	280	!------------------------------------------------------------------------------!
[1682]	281	SUBROUTINE transpose_xz( f_in, f_inv )
[1]	282
[1682]	283
[1320]	284	USE cpulog, &
	285	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	286
[1320]	287	USE indices, &
[3241]	288	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
[1320]	289
	290	USE kinds
	291
[1324]	292	USE pegrid
[1320]	293
	294	USE transpose_indices, &
	295	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	296
[1]	297	IMPLICIT NONE
	298
[1682]	299	INTEGER(iwp) :: i !<
	300	INTEGER(iwp) :: j !<
	301	INTEGER(iwp) :: k !<
	302	INTEGER(iwp) :: l !<
	303	INTEGER(iwp) :: xs !<
[1]	304
[1682]	305	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	306	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1]	307
[1682]	308	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3634]	309	!$ACC DECLARE CREATE(work)
[1111]	310
[1320]	311
[1]	312	!
	313	!-- If the PE grid is one-dimensional along y, the array has only to be
	314	!-- reordered locally and therefore no transposition has to be done.
	315	IF ( pdims(1) /= 1 ) THEN
[1106]	316
	317	#if defined( __parallel )
[1]	318	!
	319	!-- Reorder input array for transposition
[1111]	320	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	321	!$OMP DO
[1]	322	DO l = 0, pdims(1) - 1
	323	xs = 0 + l * nnx
[3634]	324	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	325	!$ACC PRESENT(work, f_in)
[1003]	326	DO k = nzb_x, nzt_x
[164]	327	DO i = xs, xs + nnx - 1
[1003]	328	DO j = nys_x, nyn_x
[1111]	329	work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]	330	ENDDO
	331	ENDDO
	332	ENDDO
	333	ENDDO
[683]	334	!$OMP END PARALLEL
[1]	335
	336	!
	337	!-- Transpose array
[1318]	338	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3634]	339	!$ACC UPDATE HOST(work)
[622]	340	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	341	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	342	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	343	comm1dx, ierr )
[3634]	344	!$ACC UPDATE DEVICE(f_inv)
[1]	345	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	346	#endif
	347
[1]	348	ELSE
[1106]	349
[1]	350	!
	351	!-- Reorder the array in a way that the z index is in first position
[683]	352	!$OMP PARALLEL PRIVATE ( i, j, k )
	353	!$OMP DO
[3634]	354	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	355	!$ACC PRESENT(f_inv, f_in)
[1003]	356	DO i = nxl, nxr
	357	DO j = nys, nyn
	358	DO k = 1, nz
[164]	359	f_inv(j,i,k) = f_in(i,j,k)
[1]	360	ENDDO
	361	ENDDO
	362	ENDDO
[683]	363	!$OMP END PARALLEL
[1]	364
[164]	365	ENDIF
	366
[1]	367	END SUBROUTINE transpose_xz
	368
	369
	370	!------------------------------------------------------------------------------!
	371	! Description:
	372	! ------------
[1682]	373	!> Resorting data after the transposition from y to x. The transposition itself
	374	!> is carried out in transpose_yx
[1216]	375	!------------------------------------------------------------------------------!
[1682]	376	SUBROUTINE resort_for_yx( f_inv, f_out )
[1216]	377
[1682]	378
[1320]	379	USE indices, &
	380	ONLY: nx
[1216]	381
[1320]	382	USE kinds
	383
	384	USE transpose_indices, &
	385	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	386
[1216]	387	IMPLICIT NONE
	388
[1682]	389	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	390	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1216]	391
	392
[1682]	393	INTEGER(iwp) :: i !<
	394	INTEGER(iwp) :: j !<
	395	INTEGER(iwp) :: k !<
[1216]	396	!
	397	!-- Rearrange indices of input array in order to make data to be send
	398	!-- by MPI contiguous
	399	!$OMP PARALLEL PRIVATE ( i, j, k )
	400	!$OMP DO
[3634]	401	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	402	!$ACC PRESENT(f_out, f_inv)
[1216]	403	DO i = 0, nx
	404	DO k = nzb_x, nzt_x
	405	DO j = nys_x, nyn_x
	406	f_out(i,j,k) = f_inv(j,k,i)
	407	ENDDO
	408	ENDDO
	409	ENDDO
	410	!$OMP END PARALLEL
	411
	412	END SUBROUTINE resort_for_yx
	413
	414
	415	!------------------------------------------------------------------------------!
	416	! Description:
	417	! ------------
[1682]	418	!> Transposition of input array (f_in) from y to x. For the input array, all
	419	!> elements along y reside on the same PE, while after transposition, all
	420	!> elements along x reside on the same PE.
[1]	421	!------------------------------------------------------------------------------!
[1682]	422	SUBROUTINE transpose_yx( f_in, f_inv )
[1]	423
[1682]	424
[1320]	425	USE cpulog, &
	426	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	427
[1320]	428	USE indices, &
	429	ONLY: nx, ny
	430
	431	USE kinds
	432
[1324]	433	USE pegrid
[1320]	434
	435	USE transpose_indices, &
	436	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	437
[1]	438	IMPLICIT NONE
	439
[1682]	440	INTEGER(iwp) :: i !<
	441	INTEGER(iwp) :: j !<
	442	INTEGER(iwp) :: k !<
	443	INTEGER(iwp) :: l !<
	444	INTEGER(iwp) :: ys !<
[1]	445
[1682]	446	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	447	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1111]	448
[1682]	449	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[3634]	450	!$ACC DECLARE CREATE(work)
[1111]	451
[1320]	452
[1106]	453	IF ( numprocs /= 1 ) THEN
	454
[1]	455	#if defined( __parallel )
	456	!
[1106]	457	!-- Reorder input array for transposition
[1111]	458	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	459	!$OMP DO
[1106]	460	DO l = 0, pdims(2) - 1
	461	ys = 0 + l * ( nyn_x - nys_x + 1 )
[3634]	462	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	463	!$ACC PRESENT(work, f_in)
[1106]	464	DO i = nxl_y, nxr_y
	465	DO k = nzb_y, nzt_y
	466	DO j = ys, ys + nyn_x - nys_x
[1111]	467	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	468	ENDDO
	469	ENDDO
	470	ENDDO
	471	ENDDO
	472	!$OMP END PARALLEL
	473
	474	!
	475	!-- Transpose array
[1318]	476	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3634]	477	!$ACC UPDATE HOST(work)
[1106]	478	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	479	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	480	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	481	comm1dy, ierr )
[3634]	482	!$ACC UPDATE DEVICE(f_inv)
[1106]	483	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	484	#endif
	485
	486	ELSE
	487
	488	!
	489	!-- Reorder array f_in the same way as ALLTOALL did it
	490	!$OMP PARALLEL PRIVATE ( i, j, k )
	491	!$OMP DO
[3634]	492	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	493	!$ACC PRESENT(f_inv, f_in)
[1003]	494	DO i = nxl_y, nxr_y
	495	DO k = nzb_y, nzt_y
[1106]	496	DO j = 0, ny
	497	f_inv(j,k,i) = f_in(j,i,k)
[1]	498	ENDDO
	499	ENDDO
	500	ENDDO
[683]	501	!$OMP END PARALLEL
[1]	502
[1106]	503	ENDIF
[1]	504
	505	END SUBROUTINE transpose_yx
	506
	507
	508	!------------------------------------------------------------------------------!
	509	! Description:
	510	! ------------
[1682]	511	!> Transposition of input array (f_in) from y to x. For the input array, all
	512	!> elements along y reside on the same PE, while after transposition, all
	513	!> elements along x reside on the same PE.
	514	!> This is a direct transposition for arrays with indices in regular order
	515	!> (k,j,i) (cf. transpose_yx).
[1]	516	!------------------------------------------------------------------------------!
[1682]	517	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	518
[1682]	519
[1320]	520	USE cpulog, &
[3241]	521	ONLY: cpu_log, log_point_s
[1]	522
[1320]	523	USE indices, &
	524	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
	525
	526	USE kinds
	527
[1324]	528	USE pegrid
[1320]	529
	530	USE transpose_indices, &
	531	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	532
[1]	533	IMPLICIT NONE
	534
[1682]	535	INTEGER(iwp) :: i !<
	536	INTEGER(iwp) :: j !<
	537	INTEGER(iwp) :: k !<
	538	INTEGER(iwp) :: l !<
	539	INTEGER(iwp) :: m !<
	540	INTEGER(iwp) :: xs !<
[1]	541
[1682]	542	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	543	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
	544	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	545	REAL(wp) :: work(nnxnnynnz) !<
[1]	546	#if defined( __parallel )
	547
	548	!
	549	!-- Rearrange indices of input array in order to make data to be send
	550	!-- by MPI contiguous
[1003]	551	DO k = 1, nz
	552	DO j = nys, nyn
	553	DO i = nxl, nxr
[164]	554	f_inv(i,k,j) = f_in(k,j,i)
[1]	555	ENDDO
	556	ENDDO
	557	ENDDO
	558
	559	!
	560	!-- Transpose array
	561	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	562	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	563	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	564	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	565	comm1dx, ierr )
	566	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	567
	568	!
	569	!-- Reorder transposed array
	570	m = 0
	571	DO l = 0, pdims(1) - 1
	572	xs = 0 + l * nnx
[1003]	573	DO j = nys_x, nyn_x
	574	DO k = 1, nz
[1]	575	DO i = xs, xs + nnx - 1
	576	m = m + 1
[164]	577	f_out(i,j,k) = work(m)
[1]	578	ENDDO
	579	ENDDO
	580	ENDDO
	581	ENDDO
	582
	583	#endif
	584
	585	END SUBROUTINE transpose_yxd
	586
	587
	588	!------------------------------------------------------------------------------!
	589	! Description:
	590	! ------------
[1682]	591	!> Resorting data for the transposition from y to z. The transposition itself
	592	!> is carried out in transpose_yz
[1216]	593	!------------------------------------------------------------------------------!
[1682]	594	SUBROUTINE resort_for_yz( f_in, f_inv )
[1216]	595
[1682]	596
[1320]	597	USE indices, &
	598	ONLY: ny
[1216]	599
[1320]	600	USE kinds
	601
	602	USE transpose_indices, &
	603	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	604
[1216]	605	IMPLICIT NONE
	606
[1682]	607	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	608	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1216]	609
[1682]	610	INTEGER(iwp) :: i !<
	611	INTEGER(iwp) :: j !<
	612	INTEGER(iwp) :: k !<
[1216]	613
	614	!
	615	!-- Rearrange indices of input array in order to make data to be send
	616	!-- by MPI contiguous
	617	!$OMP PARALLEL PRIVATE ( i, j, k )
	618	!$OMP DO
[3634]	619	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	620	!$ACC PRESENT(f_inv, f_in)
[1216]	621	DO j = 0, ny
	622	DO k = nzb_y, nzt_y
	623	DO i = nxl_y, nxr_y
	624	f_inv(i,k,j) = f_in(j,i,k)
	625	ENDDO
	626	ENDDO
	627	ENDDO
	628	!$OMP END PARALLEL
	629
	630	END SUBROUTINE resort_for_yz
	631
	632
	633	!------------------------------------------------------------------------------!
	634	! Description:
	635	! ------------
[1682]	636	!> Transposition of input array (f_in) from y to z. For the input array, all
	637	!> elements along y reside on the same PE, while after transposition, all
	638	!> elements along z reside on the same PE.
[1]	639	!------------------------------------------------------------------------------!
[1682]	640	SUBROUTINE transpose_yz( f_inv, f_out )
[1]	641
[1682]	642
[1320]	643	USE cpulog, &
	644	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	645
[1320]	646	USE indices, &
	647	ONLY: ny, nz
	648
	649	USE kinds
	650
[1324]	651	USE pegrid
[1320]	652
	653	USE transpose_indices, &
	654	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	655
[1]	656	IMPLICIT NONE
	657
[1682]	658	INTEGER(iwp) :: i !<
	659	INTEGER(iwp) :: j !<
	660	INTEGER(iwp) :: k !<
	661	INTEGER(iwp) :: l !<
	662	INTEGER(iwp) :: zs !<
[1]	663
[1682]	664	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	665	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
[1111]	666
[1682]	667	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3634]	668	!$ACC DECLARE CREATE(work)
[1111]	669
[1320]	670
[1]	671	!
	672	!-- If the PE grid is one-dimensional along y, only local reordering
	673	!-- of the data is necessary and no transposition has to be done.
	674	IF ( pdims(1) == 1 ) THEN
[1106]	675
[683]	676	!$OMP PARALLEL PRIVATE ( i, j, k )
	677	!$OMP DO
[3634]	678	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	679	!$ACC PRESENT(f_out, f_inv)
[1003]	680	DO j = 0, ny
	681	DO k = nzb_y, nzt_y
	682	DO i = nxl_y, nxr_y
[164]	683	f_out(i,j,k) = f_inv(i,k,j)
[1]	684	ENDDO
	685	ENDDO
	686	ENDDO
[683]	687	!$OMP END PARALLEL
[1]	688
[1106]	689	ELSE
	690
	691	#if defined( __parallel )
[1]	692	!
[1106]	693	!-- Transpose array
[1318]	694	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3634]	695	!$ACC UPDATE HOST(f_inv)
[1106]	696	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	697	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	698	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	699	comm1dx, ierr )
[3634]	700	!$ACC UPDATE DEVICE(work)
[1106]	701	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	702
	703	!
[1106]	704	!-- Reorder transposed array
[1111]	705	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	706	!$OMP DO
[1106]	707	DO l = 0, pdims(1) - 1
	708	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3634]	709	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	710	!$ACC PRESENT(f_out, work)
[1106]	711	DO j = nys_z, nyn_z
	712	DO k = zs, zs + nzt_y - nzb_y
	713	DO i = nxl_z, nxr_z
[1111]	714	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	715	ENDDO
[1]	716	ENDDO
	717	ENDDO
	718	ENDDO
[683]	719	!$OMP END PARALLEL
[1]	720	#endif
	721
[1106]	722	ENDIF
	723
[1]	724	END SUBROUTINE transpose_yz
	725
	726
	727	!------------------------------------------------------------------------------!
	728	! Description:
	729	! ------------
[1682]	730	!> Resorting data for the transposition from z to x. The transposition itself
	731	!> is carried out in transpose_zx
[1216]	732	!------------------------------------------------------------------------------!
[1682]	733	SUBROUTINE resort_for_zx( f_in, f_inv )
[1216]	734
[1682]	735
[1320]	736	USE indices, &
	737	ONLY: nxl, nxr, nyn, nys, nz
[1216]	738
[1320]	739	USE kinds
	740
[1216]	741	IMPLICIT NONE
	742
[1682]	743	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	744	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1216]	745
[1682]	746	INTEGER(iwp) :: i !<
	747	INTEGER(iwp) :: j !<
	748	INTEGER(iwp) :: k !<
[1216]	749
	750	!
	751	!-- Rearrange indices of input array in order to make data to be send
	752	!-- by MPI contiguous
	753	!$OMP PARALLEL PRIVATE ( i, j, k )
	754	!$OMP DO
[3634]	755	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	756	!$ACC PRESENT(f_in, f_inv)
[1216]	757	DO k = 1,nz
	758	DO i = nxl, nxr
	759	DO j = nys, nyn
	760	f_inv(j,i,k) = f_in(k,j,i)
	761	ENDDO
	762	ENDDO
	763	ENDDO
	764	!$OMP END PARALLEL
	765
	766	END SUBROUTINE resort_for_zx
	767
	768
	769	!------------------------------------------------------------------------------!
	770	! Description:
	771	! ------------
[1682]	772	!> Transposition of input array (f_in) from z to x. For the input array, all
	773	!> elements along z reside on the same PE, while after transposition, all
	774	!> elements along x reside on the same PE.
[1]	775	!------------------------------------------------------------------------------!
[1682]	776	SUBROUTINE transpose_zx( f_inv, f_out )
[1]	777
[1682]	778
[1320]	779	USE cpulog, &
	780	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	781
[1320]	782	USE indices, &
	783	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
	784
	785	USE kinds
	786
[1324]	787	USE pegrid
[1320]	788
	789	USE transpose_indices, &
	790	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	791
[1]	792	IMPLICIT NONE
	793
[1682]	794	INTEGER(iwp) :: i !<
	795	INTEGER(iwp) :: j !<
	796	INTEGER(iwp) :: k !<
	797	INTEGER(iwp) :: l !<
	798	INTEGER(iwp) :: xs !<
[1]	799
[1682]	800	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	801	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1111]	802
[1682]	803	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[3634]	804	!$ACC DECLARE CREATE(work)
[1]	805
[1320]	806
[1]	807	!
	808	!-- If the PE grid is one-dimensional along y, only local reordering
	809	!-- of the data is necessary and no transposition has to be done.
	810	IF ( pdims(1) == 1 ) THEN
[1106]	811
[683]	812	!$OMP PARALLEL PRIVATE ( i, j, k )
	813	!$OMP DO
[3634]	814	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	815	!$ACC PRESENT(f_out, f_inv)
[1003]	816	DO k = 1, nz
	817	DO i = nxl, nxr
	818	DO j = nys, nyn
[164]	819	f_out(i,j,k) = f_inv(j,i,k)
[1]	820	ENDDO
	821	ENDDO
	822	ENDDO
[683]	823	!$OMP END PARALLEL
[1]	824
[1106]	825	ELSE
	826
	827	#if defined( __parallel )
[1]	828	!
[1106]	829	!-- Transpose array
[1318]	830	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3634]	831	!$ACC UPDATE HOST(f_inv)
[1106]	832	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	833	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	834	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	835	comm1dx, ierr )
[3634]	836	!$ACC UPDATE DEVICE(work)
[1106]	837	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	838
	839	!
[1106]	840	!-- Reorder transposed array
[1111]	841	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	842	!$OMP DO
[1106]	843	DO l = 0, pdims(1) - 1
	844	xs = 0 + l * nnx
[3634]	845	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	846	!$ACC PRESENT(f_out, work)
[1106]	847	DO k = nzb_x, nzt_x
	848	DO i = xs, xs + nnx - 1
	849	DO j = nys_x, nyn_x
[1111]	850	f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]	851	ENDDO
[1]	852	ENDDO
	853	ENDDO
	854	ENDDO
[683]	855	!$OMP END PARALLEL
[1]	856	#endif
	857
[1106]	858	ENDIF
	859
[1]	860	END SUBROUTINE transpose_zx
	861
	862
	863	!------------------------------------------------------------------------------!
	864	! Description:
	865	! ------------
[1682]	866	!> Resorting data after the transposition from z to y. The transposition itself
	867	!> is carried out in transpose_zy
[1216]	868	!------------------------------------------------------------------------------!
[1682]	869	SUBROUTINE resort_for_zy( f_inv, f_out )
[1216]	870
[1682]	871
[1320]	872	USE indices, &
	873	ONLY: ny
[1216]	874
[1320]	875	USE kinds
	876
	877	USE transpose_indices, &
	878	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	879
[1216]	880	IMPLICIT NONE
	881
[1682]	882	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	883	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1216]	884
	885
[1682]	886	INTEGER(iwp) :: i !<
	887	INTEGER(iwp) :: j !<
	888	INTEGER(iwp) :: k !<
[1216]	889
	890	!
	891	!-- Rearrange indices of input array in order to make data to be send
	892	!-- by MPI contiguous
	893	!$OMP PARALLEL PRIVATE ( i, j, k )
	894	!$OMP DO
[3634]	895	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	896	!$ACC PRESENT(f_out, f_inv)
[1216]	897	DO k = nzb_y, nzt_y
	898	DO j = 0, ny
	899	DO i = nxl_y, nxr_y
	900	f_out(j,i,k) = f_inv(i,k,j)
	901	ENDDO
	902	ENDDO
	903	ENDDO
	904	!$OMP END PARALLEL
	905
	906	END SUBROUTINE resort_for_zy
	907
	908
	909	!------------------------------------------------------------------------------!
[3241]	910	! Description:cpu_log_nowait
[1216]	911	! ------------
[1682]	912	!> Transposition of input array (f_in) from z to y. For the input array, all
	913	!> elements along z reside on the same PE, while after transposition, all
	914	!> elements along y reside on the same PE.
[1]	915	!------------------------------------------------------------------------------!
[1682]	916	SUBROUTINE transpose_zy( f_in, f_inv )
[1]	917
[1682]	918
[1320]	919	USE cpulog, &
	920	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	921
[1320]	922	USE indices, &
	923	ONLY: ny, nz
	924
	925	USE kinds
	926
[1324]	927	USE pegrid
[1320]	928
	929	USE transpose_indices, &
	930	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	931
[1]	932	IMPLICIT NONE
	933
[1682]	934	INTEGER(iwp) :: i !<
	935	INTEGER(iwp) :: j !<
	936	INTEGER(iwp) :: k !<
	937	INTEGER(iwp) :: l !<
	938	INTEGER(iwp) :: zs !<
[1]	939
[1682]	940	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
	941	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1111]	942
[1682]	943	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[3634]	944	!$ACC DECLARE CREATE(work)
[1111]	945
[1]	946	!
	947	!-- If the PE grid is one-dimensional along y, the array has only to be
	948	!-- reordered locally and therefore no transposition has to be done.
	949	IF ( pdims(1) /= 1 ) THEN
[1106]	950
	951	#if defined( __parallel )
[1]	952	!
	953	!-- Reorder input array for transposition
[1111]	954	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	955	!$OMP DO
[1]	956	DO l = 0, pdims(1) - 1
[1003]	957	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[3634]	958	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	959	!$ACC PRESENT(work, f_in)
[1003]	960	DO j = nys_z, nyn_z
	961	DO k = zs, zs + nzt_y - nzb_y
	962	DO i = nxl_z, nxr_z
[1111]	963	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	964	ENDDO
	965	ENDDO
	966	ENDDO
	967	ENDDO
[683]	968	!$OMP END PARALLEL
[1]	969
	970	!
	971	!-- Transpose array
[1318]	972	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[3634]	973	!$ACC UPDATE HOST(work)
[622]	974	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	975	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	976	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	977	comm1dx, ierr )
[3634]	978	!$ACC UPDATE DEVICE(f_inv)
[1]	979	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	980	#endif
[1]	981
	982	ELSE
	983	!
[1106]	984	!-- Reorder the array in the same way like ALLTOALL did it
[683]	985	!$OMP PARALLEL PRIVATE ( i, j, k )
	986	!$OMP DO
[3634]	987	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
	988	!$ACC PRESENT(f_inv, f_in)
[1003]	989	DO k = nzb_y, nzt_y
	990	DO j = 0, ny
	991	DO i = nxl_y, nxr_y
[164]	992	f_inv(i,k,j) = f_in(i,j,k)
	993	ENDDO
	994	ENDDO
	995	ENDDO
[683]	996	!$OMP END PARALLEL
[1106]	997
	998	ENDIF
	999
[1]	1000	END SUBROUTINE transpose_zy
	1001
	1002
	1003	!------------------------------------------------------------------------------!
	1004	! Description:
	1005	! ------------
[1682]	1006	!> Transposition of input array (f_in) from z to y. For the input array, all
	1007	!> elements along z reside on the same PE, while after transposition, all
	1008	!> elements along y reside on the same PE.
	1009	!> This is a direct transposition for arrays with indices in regular order
	1010	!> (k,j,i) (cf. transpose_zy).
[1]	1011	!------------------------------------------------------------------------------!
[1682]	1012	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	1013
[1682]	1014
[1320]	1015	USE cpulog, &
[3241]	1016	ONLY: cpu_log, log_point_s
[1]	1017
[1320]	1018	USE indices, &
	1019	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
	1020
	1021	USE kinds
	1022
[1324]	1023	USE pegrid
[1320]	1024
	1025	USE transpose_indices, &
[3241]	1026	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
[1320]	1027
[1]	1028	IMPLICIT NONE
	1029
[1682]	1030	INTEGER(iwp) :: i !<
	1031	INTEGER(iwp) :: j !<
	1032	INTEGER(iwp) :: k !<
	1033	INTEGER(iwp) :: l !<
	1034	INTEGER(iwp) :: m !<
	1035	INTEGER(iwp) :: ys !<
[1]	1036
[1682]	1037	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	1038	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	1039	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
	1040	REAL(wp) :: work(nnxnnynnz) !<
[1320]	1041
[1]	1042	#if defined( __parallel )
	1043
	1044	!
	1045	!-- Rearrange indices of input array in order to make data to be send
	1046	!-- by MPI contiguous
[1003]	1047	DO i = nxl, nxr
	1048	DO j = nys, nyn
	1049	DO k = 1, nz
[164]	1050	f_inv(j,i,k) = f_in(k,j,i)
[1]	1051	ENDDO
	1052	ENDDO
	1053	ENDDO
	1054
	1055	!
	1056	!-- Move data to different array, because memory location of work1 is
	1057	!-- needed further below (work1 = work2).
	1058	!-- If the PE grid is one-dimensional along x, only local reordering
	1059	!-- of the data is necessary and no transposition has to be done.
	1060	IF ( pdims(2) == 1 ) THEN
[1003]	1061	DO k = 1, nz
	1062	DO i = nxl, nxr
	1063	DO j = nys, nyn
[164]	1064	f_out(j,i,k) = f_inv(j,i,k)
[1]	1065	ENDDO
	1066	ENDDO
	1067	ENDDO
	1068	RETURN
	1069	ENDIF
	1070
	1071	!
	1072	!-- Transpose array
	1073	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1074	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1075	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	1076	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	1077	comm1dy, ierr )
	1078	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1079
	1080	!
	1081	!-- Reorder transposed array
	1082	m = 0
	1083	DO l = 0, pdims(2) - 1
	1084	ys = 0 + l * nny
[1003]	1085	DO k = nzb_yd, nzt_yd
	1086	DO i = nxl_yd, nxr_yd
[1]	1087	DO j = ys, ys + nny - 1
	1088	m = m + 1
[164]	1089	f_out(j,i,k) = work(m)
[1]	1090	ENDDO
	1091	ENDDO
	1092	ENDDO
	1093	ENDDO
	1094
	1095	#endif
	1096
	1097	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |