Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 1320

Last change on this file since 1320 was 1320, checked in by raasch, 10 years ago
ONLY-attribute added to USE-statements, kind-parameters added to all INTEGER and REAL declaration statements, kinds are defined in new module kinds, old module precision_kind is removed, revision history before 2012 removed, comment fields (!:) to be used for variable explanations added to all variable declaration statements
Property svn:keywords set to `Id`
File size: 32.2 KB

Rev	Line
[1216]	1	SUBROUTINE resort_for_xy( f_in, f_inv )
[1]	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[1310]	17	! Copyright 1997-2014 Leibniz Universitaet Hannover
[1036]	18	!--------------------------------------------------------------------------------!
	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1320]	22	! ONLY-attribute added to USE-statements,
	23	! kind-parameters added to all INTEGER and REAL declaration statements,
	24	! kinds are defined in new module kinds,
	25	! old module precision_kind is removed,
	26	! revision history before 2012 removed,
	27	! comment fields (!:) to be used for variable explanations added to
	28	! all variable declaration statements
[198]	29	!
	30	! Former revisions:
	31	! -----------------
	32	! $Id: transpose.f90 1320 2014-03-20 08:40:49Z raasch $
	33	!
[1319]	34	! 1318 2014-03-17 13:35:16Z raasch
	35	! cpu_log_nowait parameter added to cpu measurements of the transpositions
	36	! required for solving the Poisson equation (poisfft),
	37	! module interfaces removed
	38	!
[1258]	39	! 1257 2013-11-08 15:18:40Z raasch
	40	! openacc loop and loop vector clauses removed
	41	!
[1217]	42	! 1216 2013-08-26 09:31:42Z raasch
	43	! re-sorting of the transposed / to be transposed arrays moved to separate
	44	! routines resort_for_...
	45	!
[1112]	46	! 1111 2013-03-08 23:54:10Z raasch
	47	! openACC directives added,
	48	! resorting data from/to work changed, work got 4 dimensions instead of 1
	49	!
[1107]	50	! 1106 2013-03-04 05:31:38Z raasch
	51	! preprocessor lines rearranged so that routines can also be used in serial
	52	! (non-parallel) mode
	53	!
[1093]	54	! 1092 2013-02-02 11:24:22Z raasch
	55	! unused variables removed
	56	!
[1037]	57	! 1036 2012-10-22 13:43:42Z raasch
	58	! code put under GPL (PALM 3.9)
	59	!
[1004]	60	! 1003 2012-09-14 14:35:53Z raasch
	61	! indices nxa, nya, etc. replaced by nx, ny, etc.
	62	!
[1]	63	! Revision 1.1 1997/07/24 11:25:18 raasch
	64	! Initial revision
	65	!
[1216]	66	!------------------------------------------------------------------------------!
	67	! Description:
	68	! ------------
	69	! Resorting data for the transposition from x to y. The transposition itself
	70	! is carried out in transpose_xy
	71	!------------------------------------------------------------------------------!
	72
[1320]	73	USE indices, &
	74	ONLY: nx
[1216]	75
[1320]	76	USE kinds
	77
	78	USE transpose_indices, &
	79	ONLY: nxl_z, nxr_z, nyn_x, nyn_z, nys_x, nys_z, nzb_x, nzt_x
	80
[1216]	81	IMPLICIT NONE
	82
[1320]	83	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !:
	84	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !:
[1216]	85
	86
[1320]	87	INTEGER(iwp) :: i !:
	88	INTEGER(iwp) :: j !:
	89	INTEGER(iwp) :: k !:
[1]	90	!
[1216]	91	!-- Rearrange indices of input array in order to make data to be send
	92	!-- by MPI contiguous
	93	!$OMP PARALLEL PRIVATE ( i, j, k )
	94	!$OMP DO
	95	!$acc kernels present( f_in, f_inv )
	96	DO i = 0, nx
	97	DO k = nzb_x, nzt_x
	98	DO j = nys_x, nyn_x
	99	f_inv(j,k,i) = f_in(i,j,k)
	100	ENDDO
	101	ENDDO
	102	ENDDO
	103	!$acc end kernels
	104	!$OMP END PARALLEL
	105
	106	END SUBROUTINE resort_for_xy
	107
	108
	109	SUBROUTINE transpose_xy( f_inv, f_out )
	110
	111	!------------------------------------------------------------------------------!
[1]	112	! Description:
	113	! ------------
	114	! Transposition of input array (f_in) from x to y. For the input array, all
	115	! elements along x reside on the same PE, while after transposition, all
	116	! elements along y reside on the same PE.
	117	!------------------------------------------------------------------------------!
	118
[1320]	119	USE cpulog, &
	120	ONLY: cpu_log, cpu_log_nowait, log_point_s
	121
	122	USE indices, &
	123	ONLY: nx, ny
	124
	125	USE kinds
	126
[1]	127	USE pegrid
	128
[1320]	129	USE transpose_indices, &
	130	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	131
[1]	132	IMPLICIT NONE
	133
[1320]	134	INTEGER(iwp) :: i !:
	135	INTEGER(iwp) :: j !:
	136	INTEGER(iwp) :: k !:
	137	INTEGER(iwp) :: l !:
	138	INTEGER(iwp) :: ys !:
	139
	140	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !:
	141	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !:
[1]	142
[1320]	143	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !:
[1111]	144
	145
[1106]	146	IF ( numprocs /= 1 ) THEN
	147
	148	#if defined( __parallel )
[1]	149	!
[1106]	150	!-- Transpose array
[1318]	151	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	152	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	153	!$acc update host( f_inv )
	154	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	155	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	156	comm1dy, ierr )
	157	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	158
	159	!
[1106]	160	!-- Reorder transposed array
[1111]	161	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	162	!$OMP DO
[1216]	163	!$acc data copyin( work )
[1106]	164	DO l = 0, pdims(2) - 1
	165	ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]	166	!$acc kernels present( f_out, work )
[1106]	167	DO i = nxl_y, nxr_y
	168	DO k = nzb_y, nzt_y
	169	DO j = ys, ys + nyn_x - nys_x
[1111]	170	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	171	ENDDO
[1]	172	ENDDO
	173	ENDDO
[1111]	174	!$acc end kernels
[1]	175	ENDDO
[1216]	176	!$acc end data
[683]	177	!$OMP END PARALLEL
[1]	178	#endif
	179
[1106]	180	ELSE
	181
	182	!
	183	!-- Reorder transposed array
	184	!$OMP PARALLEL PRIVATE ( i, j, k )
	185	!$OMP DO
[1216]	186	!$acc kernels present( f_inv, f_out )
[1106]	187	DO k = nzb_y, nzt_y
	188	DO i = nxl_y, nxr_y
	189	DO j = 0, ny
	190	f_out(j,i,k) = f_inv(j,k,i)
	191	ENDDO
	192	ENDDO
	193	ENDDO
[1111]	194	!$acc end kernels
[1106]	195	!$OMP END PARALLEL
	196
	197	ENDIF
	198
[1]	199	END SUBROUTINE transpose_xy
	200
	201
[1216]	202	SUBROUTINE resort_for_xz( f_inv, f_out )
[1]	203
	204	!------------------------------------------------------------------------------!
	205	! Description:
	206	! ------------
[1216]	207	! Resorting data after the transposition from x to z. The transposition itself
	208	! is carried out in transpose_xz
	209	!------------------------------------------------------------------------------!
	210
[1320]	211	USE indices, &
	212	ONLY: nxl, nxr, nyn, nys, nz
[1216]	213
[1320]	214	USE kinds
	215
[1216]	216	IMPLICIT NONE
	217
[1320]	218	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !:
	219	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !:
[1216]	220
[1320]	221	INTEGER(iwp) :: i !:
	222	INTEGER(iwp) :: j !:
	223	INTEGER(iwp) :: k !:
[1216]	224	!
	225	!-- Rearrange indices of input array in order to make data to be send
	226	!-- by MPI contiguous.
	227	!-- In case of parallel fft/transposition, scattered store is faster in
	228	!-- backward direction!!!
	229	!$OMP PARALLEL PRIVATE ( i, j, k )
	230	!$OMP DO
	231	!$acc kernels present( f_inv, f_out )
	232	DO k = 1, nz
	233	DO i = nxl, nxr
	234	DO j = nys, nyn
	235	f_out(k,j,i) = f_inv(j,i,k)
	236	ENDDO
	237	ENDDO
	238	ENDDO
	239	!$acc end kernels
	240	!$OMP END PARALLEL
	241
	242	END SUBROUTINE resort_for_xz
	243
	244
	245	SUBROUTINE transpose_xz( f_in, f_inv )
	246
	247	!------------------------------------------------------------------------------!
	248	! Description:
	249	! ------------
[1]	250	! Transposition of input array (f_in) from x to z. For the input array, all
	251	! elements along x reside on the same PE, while after transposition, all
	252	! elements along z reside on the same PE.
	253	!------------------------------------------------------------------------------!
	254
[1320]	255	USE cpulog, &
	256	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	257
[1320]	258	USE indices, &
	259	ONLY: nnx, nx, nxl, nxr, ny, nyn, nys, nz
	260
	261	USE kinds
	262
	263	USE pegrid, &
	264	ONLY: collective_wait, comm1dx, comm2d, ierr, MPI_DOUBLE_PRECISION, &
	265	pdims, sendrecvcount_zx
	266
	267	USE transpose_indices, &
	268	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	269
[1]	270	IMPLICIT NONE
	271
[1320]	272	INTEGER(iwp) :: i !:
	273	INTEGER(iwp) :: j !:
	274	INTEGER(iwp) :: k !:
	275	INTEGER(iwp) :: l !:
	276	INTEGER(iwp) :: xs !:
[1]	277
[1320]	278	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !:
	279	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !:
[1]	280
[1320]	281	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !:
[1111]	282
[1320]	283
[1]	284	!
	285	!-- If the PE grid is one-dimensional along y, the array has only to be
	286	!-- reordered locally and therefore no transposition has to be done.
	287	IF ( pdims(1) /= 1 ) THEN
[1106]	288
	289	#if defined( __parallel )
[1]	290	!
	291	!-- Reorder input array for transposition
[1111]	292	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	293	!$OMP DO
[1216]	294	!$acc data copyout( work )
[1]	295	DO l = 0, pdims(1) - 1
	296	xs = 0 + l * nnx
[1111]	297	!$acc kernels present( f_in, work )
[1003]	298	DO k = nzb_x, nzt_x
[164]	299	DO i = xs, xs + nnx - 1
[1003]	300	DO j = nys_x, nyn_x
[1111]	301	work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]	302	ENDDO
	303	ENDDO
	304	ENDDO
[1111]	305	!$acc end kernels
[1]	306	ENDDO
[1216]	307	!$acc end data
[683]	308	!$OMP END PARALLEL
[1]	309
	310	!
	311	!-- Transpose array
[1318]	312	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[622]	313	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	314	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	315	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	316	comm1dx, ierr )
[1111]	317	!$acc update device( f_inv )
[1]	318	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	319	#endif
	320
[1]	321	ELSE
[1106]	322
[1]	323	!
	324	!-- Reorder the array in a way that the z index is in first position
[683]	325	!$OMP PARALLEL PRIVATE ( i, j, k )
	326	!$OMP DO
[1216]	327	!$acc kernels present( f_in, f_inv )
[1003]	328	DO i = nxl, nxr
	329	DO j = nys, nyn
	330	DO k = 1, nz
[164]	331	f_inv(j,i,k) = f_in(i,j,k)
[1]	332	ENDDO
	333	ENDDO
	334	ENDDO
[1111]	335	!$acc end kernels
[683]	336	!$OMP END PARALLEL
[1]	337
[164]	338	ENDIF
	339
[1]	340	END SUBROUTINE transpose_xz
	341
	342
[1216]	343	SUBROUTINE resort_for_yx( f_inv, f_out )
[1]	344
	345	!------------------------------------------------------------------------------!
	346	! Description:
	347	! ------------
[1216]	348	! Resorting data after the transposition from y to x. The transposition itself
	349	! is carried out in transpose_yx
	350	!------------------------------------------------------------------------------!
	351
[1320]	352	USE indices, &
	353	ONLY: nx
[1216]	354
[1320]	355	USE kinds
	356
	357	USE transpose_indices, &
	358	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	359
[1216]	360	IMPLICIT NONE
	361
[1320]	362	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !:
	363	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !:
[1216]	364
	365
[1320]	366	INTEGER(iwp) :: i !:
	367	INTEGER(iwp) :: j !:
	368	INTEGER(iwp) :: k !:
[1216]	369	!
	370	!-- Rearrange indices of input array in order to make data to be send
	371	!-- by MPI contiguous
	372	!$OMP PARALLEL PRIVATE ( i, j, k )
	373	!$OMP DO
	374	!$acc kernels present( f_inv, f_out )
	375	DO i = 0, nx
	376	DO k = nzb_x, nzt_x
	377	DO j = nys_x, nyn_x
	378	f_out(i,j,k) = f_inv(j,k,i)
	379	ENDDO
	380	ENDDO
	381	ENDDO
	382	!$acc end kernels
	383	!$OMP END PARALLEL
	384
	385	END SUBROUTINE resort_for_yx
	386
	387
	388	SUBROUTINE transpose_yx( f_in, f_inv )
	389
	390	!------------------------------------------------------------------------------!
	391	! Description:
	392	! ------------
[1]	393	! Transposition of input array (f_in) from y to x. For the input array, all
	394	! elements along y reside on the same PE, while after transposition, all
	395	! elements along x reside on the same PE.
	396	!------------------------------------------------------------------------------!
	397
[1320]	398	USE cpulog, &
	399	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	400
[1320]	401	USE indices, &
	402	ONLY: nx, ny
	403
	404	USE kinds
	405
	406	USE pegrid, &
	407	ONLY: collective_wait, comm1dy, comm2d, ierr, MPI_DOUBLE_PRECISION, &
	408	numprocs, pdims, sendrecvcount_xy
	409
	410	USE transpose_indices, &
	411	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	412
[1]	413	IMPLICIT NONE
	414
[1320]	415	INTEGER(iwp) :: i !:
	416	INTEGER(iwp) :: j !:
	417	INTEGER(iwp) :: k !:
	418	INTEGER(iwp) :: l !:
	419	INTEGER(iwp) :: ys !:
[1]	420
[1320]	421	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !:
	422	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !:
[1111]	423
[1320]	424	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !:
[1111]	425
[1320]	426
[1106]	427	IF ( numprocs /= 1 ) THEN
	428
[1]	429	#if defined( __parallel )
	430	!
[1106]	431	!-- Reorder input array for transposition
[1111]	432	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	433	!$OMP DO
[1216]	434	!$acc data copyout( work )
[1106]	435	DO l = 0, pdims(2) - 1
	436	ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]	437	!$acc kernels present( f_in, work )
[1106]	438	DO i = nxl_y, nxr_y
	439	DO k = nzb_y, nzt_y
	440	DO j = ys, ys + nyn_x - nys_x
[1111]	441	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	442	ENDDO
	443	ENDDO
	444	ENDDO
[1111]	445	!$acc end kernels
[1106]	446	ENDDO
[1216]	447	!$acc end data
[1106]	448	!$OMP END PARALLEL
	449
	450	!
	451	!-- Transpose array
[1318]	452	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	453	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	454	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	455	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	456	comm1dy, ierr )
[1111]	457	!$acc update device( f_inv )
[1106]	458	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	459	#endif
	460
	461	ELSE
	462
	463	!
	464	!-- Reorder array f_in the same way as ALLTOALL did it
	465	!$OMP PARALLEL PRIVATE ( i, j, k )
	466	!$OMP DO
[1216]	467	!$acc kernels present( f_in, f_inv )
[1003]	468	DO i = nxl_y, nxr_y
	469	DO k = nzb_y, nzt_y
[1106]	470	DO j = 0, ny
	471	f_inv(j,k,i) = f_in(j,i,k)
[1]	472	ENDDO
	473	ENDDO
	474	ENDDO
[1111]	475	!$acc end kernels
[683]	476	!$OMP END PARALLEL
[1]	477
[1106]	478	ENDIF
[1]	479
	480	END SUBROUTINE transpose_yx
	481
	482
[1216]	483	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	484
	485	!------------------------------------------------------------------------------!
	486	! Description:
	487	! ------------
	488	! Transposition of input array (f_in) from y to x. For the input array, all
	489	! elements along y reside on the same PE, while after transposition, all
	490	! elements along x reside on the same PE.
	491	! This is a direct transposition for arrays with indices in regular order
	492	! (k,j,i) (cf. transpose_yx).
	493	!------------------------------------------------------------------------------!
	494
[1320]	495	USE cpulog, &
	496	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	497
[1320]	498	USE indices, &
	499	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
	500
	501	USE kinds
	502
	503	USE pegrid, &
	504	ONLY: collective_wait, comm1dx, comm2d, ierr, MPI_DOUBLE_PRECISION, &
	505	pdims, sendrecvcount_xy
	506
	507	USE transpose_indices, &
	508	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	509
[1]	510	IMPLICIT NONE
	511
[1320]	512	INTEGER(iwp) :: i !:
	513	INTEGER(iwp) :: j !:
	514	INTEGER(iwp) :: k !:
	515	INTEGER(iwp) :: l !:
	516	INTEGER(iwp) :: m !:
	517	INTEGER(iwp) :: xs !:
[1]	518
[1320]	519	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !:
	520	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !:
	521	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !:
	522	REAL(wp) :: work(nnxnnynnz) !:
[1]	523	#if defined( __parallel )
	524
	525	!
	526	!-- Rearrange indices of input array in order to make data to be send
	527	!-- by MPI contiguous
[1003]	528	DO k = 1, nz
	529	DO j = nys, nyn
	530	DO i = nxl, nxr
[164]	531	f_inv(i,k,j) = f_in(k,j,i)
[1]	532	ENDDO
	533	ENDDO
	534	ENDDO
	535
	536	!
	537	!-- Transpose array
	538	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	539	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	540	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	541	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	542	comm1dx, ierr )
	543	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	544
	545	!
	546	!-- Reorder transposed array
	547	m = 0
	548	DO l = 0, pdims(1) - 1
	549	xs = 0 + l * nnx
[1003]	550	DO j = nys_x, nyn_x
	551	DO k = 1, nz
[1]	552	DO i = xs, xs + nnx - 1
	553	m = m + 1
[164]	554	f_out(i,j,k) = work(m)
[1]	555	ENDDO
	556	ENDDO
	557	ENDDO
	558	ENDDO
	559
	560	#endif
	561
	562	END SUBROUTINE transpose_yxd
	563
	564
[1216]	565	SUBROUTINE resort_for_yz( f_in, f_inv )
[1]	566
	567	!------------------------------------------------------------------------------!
	568	! Description:
	569	! ------------
[1216]	570	! Resorting data for the transposition from y to z. The transposition itself
	571	! is carried out in transpose_yz
	572	!------------------------------------------------------------------------------!
	573
[1320]	574	USE indices, &
	575	ONLY: ny
[1216]	576
[1320]	577	USE kinds
	578
	579	USE transpose_indices, &
	580	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	581
[1216]	582	IMPLICIT NONE
	583
[1320]	584	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !:
	585	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !:
[1216]	586
[1320]	587	INTEGER(iwp) :: i !:
	588	INTEGER(iwp) :: j !:
	589	INTEGER(iwp) :: k !:
[1216]	590
	591	!
	592	!-- Rearrange indices of input array in order to make data to be send
	593	!-- by MPI contiguous
	594	!$OMP PARALLEL PRIVATE ( i, j, k )
	595	!$OMP DO
	596	!$acc kernels present( f_in, f_inv )
	597	DO j = 0, ny
	598	DO k = nzb_y, nzt_y
	599	DO i = nxl_y, nxr_y
	600	f_inv(i,k,j) = f_in(j,i,k)
	601	ENDDO
	602	ENDDO
	603	ENDDO
	604	!$acc end kernels
	605	!$OMP END PARALLEL
	606
	607	END SUBROUTINE resort_for_yz
	608
	609
	610	SUBROUTINE transpose_yz( f_inv, f_out )
	611
	612	!------------------------------------------------------------------------------!
	613	! Description:
	614	! ------------
[1]	615	! Transposition of input array (f_in) from y to z. For the input array, all
	616	! elements along y reside on the same PE, while after transposition, all
	617	! elements along z reside on the same PE.
	618	!------------------------------------------------------------------------------!
	619
[1320]	620	USE cpulog, &
	621	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	622
[1320]	623	USE indices, &
	624	ONLY: ny, nz
	625
	626	USE kinds
	627
	628	USE pegrid, &
	629	ONLY: collective_wait, comm1dx, comm2d, ierr, MPI_DOUBLE_PRECISION, &
	630	pdims, sendrecvcount_yz
	631
	632	USE transpose_indices, &
	633	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	634
[1]	635	IMPLICIT NONE
	636
[1320]	637	INTEGER(iwp) :: i !:
	638	INTEGER(iwp) :: j !:
	639	INTEGER(iwp) :: k !:
	640	INTEGER(iwp) :: l !:
	641	INTEGER(iwp) :: zs !:
[1]	642
[1320]	643	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !:
	644	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !:
[1111]	645
[1320]	646	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !:
[1111]	647
[1320]	648
[1]	649	!
	650	!-- If the PE grid is one-dimensional along y, only local reordering
	651	!-- of the data is necessary and no transposition has to be done.
	652	IF ( pdims(1) == 1 ) THEN
[1106]	653
[683]	654	!$OMP PARALLEL PRIVATE ( i, j, k )
	655	!$OMP DO
[1216]	656	!$acc kernels present( f_inv, f_out )
[1003]	657	DO j = 0, ny
	658	DO k = nzb_y, nzt_y
	659	DO i = nxl_y, nxr_y
[164]	660	f_out(i,j,k) = f_inv(i,k,j)
[1]	661	ENDDO
	662	ENDDO
	663	ENDDO
[1111]	664	!$acc end kernels
[683]	665	!$OMP END PARALLEL
[1]	666
[1106]	667	ELSE
	668
	669	#if defined( __parallel )
[1]	670	!
[1106]	671	!-- Transpose array
[1318]	672	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	673	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	674	!$acc update host( f_inv )
	675	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	676	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	677	comm1dx, ierr )
	678	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	679
	680	!
[1106]	681	!-- Reorder transposed array
[1111]	682	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	683	!$OMP DO
[1216]	684	!$acc data copyin( work )
[1106]	685	DO l = 0, pdims(1) - 1
	686	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1216]	687	!$acc kernels present( f_out )
[1106]	688	DO j = nys_z, nyn_z
	689	DO k = zs, zs + nzt_y - nzb_y
	690	DO i = nxl_z, nxr_z
[1111]	691	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	692	ENDDO
[1]	693	ENDDO
	694	ENDDO
[1111]	695	!$acc end kernels
[1]	696	ENDDO
[1216]	697	!$acc end data
[683]	698	!$OMP END PARALLEL
[1]	699	#endif
	700
[1106]	701	ENDIF
	702
[1]	703	END SUBROUTINE transpose_yz
	704
	705
[1216]	706	SUBROUTINE resort_for_zx( f_in, f_inv )
[1]	707
	708	!------------------------------------------------------------------------------!
	709	! Description:
	710	! ------------
[1216]	711	! Resorting data for the transposition from z to x. The transposition itself
	712	! is carried out in transpose_zx
	713	!------------------------------------------------------------------------------!
	714
[1320]	715	USE indices, &
	716	ONLY: nxl, nxr, nyn, nys, nz
[1216]	717
[1320]	718	USE kinds
	719
[1216]	720	IMPLICIT NONE
	721
[1320]	722	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !:
	723	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !:
[1216]	724
[1320]	725	INTEGER(iwp) :: i !:
	726	INTEGER(iwp) :: j !:
	727	INTEGER(iwp) :: k !:
[1216]	728
	729	!
	730	!-- Rearrange indices of input array in order to make data to be send
	731	!-- by MPI contiguous
	732	!$OMP PARALLEL PRIVATE ( i, j, k )
	733	!$OMP DO
	734	!$acc kernels present( f_in, f_inv )
	735	DO k = 1,nz
	736	DO i = nxl, nxr
	737	DO j = nys, nyn
	738	f_inv(j,i,k) = f_in(k,j,i)
	739	ENDDO
	740	ENDDO
	741	ENDDO
	742	!$acc end kernels
	743	!$OMP END PARALLEL
	744
	745	END SUBROUTINE resort_for_zx
	746
	747
	748	SUBROUTINE transpose_zx( f_inv, f_out )
	749
	750	!------------------------------------------------------------------------------!
	751	! Description:
	752	! ------------
[1]	753	! Transposition of input array (f_in) from z to x. For the input array, all
	754	! elements along z reside on the same PE, while after transposition, all
	755	! elements along x reside on the same PE.
	756	!------------------------------------------------------------------------------!
	757
[1320]	758	USE cpulog, &
	759	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	760
[1320]	761	USE indices, &
	762	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
	763
	764	USE kinds
	765
	766	USE pegrid, &
	767	ONLY: collective_wait, comm1dx, comm2d, ierr, MPI_DOUBLE_PRECISION, &
	768	pdims, sendrecvcount_zx
	769
	770	USE transpose_indices, &
	771	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	772
[1]	773	IMPLICIT NONE
	774
[1320]	775	INTEGER(iwp) :: i !:
	776	INTEGER(iwp) :: j !:
	777	INTEGER(iwp) :: k !:
	778	INTEGER(iwp) :: l !:
	779	INTEGER(iwp) :: xs !:
[1]	780
[1320]	781	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !:
	782	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !:
[1111]	783
[1320]	784	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !:
[1]	785
[1320]	786
[1]	787	!
	788	!-- If the PE grid is one-dimensional along y, only local reordering
	789	!-- of the data is necessary and no transposition has to be done.
	790	IF ( pdims(1) == 1 ) THEN
[1106]	791
[683]	792	!$OMP PARALLEL PRIVATE ( i, j, k )
	793	!$OMP DO
[1216]	794	!$acc kernels present( f_inv, f_out )
[1003]	795	DO k = 1, nz
	796	DO i = nxl, nxr
	797	DO j = nys, nyn
[164]	798	f_out(i,j,k) = f_inv(j,i,k)
[1]	799	ENDDO
	800	ENDDO
	801	ENDDO
[1111]	802	!$acc end kernels
[683]	803	!$OMP END PARALLEL
[1]	804
[1106]	805	ELSE
	806
	807	#if defined( __parallel )
[1]	808	!
[1106]	809	!-- Transpose array
[1318]	810	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	811	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	812	!$acc update host( f_inv )
	813	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	814	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	815	comm1dx, ierr )
	816	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	817
	818	!
[1106]	819	!-- Reorder transposed array
[1111]	820	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	821	!$OMP DO
[1216]	822	!$acc data copyin( work )
[1106]	823	DO l = 0, pdims(1) - 1
	824	xs = 0 + l * nnx
[1216]	825	!$acc kernels present( f_out )
[1106]	826	DO k = nzb_x, nzt_x
	827	DO i = xs, xs + nnx - 1
	828	DO j = nys_x, nyn_x
[1111]	829	f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]	830	ENDDO
[1]	831	ENDDO
	832	ENDDO
[1111]	833	!$acc end kernels
[1]	834	ENDDO
[1216]	835	!$acc end data
[683]	836	!$OMP END PARALLEL
[1]	837	#endif
	838
[1106]	839	ENDIF
	840
[1]	841	END SUBROUTINE transpose_zx
	842
	843
[1216]	844	SUBROUTINE resort_for_zy( f_inv, f_out )
[1]	845
	846	!------------------------------------------------------------------------------!
	847	! Description:
	848	! ------------
[1216]	849	! Resorting data after the transposition from z to y. The transposition itself
	850	! is carried out in transpose_zy
	851	!------------------------------------------------------------------------------!
	852
[1320]	853	USE indices, &
	854	ONLY: ny
[1216]	855
[1320]	856	USE kinds
	857
	858	USE transpose_indices, &
	859	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	860
[1216]	861	IMPLICIT NONE
	862
[1320]	863	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !:
	864	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !:
[1216]	865
	866
[1320]	867	INTEGER(iwp) :: i !:
	868	INTEGER(iwp) :: j !:
	869	INTEGER(iwp) :: k !:
[1216]	870
	871	!
	872	!-- Rearrange indices of input array in order to make data to be send
	873	!-- by MPI contiguous
	874	!$OMP PARALLEL PRIVATE ( i, j, k )
	875	!$OMP DO
	876	!$acc kernels present( f_inv, f_out )
	877	DO k = nzb_y, nzt_y
	878	DO j = 0, ny
	879	DO i = nxl_y, nxr_y
	880	f_out(j,i,k) = f_inv(i,k,j)
	881	ENDDO
	882	ENDDO
	883	ENDDO
	884	!$acc end kernels
	885	!$OMP END PARALLEL
	886
	887	END SUBROUTINE resort_for_zy
	888
	889
	890	SUBROUTINE transpose_zy( f_in, f_inv )
	891
	892	!------------------------------------------------------------------------------!
	893	! Description:
	894	! ------------
[1]	895	! Transposition of input array (f_in) from z to y. For the input array, all
	896	! elements along z reside on the same PE, while after transposition, all
	897	! elements along y reside on the same PE.
	898	!------------------------------------------------------------------------------!
	899
[1320]	900	USE cpulog, &
	901	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	902
[1320]	903	USE indices, &
	904	ONLY: ny, nz
	905
	906	USE kinds
	907
	908	USE pegrid, &
	909	ONLY: collective_wait, comm1dx, comm2d, ierr, MPI_DOUBLE_PRECISION, &
	910	pdims, sendrecvcount_yz
	911
	912	USE transpose_indices, &
	913	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	914
[1]	915	IMPLICIT NONE
	916
[1320]	917	INTEGER(iwp) :: i !:
	918	INTEGER(iwp) :: j !:
	919	INTEGER(iwp) :: k !:
	920	INTEGER(iwp) :: l !:
	921	INTEGER(iwp) :: zs !:
[1]	922
[1320]	923	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !:
	924	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !:
[1111]	925
[1320]	926	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !:
[1111]	927
[1]	928	!
	929	!-- If the PE grid is one-dimensional along y, the array has only to be
	930	!-- reordered locally and therefore no transposition has to be done.
	931	IF ( pdims(1) /= 1 ) THEN
[1106]	932
	933	#if defined( __parallel )
[1]	934	!
	935	!-- Reorder input array for transposition
[1111]	936	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	937	!$OMP DO
[1216]	938	!$acc data copyout( work )
[1]	939	DO l = 0, pdims(1) - 1
[1003]	940	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1111]	941	!$acc kernels present( f_in, work )
[1003]	942	DO j = nys_z, nyn_z
	943	DO k = zs, zs + nzt_y - nzb_y
	944	DO i = nxl_z, nxr_z
[1111]	945	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	946	ENDDO
	947	ENDDO
	948	ENDDO
[1111]	949	!$acc end kernels
[1]	950	ENDDO
[1216]	951	!$acc end data
[683]	952	!$OMP END PARALLEL
[1]	953
	954	!
	955	!-- Transpose array
[1318]	956	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[622]	957	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	958	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	959	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	960	comm1dx, ierr )
[1111]	961	!$acc update device( f_inv )
[1]	962	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	963	#endif
[1]	964
	965	ELSE
	966	!
[1106]	967	!-- Reorder the array in the same way like ALLTOALL did it
[683]	968	!$OMP PARALLEL PRIVATE ( i, j, k )
	969	!$OMP DO
[1216]	970	!$acc kernels present( f_in, f_inv )
[1003]	971	DO k = nzb_y, nzt_y
	972	DO j = 0, ny
	973	DO i = nxl_y, nxr_y
[164]	974	f_inv(i,k,j) = f_in(i,j,k)
	975	ENDDO
	976	ENDDO
	977	ENDDO
[1111]	978	!$acc end kernels
[683]	979	!$OMP END PARALLEL
[1106]	980
	981	ENDIF
	982
[1]	983	END SUBROUTINE transpose_zy
	984
	985
[1216]	986	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	987
	988	!------------------------------------------------------------------------------!
	989	! Description:
	990	! ------------
	991	! Transposition of input array (f_in) from z to y. For the input array, all
	992	! elements along z reside on the same PE, while after transposition, all
	993	! elements along y reside on the same PE.
	994	! This is a direct transposition for arrays with indices in regular order
	995	! (k,j,i) (cf. transpose_zy).
	996	!------------------------------------------------------------------------------!
	997
[1320]	998	USE cpulog, &
	999	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	1000
[1320]	1001	USE indices, &
	1002	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
	1003
	1004	USE kinds
	1005
	1006	USE pegrid, &
	1007	ONLY: collective_wait, comm1dy, comm2d, ierr, MPI_DOUBLE_PRECISION, &
	1008	pdims, sendrecvcount_zyd
	1009
	1010	USE transpose_indices, &
	1011	ONLY: nxl_y, nxl_yd, nxr_y, nxr_yd, nzb_y, nzb_yd, nzt_y, nzt_yd
	1012
[1]	1013	IMPLICIT NONE
	1014
[1320]	1015	INTEGER(iwp) :: i !:
	1016	INTEGER(iwp) :: j !:
	1017	INTEGER(iwp) :: k !:
	1018	INTEGER(iwp) :: l !:
	1019	INTEGER(iwp) :: m !:
	1020	INTEGER(iwp) :: ys !:
[1]	1021
[1320]	1022	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !:
	1023	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !:
	1024	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !:
	1025	REAL(wp) :: work(nnxnnynnz) !:
	1026
[1]	1027	#if defined( __parallel )
	1028
	1029	!
	1030	!-- Rearrange indices of input array in order to make data to be send
	1031	!-- by MPI contiguous
[1003]	1032	DO i = nxl, nxr
	1033	DO j = nys, nyn
	1034	DO k = 1, nz
[164]	1035	f_inv(j,i,k) = f_in(k,j,i)
[1]	1036	ENDDO
	1037	ENDDO
	1038	ENDDO
	1039
	1040	!
	1041	!-- Move data to different array, because memory location of work1 is
	1042	!-- needed further below (work1 = work2).
	1043	!-- If the PE grid is one-dimensional along x, only local reordering
	1044	!-- of the data is necessary and no transposition has to be done.
	1045	IF ( pdims(2) == 1 ) THEN
[1003]	1046	DO k = 1, nz
	1047	DO i = nxl, nxr
	1048	DO j = nys, nyn
[164]	1049	f_out(j,i,k) = f_inv(j,i,k)
[1]	1050	ENDDO
	1051	ENDDO
	1052	ENDDO
	1053	RETURN
	1054	ENDIF
	1055
	1056	!
	1057	!-- Transpose array
	1058	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1059	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1060	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	1061	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	1062	comm1dy, ierr )
	1063	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1064
	1065	!
	1066	!-- Reorder transposed array
	1067	m = 0
	1068	DO l = 0, pdims(2) - 1
	1069	ys = 0 + l * nny
[1003]	1070	DO k = nzb_yd, nzt_yd
	1071	DO i = nxl_yd, nxr_yd
[1]	1072	DO j = ys, ys + nny - 1
	1073	m = m + 1
[164]	1074	f_out(j,i,k) = work(m)
[1]	1075	ENDDO
	1076	ENDDO
	1077	ENDDO
	1078	ENDDO
	1079
	1080	#endif
	1081
	1082	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |