Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 1321

Last change on this file since 1321 was 1321, checked in by raasch, 10 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 32.2 KB

Rev	Line
[1216]	1	SUBROUTINE resort_for_xy( f_in, f_inv )
[1]	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[1310]	17	! Copyright 1997-2014 Leibniz Universitaet Hannover
[1036]	18	!--------------------------------------------------------------------------------!
	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1321]	22	!
	23	!
	24	! Former revisions:
	25	! -----------------
	26	! $Id: transpose.f90 1321 2014-03-20 09:40:40Z raasch $
	27	!
	28	! 1320 2014-03-20 08:40:49Z raasch
[1320]	29	! ONLY-attribute added to USE-statements,
	30	! kind-parameters added to all INTEGER and REAL declaration statements,
	31	! kinds are defined in new module kinds,
	32	! old module precision_kind is removed,
	33	! revision history before 2012 removed,
	34	! comment fields (!:) to be used for variable explanations added to
	35	! all variable declaration statements
[198]	36	!
[1319]	37	! 1318 2014-03-17 13:35:16Z raasch
	38	! cpu_log_nowait parameter added to cpu measurements of the transpositions
	39	! required for solving the Poisson equation (poisfft),
	40	! module interfaces removed
	41	!
[1258]	42	! 1257 2013-11-08 15:18:40Z raasch
	43	! openacc loop and loop vector clauses removed
	44	!
[1217]	45	! 1216 2013-08-26 09:31:42Z raasch
	46	! re-sorting of the transposed / to be transposed arrays moved to separate
	47	! routines resort_for_...
	48	!
[1112]	49	! 1111 2013-03-08 23:54:10Z raasch
	50	! openACC directives added,
	51	! resorting data from/to work changed, work got 4 dimensions instead of 1
	52	!
[1107]	53	! 1106 2013-03-04 05:31:38Z raasch
	54	! preprocessor lines rearranged so that routines can also be used in serial
	55	! (non-parallel) mode
	56	!
[1093]	57	! 1092 2013-02-02 11:24:22Z raasch
	58	! unused variables removed
	59	!
[1037]	60	! 1036 2012-10-22 13:43:42Z raasch
	61	! code put under GPL (PALM 3.9)
	62	!
[1004]	63	! 1003 2012-09-14 14:35:53Z raasch
	64	! indices nxa, nya, etc. replaced by nx, ny, etc.
	65	!
[1]	66	! Revision 1.1 1997/07/24 11:25:18 raasch
	67	! Initial revision
	68	!
[1216]	69	!------------------------------------------------------------------------------!
	70	! Description:
	71	! ------------
	72	! Resorting data for the transposition from x to y. The transposition itself
	73	! is carried out in transpose_xy
	74	!------------------------------------------------------------------------------!
	75
[1320]	76	USE indices, &
	77	ONLY: nx
[1216]	78
[1320]	79	USE kinds
	80
	81	USE transpose_indices, &
	82	ONLY: nxl_z, nxr_z, nyn_x, nyn_z, nys_x, nys_z, nzb_x, nzt_x
	83
[1216]	84	IMPLICIT NONE
	85
[1320]	86	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !:
	87	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !:
[1216]	88
	89
[1320]	90	INTEGER(iwp) :: i !:
	91	INTEGER(iwp) :: j !:
	92	INTEGER(iwp) :: k !:
[1]	93	!
[1216]	94	!-- Rearrange indices of input array in order to make data to be send
	95	!-- by MPI contiguous
	96	!$OMP PARALLEL PRIVATE ( i, j, k )
	97	!$OMP DO
	98	!$acc kernels present( f_in, f_inv )
	99	DO i = 0, nx
	100	DO k = nzb_x, nzt_x
	101	DO j = nys_x, nyn_x
	102	f_inv(j,k,i) = f_in(i,j,k)
	103	ENDDO
	104	ENDDO
	105	ENDDO
	106	!$acc end kernels
	107	!$OMP END PARALLEL
	108
	109	END SUBROUTINE resort_for_xy
	110
	111
	112	SUBROUTINE transpose_xy( f_inv, f_out )
	113
	114	!------------------------------------------------------------------------------!
[1]	115	! Description:
	116	! ------------
	117	! Transposition of input array (f_in) from x to y. For the input array, all
	118	! elements along x reside on the same PE, while after transposition, all
	119	! elements along y reside on the same PE.
	120	!------------------------------------------------------------------------------!
	121
[1320]	122	USE cpulog, &
	123	ONLY: cpu_log, cpu_log_nowait, log_point_s
	124
	125	USE indices, &
	126	ONLY: nx, ny
	127
	128	USE kinds
	129
[1]	130	USE pegrid
	131
[1320]	132	USE transpose_indices, &
	133	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	134
[1]	135	IMPLICIT NONE
	136
[1320]	137	INTEGER(iwp) :: i !:
	138	INTEGER(iwp) :: j !:
	139	INTEGER(iwp) :: k !:
	140	INTEGER(iwp) :: l !:
	141	INTEGER(iwp) :: ys !:
	142
	143	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !:
	144	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !:
[1]	145
[1320]	146	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !:
[1111]	147
	148
[1106]	149	IF ( numprocs /= 1 ) THEN
	150
	151	#if defined( __parallel )
[1]	152	!
[1106]	153	!-- Transpose array
[1318]	154	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	155	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	156	!$acc update host( f_inv )
	157	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	158	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	159	comm1dy, ierr )
	160	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	161
	162	!
[1106]	163	!-- Reorder transposed array
[1111]	164	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	165	!$OMP DO
[1216]	166	!$acc data copyin( work )
[1106]	167	DO l = 0, pdims(2) - 1
	168	ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]	169	!$acc kernels present( f_out, work )
[1106]	170	DO i = nxl_y, nxr_y
	171	DO k = nzb_y, nzt_y
	172	DO j = ys, ys + nyn_x - nys_x
[1111]	173	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	174	ENDDO
[1]	175	ENDDO
	176	ENDDO
[1111]	177	!$acc end kernels
[1]	178	ENDDO
[1216]	179	!$acc end data
[683]	180	!$OMP END PARALLEL
[1]	181	#endif
	182
[1106]	183	ELSE
	184
	185	!
	186	!-- Reorder transposed array
	187	!$OMP PARALLEL PRIVATE ( i, j, k )
	188	!$OMP DO
[1216]	189	!$acc kernels present( f_inv, f_out )
[1106]	190	DO k = nzb_y, nzt_y
	191	DO i = nxl_y, nxr_y
	192	DO j = 0, ny
	193	f_out(j,i,k) = f_inv(j,k,i)
	194	ENDDO
	195	ENDDO
	196	ENDDO
[1111]	197	!$acc end kernels
[1106]	198	!$OMP END PARALLEL
	199
	200	ENDIF
	201
[1]	202	END SUBROUTINE transpose_xy
	203
	204
[1216]	205	SUBROUTINE resort_for_xz( f_inv, f_out )
[1]	206
	207	!------------------------------------------------------------------------------!
	208	! Description:
	209	! ------------
[1216]	210	! Resorting data after the transposition from x to z. The transposition itself
	211	! is carried out in transpose_xz
	212	!------------------------------------------------------------------------------!
	213
[1320]	214	USE indices, &
	215	ONLY: nxl, nxr, nyn, nys, nz
[1216]	216
[1320]	217	USE kinds
	218
[1216]	219	IMPLICIT NONE
	220
[1320]	221	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !:
	222	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !:
[1216]	223
[1320]	224	INTEGER(iwp) :: i !:
	225	INTEGER(iwp) :: j !:
	226	INTEGER(iwp) :: k !:
[1216]	227	!
	228	!-- Rearrange indices of input array in order to make data to be send
	229	!-- by MPI contiguous.
	230	!-- In case of parallel fft/transposition, scattered store is faster in
	231	!-- backward direction!!!
	232	!$OMP PARALLEL PRIVATE ( i, j, k )
	233	!$OMP DO
	234	!$acc kernels present( f_inv, f_out )
	235	DO k = 1, nz
	236	DO i = nxl, nxr
	237	DO j = nys, nyn
	238	f_out(k,j,i) = f_inv(j,i,k)
	239	ENDDO
	240	ENDDO
	241	ENDDO
	242	!$acc end kernels
	243	!$OMP END PARALLEL
	244
	245	END SUBROUTINE resort_for_xz
	246
	247
	248	SUBROUTINE transpose_xz( f_in, f_inv )
	249
	250	!------------------------------------------------------------------------------!
	251	! Description:
	252	! ------------
[1]	253	! Transposition of input array (f_in) from x to z. For the input array, all
	254	! elements along x reside on the same PE, while after transposition, all
	255	! elements along z reside on the same PE.
	256	!------------------------------------------------------------------------------!
	257
[1320]	258	USE cpulog, &
	259	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	260
[1320]	261	USE indices, &
	262	ONLY: nnx, nx, nxl, nxr, ny, nyn, nys, nz
	263
	264	USE kinds
	265
	266	USE pegrid, &
	267	ONLY: collective_wait, comm1dx, comm2d, ierr, MPI_DOUBLE_PRECISION, &
	268	pdims, sendrecvcount_zx
	269
	270	USE transpose_indices, &
	271	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	272
[1]	273	IMPLICIT NONE
	274
[1320]	275	INTEGER(iwp) :: i !:
	276	INTEGER(iwp) :: j !:
	277	INTEGER(iwp) :: k !:
	278	INTEGER(iwp) :: l !:
	279	INTEGER(iwp) :: xs !:
[1]	280
[1320]	281	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !:
	282	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !:
[1]	283
[1320]	284	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !:
[1111]	285
[1320]	286
[1]	287	!
	288	!-- If the PE grid is one-dimensional along y, the array has only to be
	289	!-- reordered locally and therefore no transposition has to be done.
	290	IF ( pdims(1) /= 1 ) THEN
[1106]	291
	292	#if defined( __parallel )
[1]	293	!
	294	!-- Reorder input array for transposition
[1111]	295	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	296	!$OMP DO
[1216]	297	!$acc data copyout( work )
[1]	298	DO l = 0, pdims(1) - 1
	299	xs = 0 + l * nnx
[1111]	300	!$acc kernels present( f_in, work )
[1003]	301	DO k = nzb_x, nzt_x
[164]	302	DO i = xs, xs + nnx - 1
[1003]	303	DO j = nys_x, nyn_x
[1111]	304	work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]	305	ENDDO
	306	ENDDO
	307	ENDDO
[1111]	308	!$acc end kernels
[1]	309	ENDDO
[1216]	310	!$acc end data
[683]	311	!$OMP END PARALLEL
[1]	312
	313	!
	314	!-- Transpose array
[1318]	315	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[622]	316	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	317	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	318	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	319	comm1dx, ierr )
[1111]	320	!$acc update device( f_inv )
[1]	321	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	322	#endif
	323
[1]	324	ELSE
[1106]	325
[1]	326	!
	327	!-- Reorder the array in a way that the z index is in first position
[683]	328	!$OMP PARALLEL PRIVATE ( i, j, k )
	329	!$OMP DO
[1216]	330	!$acc kernels present( f_in, f_inv )
[1003]	331	DO i = nxl, nxr
	332	DO j = nys, nyn
	333	DO k = 1, nz
[164]	334	f_inv(j,i,k) = f_in(i,j,k)
[1]	335	ENDDO
	336	ENDDO
	337	ENDDO
[1111]	338	!$acc end kernels
[683]	339	!$OMP END PARALLEL
[1]	340
[164]	341	ENDIF
	342
[1]	343	END SUBROUTINE transpose_xz
	344
	345
[1216]	346	SUBROUTINE resort_for_yx( f_inv, f_out )
[1]	347
	348	!------------------------------------------------------------------------------!
	349	! Description:
	350	! ------------
[1216]	351	! Resorting data after the transposition from y to x. The transposition itself
	352	! is carried out in transpose_yx
	353	!------------------------------------------------------------------------------!
	354
[1320]	355	USE indices, &
	356	ONLY: nx
[1216]	357
[1320]	358	USE kinds
	359
	360	USE transpose_indices, &
	361	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	362
[1216]	363	IMPLICIT NONE
	364
[1320]	365	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !:
	366	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !:
[1216]	367
	368
[1320]	369	INTEGER(iwp) :: i !:
	370	INTEGER(iwp) :: j !:
	371	INTEGER(iwp) :: k !:
[1216]	372	!
	373	!-- Rearrange indices of input array in order to make data to be send
	374	!-- by MPI contiguous
	375	!$OMP PARALLEL PRIVATE ( i, j, k )
	376	!$OMP DO
	377	!$acc kernels present( f_inv, f_out )
	378	DO i = 0, nx
	379	DO k = nzb_x, nzt_x
	380	DO j = nys_x, nyn_x
	381	f_out(i,j,k) = f_inv(j,k,i)
	382	ENDDO
	383	ENDDO
	384	ENDDO
	385	!$acc end kernels
	386	!$OMP END PARALLEL
	387
	388	END SUBROUTINE resort_for_yx
	389
	390
	391	SUBROUTINE transpose_yx( f_in, f_inv )
	392
	393	!------------------------------------------------------------------------------!
	394	! Description:
	395	! ------------
[1]	396	! Transposition of input array (f_in) from y to x. For the input array, all
	397	! elements along y reside on the same PE, while after transposition, all
	398	! elements along x reside on the same PE.
	399	!------------------------------------------------------------------------------!
	400
[1320]	401	USE cpulog, &
	402	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	403
[1320]	404	USE indices, &
	405	ONLY: nx, ny
	406
	407	USE kinds
	408
	409	USE pegrid, &
	410	ONLY: collective_wait, comm1dy, comm2d, ierr, MPI_DOUBLE_PRECISION, &
	411	numprocs, pdims, sendrecvcount_xy
	412
	413	USE transpose_indices, &
	414	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	415
[1]	416	IMPLICIT NONE
	417
[1320]	418	INTEGER(iwp) :: i !:
	419	INTEGER(iwp) :: j !:
	420	INTEGER(iwp) :: k !:
	421	INTEGER(iwp) :: l !:
	422	INTEGER(iwp) :: ys !:
[1]	423
[1320]	424	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !:
	425	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !:
[1111]	426
[1320]	427	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !:
[1111]	428
[1320]	429
[1106]	430	IF ( numprocs /= 1 ) THEN
	431
[1]	432	#if defined( __parallel )
	433	!
[1106]	434	!-- Reorder input array for transposition
[1111]	435	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	436	!$OMP DO
[1216]	437	!$acc data copyout( work )
[1106]	438	DO l = 0, pdims(2) - 1
	439	ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]	440	!$acc kernels present( f_in, work )
[1106]	441	DO i = nxl_y, nxr_y
	442	DO k = nzb_y, nzt_y
	443	DO j = ys, ys + nyn_x - nys_x
[1111]	444	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	445	ENDDO
	446	ENDDO
	447	ENDDO
[1111]	448	!$acc end kernels
[1106]	449	ENDDO
[1216]	450	!$acc end data
[1106]	451	!$OMP END PARALLEL
	452
	453	!
	454	!-- Transpose array
[1318]	455	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	456	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	457	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	458	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	459	comm1dy, ierr )
[1111]	460	!$acc update device( f_inv )
[1106]	461	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	462	#endif
	463
	464	ELSE
	465
	466	!
	467	!-- Reorder array f_in the same way as ALLTOALL did it
	468	!$OMP PARALLEL PRIVATE ( i, j, k )
	469	!$OMP DO
[1216]	470	!$acc kernels present( f_in, f_inv )
[1003]	471	DO i = nxl_y, nxr_y
	472	DO k = nzb_y, nzt_y
[1106]	473	DO j = 0, ny
	474	f_inv(j,k,i) = f_in(j,i,k)
[1]	475	ENDDO
	476	ENDDO
	477	ENDDO
[1111]	478	!$acc end kernels
[683]	479	!$OMP END PARALLEL
[1]	480
[1106]	481	ENDIF
[1]	482
	483	END SUBROUTINE transpose_yx
	484
	485
[1216]	486	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	487
	488	!------------------------------------------------------------------------------!
	489	! Description:
	490	! ------------
	491	! Transposition of input array (f_in) from y to x. For the input array, all
	492	! elements along y reside on the same PE, while after transposition, all
	493	! elements along x reside on the same PE.
	494	! This is a direct transposition for arrays with indices in regular order
	495	! (k,j,i) (cf. transpose_yx).
	496	!------------------------------------------------------------------------------!
	497
[1320]	498	USE cpulog, &
	499	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	500
[1320]	501	USE indices, &
	502	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
	503
	504	USE kinds
	505
	506	USE pegrid, &
	507	ONLY: collective_wait, comm1dx, comm2d, ierr, MPI_DOUBLE_PRECISION, &
	508	pdims, sendrecvcount_xy
	509
	510	USE transpose_indices, &
	511	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	512
[1]	513	IMPLICIT NONE
	514
[1320]	515	INTEGER(iwp) :: i !:
	516	INTEGER(iwp) :: j !:
	517	INTEGER(iwp) :: k !:
	518	INTEGER(iwp) :: l !:
	519	INTEGER(iwp) :: m !:
	520	INTEGER(iwp) :: xs !:
[1]	521
[1320]	522	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !:
	523	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !:
	524	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !:
	525	REAL(wp) :: work(nnxnnynnz) !:
[1]	526	#if defined( __parallel )
	527
	528	!
	529	!-- Rearrange indices of input array in order to make data to be send
	530	!-- by MPI contiguous
[1003]	531	DO k = 1, nz
	532	DO j = nys, nyn
	533	DO i = nxl, nxr
[164]	534	f_inv(i,k,j) = f_in(k,j,i)
[1]	535	ENDDO
	536	ENDDO
	537	ENDDO
	538
	539	!
	540	!-- Transpose array
	541	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	542	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	543	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	544	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	545	comm1dx, ierr )
	546	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	547
	548	!
	549	!-- Reorder transposed array
	550	m = 0
	551	DO l = 0, pdims(1) - 1
	552	xs = 0 + l * nnx
[1003]	553	DO j = nys_x, nyn_x
	554	DO k = 1, nz
[1]	555	DO i = xs, xs + nnx - 1
	556	m = m + 1
[164]	557	f_out(i,j,k) = work(m)
[1]	558	ENDDO
	559	ENDDO
	560	ENDDO
	561	ENDDO
	562
	563	#endif
	564
	565	END SUBROUTINE transpose_yxd
	566
	567
[1216]	568	SUBROUTINE resort_for_yz( f_in, f_inv )
[1]	569
	570	!------------------------------------------------------------------------------!
	571	! Description:
	572	! ------------
[1216]	573	! Resorting data for the transposition from y to z. The transposition itself
	574	! is carried out in transpose_yz
	575	!------------------------------------------------------------------------------!
	576
[1320]	577	USE indices, &
	578	ONLY: ny
[1216]	579
[1320]	580	USE kinds
	581
	582	USE transpose_indices, &
	583	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	584
[1216]	585	IMPLICIT NONE
	586
[1320]	587	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !:
	588	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !:
[1216]	589
[1320]	590	INTEGER(iwp) :: i !:
	591	INTEGER(iwp) :: j !:
	592	INTEGER(iwp) :: k !:
[1216]	593
	594	!
	595	!-- Rearrange indices of input array in order to make data to be send
	596	!-- by MPI contiguous
	597	!$OMP PARALLEL PRIVATE ( i, j, k )
	598	!$OMP DO
	599	!$acc kernels present( f_in, f_inv )
	600	DO j = 0, ny
	601	DO k = nzb_y, nzt_y
	602	DO i = nxl_y, nxr_y
	603	f_inv(i,k,j) = f_in(j,i,k)
	604	ENDDO
	605	ENDDO
	606	ENDDO
	607	!$acc end kernels
	608	!$OMP END PARALLEL
	609
	610	END SUBROUTINE resort_for_yz
	611
	612
	613	SUBROUTINE transpose_yz( f_inv, f_out )
	614
	615	!------------------------------------------------------------------------------!
	616	! Description:
	617	! ------------
[1]	618	! Transposition of input array (f_in) from y to z. For the input array, all
	619	! elements along y reside on the same PE, while after transposition, all
	620	! elements along z reside on the same PE.
	621	!------------------------------------------------------------------------------!
	622
[1320]	623	USE cpulog, &
	624	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	625
[1320]	626	USE indices, &
	627	ONLY: ny, nz
	628
	629	USE kinds
	630
	631	USE pegrid, &
	632	ONLY: collective_wait, comm1dx, comm2d, ierr, MPI_DOUBLE_PRECISION, &
	633	pdims, sendrecvcount_yz
	634
	635	USE transpose_indices, &
	636	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	637
[1]	638	IMPLICIT NONE
	639
[1320]	640	INTEGER(iwp) :: i !:
	641	INTEGER(iwp) :: j !:
	642	INTEGER(iwp) :: k !:
	643	INTEGER(iwp) :: l !:
	644	INTEGER(iwp) :: zs !:
[1]	645
[1320]	646	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !:
	647	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !:
[1111]	648
[1320]	649	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !:
[1111]	650
[1320]	651
[1]	652	!
	653	!-- If the PE grid is one-dimensional along y, only local reordering
	654	!-- of the data is necessary and no transposition has to be done.
	655	IF ( pdims(1) == 1 ) THEN
[1106]	656
[683]	657	!$OMP PARALLEL PRIVATE ( i, j, k )
	658	!$OMP DO
[1216]	659	!$acc kernels present( f_inv, f_out )
[1003]	660	DO j = 0, ny
	661	DO k = nzb_y, nzt_y
	662	DO i = nxl_y, nxr_y
[164]	663	f_out(i,j,k) = f_inv(i,k,j)
[1]	664	ENDDO
	665	ENDDO
	666	ENDDO
[1111]	667	!$acc end kernels
[683]	668	!$OMP END PARALLEL
[1]	669
[1106]	670	ELSE
	671
	672	#if defined( __parallel )
[1]	673	!
[1106]	674	!-- Transpose array
[1318]	675	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	676	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	677	!$acc update host( f_inv )
	678	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	679	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	680	comm1dx, ierr )
	681	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	682
	683	!
[1106]	684	!-- Reorder transposed array
[1111]	685	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	686	!$OMP DO
[1216]	687	!$acc data copyin( work )
[1106]	688	DO l = 0, pdims(1) - 1
	689	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1216]	690	!$acc kernels present( f_out )
[1106]	691	DO j = nys_z, nyn_z
	692	DO k = zs, zs + nzt_y - nzb_y
	693	DO i = nxl_z, nxr_z
[1111]	694	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	695	ENDDO
[1]	696	ENDDO
	697	ENDDO
[1111]	698	!$acc end kernels
[1]	699	ENDDO
[1216]	700	!$acc end data
[683]	701	!$OMP END PARALLEL
[1]	702	#endif
	703
[1106]	704	ENDIF
	705
[1]	706	END SUBROUTINE transpose_yz
	707
	708
[1216]	709	SUBROUTINE resort_for_zx( f_in, f_inv )
[1]	710
	711	!------------------------------------------------------------------------------!
	712	! Description:
	713	! ------------
[1216]	714	! Resorting data for the transposition from z to x. The transposition itself
	715	! is carried out in transpose_zx
	716	!------------------------------------------------------------------------------!
	717
[1320]	718	USE indices, &
	719	ONLY: nxl, nxr, nyn, nys, nz
[1216]	720
[1320]	721	USE kinds
	722
[1216]	723	IMPLICIT NONE
	724
[1320]	725	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !:
	726	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !:
[1216]	727
[1320]	728	INTEGER(iwp) :: i !:
	729	INTEGER(iwp) :: j !:
	730	INTEGER(iwp) :: k !:
[1216]	731
	732	!
	733	!-- Rearrange indices of input array in order to make data to be send
	734	!-- by MPI contiguous
	735	!$OMP PARALLEL PRIVATE ( i, j, k )
	736	!$OMP DO
	737	!$acc kernels present( f_in, f_inv )
	738	DO k = 1,nz
	739	DO i = nxl, nxr
	740	DO j = nys, nyn
	741	f_inv(j,i,k) = f_in(k,j,i)
	742	ENDDO
	743	ENDDO
	744	ENDDO
	745	!$acc end kernels
	746	!$OMP END PARALLEL
	747
	748	END SUBROUTINE resort_for_zx
	749
	750
	751	SUBROUTINE transpose_zx( f_inv, f_out )
	752
	753	!------------------------------------------------------------------------------!
	754	! Description:
	755	! ------------
[1]	756	! Transposition of input array (f_in) from z to x. For the input array, all
	757	! elements along z reside on the same PE, while after transposition, all
	758	! elements along x reside on the same PE.
	759	!------------------------------------------------------------------------------!
	760
[1320]	761	USE cpulog, &
	762	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	763
[1320]	764	USE indices, &
	765	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
	766
	767	USE kinds
	768
	769	USE pegrid, &
	770	ONLY: collective_wait, comm1dx, comm2d, ierr, MPI_DOUBLE_PRECISION, &
	771	pdims, sendrecvcount_zx
	772
	773	USE transpose_indices, &
	774	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	775
[1]	776	IMPLICIT NONE
	777
[1320]	778	INTEGER(iwp) :: i !:
	779	INTEGER(iwp) :: j !:
	780	INTEGER(iwp) :: k !:
	781	INTEGER(iwp) :: l !:
	782	INTEGER(iwp) :: xs !:
[1]	783
[1320]	784	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !:
	785	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !:
[1111]	786
[1320]	787	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !:
[1]	788
[1320]	789
[1]	790	!
	791	!-- If the PE grid is one-dimensional along y, only local reordering
	792	!-- of the data is necessary and no transposition has to be done.
	793	IF ( pdims(1) == 1 ) THEN
[1106]	794
[683]	795	!$OMP PARALLEL PRIVATE ( i, j, k )
	796	!$OMP DO
[1216]	797	!$acc kernels present( f_inv, f_out )
[1003]	798	DO k = 1, nz
	799	DO i = nxl, nxr
	800	DO j = nys, nyn
[164]	801	f_out(i,j,k) = f_inv(j,i,k)
[1]	802	ENDDO
	803	ENDDO
	804	ENDDO
[1111]	805	!$acc end kernels
[683]	806	!$OMP END PARALLEL
[1]	807
[1106]	808	ELSE
	809
	810	#if defined( __parallel )
[1]	811	!
[1106]	812	!-- Transpose array
[1318]	813	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	814	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	815	!$acc update host( f_inv )
	816	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	817	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	818	comm1dx, ierr )
	819	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	820
	821	!
[1106]	822	!-- Reorder transposed array
[1111]	823	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	824	!$OMP DO
[1216]	825	!$acc data copyin( work )
[1106]	826	DO l = 0, pdims(1) - 1
	827	xs = 0 + l * nnx
[1216]	828	!$acc kernels present( f_out )
[1106]	829	DO k = nzb_x, nzt_x
	830	DO i = xs, xs + nnx - 1
	831	DO j = nys_x, nyn_x
[1111]	832	f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]	833	ENDDO
[1]	834	ENDDO
	835	ENDDO
[1111]	836	!$acc end kernels
[1]	837	ENDDO
[1216]	838	!$acc end data
[683]	839	!$OMP END PARALLEL
[1]	840	#endif
	841
[1106]	842	ENDIF
	843
[1]	844	END SUBROUTINE transpose_zx
	845
	846
[1216]	847	SUBROUTINE resort_for_zy( f_inv, f_out )
[1]	848
	849	!------------------------------------------------------------------------------!
	850	! Description:
	851	! ------------
[1216]	852	! Resorting data after the transposition from z to y. The transposition itself
	853	! is carried out in transpose_zy
	854	!------------------------------------------------------------------------------!
	855
[1320]	856	USE indices, &
	857	ONLY: ny
[1216]	858
[1320]	859	USE kinds
	860
	861	USE transpose_indices, &
	862	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	863
[1216]	864	IMPLICIT NONE
	865
[1320]	866	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !:
	867	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !:
[1216]	868
	869
[1320]	870	INTEGER(iwp) :: i !:
	871	INTEGER(iwp) :: j !:
	872	INTEGER(iwp) :: k !:
[1216]	873
	874	!
	875	!-- Rearrange indices of input array in order to make data to be send
	876	!-- by MPI contiguous
	877	!$OMP PARALLEL PRIVATE ( i, j, k )
	878	!$OMP DO
	879	!$acc kernels present( f_inv, f_out )
	880	DO k = nzb_y, nzt_y
	881	DO j = 0, ny
	882	DO i = nxl_y, nxr_y
	883	f_out(j,i,k) = f_inv(i,k,j)
	884	ENDDO
	885	ENDDO
	886	ENDDO
	887	!$acc end kernels
	888	!$OMP END PARALLEL
	889
	890	END SUBROUTINE resort_for_zy
	891
	892
	893	SUBROUTINE transpose_zy( f_in, f_inv )
	894
	895	!------------------------------------------------------------------------------!
	896	! Description:
	897	! ------------
[1]	898	! Transposition of input array (f_in) from z to y. For the input array, all
	899	! elements along z reside on the same PE, while after transposition, all
	900	! elements along y reside on the same PE.
	901	!------------------------------------------------------------------------------!
	902
[1320]	903	USE cpulog, &
	904	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	905
[1320]	906	USE indices, &
	907	ONLY: ny, nz
	908
	909	USE kinds
	910
	911	USE pegrid, &
	912	ONLY: collective_wait, comm1dx, comm2d, ierr, MPI_DOUBLE_PRECISION, &
	913	pdims, sendrecvcount_yz
	914
	915	USE transpose_indices, &
	916	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	917
[1]	918	IMPLICIT NONE
	919
[1320]	920	INTEGER(iwp) :: i !:
	921	INTEGER(iwp) :: j !:
	922	INTEGER(iwp) :: k !:
	923	INTEGER(iwp) :: l !:
	924	INTEGER(iwp) :: zs !:
[1]	925
[1320]	926	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !:
	927	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !:
[1111]	928
[1320]	929	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !:
[1111]	930
[1]	931	!
	932	!-- If the PE grid is one-dimensional along y, the array has only to be
	933	!-- reordered locally and therefore no transposition has to be done.
	934	IF ( pdims(1) /= 1 ) THEN
[1106]	935
	936	#if defined( __parallel )
[1]	937	!
	938	!-- Reorder input array for transposition
[1111]	939	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	940	!$OMP DO
[1216]	941	!$acc data copyout( work )
[1]	942	DO l = 0, pdims(1) - 1
[1003]	943	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1111]	944	!$acc kernels present( f_in, work )
[1003]	945	DO j = nys_z, nyn_z
	946	DO k = zs, zs + nzt_y - nzb_y
	947	DO i = nxl_z, nxr_z
[1111]	948	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	949	ENDDO
	950	ENDDO
	951	ENDDO
[1111]	952	!$acc end kernels
[1]	953	ENDDO
[1216]	954	!$acc end data
[683]	955	!$OMP END PARALLEL
[1]	956
	957	!
	958	!-- Transpose array
[1318]	959	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[622]	960	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	961	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	962	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	963	comm1dx, ierr )
[1111]	964	!$acc update device( f_inv )
[1]	965	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	966	#endif
[1]	967
	968	ELSE
	969	!
[1106]	970	!-- Reorder the array in the same way like ALLTOALL did it
[683]	971	!$OMP PARALLEL PRIVATE ( i, j, k )
	972	!$OMP DO
[1216]	973	!$acc kernels present( f_in, f_inv )
[1003]	974	DO k = nzb_y, nzt_y
	975	DO j = 0, ny
	976	DO i = nxl_y, nxr_y
[164]	977	f_inv(i,k,j) = f_in(i,j,k)
	978	ENDDO
	979	ENDDO
	980	ENDDO
[1111]	981	!$acc end kernels
[683]	982	!$OMP END PARALLEL
[1106]	983
	984	ENDIF
	985
[1]	986	END SUBROUTINE transpose_zy
	987
	988
[1216]	989	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	990
	991	!------------------------------------------------------------------------------!
	992	! Description:
	993	! ------------
	994	! Transposition of input array (f_in) from z to y. For the input array, all
	995	! elements along z reside on the same PE, while after transposition, all
	996	! elements along y reside on the same PE.
	997	! This is a direct transposition for arrays with indices in regular order
	998	! (k,j,i) (cf. transpose_zy).
	999	!------------------------------------------------------------------------------!
	1000
[1320]	1001	USE cpulog, &
	1002	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	1003
[1320]	1004	USE indices, &
	1005	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
	1006
	1007	USE kinds
	1008
	1009	USE pegrid, &
	1010	ONLY: collective_wait, comm1dy, comm2d, ierr, MPI_DOUBLE_PRECISION, &
	1011	pdims, sendrecvcount_zyd
	1012
	1013	USE transpose_indices, &
	1014	ONLY: nxl_y, nxl_yd, nxr_y, nxr_yd, nzb_y, nzb_yd, nzt_y, nzt_yd
	1015
[1]	1016	IMPLICIT NONE
	1017
[1320]	1018	INTEGER(iwp) :: i !:
	1019	INTEGER(iwp) :: j !:
	1020	INTEGER(iwp) :: k !:
	1021	INTEGER(iwp) :: l !:
	1022	INTEGER(iwp) :: m !:
	1023	INTEGER(iwp) :: ys !:
[1]	1024
[1320]	1025	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !:
	1026	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !:
	1027	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !:
	1028	REAL(wp) :: work(nnxnnynnz) !:
	1029
[1]	1030	#if defined( __parallel )
	1031
	1032	!
	1033	!-- Rearrange indices of input array in order to make data to be send
	1034	!-- by MPI contiguous
[1003]	1035	DO i = nxl, nxr
	1036	DO j = nys, nyn
	1037	DO k = 1, nz
[164]	1038	f_inv(j,i,k) = f_in(k,j,i)
[1]	1039	ENDDO
	1040	ENDDO
	1041	ENDDO
	1042
	1043	!
	1044	!-- Move data to different array, because memory location of work1 is
	1045	!-- needed further below (work1 = work2).
	1046	!-- If the PE grid is one-dimensional along x, only local reordering
	1047	!-- of the data is necessary and no transposition has to be done.
	1048	IF ( pdims(2) == 1 ) THEN
[1003]	1049	DO k = 1, nz
	1050	DO i = nxl, nxr
	1051	DO j = nys, nyn
[164]	1052	f_out(j,i,k) = f_inv(j,i,k)
[1]	1053	ENDDO
	1054	ENDDO
	1055	ENDDO
	1056	RETURN
	1057	ENDIF
	1058
	1059	!
	1060	!-- Transpose array
	1061	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1062	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1063	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	1064	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	1065	comm1dy, ierr )
	1066	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1067
	1068	!
	1069	!-- Reorder transposed array
	1070	m = 0
	1071	DO l = 0, pdims(2) - 1
	1072	ys = 0 + l * nny
[1003]	1073	DO k = nzb_yd, nzt_yd
	1074	DO i = nxl_yd, nxr_yd
[1]	1075	DO j = ys, ys + nny - 1
	1076	m = m + 1
[164]	1077	f_out(j,i,k) = work(m)
[1]	1078	ENDDO
	1079	ENDDO
	1080	ENDDO
	1081	ENDDO
	1082
	1083	#endif
	1084
	1085	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |