Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 1998

Last change on this file since 1998 was 1818, checked in by maronga, 9 years ago
last commit documented / copyright update
Property svn:keywords set to `Id`
File size: 31.2 KB

Rev	Line
[1682]	1	!> @file transpose.f90
[1036]	2	!--------------------------------------------------------------------------------!
	3	! This file is part of PALM.
	4	!
	5	! PALM is free software: you can redistribute it and/or modify it under the terms
	6	! of the GNU General Public License as published by the Free Software Foundation,
	7	! either version 3 of the License, or (at your option) any later version.
	8	!
	9	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	10	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	11	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	12	!
	13	! You should have received a copy of the GNU General Public License along with
	14	! PALM. If not, see <http://www.gnu.org/licenses/>.
	15	!
[1818]	16	! Copyright 1997-2016 Leibniz Universitaet Hannover
[1036]	17	!--------------------------------------------------------------------------------!
	18	!
[484]	19	! Current revisions:
[1]	20	! -----------------
[1321]	21	!
[1683]	22	!
[1321]	23	! Former revisions:
	24	! -----------------
	25	! $Id: transpose.f90 1818 2016-04-06 15:53:27Z knoop $
	26	!
[1683]	27	! 1682 2015-10-07 23:56:08Z knoop
	28	! Code annotations made doxygen readable
	29	!
[1325]	30	! 1324 2014-03-21 09:13:16Z suehring
	31	! Bugfix: ONLY statement for module pegrid removed
	32	!
[1321]	33	! 1320 2014-03-20 08:40:49Z raasch
[1320]	34	! ONLY-attribute added to USE-statements,
	35	! kind-parameters added to all INTEGER and REAL declaration statements,
	36	! kinds are defined in new module kinds,
	37	! old module precision_kind is removed,
	38	! revision history before 2012 removed,
	39	! comment fields (!:) to be used for variable explanations added to
	40	! all variable declaration statements
[198]	41	!
[1319]	42	! 1318 2014-03-17 13:35:16Z raasch
	43	! cpu_log_nowait parameter added to cpu measurements of the transpositions
	44	! required for solving the Poisson equation (poisfft),
	45	! module interfaces removed
	46	!
[1258]	47	! 1257 2013-11-08 15:18:40Z raasch
	48	! openacc loop and loop vector clauses removed
	49	!
[1217]	50	! 1216 2013-08-26 09:31:42Z raasch
	51	! re-sorting of the transposed / to be transposed arrays moved to separate
	52	! routines resort_for_...
	53	!
[1112]	54	! 1111 2013-03-08 23:54:10Z raasch
	55	! openACC directives added,
	56	! resorting data from/to work changed, work got 4 dimensions instead of 1
	57	!
[1107]	58	! 1106 2013-03-04 05:31:38Z raasch
	59	! preprocessor lines rearranged so that routines can also be used in serial
	60	! (non-parallel) mode
	61	!
[1093]	62	! 1092 2013-02-02 11:24:22Z raasch
	63	! unused variables removed
	64	!
[1037]	65	! 1036 2012-10-22 13:43:42Z raasch
	66	! code put under GPL (PALM 3.9)
	67	!
[1004]	68	! 1003 2012-09-14 14:35:53Z raasch
	69	! indices nxa, nya, etc. replaced by nx, ny, etc.
	70	!
[1]	71	! Revision 1.1 1997/07/24 11:25:18 raasch
	72	! Initial revision
	73	!
[1216]	74	!------------------------------------------------------------------------------!
	75	! Description:
	76	! ------------
[1682]	77	!> Resorting data for the transposition from x to y. The transposition itself
	78	!> is carried out in transpose_xy
[1216]	79	!------------------------------------------------------------------------------!
[1682]	80	SUBROUTINE resort_for_xy( f_in, f_inv )
	81
[1216]	82
[1320]	83	USE indices, &
	84	ONLY: nx
[1216]	85
[1320]	86	USE kinds
	87
	88	USE transpose_indices, &
	89	ONLY: nxl_z, nxr_z, nyn_x, nyn_z, nys_x, nys_z, nzb_x, nzt_x
	90
[1216]	91	IMPLICIT NONE
	92
[1682]	93	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	94	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1216]	95
	96
[1682]	97	INTEGER(iwp) :: i !<
	98	INTEGER(iwp) :: j !<
	99	INTEGER(iwp) :: k !<
[1]	100	!
[1216]	101	!-- Rearrange indices of input array in order to make data to be send
	102	!-- by MPI contiguous
	103	!$OMP PARALLEL PRIVATE ( i, j, k )
	104	!$OMP DO
	105	!$acc kernels present( f_in, f_inv )
	106	DO i = 0, nx
	107	DO k = nzb_x, nzt_x
	108	DO j = nys_x, nyn_x
	109	f_inv(j,k,i) = f_in(i,j,k)
	110	ENDDO
	111	ENDDO
	112	ENDDO
	113	!$acc end kernels
	114	!$OMP END PARALLEL
	115
	116	END SUBROUTINE resort_for_xy
	117
	118
	119	!------------------------------------------------------------------------------!
[1]	120	! Description:
	121	! ------------
[1682]	122	!> Transposition of input array (f_in) from x to y. For the input array, all
	123	!> elements along x reside on the same PE, while after transposition, all
	124	!> elements along y reside on the same PE.
[1]	125	!------------------------------------------------------------------------------!
[1682]	126	SUBROUTINE transpose_xy( f_inv, f_out )
[1]	127
[1682]	128
[1320]	129	USE cpulog, &
	130	ONLY: cpu_log, cpu_log_nowait, log_point_s
	131
	132	USE indices, &
	133	ONLY: nx, ny
	134
	135	USE kinds
	136
[1]	137	USE pegrid
	138
[1320]	139	USE transpose_indices, &
	140	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	141
[1]	142	IMPLICIT NONE
	143
[1682]	144	INTEGER(iwp) :: i !<
	145	INTEGER(iwp) :: j !<
	146	INTEGER(iwp) :: k !<
	147	INTEGER(iwp) :: l !<
	148	INTEGER(iwp) :: ys !<
[1320]	149
[1682]	150	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	151	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1]	152
[1682]	153	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[1111]	154
	155
[1106]	156	IF ( numprocs /= 1 ) THEN
	157
	158	#if defined( __parallel )
[1]	159	!
[1106]	160	!-- Transpose array
[1318]	161	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	162	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	163	!$acc update host( f_inv )
	164	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	165	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	166	comm1dy, ierr )
	167	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	168
	169	!
[1106]	170	!-- Reorder transposed array
[1111]	171	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	172	!$OMP DO
[1216]	173	!$acc data copyin( work )
[1106]	174	DO l = 0, pdims(2) - 1
	175	ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]	176	!$acc kernels present( f_out, work )
[1106]	177	DO i = nxl_y, nxr_y
	178	DO k = nzb_y, nzt_y
	179	DO j = ys, ys + nyn_x - nys_x
[1111]	180	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	181	ENDDO
[1]	182	ENDDO
	183	ENDDO
[1111]	184	!$acc end kernels
[1]	185	ENDDO
[1216]	186	!$acc end data
[683]	187	!$OMP END PARALLEL
[1]	188	#endif
	189
[1106]	190	ELSE
	191
	192	!
	193	!-- Reorder transposed array
	194	!$OMP PARALLEL PRIVATE ( i, j, k )
	195	!$OMP DO
[1216]	196	!$acc kernels present( f_inv, f_out )
[1106]	197	DO k = nzb_y, nzt_y
	198	DO i = nxl_y, nxr_y
	199	DO j = 0, ny
	200	f_out(j,i,k) = f_inv(j,k,i)
	201	ENDDO
	202	ENDDO
	203	ENDDO
[1111]	204	!$acc end kernels
[1106]	205	!$OMP END PARALLEL
	206
	207	ENDIF
	208
[1]	209	END SUBROUTINE transpose_xy
	210
	211
	212	!------------------------------------------------------------------------------!
	213	! Description:
	214	! ------------
[1682]	215	!> Resorting data after the transposition from x to z. The transposition itself
	216	!> is carried out in transpose_xz
[1216]	217	!------------------------------------------------------------------------------!
[1682]	218	SUBROUTINE resort_for_xz( f_inv, f_out )
[1216]	219
[1682]	220
[1320]	221	USE indices, &
	222	ONLY: nxl, nxr, nyn, nys, nz
[1216]	223
[1320]	224	USE kinds
	225
[1216]	226	IMPLICIT NONE
	227
[1682]	228	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	229	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
[1216]	230
[1682]	231	INTEGER(iwp) :: i !<
	232	INTEGER(iwp) :: j !<
	233	INTEGER(iwp) :: k !<
[1216]	234	!
	235	!-- Rearrange indices of input array in order to make data to be send
	236	!-- by MPI contiguous.
	237	!-- In case of parallel fft/transposition, scattered store is faster in
	238	!-- backward direction!!!
	239	!$OMP PARALLEL PRIVATE ( i, j, k )
	240	!$OMP DO
	241	!$acc kernels present( f_inv, f_out )
	242	DO k = 1, nz
	243	DO i = nxl, nxr
	244	DO j = nys, nyn
	245	f_out(k,j,i) = f_inv(j,i,k)
	246	ENDDO
	247	ENDDO
	248	ENDDO
	249	!$acc end kernels
	250	!$OMP END PARALLEL
	251
	252	END SUBROUTINE resort_for_xz
	253
	254
	255	!------------------------------------------------------------------------------!
	256	! Description:
	257	! ------------
[1682]	258	!> Transposition of input array (f_in) from x to z. For the input array, all
	259	!> elements along x reside on the same PE, while after transposition, all
	260	!> elements along z reside on the same PE.
[1]	261	!------------------------------------------------------------------------------!
[1682]	262	SUBROUTINE transpose_xz( f_in, f_inv )
[1]	263
[1682]	264
[1320]	265	USE cpulog, &
	266	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	267
[1320]	268	USE indices, &
	269	ONLY: nnx, nx, nxl, nxr, ny, nyn, nys, nz
	270
	271	USE kinds
	272
[1324]	273	USE pegrid
[1320]	274
	275	USE transpose_indices, &
	276	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	277
[1]	278	IMPLICIT NONE
	279
[1682]	280	INTEGER(iwp) :: i !<
	281	INTEGER(iwp) :: j !<
	282	INTEGER(iwp) :: k !<
	283	INTEGER(iwp) :: l !<
	284	INTEGER(iwp) :: xs !<
[1]	285
[1682]	286	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	287	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1]	288
[1682]	289	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[1111]	290
[1320]	291
[1]	292	!
	293	!-- If the PE grid is one-dimensional along y, the array has only to be
	294	!-- reordered locally and therefore no transposition has to be done.
	295	IF ( pdims(1) /= 1 ) THEN
[1106]	296
	297	#if defined( __parallel )
[1]	298	!
	299	!-- Reorder input array for transposition
[1111]	300	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	301	!$OMP DO
[1216]	302	!$acc data copyout( work )
[1]	303	DO l = 0, pdims(1) - 1
	304	xs = 0 + l * nnx
[1111]	305	!$acc kernels present( f_in, work )
[1003]	306	DO k = nzb_x, nzt_x
[164]	307	DO i = xs, xs + nnx - 1
[1003]	308	DO j = nys_x, nyn_x
[1111]	309	work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]	310	ENDDO
	311	ENDDO
	312	ENDDO
[1111]	313	!$acc end kernels
[1]	314	ENDDO
[1216]	315	!$acc end data
[683]	316	!$OMP END PARALLEL
[1]	317
	318	!
	319	!-- Transpose array
[1318]	320	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[622]	321	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	322	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	323	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	324	comm1dx, ierr )
[1111]	325	!$acc update device( f_inv )
[1]	326	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	327	#endif
	328
[1]	329	ELSE
[1106]	330
[1]	331	!
	332	!-- Reorder the array in a way that the z index is in first position
[683]	333	!$OMP PARALLEL PRIVATE ( i, j, k )
	334	!$OMP DO
[1216]	335	!$acc kernels present( f_in, f_inv )
[1003]	336	DO i = nxl, nxr
	337	DO j = nys, nyn
	338	DO k = 1, nz
[164]	339	f_inv(j,i,k) = f_in(i,j,k)
[1]	340	ENDDO
	341	ENDDO
	342	ENDDO
[1111]	343	!$acc end kernels
[683]	344	!$OMP END PARALLEL
[1]	345
[164]	346	ENDIF
	347
[1]	348	END SUBROUTINE transpose_xz
	349
	350
	351	!------------------------------------------------------------------------------!
	352	! Description:
	353	! ------------
[1682]	354	!> Resorting data after the transposition from y to x. The transposition itself
	355	!> is carried out in transpose_yx
[1216]	356	!------------------------------------------------------------------------------!
[1682]	357	SUBROUTINE resort_for_yx( f_inv, f_out )
[1216]	358
[1682]	359
[1320]	360	USE indices, &
	361	ONLY: nx
[1216]	362
[1320]	363	USE kinds
	364
	365	USE transpose_indices, &
	366	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	367
[1216]	368	IMPLICIT NONE
	369
[1682]	370	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	371	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1216]	372
	373
[1682]	374	INTEGER(iwp) :: i !<
	375	INTEGER(iwp) :: j !<
	376	INTEGER(iwp) :: k !<
[1216]	377	!
	378	!-- Rearrange indices of input array in order to make data to be send
	379	!-- by MPI contiguous
	380	!$OMP PARALLEL PRIVATE ( i, j, k )
	381	!$OMP DO
	382	!$acc kernels present( f_inv, f_out )
	383	DO i = 0, nx
	384	DO k = nzb_x, nzt_x
	385	DO j = nys_x, nyn_x
	386	f_out(i,j,k) = f_inv(j,k,i)
	387	ENDDO
	388	ENDDO
	389	ENDDO
	390	!$acc end kernels
	391	!$OMP END PARALLEL
	392
	393	END SUBROUTINE resort_for_yx
	394
	395
	396	!------------------------------------------------------------------------------!
	397	! Description:
	398	! ------------
[1682]	399	!> Transposition of input array (f_in) from y to x. For the input array, all
	400	!> elements along y reside on the same PE, while after transposition, all
	401	!> elements along x reside on the same PE.
[1]	402	!------------------------------------------------------------------------------!
[1682]	403	SUBROUTINE transpose_yx( f_in, f_inv )
[1]	404
[1682]	405
[1320]	406	USE cpulog, &
	407	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	408
[1320]	409	USE indices, &
	410	ONLY: nx, ny
	411
	412	USE kinds
	413
[1324]	414	USE pegrid
[1320]	415
	416	USE transpose_indices, &
	417	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	418
[1]	419	IMPLICIT NONE
	420
[1682]	421	INTEGER(iwp) :: i !<
	422	INTEGER(iwp) :: j !<
	423	INTEGER(iwp) :: k !<
	424	INTEGER(iwp) :: l !<
	425	INTEGER(iwp) :: ys !<
[1]	426
[1682]	427	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	428	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1111]	429
[1682]	430	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[1111]	431
[1320]	432
[1106]	433	IF ( numprocs /= 1 ) THEN
	434
[1]	435	#if defined( __parallel )
	436	!
[1106]	437	!-- Reorder input array for transposition
[1111]	438	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	439	!$OMP DO
[1216]	440	!$acc data copyout( work )
[1106]	441	DO l = 0, pdims(2) - 1
	442	ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]	443	!$acc kernels present( f_in, work )
[1106]	444	DO i = nxl_y, nxr_y
	445	DO k = nzb_y, nzt_y
	446	DO j = ys, ys + nyn_x - nys_x
[1111]	447	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	448	ENDDO
	449	ENDDO
	450	ENDDO
[1111]	451	!$acc end kernels
[1106]	452	ENDDO
[1216]	453	!$acc end data
[1106]	454	!$OMP END PARALLEL
	455
	456	!
	457	!-- Transpose array
[1318]	458	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	459	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	460	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	461	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	462	comm1dy, ierr )
[1111]	463	!$acc update device( f_inv )
[1106]	464	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	465	#endif
	466
	467	ELSE
	468
	469	!
	470	!-- Reorder array f_in the same way as ALLTOALL did it
	471	!$OMP PARALLEL PRIVATE ( i, j, k )
	472	!$OMP DO
[1216]	473	!$acc kernels present( f_in, f_inv )
[1003]	474	DO i = nxl_y, nxr_y
	475	DO k = nzb_y, nzt_y
[1106]	476	DO j = 0, ny
	477	f_inv(j,k,i) = f_in(j,i,k)
[1]	478	ENDDO
	479	ENDDO
	480	ENDDO
[1111]	481	!$acc end kernels
[683]	482	!$OMP END PARALLEL
[1]	483
[1106]	484	ENDIF
[1]	485
	486	END SUBROUTINE transpose_yx
	487
	488
	489	!------------------------------------------------------------------------------!
	490	! Description:
	491	! ------------
[1682]	492	!> Transposition of input array (f_in) from y to x. For the input array, all
	493	!> elements along y reside on the same PE, while after transposition, all
	494	!> elements along x reside on the same PE.
	495	!> This is a direct transposition for arrays with indices in regular order
	496	!> (k,j,i) (cf. transpose_yx).
[1]	497	!------------------------------------------------------------------------------!
[1682]	498	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	499
[1682]	500
[1320]	501	USE cpulog, &
	502	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	503
[1320]	504	USE indices, &
	505	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
	506
	507	USE kinds
	508
[1324]	509	USE pegrid
[1320]	510
	511	USE transpose_indices, &
	512	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	513
[1]	514	IMPLICIT NONE
	515
[1682]	516	INTEGER(iwp) :: i !<
	517	INTEGER(iwp) :: j !<
	518	INTEGER(iwp) :: k !<
	519	INTEGER(iwp) :: l !<
	520	INTEGER(iwp) :: m !<
	521	INTEGER(iwp) :: xs !<
[1]	522
[1682]	523	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	524	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
	525	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	526	REAL(wp) :: work(nnxnnynnz) !<
[1]	527	#if defined( __parallel )
	528
	529	!
	530	!-- Rearrange indices of input array in order to make data to be send
	531	!-- by MPI contiguous
[1003]	532	DO k = 1, nz
	533	DO j = nys, nyn
	534	DO i = nxl, nxr
[164]	535	f_inv(i,k,j) = f_in(k,j,i)
[1]	536	ENDDO
	537	ENDDO
	538	ENDDO
	539
	540	!
	541	!-- Transpose array
	542	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	543	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	544	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	545	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	546	comm1dx, ierr )
	547	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	548
	549	!
	550	!-- Reorder transposed array
	551	m = 0
	552	DO l = 0, pdims(1) - 1
	553	xs = 0 + l * nnx
[1003]	554	DO j = nys_x, nyn_x
	555	DO k = 1, nz
[1]	556	DO i = xs, xs + nnx - 1
	557	m = m + 1
[164]	558	f_out(i,j,k) = work(m)
[1]	559	ENDDO
	560	ENDDO
	561	ENDDO
	562	ENDDO
	563
	564	#endif
	565
	566	END SUBROUTINE transpose_yxd
	567
	568
	569	!------------------------------------------------------------------------------!
	570	! Description:
	571	! ------------
[1682]	572	!> Resorting data for the transposition from y to z. The transposition itself
	573	!> is carried out in transpose_yz
[1216]	574	!------------------------------------------------------------------------------!
[1682]	575	SUBROUTINE resort_for_yz( f_in, f_inv )
[1216]	576
[1682]	577
[1320]	578	USE indices, &
	579	ONLY: ny
[1216]	580
[1320]	581	USE kinds
	582
	583	USE transpose_indices, &
	584	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	585
[1216]	586	IMPLICIT NONE
	587
[1682]	588	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	589	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1216]	590
[1682]	591	INTEGER(iwp) :: i !<
	592	INTEGER(iwp) :: j !<
	593	INTEGER(iwp) :: k !<
[1216]	594
	595	!
	596	!-- Rearrange indices of input array in order to make data to be send
	597	!-- by MPI contiguous
	598	!$OMP PARALLEL PRIVATE ( i, j, k )
	599	!$OMP DO
	600	!$acc kernels present( f_in, f_inv )
	601	DO j = 0, ny
	602	DO k = nzb_y, nzt_y
	603	DO i = nxl_y, nxr_y
	604	f_inv(i,k,j) = f_in(j,i,k)
	605	ENDDO
	606	ENDDO
	607	ENDDO
	608	!$acc end kernels
	609	!$OMP END PARALLEL
	610
	611	END SUBROUTINE resort_for_yz
	612
	613
	614	!------------------------------------------------------------------------------!
	615	! Description:
	616	! ------------
[1682]	617	!> Transposition of input array (f_in) from y to z. For the input array, all
	618	!> elements along y reside on the same PE, while after transposition, all
	619	!> elements along z reside on the same PE.
[1]	620	!------------------------------------------------------------------------------!
[1682]	621	SUBROUTINE transpose_yz( f_inv, f_out )
[1]	622
[1682]	623
[1320]	624	USE cpulog, &
	625	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	626
[1320]	627	USE indices, &
	628	ONLY: ny, nz
	629
	630	USE kinds
	631
[1324]	632	USE pegrid
[1320]	633
	634	USE transpose_indices, &
	635	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	636
[1]	637	IMPLICIT NONE
	638
[1682]	639	INTEGER(iwp) :: i !<
	640	INTEGER(iwp) :: j !<
	641	INTEGER(iwp) :: k !<
	642	INTEGER(iwp) :: l !<
	643	INTEGER(iwp) :: zs !<
[1]	644
[1682]	645	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	646	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
[1111]	647
[1682]	648	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[1111]	649
[1320]	650
[1]	651	!
	652	!-- If the PE grid is one-dimensional along y, only local reordering
	653	!-- of the data is necessary and no transposition has to be done.
	654	IF ( pdims(1) == 1 ) THEN
[1106]	655
[683]	656	!$OMP PARALLEL PRIVATE ( i, j, k )
	657	!$OMP DO
[1216]	658	!$acc kernels present( f_inv, f_out )
[1003]	659	DO j = 0, ny
	660	DO k = nzb_y, nzt_y
	661	DO i = nxl_y, nxr_y
[164]	662	f_out(i,j,k) = f_inv(i,k,j)
[1]	663	ENDDO
	664	ENDDO
	665	ENDDO
[1111]	666	!$acc end kernels
[683]	667	!$OMP END PARALLEL
[1]	668
[1106]	669	ELSE
	670
	671	#if defined( __parallel )
[1]	672	!
[1106]	673	!-- Transpose array
[1318]	674	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	675	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	676	!$acc update host( f_inv )
	677	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	678	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	679	comm1dx, ierr )
	680	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	681
	682	!
[1106]	683	!-- Reorder transposed array
[1111]	684	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	685	!$OMP DO
[1216]	686	!$acc data copyin( work )
[1106]	687	DO l = 0, pdims(1) - 1
	688	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1216]	689	!$acc kernels present( f_out )
[1106]	690	DO j = nys_z, nyn_z
	691	DO k = zs, zs + nzt_y - nzb_y
	692	DO i = nxl_z, nxr_z
[1111]	693	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	694	ENDDO
[1]	695	ENDDO
	696	ENDDO
[1111]	697	!$acc end kernels
[1]	698	ENDDO
[1216]	699	!$acc end data
[683]	700	!$OMP END PARALLEL
[1]	701	#endif
	702
[1106]	703	ENDIF
	704
[1]	705	END SUBROUTINE transpose_yz
	706
	707
	708	!------------------------------------------------------------------------------!
	709	! Description:
	710	! ------------
[1682]	711	!> Resorting data for the transposition from z to x. The transposition itself
	712	!> is carried out in transpose_zx
[1216]	713	!------------------------------------------------------------------------------!
[1682]	714	SUBROUTINE resort_for_zx( f_in, f_inv )
[1216]	715
[1682]	716
[1320]	717	USE indices, &
	718	ONLY: nxl, nxr, nyn, nys, nz
[1216]	719
[1320]	720	USE kinds
	721
[1216]	722	IMPLICIT NONE
	723
[1682]	724	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	725	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1216]	726
[1682]	727	INTEGER(iwp) :: i !<
	728	INTEGER(iwp) :: j !<
	729	INTEGER(iwp) :: k !<
[1216]	730
	731	!
	732	!-- Rearrange indices of input array in order to make data to be send
	733	!-- by MPI contiguous
	734	!$OMP PARALLEL PRIVATE ( i, j, k )
	735	!$OMP DO
	736	!$acc kernels present( f_in, f_inv )
	737	DO k = 1,nz
	738	DO i = nxl, nxr
	739	DO j = nys, nyn
	740	f_inv(j,i,k) = f_in(k,j,i)
	741	ENDDO
	742	ENDDO
	743	ENDDO
	744	!$acc end kernels
	745	!$OMP END PARALLEL
	746
	747	END SUBROUTINE resort_for_zx
	748
	749
	750	!------------------------------------------------------------------------------!
	751	! Description:
	752	! ------------
[1682]	753	!> Transposition of input array (f_in) from z to x. For the input array, all
	754	!> elements along z reside on the same PE, while after transposition, all
	755	!> elements along x reside on the same PE.
[1]	756	!------------------------------------------------------------------------------!
[1682]	757	SUBROUTINE transpose_zx( f_inv, f_out )
[1]	758
[1682]	759
[1320]	760	USE cpulog, &
	761	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	762
[1320]	763	USE indices, &
	764	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
	765
	766	USE kinds
	767
[1324]	768	USE pegrid
[1320]	769
	770	USE transpose_indices, &
	771	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	772
[1]	773	IMPLICIT NONE
	774
[1682]	775	INTEGER(iwp) :: i !<
	776	INTEGER(iwp) :: j !<
	777	INTEGER(iwp) :: k !<
	778	INTEGER(iwp) :: l !<
	779	INTEGER(iwp) :: xs !<
[1]	780
[1682]	781	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	782	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1111]	783
[1682]	784	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[1]	785
[1320]	786
[1]	787	!
	788	!-- If the PE grid is one-dimensional along y, only local reordering
	789	!-- of the data is necessary and no transposition has to be done.
	790	IF ( pdims(1) == 1 ) THEN
[1106]	791
[683]	792	!$OMP PARALLEL PRIVATE ( i, j, k )
	793	!$OMP DO
[1216]	794	!$acc kernels present( f_inv, f_out )
[1003]	795	DO k = 1, nz
	796	DO i = nxl, nxr
	797	DO j = nys, nyn
[164]	798	f_out(i,j,k) = f_inv(j,i,k)
[1]	799	ENDDO
	800	ENDDO
	801	ENDDO
[1111]	802	!$acc end kernels
[683]	803	!$OMP END PARALLEL
[1]	804
[1106]	805	ELSE
	806
	807	#if defined( __parallel )
[1]	808	!
[1106]	809	!-- Transpose array
[1318]	810	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	811	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	812	!$acc update host( f_inv )
	813	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	814	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	815	comm1dx, ierr )
	816	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	817
	818	!
[1106]	819	!-- Reorder transposed array
[1111]	820	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	821	!$OMP DO
[1216]	822	!$acc data copyin( work )
[1106]	823	DO l = 0, pdims(1) - 1
	824	xs = 0 + l * nnx
[1216]	825	!$acc kernels present( f_out )
[1106]	826	DO k = nzb_x, nzt_x
	827	DO i = xs, xs + nnx - 1
	828	DO j = nys_x, nyn_x
[1111]	829	f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]	830	ENDDO
[1]	831	ENDDO
	832	ENDDO
[1111]	833	!$acc end kernels
[1]	834	ENDDO
[1216]	835	!$acc end data
[683]	836	!$OMP END PARALLEL
[1]	837	#endif
	838
[1106]	839	ENDIF
	840
[1]	841	END SUBROUTINE transpose_zx
	842
	843
	844	!------------------------------------------------------------------------------!
	845	! Description:
	846	! ------------
[1682]	847	!> Resorting data after the transposition from z to y. The transposition itself
	848	!> is carried out in transpose_zy
[1216]	849	!------------------------------------------------------------------------------!
[1682]	850	SUBROUTINE resort_for_zy( f_inv, f_out )
[1216]	851
[1682]	852
[1320]	853	USE indices, &
	854	ONLY: ny
[1216]	855
[1320]	856	USE kinds
	857
	858	USE transpose_indices, &
	859	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	860
[1216]	861	IMPLICIT NONE
	862
[1682]	863	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	864	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1216]	865
	866
[1682]	867	INTEGER(iwp) :: i !<
	868	INTEGER(iwp) :: j !<
	869	INTEGER(iwp) :: k !<
[1216]	870
	871	!
	872	!-- Rearrange indices of input array in order to make data to be send
	873	!-- by MPI contiguous
	874	!$OMP PARALLEL PRIVATE ( i, j, k )
	875	!$OMP DO
	876	!$acc kernels present( f_inv, f_out )
	877	DO k = nzb_y, nzt_y
	878	DO j = 0, ny
	879	DO i = nxl_y, nxr_y
	880	f_out(j,i,k) = f_inv(i,k,j)
	881	ENDDO
	882	ENDDO
	883	ENDDO
	884	!$acc end kernels
	885	!$OMP END PARALLEL
	886
	887	END SUBROUTINE resort_for_zy
	888
	889
	890	!------------------------------------------------------------------------------!
	891	! Description:
	892	! ------------
[1682]	893	!> Transposition of input array (f_in) from z to y. For the input array, all
	894	!> elements along z reside on the same PE, while after transposition, all
	895	!> elements along y reside on the same PE.
[1]	896	!------------------------------------------------------------------------------!
[1682]	897	SUBROUTINE transpose_zy( f_in, f_inv )
[1]	898
[1682]	899
[1320]	900	USE cpulog, &
	901	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	902
[1320]	903	USE indices, &
	904	ONLY: ny, nz
	905
	906	USE kinds
	907
[1324]	908	USE pegrid
[1320]	909
	910	USE transpose_indices, &
	911	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	912
[1]	913	IMPLICIT NONE
	914
[1682]	915	INTEGER(iwp) :: i !<
	916	INTEGER(iwp) :: j !<
	917	INTEGER(iwp) :: k !<
	918	INTEGER(iwp) :: l !<
	919	INTEGER(iwp) :: zs !<
[1]	920
[1682]	921	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
	922	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1111]	923
[1682]	924	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[1111]	925
[1]	926	!
	927	!-- If the PE grid is one-dimensional along y, the array has only to be
	928	!-- reordered locally and therefore no transposition has to be done.
	929	IF ( pdims(1) /= 1 ) THEN
[1106]	930
	931	#if defined( __parallel )
[1]	932	!
	933	!-- Reorder input array for transposition
[1111]	934	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	935	!$OMP DO
[1216]	936	!$acc data copyout( work )
[1]	937	DO l = 0, pdims(1) - 1
[1003]	938	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1111]	939	!$acc kernels present( f_in, work )
[1003]	940	DO j = nys_z, nyn_z
	941	DO k = zs, zs + nzt_y - nzb_y
	942	DO i = nxl_z, nxr_z
[1111]	943	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	944	ENDDO
	945	ENDDO
	946	ENDDO
[1111]	947	!$acc end kernels
[1]	948	ENDDO
[1216]	949	!$acc end data
[683]	950	!$OMP END PARALLEL
[1]	951
	952	!
	953	!-- Transpose array
[1318]	954	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[622]	955	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	956	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	957	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	958	comm1dx, ierr )
[1111]	959	!$acc update device( f_inv )
[1]	960	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	961	#endif
[1]	962
	963	ELSE
	964	!
[1106]	965	!-- Reorder the array in the same way like ALLTOALL did it
[683]	966	!$OMP PARALLEL PRIVATE ( i, j, k )
	967	!$OMP DO
[1216]	968	!$acc kernels present( f_in, f_inv )
[1003]	969	DO k = nzb_y, nzt_y
	970	DO j = 0, ny
	971	DO i = nxl_y, nxr_y
[164]	972	f_inv(i,k,j) = f_in(i,j,k)
	973	ENDDO
	974	ENDDO
	975	ENDDO
[1111]	976	!$acc end kernels
[683]	977	!$OMP END PARALLEL
[1106]	978
	979	ENDIF
	980
[1]	981	END SUBROUTINE transpose_zy
	982
	983
	984	!------------------------------------------------------------------------------!
	985	! Description:
	986	! ------------
[1682]	987	!> Transposition of input array (f_in) from z to y. For the input array, all
	988	!> elements along z reside on the same PE, while after transposition, all
	989	!> elements along y reside on the same PE.
	990	!> This is a direct transposition for arrays with indices in regular order
	991	!> (k,j,i) (cf. transpose_zy).
[1]	992	!------------------------------------------------------------------------------!
[1682]	993	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	994
[1682]	995
[1320]	996	USE cpulog, &
	997	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	998
[1320]	999	USE indices, &
	1000	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
	1001
	1002	USE kinds
	1003
[1324]	1004	USE pegrid
[1320]	1005
	1006	USE transpose_indices, &
	1007	ONLY: nxl_y, nxl_yd, nxr_y, nxr_yd, nzb_y, nzb_yd, nzt_y, nzt_yd
	1008
[1]	1009	IMPLICIT NONE
	1010
[1682]	1011	INTEGER(iwp) :: i !<
	1012	INTEGER(iwp) :: j !<
	1013	INTEGER(iwp) :: k !<
	1014	INTEGER(iwp) :: l !<
	1015	INTEGER(iwp) :: m !<
	1016	INTEGER(iwp) :: ys !<
[1]	1017
[1682]	1018	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	1019	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	1020	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
	1021	REAL(wp) :: work(nnxnnynnz) !<
[1320]	1022
[1]	1023	#if defined( __parallel )
	1024
	1025	!
	1026	!-- Rearrange indices of input array in order to make data to be send
	1027	!-- by MPI contiguous
[1003]	1028	DO i = nxl, nxr
	1029	DO j = nys, nyn
	1030	DO k = 1, nz
[164]	1031	f_inv(j,i,k) = f_in(k,j,i)
[1]	1032	ENDDO
	1033	ENDDO
	1034	ENDDO
	1035
	1036	!
	1037	!-- Move data to different array, because memory location of work1 is
	1038	!-- needed further below (work1 = work2).
	1039	!-- If the PE grid is one-dimensional along x, only local reordering
	1040	!-- of the data is necessary and no transposition has to be done.
	1041	IF ( pdims(2) == 1 ) THEN
[1003]	1042	DO k = 1, nz
	1043	DO i = nxl, nxr
	1044	DO j = nys, nyn
[164]	1045	f_out(j,i,k) = f_inv(j,i,k)
[1]	1046	ENDDO
	1047	ENDDO
	1048	ENDDO
	1049	RETURN
	1050	ENDIF
	1051
	1052	!
	1053	!-- Transpose array
	1054	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1055	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1056	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	1057	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	1058	comm1dy, ierr )
	1059	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1060
	1061	!
	1062	!-- Reorder transposed array
	1063	m = 0
	1064	DO l = 0, pdims(2) - 1
	1065	ys = 0 + l * nny
[1003]	1066	DO k = nzb_yd, nzt_yd
	1067	DO i = nxl_yd, nxr_yd
[1]	1068	DO j = ys, ys + nny - 1
	1069	m = m + 1
[164]	1070	f_out(j,i,k) = work(m)
[1]	1071	ENDDO
	1072	ENDDO
	1073	ENDDO
	1074	ENDDO
	1075
	1076	#endif
	1077
	1078	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |