Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 2034

Last change on this file since 2034 was 2001, checked in by knoop, 8 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 31.2 KB

Rev	Line
[1682]	1	!> @file transpose.f90
[2000]	2	!------------------------------------------------------------------------------!
[1036]	3	! This file is part of PALM.
	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[1818]	17	! Copyright 1997-2016 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[1321]	22	!
[2001]	23	!
[1321]	24	! Former revisions:
	25	! -----------------
	26	! $Id: transpose.f90 2001 2016-08-20 18:41:22Z gronemeier $
	27	!
[2001]	28	! 2000 2016-08-20 18:09:15Z knoop
	29	! Forced header and separation lines into 80 columns
	30	!
[1683]	31	! 1682 2015-10-07 23:56:08Z knoop
	32	! Code annotations made doxygen readable
	33	!
[1325]	34	! 1324 2014-03-21 09:13:16Z suehring
	35	! Bugfix: ONLY statement for module pegrid removed
	36	!
[1321]	37	! 1320 2014-03-20 08:40:49Z raasch
[1320]	38	! ONLY-attribute added to USE-statements,
	39	! kind-parameters added to all INTEGER and REAL declaration statements,
	40	! kinds are defined in new module kinds,
	41	! old module precision_kind is removed,
	42	! revision history before 2012 removed,
	43	! comment fields (!:) to be used for variable explanations added to
	44	! all variable declaration statements
[198]	45	!
[1319]	46	! 1318 2014-03-17 13:35:16Z raasch
	47	! cpu_log_nowait parameter added to cpu measurements of the transpositions
	48	! required for solving the Poisson equation (poisfft),
	49	! module interfaces removed
	50	!
[1258]	51	! 1257 2013-11-08 15:18:40Z raasch
	52	! openacc loop and loop vector clauses removed
	53	!
[1217]	54	! 1216 2013-08-26 09:31:42Z raasch
	55	! re-sorting of the transposed / to be transposed arrays moved to separate
	56	! routines resort_for_...
	57	!
[1112]	58	! 1111 2013-03-08 23:54:10Z raasch
	59	! openACC directives added,
	60	! resorting data from/to work changed, work got 4 dimensions instead of 1
	61	!
[1107]	62	! 1106 2013-03-04 05:31:38Z raasch
	63	! preprocessor lines rearranged so that routines can also be used in serial
	64	! (non-parallel) mode
	65	!
[1093]	66	! 1092 2013-02-02 11:24:22Z raasch
	67	! unused variables removed
	68	!
[1037]	69	! 1036 2012-10-22 13:43:42Z raasch
	70	! code put under GPL (PALM 3.9)
	71	!
[1004]	72	! 1003 2012-09-14 14:35:53Z raasch
	73	! indices nxa, nya, etc. replaced by nx, ny, etc.
	74	!
[1]	75	! Revision 1.1 1997/07/24 11:25:18 raasch
	76	! Initial revision
	77	!
[1216]	78	!------------------------------------------------------------------------------!
	79	! Description:
	80	! ------------
[1682]	81	!> Resorting data for the transposition from x to y. The transposition itself
	82	!> is carried out in transpose_xy
[1216]	83	!------------------------------------------------------------------------------!
[1682]	84	SUBROUTINE resort_for_xy( f_in, f_inv )
	85
[1216]	86
[1320]	87	USE indices, &
	88	ONLY: nx
[1216]	89
[1320]	90	USE kinds
	91
	92	USE transpose_indices, &
	93	ONLY: nxl_z, nxr_z, nyn_x, nyn_z, nys_x, nys_z, nzb_x, nzt_x
	94
[1216]	95	IMPLICIT NONE
	96
[1682]	97	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	98	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1216]	99
	100
[1682]	101	INTEGER(iwp) :: i !<
	102	INTEGER(iwp) :: j !<
	103	INTEGER(iwp) :: k !<
[1]	104	!
[1216]	105	!-- Rearrange indices of input array in order to make data to be send
	106	!-- by MPI contiguous
	107	!$OMP PARALLEL PRIVATE ( i, j, k )
	108	!$OMP DO
	109	!$acc kernels present( f_in, f_inv )
	110	DO i = 0, nx
	111	DO k = nzb_x, nzt_x
	112	DO j = nys_x, nyn_x
	113	f_inv(j,k,i) = f_in(i,j,k)
	114	ENDDO
	115	ENDDO
	116	ENDDO
	117	!$acc end kernels
	118	!$OMP END PARALLEL
	119
	120	END SUBROUTINE resort_for_xy
	121
	122
	123	!------------------------------------------------------------------------------!
[1]	124	! Description:
	125	! ------------
[1682]	126	!> Transposition of input array (f_in) from x to y. For the input array, all
	127	!> elements along x reside on the same PE, while after transposition, all
	128	!> elements along y reside on the same PE.
[1]	129	!------------------------------------------------------------------------------!
[1682]	130	SUBROUTINE transpose_xy( f_inv, f_out )
[1]	131
[1682]	132
[1320]	133	USE cpulog, &
	134	ONLY: cpu_log, cpu_log_nowait, log_point_s
	135
	136	USE indices, &
	137	ONLY: nx, ny
	138
	139	USE kinds
	140
[1]	141	USE pegrid
	142
[1320]	143	USE transpose_indices, &
	144	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	145
[1]	146	IMPLICIT NONE
	147
[1682]	148	INTEGER(iwp) :: i !<
	149	INTEGER(iwp) :: j !<
	150	INTEGER(iwp) :: k !<
	151	INTEGER(iwp) :: l !<
	152	INTEGER(iwp) :: ys !<
[1320]	153
[1682]	154	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	155	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1]	156
[1682]	157	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[1111]	158
	159
[1106]	160	IF ( numprocs /= 1 ) THEN
	161
	162	#if defined( __parallel )
[1]	163	!
[1106]	164	!-- Transpose array
[1318]	165	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	166	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	167	!$acc update host( f_inv )
	168	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	169	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	170	comm1dy, ierr )
	171	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	172
	173	!
[1106]	174	!-- Reorder transposed array
[1111]	175	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	176	!$OMP DO
[1216]	177	!$acc data copyin( work )
[1106]	178	DO l = 0, pdims(2) - 1
	179	ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]	180	!$acc kernels present( f_out, work )
[1106]	181	DO i = nxl_y, nxr_y
	182	DO k = nzb_y, nzt_y
	183	DO j = ys, ys + nyn_x - nys_x
[1111]	184	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	185	ENDDO
[1]	186	ENDDO
	187	ENDDO
[1111]	188	!$acc end kernels
[1]	189	ENDDO
[1216]	190	!$acc end data
[683]	191	!$OMP END PARALLEL
[1]	192	#endif
	193
[1106]	194	ELSE
	195
	196	!
	197	!-- Reorder transposed array
	198	!$OMP PARALLEL PRIVATE ( i, j, k )
	199	!$OMP DO
[1216]	200	!$acc kernels present( f_inv, f_out )
[1106]	201	DO k = nzb_y, nzt_y
	202	DO i = nxl_y, nxr_y
	203	DO j = 0, ny
	204	f_out(j,i,k) = f_inv(j,k,i)
	205	ENDDO
	206	ENDDO
	207	ENDDO
[1111]	208	!$acc end kernels
[1106]	209	!$OMP END PARALLEL
	210
	211	ENDIF
	212
[1]	213	END SUBROUTINE transpose_xy
	214
	215
	216	!------------------------------------------------------------------------------!
	217	! Description:
	218	! ------------
[1682]	219	!> Resorting data after the transposition from x to z. The transposition itself
	220	!> is carried out in transpose_xz
[1216]	221	!------------------------------------------------------------------------------!
[1682]	222	SUBROUTINE resort_for_xz( f_inv, f_out )
[1216]	223
[1682]	224
[1320]	225	USE indices, &
	226	ONLY: nxl, nxr, nyn, nys, nz
[1216]	227
[1320]	228	USE kinds
	229
[1216]	230	IMPLICIT NONE
	231
[1682]	232	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	233	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
[1216]	234
[1682]	235	INTEGER(iwp) :: i !<
	236	INTEGER(iwp) :: j !<
	237	INTEGER(iwp) :: k !<
[1216]	238	!
	239	!-- Rearrange indices of input array in order to make data to be send
	240	!-- by MPI contiguous.
	241	!-- In case of parallel fft/transposition, scattered store is faster in
	242	!-- backward direction!!!
	243	!$OMP PARALLEL PRIVATE ( i, j, k )
	244	!$OMP DO
	245	!$acc kernels present( f_inv, f_out )
	246	DO k = 1, nz
	247	DO i = nxl, nxr
	248	DO j = nys, nyn
	249	f_out(k,j,i) = f_inv(j,i,k)
	250	ENDDO
	251	ENDDO
	252	ENDDO
	253	!$acc end kernels
	254	!$OMP END PARALLEL
	255
	256	END SUBROUTINE resort_for_xz
	257
	258
	259	!------------------------------------------------------------------------------!
	260	! Description:
	261	! ------------
[1682]	262	!> Transposition of input array (f_in) from x to z. For the input array, all
	263	!> elements along x reside on the same PE, while after transposition, all
	264	!> elements along z reside on the same PE.
[1]	265	!------------------------------------------------------------------------------!
[1682]	266	SUBROUTINE transpose_xz( f_in, f_inv )
[1]	267
[1682]	268
[1320]	269	USE cpulog, &
	270	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	271
[1320]	272	USE indices, &
	273	ONLY: nnx, nx, nxl, nxr, ny, nyn, nys, nz
	274
	275	USE kinds
	276
[1324]	277	USE pegrid
[1320]	278
	279	USE transpose_indices, &
	280	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	281
[1]	282	IMPLICIT NONE
	283
[1682]	284	INTEGER(iwp) :: i !<
	285	INTEGER(iwp) :: j !<
	286	INTEGER(iwp) :: k !<
	287	INTEGER(iwp) :: l !<
	288	INTEGER(iwp) :: xs !<
[1]	289
[1682]	290	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	291	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1]	292
[1682]	293	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[1111]	294
[1320]	295
[1]	296	!
	297	!-- If the PE grid is one-dimensional along y, the array has only to be
	298	!-- reordered locally and therefore no transposition has to be done.
	299	IF ( pdims(1) /= 1 ) THEN
[1106]	300
	301	#if defined( __parallel )
[1]	302	!
	303	!-- Reorder input array for transposition
[1111]	304	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	305	!$OMP DO
[1216]	306	!$acc data copyout( work )
[1]	307	DO l = 0, pdims(1) - 1
	308	xs = 0 + l * nnx
[1111]	309	!$acc kernels present( f_in, work )
[1003]	310	DO k = nzb_x, nzt_x
[164]	311	DO i = xs, xs + nnx - 1
[1003]	312	DO j = nys_x, nyn_x
[1111]	313	work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]	314	ENDDO
	315	ENDDO
	316	ENDDO
[1111]	317	!$acc end kernels
[1]	318	ENDDO
[1216]	319	!$acc end data
[683]	320	!$OMP END PARALLEL
[1]	321
	322	!
	323	!-- Transpose array
[1318]	324	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[622]	325	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	326	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	327	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	328	comm1dx, ierr )
[1111]	329	!$acc update device( f_inv )
[1]	330	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	331	#endif
	332
[1]	333	ELSE
[1106]	334
[1]	335	!
	336	!-- Reorder the array in a way that the z index is in first position
[683]	337	!$OMP PARALLEL PRIVATE ( i, j, k )
	338	!$OMP DO
[1216]	339	!$acc kernels present( f_in, f_inv )
[1003]	340	DO i = nxl, nxr
	341	DO j = nys, nyn
	342	DO k = 1, nz
[164]	343	f_inv(j,i,k) = f_in(i,j,k)
[1]	344	ENDDO
	345	ENDDO
	346	ENDDO
[1111]	347	!$acc end kernels
[683]	348	!$OMP END PARALLEL
[1]	349
[164]	350	ENDIF
	351
[1]	352	END SUBROUTINE transpose_xz
	353
	354
	355	!------------------------------------------------------------------------------!
	356	! Description:
	357	! ------------
[1682]	358	!> Resorting data after the transposition from y to x. The transposition itself
	359	!> is carried out in transpose_yx
[1216]	360	!------------------------------------------------------------------------------!
[1682]	361	SUBROUTINE resort_for_yx( f_inv, f_out )
[1216]	362
[1682]	363
[1320]	364	USE indices, &
	365	ONLY: nx
[1216]	366
[1320]	367	USE kinds
	368
	369	USE transpose_indices, &
	370	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	371
[1216]	372	IMPLICIT NONE
	373
[1682]	374	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	375	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1216]	376
	377
[1682]	378	INTEGER(iwp) :: i !<
	379	INTEGER(iwp) :: j !<
	380	INTEGER(iwp) :: k !<
[1216]	381	!
	382	!-- Rearrange indices of input array in order to make data to be send
	383	!-- by MPI contiguous
	384	!$OMP PARALLEL PRIVATE ( i, j, k )
	385	!$OMP DO
	386	!$acc kernels present( f_inv, f_out )
	387	DO i = 0, nx
	388	DO k = nzb_x, nzt_x
	389	DO j = nys_x, nyn_x
	390	f_out(i,j,k) = f_inv(j,k,i)
	391	ENDDO
	392	ENDDO
	393	ENDDO
	394	!$acc end kernels
	395	!$OMP END PARALLEL
	396
	397	END SUBROUTINE resort_for_yx
	398
	399
	400	!------------------------------------------------------------------------------!
	401	! Description:
	402	! ------------
[1682]	403	!> Transposition of input array (f_in) from y to x. For the input array, all
	404	!> elements along y reside on the same PE, while after transposition, all
	405	!> elements along x reside on the same PE.
[1]	406	!------------------------------------------------------------------------------!
[1682]	407	SUBROUTINE transpose_yx( f_in, f_inv )
[1]	408
[1682]	409
[1320]	410	USE cpulog, &
	411	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	412
[1320]	413	USE indices, &
	414	ONLY: nx, ny
	415
	416	USE kinds
	417
[1324]	418	USE pegrid
[1320]	419
	420	USE transpose_indices, &
	421	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	422
[1]	423	IMPLICIT NONE
	424
[1682]	425	INTEGER(iwp) :: i !<
	426	INTEGER(iwp) :: j !<
	427	INTEGER(iwp) :: k !<
	428	INTEGER(iwp) :: l !<
	429	INTEGER(iwp) :: ys !<
[1]	430
[1682]	431	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	432	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1111]	433
[1682]	434	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[1111]	435
[1320]	436
[1106]	437	IF ( numprocs /= 1 ) THEN
	438
[1]	439	#if defined( __parallel )
	440	!
[1106]	441	!-- Reorder input array for transposition
[1111]	442	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	443	!$OMP DO
[1216]	444	!$acc data copyout( work )
[1106]	445	DO l = 0, pdims(2) - 1
	446	ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]	447	!$acc kernels present( f_in, work )
[1106]	448	DO i = nxl_y, nxr_y
	449	DO k = nzb_y, nzt_y
	450	DO j = ys, ys + nyn_x - nys_x
[1111]	451	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	452	ENDDO
	453	ENDDO
	454	ENDDO
[1111]	455	!$acc end kernels
[1106]	456	ENDDO
[1216]	457	!$acc end data
[1106]	458	!$OMP END PARALLEL
	459
	460	!
	461	!-- Transpose array
[1318]	462	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	463	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	464	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	465	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	466	comm1dy, ierr )
[1111]	467	!$acc update device( f_inv )
[1106]	468	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	469	#endif
	470
	471	ELSE
	472
	473	!
	474	!-- Reorder array f_in the same way as ALLTOALL did it
	475	!$OMP PARALLEL PRIVATE ( i, j, k )
	476	!$OMP DO
[1216]	477	!$acc kernels present( f_in, f_inv )
[1003]	478	DO i = nxl_y, nxr_y
	479	DO k = nzb_y, nzt_y
[1106]	480	DO j = 0, ny
	481	f_inv(j,k,i) = f_in(j,i,k)
[1]	482	ENDDO
	483	ENDDO
	484	ENDDO
[1111]	485	!$acc end kernels
[683]	486	!$OMP END PARALLEL
[1]	487
[1106]	488	ENDIF
[1]	489
	490	END SUBROUTINE transpose_yx
	491
	492
	493	!------------------------------------------------------------------------------!
	494	! Description:
	495	! ------------
[1682]	496	!> Transposition of input array (f_in) from y to x. For the input array, all
	497	!> elements along y reside on the same PE, while after transposition, all
	498	!> elements along x reside on the same PE.
	499	!> This is a direct transposition for arrays with indices in regular order
	500	!> (k,j,i) (cf. transpose_yx).
[1]	501	!------------------------------------------------------------------------------!
[1682]	502	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	503
[1682]	504
[1320]	505	USE cpulog, &
	506	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	507
[1320]	508	USE indices, &
	509	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
	510
	511	USE kinds
	512
[1324]	513	USE pegrid
[1320]	514
	515	USE transpose_indices, &
	516	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	517
[1]	518	IMPLICIT NONE
	519
[1682]	520	INTEGER(iwp) :: i !<
	521	INTEGER(iwp) :: j !<
	522	INTEGER(iwp) :: k !<
	523	INTEGER(iwp) :: l !<
	524	INTEGER(iwp) :: m !<
	525	INTEGER(iwp) :: xs !<
[1]	526
[1682]	527	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	528	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
	529	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	530	REAL(wp) :: work(nnxnnynnz) !<
[1]	531	#if defined( __parallel )
	532
	533	!
	534	!-- Rearrange indices of input array in order to make data to be send
	535	!-- by MPI contiguous
[1003]	536	DO k = 1, nz
	537	DO j = nys, nyn
	538	DO i = nxl, nxr
[164]	539	f_inv(i,k,j) = f_in(k,j,i)
[1]	540	ENDDO
	541	ENDDO
	542	ENDDO
	543
	544	!
	545	!-- Transpose array
	546	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	547	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	548	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	549	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	550	comm1dx, ierr )
	551	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	552
	553	!
	554	!-- Reorder transposed array
	555	m = 0
	556	DO l = 0, pdims(1) - 1
	557	xs = 0 + l * nnx
[1003]	558	DO j = nys_x, nyn_x
	559	DO k = 1, nz
[1]	560	DO i = xs, xs + nnx - 1
	561	m = m + 1
[164]	562	f_out(i,j,k) = work(m)
[1]	563	ENDDO
	564	ENDDO
	565	ENDDO
	566	ENDDO
	567
	568	#endif
	569
	570	END SUBROUTINE transpose_yxd
	571
	572
	573	!------------------------------------------------------------------------------!
	574	! Description:
	575	! ------------
[1682]	576	!> Resorting data for the transposition from y to z. The transposition itself
	577	!> is carried out in transpose_yz
[1216]	578	!------------------------------------------------------------------------------!
[1682]	579	SUBROUTINE resort_for_yz( f_in, f_inv )
[1216]	580
[1682]	581
[1320]	582	USE indices, &
	583	ONLY: ny
[1216]	584
[1320]	585	USE kinds
	586
	587	USE transpose_indices, &
	588	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	589
[1216]	590	IMPLICIT NONE
	591
[1682]	592	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	593	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1216]	594
[1682]	595	INTEGER(iwp) :: i !<
	596	INTEGER(iwp) :: j !<
	597	INTEGER(iwp) :: k !<
[1216]	598
	599	!
	600	!-- Rearrange indices of input array in order to make data to be send
	601	!-- by MPI contiguous
	602	!$OMP PARALLEL PRIVATE ( i, j, k )
	603	!$OMP DO
	604	!$acc kernels present( f_in, f_inv )
	605	DO j = 0, ny
	606	DO k = nzb_y, nzt_y
	607	DO i = nxl_y, nxr_y
	608	f_inv(i,k,j) = f_in(j,i,k)
	609	ENDDO
	610	ENDDO
	611	ENDDO
	612	!$acc end kernels
	613	!$OMP END PARALLEL
	614
	615	END SUBROUTINE resort_for_yz
	616
	617
	618	!------------------------------------------------------------------------------!
	619	! Description:
	620	! ------------
[1682]	621	!> Transposition of input array (f_in) from y to z. For the input array, all
	622	!> elements along y reside on the same PE, while after transposition, all
	623	!> elements along z reside on the same PE.
[1]	624	!------------------------------------------------------------------------------!
[1682]	625	SUBROUTINE transpose_yz( f_inv, f_out )
[1]	626
[1682]	627
[1320]	628	USE cpulog, &
	629	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	630
[1320]	631	USE indices, &
	632	ONLY: ny, nz
	633
	634	USE kinds
	635
[1324]	636	USE pegrid
[1320]	637
	638	USE transpose_indices, &
	639	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	640
[1]	641	IMPLICIT NONE
	642
[1682]	643	INTEGER(iwp) :: i !<
	644	INTEGER(iwp) :: j !<
	645	INTEGER(iwp) :: k !<
	646	INTEGER(iwp) :: l !<
	647	INTEGER(iwp) :: zs !<
[1]	648
[1682]	649	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	650	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
[1111]	651
[1682]	652	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[1111]	653
[1320]	654
[1]	655	!
	656	!-- If the PE grid is one-dimensional along y, only local reordering
	657	!-- of the data is necessary and no transposition has to be done.
	658	IF ( pdims(1) == 1 ) THEN
[1106]	659
[683]	660	!$OMP PARALLEL PRIVATE ( i, j, k )
	661	!$OMP DO
[1216]	662	!$acc kernels present( f_inv, f_out )
[1003]	663	DO j = 0, ny
	664	DO k = nzb_y, nzt_y
	665	DO i = nxl_y, nxr_y
[164]	666	f_out(i,j,k) = f_inv(i,k,j)
[1]	667	ENDDO
	668	ENDDO
	669	ENDDO
[1111]	670	!$acc end kernels
[683]	671	!$OMP END PARALLEL
[1]	672
[1106]	673	ELSE
	674
	675	#if defined( __parallel )
[1]	676	!
[1106]	677	!-- Transpose array
[1318]	678	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	679	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	680	!$acc update host( f_inv )
	681	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	682	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	683	comm1dx, ierr )
	684	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	685
	686	!
[1106]	687	!-- Reorder transposed array
[1111]	688	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	689	!$OMP DO
[1216]	690	!$acc data copyin( work )
[1106]	691	DO l = 0, pdims(1) - 1
	692	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1216]	693	!$acc kernels present( f_out )
[1106]	694	DO j = nys_z, nyn_z
	695	DO k = zs, zs + nzt_y - nzb_y
	696	DO i = nxl_z, nxr_z
[1111]	697	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	698	ENDDO
[1]	699	ENDDO
	700	ENDDO
[1111]	701	!$acc end kernels
[1]	702	ENDDO
[1216]	703	!$acc end data
[683]	704	!$OMP END PARALLEL
[1]	705	#endif
	706
[1106]	707	ENDIF
	708
[1]	709	END SUBROUTINE transpose_yz
	710
	711
	712	!------------------------------------------------------------------------------!
	713	! Description:
	714	! ------------
[1682]	715	!> Resorting data for the transposition from z to x. The transposition itself
	716	!> is carried out in transpose_zx
[1216]	717	!------------------------------------------------------------------------------!
[1682]	718	SUBROUTINE resort_for_zx( f_in, f_inv )
[1216]	719
[1682]	720
[1320]	721	USE indices, &
	722	ONLY: nxl, nxr, nyn, nys, nz
[1216]	723
[1320]	724	USE kinds
	725
[1216]	726	IMPLICIT NONE
	727
[1682]	728	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	729	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1216]	730
[1682]	731	INTEGER(iwp) :: i !<
	732	INTEGER(iwp) :: j !<
	733	INTEGER(iwp) :: k !<
[1216]	734
	735	!
	736	!-- Rearrange indices of input array in order to make data to be send
	737	!-- by MPI contiguous
	738	!$OMP PARALLEL PRIVATE ( i, j, k )
	739	!$OMP DO
	740	!$acc kernels present( f_in, f_inv )
	741	DO k = 1,nz
	742	DO i = nxl, nxr
	743	DO j = nys, nyn
	744	f_inv(j,i,k) = f_in(k,j,i)
	745	ENDDO
	746	ENDDO
	747	ENDDO
	748	!$acc end kernels
	749	!$OMP END PARALLEL
	750
	751	END SUBROUTINE resort_for_zx
	752
	753
	754	!------------------------------------------------------------------------------!
	755	! Description:
	756	! ------------
[1682]	757	!> Transposition of input array (f_in) from z to x. For the input array, all
	758	!> elements along z reside on the same PE, while after transposition, all
	759	!> elements along x reside on the same PE.
[1]	760	!------------------------------------------------------------------------------!
[1682]	761	SUBROUTINE transpose_zx( f_inv, f_out )
[1]	762
[1682]	763
[1320]	764	USE cpulog, &
	765	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	766
[1320]	767	USE indices, &
	768	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
	769
	770	USE kinds
	771
[1324]	772	USE pegrid
[1320]	773
	774	USE transpose_indices, &
	775	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	776
[1]	777	IMPLICIT NONE
	778
[1682]	779	INTEGER(iwp) :: i !<
	780	INTEGER(iwp) :: j !<
	781	INTEGER(iwp) :: k !<
	782	INTEGER(iwp) :: l !<
	783	INTEGER(iwp) :: xs !<
[1]	784
[1682]	785	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	786	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1111]	787
[1682]	788	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[1]	789
[1320]	790
[1]	791	!
	792	!-- If the PE grid is one-dimensional along y, only local reordering
	793	!-- of the data is necessary and no transposition has to be done.
	794	IF ( pdims(1) == 1 ) THEN
[1106]	795
[683]	796	!$OMP PARALLEL PRIVATE ( i, j, k )
	797	!$OMP DO
[1216]	798	!$acc kernels present( f_inv, f_out )
[1003]	799	DO k = 1, nz
	800	DO i = nxl, nxr
	801	DO j = nys, nyn
[164]	802	f_out(i,j,k) = f_inv(j,i,k)
[1]	803	ENDDO
	804	ENDDO
	805	ENDDO
[1111]	806	!$acc end kernels
[683]	807	!$OMP END PARALLEL
[1]	808
[1106]	809	ELSE
	810
	811	#if defined( __parallel )
[1]	812	!
[1106]	813	!-- Transpose array
[1318]	814	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	815	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	816	!$acc update host( f_inv )
	817	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	818	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	819	comm1dx, ierr )
	820	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	821
	822	!
[1106]	823	!-- Reorder transposed array
[1111]	824	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	825	!$OMP DO
[1216]	826	!$acc data copyin( work )
[1106]	827	DO l = 0, pdims(1) - 1
	828	xs = 0 + l * nnx
[1216]	829	!$acc kernels present( f_out )
[1106]	830	DO k = nzb_x, nzt_x
	831	DO i = xs, xs + nnx - 1
	832	DO j = nys_x, nyn_x
[1111]	833	f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]	834	ENDDO
[1]	835	ENDDO
	836	ENDDO
[1111]	837	!$acc end kernels
[1]	838	ENDDO
[1216]	839	!$acc end data
[683]	840	!$OMP END PARALLEL
[1]	841	#endif
	842
[1106]	843	ENDIF
	844
[1]	845	END SUBROUTINE transpose_zx
	846
	847
	848	!------------------------------------------------------------------------------!
	849	! Description:
	850	! ------------
[1682]	851	!> Resorting data after the transposition from z to y. The transposition itself
	852	!> is carried out in transpose_zy
[1216]	853	!------------------------------------------------------------------------------!
[1682]	854	SUBROUTINE resort_for_zy( f_inv, f_out )
[1216]	855
[1682]	856
[1320]	857	USE indices, &
	858	ONLY: ny
[1216]	859
[1320]	860	USE kinds
	861
	862	USE transpose_indices, &
	863	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	864
[1216]	865	IMPLICIT NONE
	866
[1682]	867	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	868	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1216]	869
	870
[1682]	871	INTEGER(iwp) :: i !<
	872	INTEGER(iwp) :: j !<
	873	INTEGER(iwp) :: k !<
[1216]	874
	875	!
	876	!-- Rearrange indices of input array in order to make data to be send
	877	!-- by MPI contiguous
	878	!$OMP PARALLEL PRIVATE ( i, j, k )
	879	!$OMP DO
	880	!$acc kernels present( f_inv, f_out )
	881	DO k = nzb_y, nzt_y
	882	DO j = 0, ny
	883	DO i = nxl_y, nxr_y
	884	f_out(j,i,k) = f_inv(i,k,j)
	885	ENDDO
	886	ENDDO
	887	ENDDO
	888	!$acc end kernels
	889	!$OMP END PARALLEL
	890
	891	END SUBROUTINE resort_for_zy
	892
	893
	894	!------------------------------------------------------------------------------!
	895	! Description:
	896	! ------------
[1682]	897	!> Transposition of input array (f_in) from z to y. For the input array, all
	898	!> elements along z reside on the same PE, while after transposition, all
	899	!> elements along y reside on the same PE.
[1]	900	!------------------------------------------------------------------------------!
[1682]	901	SUBROUTINE transpose_zy( f_in, f_inv )
[1]	902
[1682]	903
[1320]	904	USE cpulog, &
	905	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	906
[1320]	907	USE indices, &
	908	ONLY: ny, nz
	909
	910	USE kinds
	911
[1324]	912	USE pegrid
[1320]	913
	914	USE transpose_indices, &
	915	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	916
[1]	917	IMPLICIT NONE
	918
[1682]	919	INTEGER(iwp) :: i !<
	920	INTEGER(iwp) :: j !<
	921	INTEGER(iwp) :: k !<
	922	INTEGER(iwp) :: l !<
	923	INTEGER(iwp) :: zs !<
[1]	924
[1682]	925	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
	926	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1111]	927
[1682]	928	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[1111]	929
[1]	930	!
	931	!-- If the PE grid is one-dimensional along y, the array has only to be
	932	!-- reordered locally and therefore no transposition has to be done.
	933	IF ( pdims(1) /= 1 ) THEN
[1106]	934
	935	#if defined( __parallel )
[1]	936	!
	937	!-- Reorder input array for transposition
[1111]	938	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	939	!$OMP DO
[1216]	940	!$acc data copyout( work )
[1]	941	DO l = 0, pdims(1) - 1
[1003]	942	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1111]	943	!$acc kernels present( f_in, work )
[1003]	944	DO j = nys_z, nyn_z
	945	DO k = zs, zs + nzt_y - nzb_y
	946	DO i = nxl_z, nxr_z
[1111]	947	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	948	ENDDO
	949	ENDDO
	950	ENDDO
[1111]	951	!$acc end kernels
[1]	952	ENDDO
[1216]	953	!$acc end data
[683]	954	!$OMP END PARALLEL
[1]	955
	956	!
	957	!-- Transpose array
[1318]	958	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[622]	959	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	960	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	961	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	962	comm1dx, ierr )
[1111]	963	!$acc update device( f_inv )
[1]	964	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	965	#endif
[1]	966
	967	ELSE
	968	!
[1106]	969	!-- Reorder the array in the same way like ALLTOALL did it
[683]	970	!$OMP PARALLEL PRIVATE ( i, j, k )
	971	!$OMP DO
[1216]	972	!$acc kernels present( f_in, f_inv )
[1003]	973	DO k = nzb_y, nzt_y
	974	DO j = 0, ny
	975	DO i = nxl_y, nxr_y
[164]	976	f_inv(i,k,j) = f_in(i,j,k)
	977	ENDDO
	978	ENDDO
	979	ENDDO
[1111]	980	!$acc end kernels
[683]	981	!$OMP END PARALLEL
[1106]	982
	983	ENDIF
	984
[1]	985	END SUBROUTINE transpose_zy
	986
	987
	988	!------------------------------------------------------------------------------!
	989	! Description:
	990	! ------------
[1682]	991	!> Transposition of input array (f_in) from z to y. For the input array, all
	992	!> elements along z reside on the same PE, while after transposition, all
	993	!> elements along y reside on the same PE.
	994	!> This is a direct transposition for arrays with indices in regular order
	995	!> (k,j,i) (cf. transpose_zy).
[1]	996	!------------------------------------------------------------------------------!
[1682]	997	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	998
[1682]	999
[1320]	1000	USE cpulog, &
	1001	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	1002
[1320]	1003	USE indices, &
	1004	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
	1005
	1006	USE kinds
	1007
[1324]	1008	USE pegrid
[1320]	1009
	1010	USE transpose_indices, &
	1011	ONLY: nxl_y, nxl_yd, nxr_y, nxr_yd, nzb_y, nzb_yd, nzt_y, nzt_yd
	1012
[1]	1013	IMPLICIT NONE
	1014
[1682]	1015	INTEGER(iwp) :: i !<
	1016	INTEGER(iwp) :: j !<
	1017	INTEGER(iwp) :: k !<
	1018	INTEGER(iwp) :: l !<
	1019	INTEGER(iwp) :: m !<
	1020	INTEGER(iwp) :: ys !<
[1]	1021
[1682]	1022	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	1023	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	1024	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
	1025	REAL(wp) :: work(nnxnnynnz) !<
[1320]	1026
[1]	1027	#if defined( __parallel )
	1028
	1029	!
	1030	!-- Rearrange indices of input array in order to make data to be send
	1031	!-- by MPI contiguous
[1003]	1032	DO i = nxl, nxr
	1033	DO j = nys, nyn
	1034	DO k = 1, nz
[164]	1035	f_inv(j,i,k) = f_in(k,j,i)
[1]	1036	ENDDO
	1037	ENDDO
	1038	ENDDO
	1039
	1040	!
	1041	!-- Move data to different array, because memory location of work1 is
	1042	!-- needed further below (work1 = work2).
	1043	!-- If the PE grid is one-dimensional along x, only local reordering
	1044	!-- of the data is necessary and no transposition has to be done.
	1045	IF ( pdims(2) == 1 ) THEN
[1003]	1046	DO k = 1, nz
	1047	DO i = nxl, nxr
	1048	DO j = nys, nyn
[164]	1049	f_out(j,i,k) = f_inv(j,i,k)
[1]	1050	ENDDO
	1051	ENDDO
	1052	ENDDO
	1053	RETURN
	1054	ENDIF
	1055
	1056	!
	1057	!-- Transpose array
	1058	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1059	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1060	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	1061	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	1062	comm1dy, ierr )
	1063	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1064
	1065	!
	1066	!-- Reorder transposed array
	1067	m = 0
	1068	DO l = 0, pdims(2) - 1
	1069	ys = 0 + l * nny
[1003]	1070	DO k = nzb_yd, nzt_yd
	1071	DO i = nxl_yd, nxr_yd
[1]	1072	DO j = ys, ys + nny - 1
	1073	m = m + 1
[164]	1074	f_out(j,i,k) = work(m)
[1]	1075	ENDDO
	1076	ENDDO
	1077	ENDDO
	1078	ENDDO
	1079
	1080	#endif
	1081
	1082	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |