Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 2000

Last change on this file since 2000 was 2000, checked in by knoop, 8 years ago
Forced header and separation lines into 80 columns
Property svn:keywords set to `Id`
File size: 31.2 KB

Rev	Line
[1682]	1	!> @file transpose.f90
[2000]	2	!------------------------------------------------------------------------------!
[1036]	3	! This file is part of PALM.
	4	!
[2000]	5	! PALM is free software: you can redistribute it and/or modify it under the
	6	! terms of the GNU General Public License as published by the Free Software
	7	! Foundation, either version 3 of the License, or (at your option) any later
	8	! version.
[1036]	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
[1818]	17	! Copyright 1997-2016 Leibniz Universitaet Hannover
[2000]	18	!------------------------------------------------------------------------------!
[1036]	19	!
[484]	20	! Current revisions:
[1]	21	! -----------------
[2000]	22	! Forced header and separation lines into 80 columns
[1321]	23	!
	24	! Former revisions:
	25	! -----------------
	26	! $Id: transpose.f90 2000 2016-08-20 18:09:15Z knoop $
	27	!
[1683]	28	! 1682 2015-10-07 23:56:08Z knoop
	29	! Code annotations made doxygen readable
	30	!
[1325]	31	! 1324 2014-03-21 09:13:16Z suehring
	32	! Bugfix: ONLY statement for module pegrid removed
	33	!
[1321]	34	! 1320 2014-03-20 08:40:49Z raasch
[1320]	35	! ONLY-attribute added to USE-statements,
	36	! kind-parameters added to all INTEGER and REAL declaration statements,
	37	! kinds are defined in new module kinds,
	38	! old module precision_kind is removed,
	39	! revision history before 2012 removed,
	40	! comment fields (!:) to be used for variable explanations added to
	41	! all variable declaration statements
[198]	42	!
[1319]	43	! 1318 2014-03-17 13:35:16Z raasch
	44	! cpu_log_nowait parameter added to cpu measurements of the transpositions
	45	! required for solving the Poisson equation (poisfft),
	46	! module interfaces removed
	47	!
[1258]	48	! 1257 2013-11-08 15:18:40Z raasch
	49	! openacc loop and loop vector clauses removed
	50	!
[1217]	51	! 1216 2013-08-26 09:31:42Z raasch
	52	! re-sorting of the transposed / to be transposed arrays moved to separate
	53	! routines resort_for_...
	54	!
[1112]	55	! 1111 2013-03-08 23:54:10Z raasch
	56	! openACC directives added,
	57	! resorting data from/to work changed, work got 4 dimensions instead of 1
	58	!
[1107]	59	! 1106 2013-03-04 05:31:38Z raasch
	60	! preprocessor lines rearranged so that routines can also be used in serial
	61	! (non-parallel) mode
	62	!
[1093]	63	! 1092 2013-02-02 11:24:22Z raasch
	64	! unused variables removed
	65	!
[1037]	66	! 1036 2012-10-22 13:43:42Z raasch
	67	! code put under GPL (PALM 3.9)
	68	!
[1004]	69	! 1003 2012-09-14 14:35:53Z raasch
	70	! indices nxa, nya, etc. replaced by nx, ny, etc.
	71	!
[1]	72	! Revision 1.1 1997/07/24 11:25:18 raasch
	73	! Initial revision
	74	!
[1216]	75	!------------------------------------------------------------------------------!
	76	! Description:
	77	! ------------
[1682]	78	!> Resorting data for the transposition from x to y. The transposition itself
	79	!> is carried out in transpose_xy
[1216]	80	!------------------------------------------------------------------------------!
[1682]	81	SUBROUTINE resort_for_xy( f_in, f_inv )
	82
[1216]	83
[1320]	84	USE indices, &
	85	ONLY: nx
[1216]	86
[1320]	87	USE kinds
	88
	89	USE transpose_indices, &
	90	ONLY: nxl_z, nxr_z, nyn_x, nyn_z, nys_x, nys_z, nzb_x, nzt_x
	91
[1216]	92	IMPLICIT NONE
	93
[1682]	94	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	95	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1216]	96
	97
[1682]	98	INTEGER(iwp) :: i !<
	99	INTEGER(iwp) :: j !<
	100	INTEGER(iwp) :: k !<
[1]	101	!
[1216]	102	!-- Rearrange indices of input array in order to make data to be send
	103	!-- by MPI contiguous
	104	!$OMP PARALLEL PRIVATE ( i, j, k )
	105	!$OMP DO
	106	!$acc kernels present( f_in, f_inv )
	107	DO i = 0, nx
	108	DO k = nzb_x, nzt_x
	109	DO j = nys_x, nyn_x
	110	f_inv(j,k,i) = f_in(i,j,k)
	111	ENDDO
	112	ENDDO
	113	ENDDO
	114	!$acc end kernels
	115	!$OMP END PARALLEL
	116
	117	END SUBROUTINE resort_for_xy
	118
	119
	120	!------------------------------------------------------------------------------!
[1]	121	! Description:
	122	! ------------
[1682]	123	!> Transposition of input array (f_in) from x to y. For the input array, all
	124	!> elements along x reside on the same PE, while after transposition, all
	125	!> elements along y reside on the same PE.
[1]	126	!------------------------------------------------------------------------------!
[1682]	127	SUBROUTINE transpose_xy( f_inv, f_out )
[1]	128
[1682]	129
[1320]	130	USE cpulog, &
	131	ONLY: cpu_log, cpu_log_nowait, log_point_s
	132
	133	USE indices, &
	134	ONLY: nx, ny
	135
	136	USE kinds
	137
[1]	138	USE pegrid
	139
[1320]	140	USE transpose_indices, &
	141	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	142
[1]	143	IMPLICIT NONE
	144
[1682]	145	INTEGER(iwp) :: i !<
	146	INTEGER(iwp) :: j !<
	147	INTEGER(iwp) :: k !<
	148	INTEGER(iwp) :: l !<
	149	INTEGER(iwp) :: ys !<
[1320]	150
[1682]	151	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	152	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1]	153
[1682]	154	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[1111]	155
	156
[1106]	157	IF ( numprocs /= 1 ) THEN
	158
	159	#if defined( __parallel )
[1]	160	!
[1106]	161	!-- Transpose array
[1318]	162	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	163	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	164	!$acc update host( f_inv )
	165	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
	166	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
[1106]	167	comm1dy, ierr )
	168	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	169
	170	!
[1106]	171	!-- Reorder transposed array
[1111]	172	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	173	!$OMP DO
[1216]	174	!$acc data copyin( work )
[1106]	175	DO l = 0, pdims(2) - 1
	176	ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]	177	!$acc kernels present( f_out, work )
[1106]	178	DO i = nxl_y, nxr_y
	179	DO k = nzb_y, nzt_y
	180	DO j = ys, ys + nyn_x - nys_x
[1111]	181	f_out(j,i,k) = work(j-ys+1,k,i,l)
[1106]	182	ENDDO
[1]	183	ENDDO
	184	ENDDO
[1111]	185	!$acc end kernels
[1]	186	ENDDO
[1216]	187	!$acc end data
[683]	188	!$OMP END PARALLEL
[1]	189	#endif
	190
[1106]	191	ELSE
	192
	193	!
	194	!-- Reorder transposed array
	195	!$OMP PARALLEL PRIVATE ( i, j, k )
	196	!$OMP DO
[1216]	197	!$acc kernels present( f_inv, f_out )
[1106]	198	DO k = nzb_y, nzt_y
	199	DO i = nxl_y, nxr_y
	200	DO j = 0, ny
	201	f_out(j,i,k) = f_inv(j,k,i)
	202	ENDDO
	203	ENDDO
	204	ENDDO
[1111]	205	!$acc end kernels
[1106]	206	!$OMP END PARALLEL
	207
	208	ENDIF
	209
[1]	210	END SUBROUTINE transpose_xy
	211
	212
	213	!------------------------------------------------------------------------------!
	214	! Description:
	215	! ------------
[1682]	216	!> Resorting data after the transposition from x to z. The transposition itself
	217	!> is carried out in transpose_xz
[1216]	218	!------------------------------------------------------------------------------!
[1682]	219	SUBROUTINE resort_for_xz( f_inv, f_out )
[1216]	220
[1682]	221
[1320]	222	USE indices, &
	223	ONLY: nxl, nxr, nyn, nys, nz
[1216]	224
[1320]	225	USE kinds
	226
[1216]	227	IMPLICIT NONE
	228
[1682]	229	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	230	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
[1216]	231
[1682]	232	INTEGER(iwp) :: i !<
	233	INTEGER(iwp) :: j !<
	234	INTEGER(iwp) :: k !<
[1216]	235	!
	236	!-- Rearrange indices of input array in order to make data to be send
	237	!-- by MPI contiguous.
	238	!-- In case of parallel fft/transposition, scattered store is faster in
	239	!-- backward direction!!!
	240	!$OMP PARALLEL PRIVATE ( i, j, k )
	241	!$OMP DO
	242	!$acc kernels present( f_inv, f_out )
	243	DO k = 1, nz
	244	DO i = nxl, nxr
	245	DO j = nys, nyn
	246	f_out(k,j,i) = f_inv(j,i,k)
	247	ENDDO
	248	ENDDO
	249	ENDDO
	250	!$acc end kernels
	251	!$OMP END PARALLEL
	252
	253	END SUBROUTINE resort_for_xz
	254
	255
	256	!------------------------------------------------------------------------------!
	257	! Description:
	258	! ------------
[1682]	259	!> Transposition of input array (f_in) from x to z. For the input array, all
	260	!> elements along x reside on the same PE, while after transposition, all
	261	!> elements along z reside on the same PE.
[1]	262	!------------------------------------------------------------------------------!
[1682]	263	SUBROUTINE transpose_xz( f_in, f_inv )
[1]	264
[1682]	265
[1320]	266	USE cpulog, &
	267	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	268
[1320]	269	USE indices, &
	270	ONLY: nnx, nx, nxl, nxr, ny, nyn, nys, nz
	271
	272	USE kinds
	273
[1324]	274	USE pegrid
[1320]	275
	276	USE transpose_indices, &
	277	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	278
[1]	279	IMPLICIT NONE
	280
[1682]	281	INTEGER(iwp) :: i !<
	282	INTEGER(iwp) :: j !<
	283	INTEGER(iwp) :: k !<
	284	INTEGER(iwp) :: l !<
	285	INTEGER(iwp) :: xs !<
[1]	286
[1682]	287	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	288	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1]	289
[1682]	290	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[1111]	291
[1320]	292
[1]	293	!
	294	!-- If the PE grid is one-dimensional along y, the array has only to be
	295	!-- reordered locally and therefore no transposition has to be done.
	296	IF ( pdims(1) /= 1 ) THEN
[1106]	297
	298	#if defined( __parallel )
[1]	299	!
	300	!-- Reorder input array for transposition
[1111]	301	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	302	!$OMP DO
[1216]	303	!$acc data copyout( work )
[1]	304	DO l = 0, pdims(1) - 1
	305	xs = 0 + l * nnx
[1111]	306	!$acc kernels present( f_in, work )
[1003]	307	DO k = nzb_x, nzt_x
[164]	308	DO i = xs, xs + nnx - 1
[1003]	309	DO j = nys_x, nyn_x
[1111]	310	work(j,i-xs+1,k,l) = f_in(i,j,k)
[1]	311	ENDDO
	312	ENDDO
	313	ENDDO
[1111]	314	!$acc end kernels
[1]	315	ENDDO
[1216]	316	!$acc end data
[683]	317	!$OMP END PARALLEL
[1]	318
	319	!
	320	!-- Transpose array
[1318]	321	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[622]	322	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	323	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
	324	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
[1]	325	comm1dx, ierr )
[1111]	326	!$acc update device( f_inv )
[1]	327	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	328	#endif
	329
[1]	330	ELSE
[1106]	331
[1]	332	!
	333	!-- Reorder the array in a way that the z index is in first position
[683]	334	!$OMP PARALLEL PRIVATE ( i, j, k )
	335	!$OMP DO
[1216]	336	!$acc kernels present( f_in, f_inv )
[1003]	337	DO i = nxl, nxr
	338	DO j = nys, nyn
	339	DO k = 1, nz
[164]	340	f_inv(j,i,k) = f_in(i,j,k)
[1]	341	ENDDO
	342	ENDDO
	343	ENDDO
[1111]	344	!$acc end kernels
[683]	345	!$OMP END PARALLEL
[1]	346
[164]	347	ENDIF
	348
[1]	349	END SUBROUTINE transpose_xz
	350
	351
	352	!------------------------------------------------------------------------------!
	353	! Description:
	354	! ------------
[1682]	355	!> Resorting data after the transposition from y to x. The transposition itself
	356	!> is carried out in transpose_yx
[1216]	357	!------------------------------------------------------------------------------!
[1682]	358	SUBROUTINE resort_for_yx( f_inv, f_out )
[1216]	359
[1682]	360
[1320]	361	USE indices, &
	362	ONLY: nx
[1216]	363
[1320]	364	USE kinds
	365
	366	USE transpose_indices, &
	367	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	368
[1216]	369	IMPLICIT NONE
	370
[1682]	371	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
	372	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1216]	373
	374
[1682]	375	INTEGER(iwp) :: i !<
	376	INTEGER(iwp) :: j !<
	377	INTEGER(iwp) :: k !<
[1216]	378	!
	379	!-- Rearrange indices of input array in order to make data to be send
	380	!-- by MPI contiguous
	381	!$OMP PARALLEL PRIVATE ( i, j, k )
	382	!$OMP DO
	383	!$acc kernels present( f_inv, f_out )
	384	DO i = 0, nx
	385	DO k = nzb_x, nzt_x
	386	DO j = nys_x, nyn_x
	387	f_out(i,j,k) = f_inv(j,k,i)
	388	ENDDO
	389	ENDDO
	390	ENDDO
	391	!$acc end kernels
	392	!$OMP END PARALLEL
	393
	394	END SUBROUTINE resort_for_yx
	395
	396
	397	!------------------------------------------------------------------------------!
	398	! Description:
	399	! ------------
[1682]	400	!> Transposition of input array (f_in) from y to x. For the input array, all
	401	!> elements along y reside on the same PE, while after transposition, all
	402	!> elements along x reside on the same PE.
[1]	403	!------------------------------------------------------------------------------!
[1682]	404	SUBROUTINE transpose_yx( f_in, f_inv )
[1]	405
[1682]	406
[1320]	407	USE cpulog, &
	408	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	409
[1320]	410	USE indices, &
	411	ONLY: nx, ny
	412
	413	USE kinds
	414
[1324]	415	USE pegrid
[1320]	416
	417	USE transpose_indices, &
	418	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
	419
[1]	420	IMPLICIT NONE
	421
[1682]	422	INTEGER(iwp) :: i !<
	423	INTEGER(iwp) :: j !<
	424	INTEGER(iwp) :: k !<
	425	INTEGER(iwp) :: l !<
	426	INTEGER(iwp) :: ys !<
[1]	427
[1682]	428	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	429	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
[1111]	430
[1682]	431	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
[1111]	432
[1320]	433
[1106]	434	IF ( numprocs /= 1 ) THEN
	435
[1]	436	#if defined( __parallel )
	437	!
[1106]	438	!-- Reorder input array for transposition
[1111]	439	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
[683]	440	!$OMP DO
[1216]	441	!$acc data copyout( work )
[1106]	442	DO l = 0, pdims(2) - 1
	443	ys = 0 + l * ( nyn_x - nys_x + 1 )
[1111]	444	!$acc kernels present( f_in, work )
[1106]	445	DO i = nxl_y, nxr_y
	446	DO k = nzb_y, nzt_y
	447	DO j = ys, ys + nyn_x - nys_x
[1111]	448	work(j-ys+1,k,i,l) = f_in(j,i,k)
[1106]	449	ENDDO
	450	ENDDO
	451	ENDDO
[1111]	452	!$acc end kernels
[1106]	453	ENDDO
[1216]	454	!$acc end data
[1106]	455	!$OMP END PARALLEL
	456
	457	!
	458	!-- Transpose array
[1318]	459	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	460	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	461	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
	462	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
[1106]	463	comm1dy, ierr )
[1111]	464	!$acc update device( f_inv )
[1106]	465	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	466	#endif
	467
	468	ELSE
	469
	470	!
	471	!-- Reorder array f_in the same way as ALLTOALL did it
	472	!$OMP PARALLEL PRIVATE ( i, j, k )
	473	!$OMP DO
[1216]	474	!$acc kernels present( f_in, f_inv )
[1003]	475	DO i = nxl_y, nxr_y
	476	DO k = nzb_y, nzt_y
[1106]	477	DO j = 0, ny
	478	f_inv(j,k,i) = f_in(j,i,k)
[1]	479	ENDDO
	480	ENDDO
	481	ENDDO
[1111]	482	!$acc end kernels
[683]	483	!$OMP END PARALLEL
[1]	484
[1106]	485	ENDIF
[1]	486
	487	END SUBROUTINE transpose_yx
	488
	489
	490	!------------------------------------------------------------------------------!
	491	! Description:
	492	! ------------
[1682]	493	!> Transposition of input array (f_in) from y to x. For the input array, all
	494	!> elements along y reside on the same PE, while after transposition, all
	495	!> elements along x reside on the same PE.
	496	!> This is a direct transposition for arrays with indices in regular order
	497	!> (k,j,i) (cf. transpose_yx).
[1]	498	!------------------------------------------------------------------------------!
[1682]	499	SUBROUTINE transpose_yxd( f_in, f_out )
[1]	500
[1682]	501
[1320]	502	USE cpulog, &
	503	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	504
[1320]	505	USE indices, &
	506	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
	507
	508	USE kinds
	509
[1324]	510	USE pegrid
[1320]	511
	512	USE transpose_indices, &
	513	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	514
[1]	515	IMPLICIT NONE
	516
[1682]	517	INTEGER(iwp) :: i !<
	518	INTEGER(iwp) :: j !<
	519	INTEGER(iwp) :: k !<
	520	INTEGER(iwp) :: l !<
	521	INTEGER(iwp) :: m !<
	522	INTEGER(iwp) :: xs !<
[1]	523
[1682]	524	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	525	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
	526	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
	527	REAL(wp) :: work(nnxnnynnz) !<
[1]	528	#if defined( __parallel )
	529
	530	!
	531	!-- Rearrange indices of input array in order to make data to be send
	532	!-- by MPI contiguous
[1003]	533	DO k = 1, nz
	534	DO j = nys, nyn
	535	DO i = nxl, nxr
[164]	536	f_inv(i,k,j) = f_in(k,j,i)
[1]	537	ENDDO
	538	ENDDO
	539	ENDDO
	540
	541	!
	542	!-- Transpose array
	543	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	544	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	545	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
[164]	546	work(1), sendrecvcount_xy, MPI_REAL, &
[1]	547	comm1dx, ierr )
	548	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	549
	550	!
	551	!-- Reorder transposed array
	552	m = 0
	553	DO l = 0, pdims(1) - 1
	554	xs = 0 + l * nnx
[1003]	555	DO j = nys_x, nyn_x
	556	DO k = 1, nz
[1]	557	DO i = xs, xs + nnx - 1
	558	m = m + 1
[164]	559	f_out(i,j,k) = work(m)
[1]	560	ENDDO
	561	ENDDO
	562	ENDDO
	563	ENDDO
	564
	565	#endif
	566
	567	END SUBROUTINE transpose_yxd
	568
	569
	570	!------------------------------------------------------------------------------!
	571	! Description:
	572	! ------------
[1682]	573	!> Resorting data for the transposition from y to z. The transposition itself
	574	!> is carried out in transpose_yz
[1216]	575	!------------------------------------------------------------------------------!
[1682]	576	SUBROUTINE resort_for_yz( f_in, f_inv )
[1216]	577
[1682]	578
[1320]	579	USE indices, &
	580	ONLY: ny
[1216]	581
[1320]	582	USE kinds
	583
	584	USE transpose_indices, &
	585	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	586
[1216]	587	IMPLICIT NONE
	588
[1682]	589	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
	590	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1216]	591
[1682]	592	INTEGER(iwp) :: i !<
	593	INTEGER(iwp) :: j !<
	594	INTEGER(iwp) :: k !<
[1216]	595
	596	!
	597	!-- Rearrange indices of input array in order to make data to be send
	598	!-- by MPI contiguous
	599	!$OMP PARALLEL PRIVATE ( i, j, k )
	600	!$OMP DO
	601	!$acc kernels present( f_in, f_inv )
	602	DO j = 0, ny
	603	DO k = nzb_y, nzt_y
	604	DO i = nxl_y, nxr_y
	605	f_inv(i,k,j) = f_in(j,i,k)
	606	ENDDO
	607	ENDDO
	608	ENDDO
	609	!$acc end kernels
	610	!$OMP END PARALLEL
	611
	612	END SUBROUTINE resort_for_yz
	613
	614
	615	!------------------------------------------------------------------------------!
	616	! Description:
	617	! ------------
[1682]	618	!> Transposition of input array (f_in) from y to z. For the input array, all
	619	!> elements along y reside on the same PE, while after transposition, all
	620	!> elements along z reside on the same PE.
[1]	621	!------------------------------------------------------------------------------!
[1682]	622	SUBROUTINE transpose_yz( f_inv, f_out )
[1]	623
[1682]	624
[1320]	625	USE cpulog, &
	626	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	627
[1320]	628	USE indices, &
	629	ONLY: ny, nz
	630
	631	USE kinds
	632
[1324]	633	USE pegrid
[1320]	634
	635	USE transpose_indices, &
	636	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	637
[1]	638	IMPLICIT NONE
	639
[1682]	640	INTEGER(iwp) :: i !<
	641	INTEGER(iwp) :: j !<
	642	INTEGER(iwp) :: k !<
	643	INTEGER(iwp) :: l !<
	644	INTEGER(iwp) :: zs !<
[1]	645
[1682]	646	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	647	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
[1111]	648
[1682]	649	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[1111]	650
[1320]	651
[1]	652	!
	653	!-- If the PE grid is one-dimensional along y, only local reordering
	654	!-- of the data is necessary and no transposition has to be done.
	655	IF ( pdims(1) == 1 ) THEN
[1106]	656
[683]	657	!$OMP PARALLEL PRIVATE ( i, j, k )
	658	!$OMP DO
[1216]	659	!$acc kernels present( f_inv, f_out )
[1003]	660	DO j = 0, ny
	661	DO k = nzb_y, nzt_y
	662	DO i = nxl_y, nxr_y
[164]	663	f_out(i,j,k) = f_inv(i,k,j)
[1]	664	ENDDO
	665	ENDDO
	666	ENDDO
[1111]	667	!$acc end kernels
[683]	668	!$OMP END PARALLEL
[1]	669
[1106]	670	ELSE
	671
	672	#if defined( __parallel )
[1]	673	!
[1106]	674	!-- Transpose array
[1318]	675	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	676	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	677	!$acc update host( f_inv )
	678	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
	679	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
[1106]	680	comm1dx, ierr )
	681	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	682
	683	!
[1106]	684	!-- Reorder transposed array
[1111]	685	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	686	!$OMP DO
[1216]	687	!$acc data copyin( work )
[1106]	688	DO l = 0, pdims(1) - 1
	689	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1216]	690	!$acc kernels present( f_out )
[1106]	691	DO j = nys_z, nyn_z
	692	DO k = zs, zs + nzt_y - nzb_y
	693	DO i = nxl_z, nxr_z
[1111]	694	f_out(i,j,k) = work(i,k-zs+1,j,l)
[1106]	695	ENDDO
[1]	696	ENDDO
	697	ENDDO
[1111]	698	!$acc end kernels
[1]	699	ENDDO
[1216]	700	!$acc end data
[683]	701	!$OMP END PARALLEL
[1]	702	#endif
	703
[1106]	704	ENDIF
	705
[1]	706	END SUBROUTINE transpose_yz
	707
	708
	709	!------------------------------------------------------------------------------!
	710	! Description:
	711	! ------------
[1682]	712	!> Resorting data for the transposition from z to x. The transposition itself
	713	!> is carried out in transpose_zx
[1216]	714	!------------------------------------------------------------------------------!
[1682]	715	SUBROUTINE resort_for_zx( f_in, f_inv )
[1216]	716
[1682]	717
[1320]	718	USE indices, &
	719	ONLY: nxl, nxr, nyn, nys, nz
[1216]	720
[1320]	721	USE kinds
	722
[1216]	723	IMPLICIT NONE
	724
[1682]	725	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	726	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
[1216]	727
[1682]	728	INTEGER(iwp) :: i !<
	729	INTEGER(iwp) :: j !<
	730	INTEGER(iwp) :: k !<
[1216]	731
	732	!
	733	!-- Rearrange indices of input array in order to make data to be send
	734	!-- by MPI contiguous
	735	!$OMP PARALLEL PRIVATE ( i, j, k )
	736	!$OMP DO
	737	!$acc kernels present( f_in, f_inv )
	738	DO k = 1,nz
	739	DO i = nxl, nxr
	740	DO j = nys, nyn
	741	f_inv(j,i,k) = f_in(k,j,i)
	742	ENDDO
	743	ENDDO
	744	ENDDO
	745	!$acc end kernels
	746	!$OMP END PARALLEL
	747
	748	END SUBROUTINE resort_for_zx
	749
	750
	751	!------------------------------------------------------------------------------!
	752	! Description:
	753	! ------------
[1682]	754	!> Transposition of input array (f_in) from z to x. For the input array, all
	755	!> elements along z reside on the same PE, while after transposition, all
	756	!> elements along x reside on the same PE.
[1]	757	!------------------------------------------------------------------------------!
[1682]	758	SUBROUTINE transpose_zx( f_inv, f_out )
[1]	759
[1682]	760
[1320]	761	USE cpulog, &
	762	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	763
[1320]	764	USE indices, &
	765	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
	766
	767	USE kinds
	768
[1324]	769	USE pegrid
[1320]	770
	771	USE transpose_indices, &
	772	ONLY: nyn_x, nys_x, nzb_x, nzt_x
	773
[1]	774	IMPLICIT NONE
	775
[1682]	776	INTEGER(iwp) :: i !<
	777	INTEGER(iwp) :: j !<
	778	INTEGER(iwp) :: k !<
	779	INTEGER(iwp) :: l !<
	780	INTEGER(iwp) :: xs !<
[1]	781
[1682]	782	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	783	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
[1111]	784
[1682]	785	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
[1]	786
[1320]	787
[1]	788	!
	789	!-- If the PE grid is one-dimensional along y, only local reordering
	790	!-- of the data is necessary and no transposition has to be done.
	791	IF ( pdims(1) == 1 ) THEN
[1106]	792
[683]	793	!$OMP PARALLEL PRIVATE ( i, j, k )
	794	!$OMP DO
[1216]	795	!$acc kernels present( f_inv, f_out )
[1003]	796	DO k = 1, nz
	797	DO i = nxl, nxr
	798	DO j = nys, nyn
[164]	799	f_out(i,j,k) = f_inv(j,i,k)
[1]	800	ENDDO
	801	ENDDO
	802	ENDDO
[1111]	803	!$acc end kernels
[683]	804	!$OMP END PARALLEL
[1]	805
[1106]	806	ELSE
	807
	808	#if defined( __parallel )
[1]	809	!
[1106]	810	!-- Transpose array
[1318]	811	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[1106]	812	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	813	!$acc update host( f_inv )
	814	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
	815	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
[1106]	816	comm1dx, ierr )
	817	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1]	818
	819	!
[1106]	820	!-- Reorder transposed array
[1111]	821	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
[683]	822	!$OMP DO
[1216]	823	!$acc data copyin( work )
[1106]	824	DO l = 0, pdims(1) - 1
	825	xs = 0 + l * nnx
[1216]	826	!$acc kernels present( f_out )
[1106]	827	DO k = nzb_x, nzt_x
	828	DO i = xs, xs + nnx - 1
	829	DO j = nys_x, nyn_x
[1111]	830	f_out(i,j,k) = work(j,i-xs+1,k,l)
[1106]	831	ENDDO
[1]	832	ENDDO
	833	ENDDO
[1111]	834	!$acc end kernels
[1]	835	ENDDO
[1216]	836	!$acc end data
[683]	837	!$OMP END PARALLEL
[1]	838	#endif
	839
[1106]	840	ENDIF
	841
[1]	842	END SUBROUTINE transpose_zx
	843
	844
	845	!------------------------------------------------------------------------------!
	846	! Description:
	847	! ------------
[1682]	848	!> Resorting data after the transposition from z to y. The transposition itself
	849	!> is carried out in transpose_zy
[1216]	850	!------------------------------------------------------------------------------!
[1682]	851	SUBROUTINE resort_for_zy( f_inv, f_out )
[1216]	852
[1682]	853
[1320]	854	USE indices, &
	855	ONLY: ny
[1216]	856
[1320]	857	USE kinds
	858
	859	USE transpose_indices, &
	860	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
	861
[1216]	862	IMPLICIT NONE
	863
[1682]	864	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
	865	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
[1216]	866
	867
[1682]	868	INTEGER(iwp) :: i !<
	869	INTEGER(iwp) :: j !<
	870	INTEGER(iwp) :: k !<
[1216]	871
	872	!
	873	!-- Rearrange indices of input array in order to make data to be send
	874	!-- by MPI contiguous
	875	!$OMP PARALLEL PRIVATE ( i, j, k )
	876	!$OMP DO
	877	!$acc kernels present( f_inv, f_out )
	878	DO k = nzb_y, nzt_y
	879	DO j = 0, ny
	880	DO i = nxl_y, nxr_y
	881	f_out(j,i,k) = f_inv(i,k,j)
	882	ENDDO
	883	ENDDO
	884	ENDDO
	885	!$acc end kernels
	886	!$OMP END PARALLEL
	887
	888	END SUBROUTINE resort_for_zy
	889
	890
	891	!------------------------------------------------------------------------------!
	892	! Description:
	893	! ------------
[1682]	894	!> Transposition of input array (f_in) from z to y. For the input array, all
	895	!> elements along z reside on the same PE, while after transposition, all
	896	!> elements along y reside on the same PE.
[1]	897	!------------------------------------------------------------------------------!
[1682]	898	SUBROUTINE transpose_zy( f_in, f_inv )
[1]	899
[1682]	900
[1320]	901	USE cpulog, &
	902	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	903
[1320]	904	USE indices, &
	905	ONLY: ny, nz
	906
	907	USE kinds
	908
[1324]	909	USE pegrid
[1320]	910
	911	USE transpose_indices, &
	912	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
	913
[1]	914	IMPLICIT NONE
	915
[1682]	916	INTEGER(iwp) :: i !<
	917	INTEGER(iwp) :: j !<
	918	INTEGER(iwp) :: k !<
	919	INTEGER(iwp) :: l !<
	920	INTEGER(iwp) :: zs !<
[1]	921
[1682]	922	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
	923	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
[1111]	924
[1682]	925	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
[1111]	926
[1]	927	!
	928	!-- If the PE grid is one-dimensional along y, the array has only to be
	929	!-- reordered locally and therefore no transposition has to be done.
	930	IF ( pdims(1) /= 1 ) THEN
[1106]	931
	932	#if defined( __parallel )
[1]	933	!
	934	!-- Reorder input array for transposition
[1111]	935	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
[683]	936	!$OMP DO
[1216]	937	!$acc data copyout( work )
[1]	938	DO l = 0, pdims(1) - 1
[1003]	939	zs = 1 + l * ( nzt_y - nzb_y + 1 )
[1111]	940	!$acc kernels present( f_in, work )
[1003]	941	DO j = nys_z, nyn_z
	942	DO k = zs, zs + nzt_y - nzb_y
	943	DO i = nxl_z, nxr_z
[1111]	944	work(i,k-zs+1,j,l) = f_in(i,j,k)
[1]	945	ENDDO
	946	ENDDO
	947	ENDDO
[1111]	948	!$acc end kernels
[1]	949	ENDDO
[1216]	950	!$acc end data
[683]	951	!$OMP END PARALLEL
[1]	952
	953	!
	954	!-- Transpose array
[1318]	955	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
[622]	956	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1111]	957	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
	958	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
[1]	959	comm1dx, ierr )
[1111]	960	!$acc update device( f_inv )
[1]	961	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
[1106]	962	#endif
[1]	963
	964	ELSE
	965	!
[1106]	966	!-- Reorder the array in the same way like ALLTOALL did it
[683]	967	!$OMP PARALLEL PRIVATE ( i, j, k )
	968	!$OMP DO
[1216]	969	!$acc kernels present( f_in, f_inv )
[1003]	970	DO k = nzb_y, nzt_y
	971	DO j = 0, ny
	972	DO i = nxl_y, nxr_y
[164]	973	f_inv(i,k,j) = f_in(i,j,k)
	974	ENDDO
	975	ENDDO
	976	ENDDO
[1111]	977	!$acc end kernels
[683]	978	!$OMP END PARALLEL
[1106]	979
	980	ENDIF
	981
[1]	982	END SUBROUTINE transpose_zy
	983
	984
	985	!------------------------------------------------------------------------------!
	986	! Description:
	987	! ------------
[1682]	988	!> Transposition of input array (f_in) from z to y. For the input array, all
	989	!> elements along z reside on the same PE, while after transposition, all
	990	!> elements along y reside on the same PE.
	991	!> This is a direct transposition for arrays with indices in regular order
	992	!> (k,j,i) (cf. transpose_zy).
[1]	993	!------------------------------------------------------------------------------!
[1682]	994	SUBROUTINE transpose_zyd( f_in, f_out )
[1]	995
[1682]	996
[1320]	997	USE cpulog, &
	998	ONLY: cpu_log, cpu_log_nowait, log_point_s
[1]	999
[1320]	1000	USE indices, &
	1001	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
	1002
	1003	USE kinds
	1004
[1324]	1005	USE pegrid
[1320]	1006
	1007	USE transpose_indices, &
	1008	ONLY: nxl_y, nxl_yd, nxr_y, nxr_yd, nzb_y, nzb_yd, nzt_y, nzt_yd
	1009
[1]	1010	IMPLICIT NONE
	1011
[1682]	1012	INTEGER(iwp) :: i !<
	1013	INTEGER(iwp) :: j !<
	1014	INTEGER(iwp) :: k !<
	1015	INTEGER(iwp) :: l !<
	1016	INTEGER(iwp) :: m !<
	1017	INTEGER(iwp) :: ys !<
[1]	1018
[1682]	1019	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
	1020	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
	1021	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
	1022	REAL(wp) :: work(nnxnnynnz) !<
[1320]	1023
[1]	1024	#if defined( __parallel )
	1025
	1026	!
	1027	!-- Rearrange indices of input array in order to make data to be send
	1028	!-- by MPI contiguous
[1003]	1029	DO i = nxl, nxr
	1030	DO j = nys, nyn
	1031	DO k = 1, nz
[164]	1032	f_inv(j,i,k) = f_in(k,j,i)
[1]	1033	ENDDO
	1034	ENDDO
	1035	ENDDO
	1036
	1037	!
	1038	!-- Move data to different array, because memory location of work1 is
	1039	!-- needed further below (work1 = work2).
	1040	!-- If the PE grid is one-dimensional along x, only local reordering
	1041	!-- of the data is necessary and no transposition has to be done.
	1042	IF ( pdims(2) == 1 ) THEN
[1003]	1043	DO k = 1, nz
	1044	DO i = nxl, nxr
	1045	DO j = nys, nyn
[164]	1046	f_out(j,i,k) = f_inv(j,i,k)
[1]	1047	ENDDO
	1048	ENDDO
	1049	ENDDO
	1050	RETURN
	1051	ENDIF
	1052
	1053	!
	1054	!-- Transpose array
	1055	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
[622]	1056	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
[1]	1057	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
[164]	1058	work(1), sendrecvcount_zyd, MPI_REAL, &
[1]	1059	comm1dy, ierr )
	1060	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	1061
	1062	!
	1063	!-- Reorder transposed array
	1064	m = 0
	1065	DO l = 0, pdims(2) - 1
	1066	ys = 0 + l * nny
[1003]	1067	DO k = nzb_yd, nzt_yd
	1068	DO i = nxl_yd, nxr_yd
[1]	1069	DO j = ys, ys + nny - 1
	1070	m = m + 1
[164]	1071	f_out(j,i,k) = work(m)
[1]	1072	ENDDO
	1073	ENDDO
	1074	ENDDO
	1075	ENDDO
	1076
	1077	#endif
	1078
	1079	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |