Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 748

Last change on this file since 748 was 668, checked in by suehring, 14 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 31.5 KB

Rev	Line
[1]	1	MODULE poisfft_hybrid_mod
	2	!------------------------------------------------------------------------------
	3	!
[257]	4	! Current revisions:
[1]	5	! -----------------
	6	!
	7	! Former revisions:
	8	! -----------------
[3]	9	! $Id: poisfft_hybrid.f90 668 2010-12-23 13:22:58Z raasch $
[392]	10	!
[668]	11	! 667 2010-12-23 12:06:00Z suehring/gryschka
	12	! ddzu replaced by ddzu_pres due to changes in zu(0)
	13	!
[482]	14	! 415 2009-12-15 10:26:23Z raasch
	15	! Dimension of array stat in cascade change to prevent type problems with
	16	! mpi2 libraries
	17	!
[392]	18	! 274 2009-03-26 15:11:21Z heinze
	19	! Output of messages replaced by message handling routine.
	20	!
	21	! Feb. 2007
[3]	22	! RCS Log replace by Id keyword, revision history cleaned up
	23	!
[1]	24	! Revision 1.11 2004/04/30 12:43:14 raasch
	25	! Renaming of fft routines, additional argument in calls of fft_y_m
	26	!
	27	! Revision 1.2 2002/12/19 16:08:31 raasch
	28	! Preprocessor directive KKMP introduced (OMP does NOT work),
	29	! array tri will be a shared array in OpenMP loop, to get better cache
	30	! utilization, the i index (x-direction) will be executed in stride
	31	! "istride" as outer loop and in a shorter inner loop,
	32	! overlapping of computation and communication realized by new routine
	33	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
	34	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
	35	!
	36	!
	37	! Description:
	38	! ------------
	39	! Solution of the Poisson equation with a 2D spectral method.
	40	! Hybrid version for parallel computers using a 1D domain decomposition,
	41	! realized with MPI, along x and parallelization with OPEN-MP along y
	42	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
	43	! optimization is realized by overlapping of computation and communication
	44	! and by simultaneously executing as many communication calls as switches
	45	! per logical partition (LPAR) are available. This version comes into
	46	! effect if more than one node is used and if the environment variable
	47	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
	48	! without any rest.
	49	!
	50	! WARNING: In case of OpenMP, there are problems with allocating large
	51	! arrays in parallel regions.
	52	!
	53	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
	54	!------------------------------------------------------------------------------!
	55
	56	USE fft_xy
	57	USE indices
	58	USE pegrid
	59
	60	IMPLICIT NONE
	61
	62	PRIVATE
	63	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
	64
	65	INTEGER, PARAMETER :: switch_per_lpar = 2
	66
	67	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
	68	nxl_p, nxr_p, & ! partial x dimension
	69	nys_a, nyn_a, & ! total y dimension
	70	nys_p, nyn_p, & ! partial y dimension
	71
	72	npe_s, & ! total number of PEs for solver
	73	nwords, & ! number of points to be exchanged
	74	! with MPI_ALLTOALL
	75	n_omp_threads ! number of OpenMP threads
	76
	77	!
	78	!-- Variables for multi node version (cluster version) using routine
	79	!-- poisfft_hybrid_nodes
	80	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
	81	comm_node_all, & ! communicater all PEs node version
	82	comm_tasks, & ! communicater tasks
	83	me, me_node, me_task,& ! identity of this PE
	84	nodes, & ! number of nodes
	85	tasks_per_logical_node = -1 ! default no cluster
	86
	87
	88	!
	89	!-- Public interfaces
	90	INTERFACE poisfft_hybrid_ini
	91	MODULE PROCEDURE poisfft_hybrid_ini
	92	END INTERFACE poisfft_hybrid_ini
	93
	94	INTERFACE poisfft_hybrid
	95	MODULE PROCEDURE poisfft_hybrid
	96	END INTERFACE poisfft_hybrid
	97
	98	!
	99	!-- Private interfaces
	100	INTERFACE poisfft_hybrid_omp
	101	MODULE PROCEDURE poisfft_hybrid_omp
	102	END INTERFACE poisfft_hybrid_omp
	103
	104	INTERFACE poisfft_hybrid_omp_vec
	105	MODULE PROCEDURE poisfft_hybrid_omp_vec
	106	END INTERFACE poisfft_hybrid_omp_vec
	107
	108	INTERFACE poisfft_hybrid_nodes
	109	MODULE PROCEDURE poisfft_hybrid_nodes
	110	END INTERFACE poisfft_hybrid_nodes
	111
	112	INTERFACE tridia_hybrid
	113	MODULE PROCEDURE tridia_hybrid
	114	END INTERFACE tridia_hybrid
	115
	116	INTERFACE cascade
	117	MODULE PROCEDURE cascade
	118	END INTERFACE cascade
	119
	120	CONTAINS
	121
	122
	123	SUBROUTINE poisfft_hybrid_ini
	124
	125	USE control_parameters
	126	USE pegrid
	127
	128	IMPLICIT NONE
	129
	130	CHARACTER(LEN=8) :: cdummy
	131	INTEGER :: idummy, istat
	132	INTEGER, DIMENSION(2) :: coords, dims
	133
	134	LOGICAL, DIMENSION(2) :: period = .false., re_dims
	135
	136
	137	!
	138	!-- Set the internal index values for the hybrid solver
	139	#if defined( __parallel )
	140	npe_s = pdims(1)
	141	#else
	142	npe_s = 1
	143	#endif
	144	nxl_a = 0
	145	nxr_a = nx
	146	nxl_p = 0
	147	nxr_p = ( ( nx+1 ) / npe_s ) - 1
	148	nys_a = nys
	149	nyn_a = nyn
	150	nys_p = 0
	151	nyn_p = ( ( ny+1 ) / npe_s ) - 1
	152
	153	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
	154
	155	#if defined( __KKMP )
	156	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
	157	READ ( cdummy, '(I8)' ) n_omp_threads
[257]	158	IF ( n_omp_threads > 1 ) THEN
	159	WRITE( message_string, * ) 'Number of OpenMP threads = ', &
	160	n_omp_threads
	161	CALL message( 'poisfft_hybrid_ini', 'PA0280', 0, 0, 0, 6, 0 )
[1]	162	ENDIF
	163	#else
	164	n_omp_threads = 1
	165	#endif
	166	!
	167	!-- Initialize the one-dimensional FFT routines
	168	CALL fft_init
	169
	170	!
	171	!-- Setup for multi node version (poisfft_hybrid_nodes)
	172	IF ( n_omp_threads == 1 .AND. &
	173	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
	174
	175	IF ( tasks_per_node /= -9999 ) THEN
	176	!
	177	!-- Multi node version requires that the available number of
	178	!-- switches per logical partition must be an integral divisor
	179	!-- of the chosen number of tasks per node
	180	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
	181	!
	182	!-- Set the switch which decides about usage of the multi node
	183	!-- version
	184	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
	185	numprocs > tasks_per_node ) THEN
	186	tasks_per_logical_node = tasks_per_node / switch_per_lpar
	187	ENDIF
	188
[257]	189	IF ( tasks_per_logical_node > -1 ) THEN
	190
	191	WRITE( message_string, * ) 'running optimized ', &
	192	'multinode version', &
	193	'&switch_per_lpar = ', &
	194	switch_per_lpar, &
	195	'&tasks_per_lpar = ', &
	196	tasks_per_node, &
	197	'tasks_per_logical_node = ', &
	198	tasks_per_logical_node
[274]	199	CALL message( 'poisfft_hybrid_ini', 'PA0281', 0, 0, 0, 6, 0 )
[257]	200
[1]	201	ENDIF
	202
	203	ENDIF
	204	ENDIF
	205	ENDIF
	206
	207	!
	208	!-- Determine sub-topologies for multi node version
	209	IF ( tasks_per_logical_node >= 2 ) THEN
	210
	211	#if defined( __parallel )
	212	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
	213	tasks_per_logical_node
	214	dims(1) = nodes
	215	dims(2) = tasks_per_logical_node
	216
	217	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
	218	comm_node_all, istat )
	219	CALL MPI_COMM_RANK( comm_node_all, me, istat )
	220
	221	re_dims(1) = .TRUE.
	222	re_dims(2) = .FALSE.
	223	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
	224	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
	225
	226	re_dims(1) = .FALSE.
	227	re_dims(2) = .TRUE.
	228	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
	229	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
	230
	231	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
	232	! tasks_per_logical_node
	233	#else
[257]	234	message_string = 'parallel environment (MPI) required'
	235	CALL message( 'poisfft_hybrid_ini', 'PA0282', 1, 2, 0, 6, 0 )
[1]	236	#endif
	237	ENDIF
	238
	239	END SUBROUTINE poisfft_hybrid_ini
	240
	241
	242	SUBROUTINE poisfft_hybrid( ar )
	243
	244	USE control_parameters
	245	USE interfaces
	246
	247	IMPLICIT NONE
	248
	249	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	250
	251	IF ( host(1:3) == 'nec' ) THEN
	252	CALL poisfft_hybrid_omp_vec( ar )
	253	ELSE
	254	IF ( tasks_per_logical_node == -1 ) THEN
	255	CALL poisfft_hybrid_omp( ar )
	256	ELSE
	257	CALL poisfft_hybrid_nodes( ar )
	258	ENDIF
	259	ENDIF
	260
	261	END SUBROUTINE poisfft_hybrid
	262
	263
	264	SUBROUTINE poisfft_hybrid_omp ( ar )
	265
	266	USE cpulog
	267	USE interfaces
	268
	269	IMPLICIT NONE
	270
	271	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	272	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	273
	274	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	275
	276	REAL, DIMENSION(0:nx) :: fftx_ar
	277	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	278
	279	REAL, DIMENSION(0:nx,nz) :: tri_ar
	280
	281	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	282	#if defined( __KKMP )
	283	INTEGER :: omp_get_thread_num
	284	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	285	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	286	#else
	287	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	288	#endif
	289
	290
	291	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
	292
	293	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	294
	295	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	296	!$OMP DO
	297	!
	298	!-- Store grid points to be transformed on a 1d-array, do the fft
	299	!-- and sample the results on a 4d-array
	300	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	301	iei = MIN( iouter+istride-1, nxr_p )
	302	DO k = 1, nz
	303
	304	DO i = iouter, iei
	305	ii = nxl + i
	306	ir = i - iouter + 1
	307
	308	DO j = nys_a, nyn_a
	309	ffty_ar(j,ir) = ar(k,j,ii)
	310	ENDDO
	311
	312	CALL fft_y( ffty_ar(:,ir), 'forward' )
	313	ENDDO
	314
	315	m = nys_a
	316	DO n = 1, npe_s
	317	DO j = nys_p, nyn_p
	318	DO i = iouter, iei
	319	ir = i - iouter + 1
	320	work1(i,k,j,n) = ffty_ar(m,ir)
	321	ENDDO
	322	m = m+1
	323	ENDDO
	324	ENDDO
	325
	326	ENDDO
	327	ENDDO
	328	!$OMP END PARALLEL
	329
	330	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	331
	332	#if defined( __parallel )
	333	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	334
	335	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	336	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	337	comm2d, istat )
	338
	339	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	340	#else
	341	work2 = work1
	342	#endif
	343
	344	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	345
	346	#if defined( __KKMP )
	347	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
	348	!$OMP DO
	349	DO j = nys_p, nyn_p
	350	jthread = omp_get_thread_num() + 1
	351	#else
	352	DO j = nys_p, nyn_p
	353	jthread = 1
	354	#endif
	355	DO k = 1, nz
	356
	357	m = nxl_a
	358	DO n = 1, npe_s
	359	DO i = nxl_p, nxr_p
	360	fftx_ar(m) = work2(i,k,j,n)
	361	m = m+1
	362	ENDDO
	363	ENDDO
	364
	365	CALL fft_x( fftx_ar, 'forward' )
	366
	367	DO i = nxl_a, nxr_a
	368	tri_ar(i,k) = fftx_ar(i)
	369	ENDDO
	370
	371	ENDDO
	372
	373	jj = myid * (nyn_p-nys_p+1) + j
	374	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	375
	376	DO k = 1, nz
	377	DO i = nxl_a, nxr_a
	378	fftx_ar(i) = tri_ar (i,k)
	379	ENDDO
	380
	381	CALL fft_x( fftx_ar, 'backward' )
	382
	383	m = nxl_a
	384	DO n = 1, npe_s
	385	DO i = nxl_p, nxr_p
	386	work2(i,k,j,n) = fftx_ar(m)
	387	m = m+1
	388	ENDDO
	389	ENDDO
	390
	391	ENDDO
	392	ENDDO
	393	#if defined( __KKMP )
	394	!$OMP END PARALLEL
	395	#endif
	396
	397	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	398
	399	#if defined( __parallel )
	400	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	401	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	402
	403	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	404	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	405	comm2d, istat )
	406
	407	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	408	#else
	409	work1 = work2
	410	#endif
	411
	412	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	413
	414	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	415	!$OMP DO
	416	DO iouter = nxl_p, nxr_p, istride
	417	iei = MIN( iouter+istride-1, nxr_p )
	418	DO k = 1, nz
	419
	420	m = nys_a
	421	DO n = 1, npe_s
	422	DO j = nys_p, nyn_p
	423	DO i = iouter, iei
	424	ir = i - iouter + 1
	425	ffty_ar(m,ir) = work1 (i,k,j,n)
	426	ENDDO
	427	m = m+1
	428	ENDDO
	429	ENDDO
	430
	431	DO i = iouter, iei
	432	ii = nxl + i
	433	ir = i - iouter + 1
	434	CALL fft_y( ffty_ar(:,ir), 'backward' )
	435
	436	DO j = nys_a, nyn_a
	437	ar(k,j,ii) = ffty_ar(j,ir)
	438	ENDDO
	439	ENDDO
	440
	441	ENDDO
	442	ENDDO
	443	!$OMP END PARALLEL
	444
	445	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	446
	447	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
	448
	449	#if defined( __KKMP )
	450	DEALLOCATE( tri )
	451	#endif
	452
	453	END SUBROUTINE poisfft_hybrid_omp
	454
	455
	456	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
	457
	458	USE cpulog
	459	USE interfaces
	460
	461	IMPLICIT NONE
	462
	463	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	464	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	465
	466	REAL, DIMENSION(0:nx,nz) :: tri_ar
	467
	468	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	469
	470	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
	471	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
	472
	473	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	474	#if defined( __KKMP )
	475	INTEGER :: omp_get_thread_num
	476	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	477	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	478	#else
	479	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	480	#endif
	481
	482
	483	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
	484
	485	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
	486
	487	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
	488	!$OMP DO
	489	!
	490	!-- Store grid points to be transformed on a 1d-array, do the fft
	491	!-- and sample the results on a 4d-array
	492	DO i = nxl_p, nxr_p
	493
	494	DO j = nys_a, nyn_a
	495	DO k = 1, nz
	496	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
	497	ENDDO
	498	ENDDO
	499
	500	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
	501	ENDDO
	502
	503	!$OMP DO
	504	DO k = 1, nz
	505	m = nys_a
	506	DO n = 1, npe_s
	507	DO j = nys_p, nyn_p
	508	DO i = nxl_p, nxr_p
	509	work1(i,k,j,n) = ffty_ar3(m,k,i)
	510	ENDDO
	511	m = m+1
	512	ENDDO
	513	ENDDO
	514	ENDDO
	515	!$OMP END PARALLEL
	516
	517	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
	518
	519	#if defined( __parallel )
	520	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	521	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	522	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	523	comm2d, istat )
	524	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	525	#else
	526	work2 = work1
	527	#endif
	528
	529	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
	530
	531	#if defined( __KKMP )
	532	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
	533	!$OMP DO
	534	DO j = nys_p, nyn_p
	535	jthread = omp_get_thread_num() + 1
	536	#else
	537	DO j = nys_p, nyn_p
	538	jthread = 1
	539	#endif
	540	DO k = 1, nz
	541
	542	m = nxl_a
	543	DO n = 1, npe_s
	544	DO i = nxl_p, nxr_p
	545	fftx_ar3(m,k,j) = work2(i,k,j,n)
	546	m = m+1
	547	ENDDO
	548	ENDDO
	549	ENDDO
	550
	551	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
	552
	553	DO k = 1, nz
	554	DO i = nxl_a, nxr_a
	555	tri_ar(i,k) = fftx_ar3(i,k,j)
	556	ENDDO
	557	ENDDO
	558
	559	jj = myid * (nyn_p-nys_p+1) + j
	560	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	561
	562	DO k = 1, nz
	563	DO i = nxl_a, nxr_a
	564	fftx_ar3(i,k,j) = tri_ar (i,k)
	565	ENDDO
	566	ENDDO
	567
	568	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
	569
	570	DO k = 1, nz
	571	m = nxl_a
	572	DO n = 1, npe_s
	573	DO i = nxl_p, nxr_p
	574	work2(i,k,j,n) = fftx_ar3(m,k,j)
	575	m = m+1
	576	ENDDO
	577	ENDDO
	578	ENDDO
	579
	580	ENDDO
	581	#if defined( __KKMP )
	582	!$OMP END PARALLEL
	583	#endif
	584
	585	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
	586
	587	#if defined( __parallel )
	588	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	589	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	590	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	591	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	592	comm2d, istat )
	593	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	594	#else
	595	work1 = work2
	596	#endif
	597
	598	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
	599
	600	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
	601	!$OMP DO
	602	DO k = 1, nz
	603	m = nys_a
	604	DO n = 1, npe_s
	605	DO j = nys_p, nyn_p
	606	DO i = nxl_p, nxr_p
	607	ffty_ar3(m,k,i) = work1(i,k,j,n)
	608	ENDDO
	609	m = m+1
	610	ENDDO
	611	ENDDO
	612	ENDDO
	613
	614	!$OMP DO
	615	DO i = nxl_p, nxr_p
	616	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
	617	DO j = nys_a, nyn_a
	618	DO k = 1, nz
	619	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
	620	ENDDO
	621	ENDDO
	622	ENDDO
	623	!$OMP END PARALLEL
	624
	625	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
	626
	627	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
	628
	629	#if defined( __KKMP )
	630	DEALLOCATE( tri )
	631	#endif
	632
	633	END SUBROUTINE poisfft_hybrid_omp_vec
	634
	635
	636	SUBROUTINE poisfft_hybrid_nodes ( ar )
	637
	638	USE cpulog
	639	USE interfaces
	640
	641	IMPLICIT NONE
	642
	643	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	644	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
	645	n, nn, nt, nw1, nw2
	646
	647	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	648
	649	REAL, DIMENSION(0:nx) :: fftx_ar
	650	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	651
	652	REAL, DIMENSION(0:nx,nz) :: tri_ar
	653
	654	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
	655	nodes,nys_p:nyn_p) :: work1,work2
	656	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	657
	658
	659	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
	660
	661	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	662
	663	!
	664	!-- Store grid points to be transformed on a 1d-array, do the fft
	665	!-- and sample the results on a 4d-array
	666	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	667	iei = MIN( iouter+istride-1, nxr_p )
	668	DO k = 1, nz
	669
	670	DO i = iouter, iei
	671	ii = nxl + i
	672	ir = i - iouter + 1
	673
	674	DO j = nys_a, nyn_a
	675	ffty_ar(j,ir) = ar(k,j,ii)
	676	ENDDO
	677
	678	CALL fft_y( ffty_ar(:,ir), 'forward' )
	679	ENDDO
	680
	681	m = nys_a
	682	DO nn = 1, nodes
	683	DO nt = 1, tasks_per_logical_node
	684	DO j = nys_p, nyn_p
	685	DO i = iouter, iei
	686	ir = i - iouter + 1
	687	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
	688	ENDDO
	689	m = m+1
	690	ENDDO
	691	ENDDO
	692	ENDDO
	693
	694	ENDDO
	695	ENDDO
	696
	697	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	698
	699	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	700	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
	701	DO nn = 1, nodes
	702	DO j = nys_p, nyn_p
	703	#if defined( __parallel )
	704	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	705	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	706	comm_tasks, istat )
	707	#endif
	708	ENDDO
	709	ENDDO
	710	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	711
	712
	713	DO j = nys_p, nyn_p
	714
	715	CALL cascade( 1, j, nys_p, nyn_p )
	716	nw2 = nw1 * SIZE( work1, 3 )
	717	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
	718	#if defined( __parallel )
	719	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	720	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	721	comm_nodes, istat )
	722	#endif
	723	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
	724	CALL cascade( 2, j, nys_p, nyn_p )
	725
	726	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	727	DO k = 1, nz
	728
	729	m = nxl_a
	730	DO nn = 1, nodes
	731	DO nt = 1, tasks_per_logical_node
	732	DO i = nxl_p, nxr_p
	733	fftx_ar(m) = work1(i,k,nt,nn,j)
	734	m = m+1
	735	ENDDO
	736	ENDDO
	737	ENDDO
	738
	739	CALL fft_x( fftx_ar, 'forward' )
	740
	741	DO i = nxl_a, nxr_a
	742	tri_ar(i,k) = fftx_ar(i)
	743	ENDDO
	744
	745	ENDDO
	746
	747	jj = myid * (nyn_p-nys_p+1) + j
	748	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
	749
	750	DO k = 1, nz
	751	DO i = nxl_a, nxr_a
	752	fftx_ar(i) = tri_ar(i,k)
	753	ENDDO
	754
	755	CALL fft_x( fftx_ar, 'backward' )
	756
	757	m = nxl_a
	758	DO nn = 1, nodes
	759	DO nt = 1, tasks_per_logical_node
	760	DO i = nxl_p, nxr_p
	761	work1(i,k,nt,nn,j) = fftx_ar(m)
	762	m = m+1
	763	ENDDO
	764	ENDDO
	765	ENDDO
	766	ENDDO
	767
	768	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	769	nw2 = nw1 * SIZE( work1, 3 )
	770	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
	771	#if defined( __parallel )
	772	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	773	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	774	comm_nodes, istat )
	775	#endif
	776	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
	777
	778	ENDDO
	779
	780	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	781	DO nn = 1, nodes
	782	DO j = nys_p, nyn_p
	783	#if defined( __parallel )
	784	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	785	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	786	comm_tasks, istat )
	787	#endif
	788	ENDDO
	789	ENDDO
	790	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	791
	792	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	793
	794	DO iouter = nxl_p, nxr_p, istride
	795	iei = MIN( iouter+istride-1, nxr_p )
	796	DO k = 1, nz
	797
	798	m = nys_a
	799	DO nn = 1, nodes
	800	DO nt = 1, tasks_per_logical_node
	801	DO j = nys_p, nyn_p
	802	DO i = iouter, iei
	803	ir = i - iouter + 1
	804	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
	805	ENDDO
	806	m = m+1
	807	ENDDO
	808	ENDDO
	809	ENDDO
	810
	811	DO i = iouter, iei
	812	ii = nxl + i
	813	ir = i - iouter + 1
	814	CALL fft_y( ffty_ar(:,ir), 'backward' )
	815
	816	DO j = nys_a, nyn_a
	817	ar(k,j,ii) = ffty_ar(j,ir)
	818	ENDDO
	819	ENDDO
	820
	821	ENDDO
	822	ENDDO
	823
	824	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	825
	826	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
	827
	828	END SUBROUTINE poisfft_hybrid_nodes
	829
	830
	831
	832	SUBROUTINE tridia_hybrid( j, ar, tri )
	833
	834	USE arrays_3d
	835	USE control_parameters
	836	USE grid_variables
	837
	838	IMPLICIT NONE
	839
	840	INTEGER :: i, j, k, nnyh
	841	REAL, DIMENSION(0:nx,nz) :: ar
	842	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	843	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	844
	845	nnyh = (ny+1) / 2
	846
	847	tri = 0.0
	848	!
	849	!-- Define constant elements of the tridiagonal matrix.
	850	DO k = 0, nz-1
	851	DO i = 0,nx
[667]	852	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	853	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	854	ENDDO
	855	ENDDO
	856
	857	IF ( j <= nnyh ) THEN
	858	CALL maketri_hybrid( j )
	859	ELSE
	860	CALL maketri_hybrid( ny+1-j)
	861	ENDIF
	862	CALL zerleg_hybrid
	863	CALL substi_hybrid( ar, tri )
	864
	865	CONTAINS
	866
	867	SUBROUTINE maketri_hybrid( j )
	868
	869	!----------------------------------------------------------------------!
	870	! maketri !
	871	! !
	872	! computes the i- and j-dependent component of the matrix !
	873	!----------------------------------------------------------------------!
	874
	875	USE constants
	876
	877	IMPLICIT NONE
	878
	879	INTEGER :: i, j, k, nnxh
	880	REAL :: a, c
	881
	882	REAL, DIMENSION(0:nx) :: l
	883
	884
	885	nnxh = (nx+1) / 2
	886	!
	887	!-- Provide the tridiagonal matrix for solution of the Poisson equation
	888	!-- in Fourier space. The coefficients are computed following the method
	889	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
	890	!-- Siano's original version.
	891	DO i = 0,nx
	892	IF ( i >= 0 .AND. i < nnxh ) THEN
	893	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	894	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	895	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	896	FLOAT( ny+1 ) ) ) / ( dy * dy )
	897	ELSEIF ( i == nnxh ) THEN
	898	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	899	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	900	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	901	FLOAT(ny+1) ) ) / ( dy * dy )
	902	ELSE
	903	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	904	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	905	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	906	FLOAT( ny+1 ) ) ) / ( dy * dy )
	907	ENDIF
	908	ENDDO
	909
	910	DO k = 0,nz-1
	911	DO i = 0, nx
[667]	912	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	913	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	914	tri(1,i,k) = a + c - l(i)
	915	ENDDO
	916	ENDDO
	917	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	918	DO i = 0,nx
	919	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	920	ENDDO
	921	ENDIF
	922	IF ( ibc_p_t == 1 ) THEN
	923	DO i = 0,nx
	924	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	925	ENDDO
	926	ENDIF
	927
	928	END SUBROUTINE maketri_hybrid
	929
	930
	931	SUBROUTINE zerleg_hybrid
	932
	933	!----------------------------------------------------------------------!
	934	! zerleg !
	935	! !
	936	! Splitting of the tridiagonal matrix (Thomas algorithm) !
	937	!----------------------------------------------------------------------!
	938
	939	USE indices
	940
	941	IMPLICIT NONE
	942
	943	INTEGER :: i, k
	944
	945	!
	946	!-- Splitting
	947	DO i = 0, nx
	948	tri(4,i,0) = tri(1,i,0)
	949	ENDDO
	950	DO k = 1, nz-1
	951	DO i = 0,nx
	952	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	953	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	954	ENDDO
	955	ENDDO
	956
	957	END SUBROUTINE zerleg_hybrid
	958
	959	SUBROUTINE substi_hybrid( ar, tri )
	960
	961	!----------------------------------------------------------------------!
	962	! substi !
	963	! !
	964	! Substitution (Forward and Backward) (Thomas algorithm) !
	965	!----------------------------------------------------------------------!
	966
	967	IMPLICIT NONE
	968
	969	INTEGER :: i, j, k
	970	REAL, DIMENSION(0:nx,nz) :: ar
	971	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	972	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	973
	974	!
	975	!-- Forward substitution
	976	DO i = 0, nx
	977	ar1(i,0) = ar(i,1)
	978	ENDDO
	979	DO k = 1, nz - 1
	980	DO i = 0,nx
	981	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	982	ENDDO
	983	ENDDO
	984
	985	!
	986	!-- Backward substitution
	987	DO i = 0,nx
	988	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	989	ENDDO
	990	DO k = nz-2, 0, -1
	991	DO i = 0,nx
	992	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	993	/ tri(4,i,k)
	994	ENDDO
	995	ENDDO
	996
	997	END SUBROUTINE substi_hybrid
	998
	999	END SUBROUTINE tridia_hybrid
	1000
	1001
	1002	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
	1003
	1004	USE cpulog
	1005
	1006	IMPLICIT NONE
	1007
	1008	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
	1009	INTEGER, SAVE :: tag = 10
	1010	#if defined( __parallel )
[415]	1011	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
	1012	INTEGER, DIMENSION(MPI_STATUS_SIZE,1) :: stata
[1]	1013	#endif
	1014
	1015	REAL :: buf, buf1
	1016
	1017
	1018	buf = 1.0
	1019	buf1 = 1.1
	1020	IF ( me_node == 0 ) THEN ! first node only
	1021
	1022	SELECT CASE ( loca )
	1023
	1024	CASE ( 1 ) ! before alltoall
	1025
	1026	IF( me_task > 0 ) THEN ! first task does not wait
	1027	#if defined( __parallel )
	1028	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
	1029	buf1, 1, MPI_REAL, me_task-1, 0, &
[415]	1030	comm_tasks, stat, ierr )
[1]	1031	#endif
	1032	ELSEIF ( j > nys_p ) THEN
	1033	req = 0
	1034	tag = MOD( tag-10, 10 ) + 10
	1035	#if defined( __parallel )
	1036	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
	1037	tag, comm_tasks, req, ierr )
	1038	reqa = req
[415]	1039	CALL MPI_WAITALL( 1, reqa, stata, ierr )
[1]	1040	#endif
	1041	ENDIF
	1042
	1043	CASE ( 2 ) ! after alltoall
	1044
	1045	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
	1046	#if defined( __parallel )
	1047	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
	1048	buf1, 1, MPI_REAL, me_task+1, 0, &
	1049	comm_tasks, stat, ierr)
	1050	#endif
	1051	ELSEIF ( j < nyn_p ) THEN
	1052	req = 0
	1053	tag = MOD( tag-10, 10 ) + 10
	1054	#if defined( __parallel )
	1055	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
	1056	ierr )
	1057	#endif
	1058	ENDIF
	1059
	1060	END SELECT
	1061
	1062	ENDIF
	1063
	1064	END SUBROUTINE cascade
	1065
	1066	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |