Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 413

Last change on this file since 413 was 412, checked in by raasch, 15 years ago
data asignments for dvrp arrays switched back to old settings due to runtime problems; type problem in mpi_waitall (poisfft_hybrid) fixed for mpi2 libraries
Property svn:keywords set to `Id`
File size: 31.3 KB

Rev	Line
[1]	1	MODULE poisfft_hybrid_mod
	2	!------------------------------------------------------------------------------
	3	!
[257]	4	! Current revisions:
[1]	5	! -----------------
[412]	6	! Dimension of array stat in cascade change to prevent type problems with
	7	! mpi2 libraries
[1]	8	!
	9	! Former revisions:
	10	! -----------------
[3]	11	! $Id: poisfft_hybrid.f90 412 2009-12-14 07:14:46Z heinze $
[392]	12	!
	13	! 274 2009-03-26 15:11:21Z heinze
	14	! Output of messages replaced by message handling routine.
	15	!
	16	! Feb. 2007
[3]	17	! RCS Log replace by Id keyword, revision history cleaned up
	18	!
[1]	19	! Revision 1.11 2004/04/30 12:43:14 raasch
	20	! Renaming of fft routines, additional argument in calls of fft_y_m
	21	!
	22	! Revision 1.2 2002/12/19 16:08:31 raasch
	23	! Preprocessor directive KKMP introduced (OMP does NOT work),
	24	! array tri will be a shared array in OpenMP loop, to get better cache
	25	! utilization, the i index (x-direction) will be executed in stride
	26	! "istride" as outer loop and in a shorter inner loop,
	27	! overlapping of computation and communication realized by new routine
	28	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
	29	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
	30	!
	31	!
	32	! Description:
	33	! ------------
	34	! Solution of the Poisson equation with a 2D spectral method.
	35	! Hybrid version for parallel computers using a 1D domain decomposition,
	36	! realized with MPI, along x and parallelization with OPEN-MP along y
	37	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
	38	! optimization is realized by overlapping of computation and communication
	39	! and by simultaneously executing as many communication calls as switches
	40	! per logical partition (LPAR) are available. This version comes into
	41	! effect if more than one node is used and if the environment variable
	42	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
	43	! without any rest.
	44	!
	45	! WARNING: In case of OpenMP, there are problems with allocating large
	46	! arrays in parallel regions.
	47	!
	48	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
	49	!------------------------------------------------------------------------------!
	50
	51	USE fft_xy
	52	USE indices
	53	USE pegrid
	54
	55	IMPLICIT NONE
	56
	57	PRIVATE
	58	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
	59
	60	INTEGER, PARAMETER :: switch_per_lpar = 2
	61
	62	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
	63	nxl_p, nxr_p, & ! partial x dimension
	64	nys_a, nyn_a, & ! total y dimension
	65	nys_p, nyn_p, & ! partial y dimension
	66
	67	npe_s, & ! total number of PEs for solver
	68	nwords, & ! number of points to be exchanged
	69	! with MPI_ALLTOALL
	70	n_omp_threads ! number of OpenMP threads
	71
	72	!
	73	!-- Variables for multi node version (cluster version) using routine
	74	!-- poisfft_hybrid_nodes
	75	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
	76	comm_node_all, & ! communicater all PEs node version
	77	comm_tasks, & ! communicater tasks
	78	me, me_node, me_task,& ! identity of this PE
	79	nodes, & ! number of nodes
	80	tasks_per_logical_node = -1 ! default no cluster
	81
	82
	83	!
	84	!-- Public interfaces
	85	INTERFACE poisfft_hybrid_ini
	86	MODULE PROCEDURE poisfft_hybrid_ini
	87	END INTERFACE poisfft_hybrid_ini
	88
	89	INTERFACE poisfft_hybrid
	90	MODULE PROCEDURE poisfft_hybrid
	91	END INTERFACE poisfft_hybrid
	92
	93	!
	94	!-- Private interfaces
	95	INTERFACE poisfft_hybrid_omp
	96	MODULE PROCEDURE poisfft_hybrid_omp
	97	END INTERFACE poisfft_hybrid_omp
	98
	99	INTERFACE poisfft_hybrid_omp_vec
	100	MODULE PROCEDURE poisfft_hybrid_omp_vec
	101	END INTERFACE poisfft_hybrid_omp_vec
	102
	103	INTERFACE poisfft_hybrid_nodes
	104	MODULE PROCEDURE poisfft_hybrid_nodes
	105	END INTERFACE poisfft_hybrid_nodes
	106
	107	INTERFACE tridia_hybrid
	108	MODULE PROCEDURE tridia_hybrid
	109	END INTERFACE tridia_hybrid
	110
	111	INTERFACE cascade
	112	MODULE PROCEDURE cascade
	113	END INTERFACE cascade
	114
	115	CONTAINS
	116
	117
	118	SUBROUTINE poisfft_hybrid_ini
	119
	120	USE control_parameters
	121	USE pegrid
	122
	123	IMPLICIT NONE
	124
	125	CHARACTER(LEN=8) :: cdummy
	126	INTEGER :: idummy, istat
	127	INTEGER, DIMENSION(2) :: coords, dims
	128
	129	LOGICAL, DIMENSION(2) :: period = .false., re_dims
	130
	131
	132	!
	133	!-- Set the internal index values for the hybrid solver
	134	#if defined( __parallel )
	135	npe_s = pdims(1)
	136	#else
	137	npe_s = 1
	138	#endif
	139	nxl_a = 0
	140	nxr_a = nx
	141	nxl_p = 0
	142	nxr_p = ( ( nx+1 ) / npe_s ) - 1
	143	nys_a = nys
	144	nyn_a = nyn
	145	nys_p = 0
	146	nyn_p = ( ( ny+1 ) / npe_s ) - 1
	147
	148	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
	149
	150	#if defined( __KKMP )
	151	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
	152	READ ( cdummy, '(I8)' ) n_omp_threads
[257]	153	IF ( n_omp_threads > 1 ) THEN
	154	WRITE( message_string, * ) 'Number of OpenMP threads = ', &
	155	n_omp_threads
	156	CALL message( 'poisfft_hybrid_ini', 'PA0280', 0, 0, 0, 6, 0 )
[1]	157	ENDIF
	158	#else
	159	n_omp_threads = 1
	160	#endif
	161	!
	162	!-- Initialize the one-dimensional FFT routines
	163	CALL fft_init
	164
	165	!
	166	!-- Setup for multi node version (poisfft_hybrid_nodes)
	167	IF ( n_omp_threads == 1 .AND. &
	168	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
	169
	170	IF ( tasks_per_node /= -9999 ) THEN
	171	!
	172	!-- Multi node version requires that the available number of
	173	!-- switches per logical partition must be an integral divisor
	174	!-- of the chosen number of tasks per node
	175	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
	176	!
	177	!-- Set the switch which decides about usage of the multi node
	178	!-- version
	179	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
	180	numprocs > tasks_per_node ) THEN
	181	tasks_per_logical_node = tasks_per_node / switch_per_lpar
	182	ENDIF
	183
[257]	184	IF ( tasks_per_logical_node > -1 ) THEN
	185
	186	WRITE( message_string, * ) 'running optimized ', &
	187	'multinode version', &
	188	'&switch_per_lpar = ', &
	189	switch_per_lpar, &
	190	'&tasks_per_lpar = ', &
	191	tasks_per_node, &
	192	'tasks_per_logical_node = ', &
	193	tasks_per_logical_node
[274]	194	CALL message( 'poisfft_hybrid_ini', 'PA0281', 0, 0, 0, 6, 0 )
[257]	195
[1]	196	ENDIF
	197
	198	ENDIF
	199	ENDIF
	200	ENDIF
	201
	202	!
	203	!-- Determine sub-topologies for multi node version
	204	IF ( tasks_per_logical_node >= 2 ) THEN
	205
	206	#if defined( __parallel )
	207	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
	208	tasks_per_logical_node
	209	dims(1) = nodes
	210	dims(2) = tasks_per_logical_node
	211
	212	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
	213	comm_node_all, istat )
	214	CALL MPI_COMM_RANK( comm_node_all, me, istat )
	215
	216	re_dims(1) = .TRUE.
	217	re_dims(2) = .FALSE.
	218	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
	219	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
	220
	221	re_dims(1) = .FALSE.
	222	re_dims(2) = .TRUE.
	223	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
	224	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
	225
	226	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
	227	! tasks_per_logical_node
	228	#else
[257]	229	message_string = 'parallel environment (MPI) required'
	230	CALL message( 'poisfft_hybrid_ini', 'PA0282', 1, 2, 0, 6, 0 )
[1]	231	#endif
	232	ENDIF
	233
	234	END SUBROUTINE poisfft_hybrid_ini
	235
	236
	237	SUBROUTINE poisfft_hybrid( ar )
	238
	239	USE control_parameters
	240	USE interfaces
	241
	242	IMPLICIT NONE
	243
	244	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	245
	246	IF ( host(1:3) == 'nec' ) THEN
	247	CALL poisfft_hybrid_omp_vec( ar )
	248	ELSE
	249	IF ( tasks_per_logical_node == -1 ) THEN
	250	CALL poisfft_hybrid_omp( ar )
	251	ELSE
	252	CALL poisfft_hybrid_nodes( ar )
	253	ENDIF
	254	ENDIF
	255
	256	END SUBROUTINE poisfft_hybrid
	257
	258
	259	SUBROUTINE poisfft_hybrid_omp ( ar )
	260
	261	USE cpulog
	262	USE interfaces
	263
	264	IMPLICIT NONE
	265
	266	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	267	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	268
	269	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	270
	271	REAL, DIMENSION(0:nx) :: fftx_ar
	272	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	273
	274	REAL, DIMENSION(0:nx,nz) :: tri_ar
	275
	276	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	277	#if defined( __KKMP )
	278	INTEGER :: omp_get_thread_num
	279	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	280	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	281	#else
	282	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	283	#endif
	284
	285
	286	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
	287
	288	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	289
	290	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	291	!$OMP DO
	292	!
	293	!-- Store grid points to be transformed on a 1d-array, do the fft
	294	!-- and sample the results on a 4d-array
	295	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	296	iei = MIN( iouter+istride-1, nxr_p )
	297	DO k = 1, nz
	298
	299	DO i = iouter, iei
	300	ii = nxl + i
	301	ir = i - iouter + 1
	302
	303	DO j = nys_a, nyn_a
	304	ffty_ar(j,ir) = ar(k,j,ii)
	305	ENDDO
	306
	307	CALL fft_y( ffty_ar(:,ir), 'forward' )
	308	ENDDO
	309
	310	m = nys_a
	311	DO n = 1, npe_s
	312	DO j = nys_p, nyn_p
	313	DO i = iouter, iei
	314	ir = i - iouter + 1
	315	work1(i,k,j,n) = ffty_ar(m,ir)
	316	ENDDO
	317	m = m+1
	318	ENDDO
	319	ENDDO
	320
	321	ENDDO
	322	ENDDO
	323	!$OMP END PARALLEL
	324
	325	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	326
	327	#if defined( __parallel )
	328	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	329
	330	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	331	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	332	comm2d, istat )
	333
	334	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	335	#else
	336	work2 = work1
	337	#endif
	338
	339	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	340
	341	#if defined( __KKMP )
	342	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
	343	!$OMP DO
	344	DO j = nys_p, nyn_p
	345	jthread = omp_get_thread_num() + 1
	346	#else
	347	DO j = nys_p, nyn_p
	348	jthread = 1
	349	#endif
	350	DO k = 1, nz
	351
	352	m = nxl_a
	353	DO n = 1, npe_s
	354	DO i = nxl_p, nxr_p
	355	fftx_ar(m) = work2(i,k,j,n)
	356	m = m+1
	357	ENDDO
	358	ENDDO
	359
	360	CALL fft_x( fftx_ar, 'forward' )
	361
	362	DO i = nxl_a, nxr_a
	363	tri_ar(i,k) = fftx_ar(i)
	364	ENDDO
	365
	366	ENDDO
	367
	368	jj = myid * (nyn_p-nys_p+1) + j
	369	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	370
	371	DO k = 1, nz
	372	DO i = nxl_a, nxr_a
	373	fftx_ar(i) = tri_ar (i,k)
	374	ENDDO
	375
	376	CALL fft_x( fftx_ar, 'backward' )
	377
	378	m = nxl_a
	379	DO n = 1, npe_s
	380	DO i = nxl_p, nxr_p
	381	work2(i,k,j,n) = fftx_ar(m)
	382	m = m+1
	383	ENDDO
	384	ENDDO
	385
	386	ENDDO
	387	ENDDO
	388	#if defined( __KKMP )
	389	!$OMP END PARALLEL
	390	#endif
	391
	392	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	393
	394	#if defined( __parallel )
	395	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	396	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	397
	398	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	399	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	400	comm2d, istat )
	401
	402	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	403	#else
	404	work1 = work2
	405	#endif
	406
	407	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	408
	409	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	410	!$OMP DO
	411	DO iouter = nxl_p, nxr_p, istride
	412	iei = MIN( iouter+istride-1, nxr_p )
	413	DO k = 1, nz
	414
	415	m = nys_a
	416	DO n = 1, npe_s
	417	DO j = nys_p, nyn_p
	418	DO i = iouter, iei
	419	ir = i - iouter + 1
	420	ffty_ar(m,ir) = work1 (i,k,j,n)
	421	ENDDO
	422	m = m+1
	423	ENDDO
	424	ENDDO
	425
	426	DO i = iouter, iei
	427	ii = nxl + i
	428	ir = i - iouter + 1
	429	CALL fft_y( ffty_ar(:,ir), 'backward' )
	430
	431	DO j = nys_a, nyn_a
	432	ar(k,j,ii) = ffty_ar(j,ir)
	433	ENDDO
	434	ENDDO
	435
	436	ENDDO
	437	ENDDO
	438	!$OMP END PARALLEL
	439
	440	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	441
	442	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
	443
	444	#if defined( __KKMP )
	445	DEALLOCATE( tri )
	446	#endif
	447
	448	END SUBROUTINE poisfft_hybrid_omp
	449
	450
	451	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
	452
	453	USE cpulog
	454	USE interfaces
	455
	456	IMPLICIT NONE
	457
	458	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	459	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	460
	461	REAL, DIMENSION(0:nx,nz) :: tri_ar
	462
	463	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	464
	465	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
	466	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
	467
	468	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	469	#if defined( __KKMP )
	470	INTEGER :: omp_get_thread_num
	471	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	472	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	473	#else
	474	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	475	#endif
	476
	477
	478	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
	479
	480	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
	481
	482	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
	483	!$OMP DO
	484	!
	485	!-- Store grid points to be transformed on a 1d-array, do the fft
	486	!-- and sample the results on a 4d-array
	487	DO i = nxl_p, nxr_p
	488
	489	DO j = nys_a, nyn_a
	490	DO k = 1, nz
	491	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
	492	ENDDO
	493	ENDDO
	494
	495	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
	496	ENDDO
	497
	498	!$OMP DO
	499	DO k = 1, nz
	500	m = nys_a
	501	DO n = 1, npe_s
	502	DO j = nys_p, nyn_p
	503	DO i = nxl_p, nxr_p
	504	work1(i,k,j,n) = ffty_ar3(m,k,i)
	505	ENDDO
	506	m = m+1
	507	ENDDO
	508	ENDDO
	509	ENDDO
	510	!$OMP END PARALLEL
	511
	512	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
	513
	514	#if defined( __parallel )
	515	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	516	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	517	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	518	comm2d, istat )
	519	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	520	#else
	521	work2 = work1
	522	#endif
	523
	524	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
	525
	526	#if defined( __KKMP )
	527	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
	528	!$OMP DO
	529	DO j = nys_p, nyn_p
	530	jthread = omp_get_thread_num() + 1
	531	#else
	532	DO j = nys_p, nyn_p
	533	jthread = 1
	534	#endif
	535	DO k = 1, nz
	536
	537	m = nxl_a
	538	DO n = 1, npe_s
	539	DO i = nxl_p, nxr_p
	540	fftx_ar3(m,k,j) = work2(i,k,j,n)
	541	m = m+1
	542	ENDDO
	543	ENDDO
	544	ENDDO
	545
	546	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
	547
	548	DO k = 1, nz
	549	DO i = nxl_a, nxr_a
	550	tri_ar(i,k) = fftx_ar3(i,k,j)
	551	ENDDO
	552	ENDDO
	553
	554	jj = myid * (nyn_p-nys_p+1) + j
	555	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	556
	557	DO k = 1, nz
	558	DO i = nxl_a, nxr_a
	559	fftx_ar3(i,k,j) = tri_ar (i,k)
	560	ENDDO
	561	ENDDO
	562
	563	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
	564
	565	DO k = 1, nz
	566	m = nxl_a
	567	DO n = 1, npe_s
	568	DO i = nxl_p, nxr_p
	569	work2(i,k,j,n) = fftx_ar3(m,k,j)
	570	m = m+1
	571	ENDDO
	572	ENDDO
	573	ENDDO
	574
	575	ENDDO
	576	#if defined( __KKMP )
	577	!$OMP END PARALLEL
	578	#endif
	579
	580	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
	581
	582	#if defined( __parallel )
	583	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	584	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	585	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	586	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	587	comm2d, istat )
	588	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	589	#else
	590	work1 = work2
	591	#endif
	592
	593	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
	594
	595	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
	596	!$OMP DO
	597	DO k = 1, nz
	598	m = nys_a
	599	DO n = 1, npe_s
	600	DO j = nys_p, nyn_p
	601	DO i = nxl_p, nxr_p
	602	ffty_ar3(m,k,i) = work1(i,k,j,n)
	603	ENDDO
	604	m = m+1
	605	ENDDO
	606	ENDDO
	607	ENDDO
	608
	609	!$OMP DO
	610	DO i = nxl_p, nxr_p
	611	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
	612	DO j = nys_a, nyn_a
	613	DO k = 1, nz
	614	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
	615	ENDDO
	616	ENDDO
	617	ENDDO
	618	!$OMP END PARALLEL
	619
	620	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
	621
	622	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
	623
	624	#if defined( __KKMP )
	625	DEALLOCATE( tri )
	626	#endif
	627
	628	END SUBROUTINE poisfft_hybrid_omp_vec
	629
	630
	631	SUBROUTINE poisfft_hybrid_nodes ( ar )
	632
	633	USE cpulog
	634	USE interfaces
	635
	636	IMPLICIT NONE
	637
	638	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	639	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
	640	n, nn, nt, nw1, nw2
	641
	642	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	643
	644	REAL, DIMENSION(0:nx) :: fftx_ar
	645	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	646
	647	REAL, DIMENSION(0:nx,nz) :: tri_ar
	648
	649	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
	650	nodes,nys_p:nyn_p) :: work1,work2
	651	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	652
	653
	654	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
	655
	656	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	657
	658	!
	659	!-- Store grid points to be transformed on a 1d-array, do the fft
	660	!-- and sample the results on a 4d-array
	661	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	662	iei = MIN( iouter+istride-1, nxr_p )
	663	DO k = 1, nz
	664
	665	DO i = iouter, iei
	666	ii = nxl + i
	667	ir = i - iouter + 1
	668
	669	DO j = nys_a, nyn_a
	670	ffty_ar(j,ir) = ar(k,j,ii)
	671	ENDDO
	672
	673	CALL fft_y( ffty_ar(:,ir), 'forward' )
	674	ENDDO
	675
	676	m = nys_a
	677	DO nn = 1, nodes
	678	DO nt = 1, tasks_per_logical_node
	679	DO j = nys_p, nyn_p
	680	DO i = iouter, iei
	681	ir = i - iouter + 1
	682	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
	683	ENDDO
	684	m = m+1
	685	ENDDO
	686	ENDDO
	687	ENDDO
	688
	689	ENDDO
	690	ENDDO
	691
	692	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	693
	694	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	695	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
	696	DO nn = 1, nodes
	697	DO j = nys_p, nyn_p
	698	#if defined( __parallel )
	699	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	700	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	701	comm_tasks, istat )
	702	#endif
	703	ENDDO
	704	ENDDO
	705	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	706
	707
	708	DO j = nys_p, nyn_p
	709
	710	CALL cascade( 1, j, nys_p, nyn_p )
	711	nw2 = nw1 * SIZE( work1, 3 )
	712	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
	713	#if defined( __parallel )
	714	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	715	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	716	comm_nodes, istat )
	717	#endif
	718	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
	719	CALL cascade( 2, j, nys_p, nyn_p )
	720
	721	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	722	DO k = 1, nz
	723
	724	m = nxl_a
	725	DO nn = 1, nodes
	726	DO nt = 1, tasks_per_logical_node
	727	DO i = nxl_p, nxr_p
	728	fftx_ar(m) = work1(i,k,nt,nn,j)
	729	m = m+1
	730	ENDDO
	731	ENDDO
	732	ENDDO
	733
	734	CALL fft_x( fftx_ar, 'forward' )
	735
	736	DO i = nxl_a, nxr_a
	737	tri_ar(i,k) = fftx_ar(i)
	738	ENDDO
	739
	740	ENDDO
	741
	742	jj = myid * (nyn_p-nys_p+1) + j
	743	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
	744
	745	DO k = 1, nz
	746	DO i = nxl_a, nxr_a
	747	fftx_ar(i) = tri_ar(i,k)
	748	ENDDO
	749
	750	CALL fft_x( fftx_ar, 'backward' )
	751
	752	m = nxl_a
	753	DO nn = 1, nodes
	754	DO nt = 1, tasks_per_logical_node
	755	DO i = nxl_p, nxr_p
	756	work1(i,k,nt,nn,j) = fftx_ar(m)
	757	m = m+1
	758	ENDDO
	759	ENDDO
	760	ENDDO
	761	ENDDO
	762
	763	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	764	nw2 = nw1 * SIZE( work1, 3 )
	765	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
	766	#if defined( __parallel )
	767	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	768	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	769	comm_nodes, istat )
	770	#endif
	771	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
	772
	773	ENDDO
	774
	775	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	776	DO nn = 1, nodes
	777	DO j = nys_p, nyn_p
	778	#if defined( __parallel )
	779	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	780	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	781	comm_tasks, istat )
	782	#endif
	783	ENDDO
	784	ENDDO
	785	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	786
	787	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	788
	789	DO iouter = nxl_p, nxr_p, istride
	790	iei = MIN( iouter+istride-1, nxr_p )
	791	DO k = 1, nz
	792
	793	m = nys_a
	794	DO nn = 1, nodes
	795	DO nt = 1, tasks_per_logical_node
	796	DO j = nys_p, nyn_p
	797	DO i = iouter, iei
	798	ir = i - iouter + 1
	799	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
	800	ENDDO
	801	m = m+1
	802	ENDDO
	803	ENDDO
	804	ENDDO
	805
	806	DO i = iouter, iei
	807	ii = nxl + i
	808	ir = i - iouter + 1
	809	CALL fft_y( ffty_ar(:,ir), 'backward' )
	810
	811	DO j = nys_a, nyn_a
	812	ar(k,j,ii) = ffty_ar(j,ir)
	813	ENDDO
	814	ENDDO
	815
	816	ENDDO
	817	ENDDO
	818
	819	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	820
	821	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
	822
	823	END SUBROUTINE poisfft_hybrid_nodes
	824
	825
	826
	827	SUBROUTINE tridia_hybrid( j, ar, tri )
	828
	829	USE arrays_3d
	830	USE control_parameters
	831	USE grid_variables
	832
	833	IMPLICIT NONE
	834
	835	INTEGER :: i, j, k, nnyh
	836	REAL, DIMENSION(0:nx,nz) :: ar
	837	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	838	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	839
	840	nnyh = (ny+1) / 2
	841
	842	tri = 0.0
	843	!
	844	!-- Define constant elements of the tridiagonal matrix.
	845	DO k = 0, nz-1
	846	DO i = 0,nx
	847	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
	848	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
	849	ENDDO
	850	ENDDO
	851
	852	IF ( j <= nnyh ) THEN
	853	CALL maketri_hybrid( j )
	854	ELSE
	855	CALL maketri_hybrid( ny+1-j)
	856	ENDIF
	857	CALL zerleg_hybrid
	858	CALL substi_hybrid( ar, tri )
	859
	860	CONTAINS
	861
	862	SUBROUTINE maketri_hybrid( j )
	863
	864	!----------------------------------------------------------------------!
	865	! maketri !
	866	! !
	867	! computes the i- and j-dependent component of the matrix !
	868	!----------------------------------------------------------------------!
	869
	870	USE constants
	871
	872	IMPLICIT NONE
	873
	874	INTEGER :: i, j, k, nnxh
	875	REAL :: a, c
	876
	877	REAL, DIMENSION(0:nx) :: l
	878
	879
	880	nnxh = (nx+1) / 2
	881	!
	882	!-- Provide the tridiagonal matrix for solution of the Poisson equation
	883	!-- in Fourier space. The coefficients are computed following the method
	884	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
	885	!-- Siano's original version.
	886	DO i = 0,nx
	887	IF ( i >= 0 .AND. i < nnxh ) THEN
	888	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	889	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	890	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	891	FLOAT( ny+1 ) ) ) / ( dy * dy )
	892	ELSEIF ( i == nnxh ) THEN
	893	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	894	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	895	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	896	FLOAT(ny+1) ) ) / ( dy * dy )
	897	ELSE
	898	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	899	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	900	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	901	FLOAT( ny+1 ) ) ) / ( dy * dy )
	902	ENDIF
	903	ENDDO
	904
	905	DO k = 0,nz-1
	906	DO i = 0, nx
	907	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	908	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	909	tri(1,i,k) = a + c - l(i)
	910	ENDDO
	911	ENDDO
	912	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	913	DO i = 0,nx
	914	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	915	ENDDO
	916	ENDIF
	917	IF ( ibc_p_t == 1 ) THEN
	918	DO i = 0,nx
	919	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	920	ENDDO
	921	ENDIF
	922
	923	END SUBROUTINE maketri_hybrid
	924
	925
	926	SUBROUTINE zerleg_hybrid
	927
	928	!----------------------------------------------------------------------!
	929	! zerleg !
	930	! !
	931	! Splitting of the tridiagonal matrix (Thomas algorithm) !
	932	!----------------------------------------------------------------------!
	933
	934	USE indices
	935
	936	IMPLICIT NONE
	937
	938	INTEGER :: i, k
	939
	940	!
	941	!-- Splitting
	942	DO i = 0, nx
	943	tri(4,i,0) = tri(1,i,0)
	944	ENDDO
	945	DO k = 1, nz-1
	946	DO i = 0,nx
	947	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	948	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	949	ENDDO
	950	ENDDO
	951
	952	END SUBROUTINE zerleg_hybrid
	953
	954	SUBROUTINE substi_hybrid( ar, tri )
	955
	956	!----------------------------------------------------------------------!
	957	! substi !
	958	! !
	959	! Substitution (Forward and Backward) (Thomas algorithm) !
	960	!----------------------------------------------------------------------!
	961
	962	IMPLICIT NONE
	963
	964	INTEGER :: i, j, k
	965	REAL, DIMENSION(0:nx,nz) :: ar
	966	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	967	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	968
	969	!
	970	!-- Forward substitution
	971	DO i = 0, nx
	972	ar1(i,0) = ar(i,1)
	973	ENDDO
	974	DO k = 1, nz - 1
	975	DO i = 0,nx
	976	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	977	ENDDO
	978	ENDDO
	979
	980	!
	981	!-- Backward substitution
	982	DO i = 0,nx
	983	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	984	ENDDO
	985	DO k = nz-2, 0, -1
	986	DO i = 0,nx
	987	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	988	/ tri(4,i,k)
	989	ENDDO
	990	ENDDO
	991
	992	END SUBROUTINE substi_hybrid
	993
	994	END SUBROUTINE tridia_hybrid
	995
	996
	997	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
	998
	999	USE cpulog
	1000
	1001	IMPLICIT NONE
	1002
	1003	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
	1004	INTEGER, SAVE :: tag = 10
	1005	#if defined( __parallel )
[412]	1006	INTEGER, DIMENSION(MPI_STATUS_SIZE,1) :: stat
[1]	1007	#endif
	1008
	1009	REAL :: buf, buf1
	1010
	1011
	1012	buf = 1.0
	1013	buf1 = 1.1
	1014	IF ( me_node == 0 ) THEN ! first node only
	1015
	1016	SELECT CASE ( loca )
	1017
	1018	CASE ( 1 ) ! before alltoall
	1019
	1020	IF( me_task > 0 ) THEN ! first task does not wait
	1021	#if defined( __parallel )
	1022	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
	1023	buf1, 1, MPI_REAL, me_task-1, 0, &
	1024	comm_tasks, stat,ierr )
	1025	#endif
	1026	ELSEIF ( j > nys_p ) THEN
	1027	req = 0
	1028	tag = MOD( tag-10, 10 ) + 10
	1029	#if defined( __parallel )
	1030	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
	1031	tag, comm_tasks, req, ierr )
	1032	reqa = req
	1033	CALL MPI_WAITALL( 1, reqa, stat, ierr )
	1034	#endif
	1035	ENDIF
	1036
	1037	CASE ( 2 ) ! after alltoall
	1038
	1039	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
	1040	#if defined( __parallel )
	1041	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
	1042	buf1, 1, MPI_REAL, me_task+1, 0, &
	1043	comm_tasks, stat, ierr)
	1044	#endif
	1045	ELSEIF ( j < nyn_p ) THEN
	1046	req = 0
	1047	tag = MOD( tag-10, 10 ) + 10
	1048	#if defined( __parallel )
	1049	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
	1050	ierr )
	1051	#endif
	1052	ENDIF
	1053
	1054	END SELECT
	1055
	1056	ENDIF
	1057
	1058	END SUBROUTINE cascade
	1059
	1060	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |