Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 260

Last change on this file since 260 was 257, checked in by heinze, 16 years ago
Output of messages replaced by message handling routine
Property svn:keywords set to `Id`
File size: 31.2 KB

Rev	Line
[1]	1	MODULE poisfft_hybrid_mod
	2	!------------------------------------------------------------------------------
	3	!
[257]	4	! Current revisions:
[1]	5	! -----------------
[257]	6	! Output of messages replaced by message handling routine.
[1]	7	!
[257]	8	!
[1]	9	! Former revisions:
	10	! -----------------
[3]	11	! $Id: poisfft_hybrid.f90 257 2009-03-11 15:17:42Z raasch $
	12	! RCS Log replace by Id keyword, revision history cleaned up
	13	!
[1]	14	! Revision 1.11 2004/04/30 12:43:14 raasch
	15	! Renaming of fft routines, additional argument in calls of fft_y_m
	16	!
	17	! Revision 1.2 2002/12/19 16:08:31 raasch
	18	! Preprocessor directive KKMP introduced (OMP does NOT work),
	19	! array tri will be a shared array in OpenMP loop, to get better cache
	20	! utilization, the i index (x-direction) will be executed in stride
	21	! "istride" as outer loop and in a shorter inner loop,
	22	! overlapping of computation and communication realized by new routine
	23	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
	24	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
	25	!
	26	!
	27	! Description:
	28	! ------------
	29	! Solution of the Poisson equation with a 2D spectral method.
	30	! Hybrid version for parallel computers using a 1D domain decomposition,
	31	! realized with MPI, along x and parallelization with OPEN-MP along y
	32	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
	33	! optimization is realized by overlapping of computation and communication
	34	! and by simultaneously executing as many communication calls as switches
	35	! per logical partition (LPAR) are available. This version comes into
	36	! effect if more than one node is used and if the environment variable
	37	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
	38	! without any rest.
	39	!
	40	! WARNING: In case of OpenMP, there are problems with allocating large
	41	! arrays in parallel regions.
	42	!
	43	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
	44	!------------------------------------------------------------------------------!
	45
	46	USE fft_xy
	47	USE indices
	48	USE pegrid
	49
	50	IMPLICIT NONE
	51
	52	PRIVATE
	53	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
	54
	55	INTEGER, PARAMETER :: switch_per_lpar = 2
	56
	57	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
	58	nxl_p, nxr_p, & ! partial x dimension
	59	nys_a, nyn_a, & ! total y dimension
	60	nys_p, nyn_p, & ! partial y dimension
	61
	62	npe_s, & ! total number of PEs for solver
	63	nwords, & ! number of points to be exchanged
	64	! with MPI_ALLTOALL
	65	n_omp_threads ! number of OpenMP threads
	66
	67	!
	68	!-- Variables for multi node version (cluster version) using routine
	69	!-- poisfft_hybrid_nodes
	70	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
	71	comm_node_all, & ! communicater all PEs node version
	72	comm_tasks, & ! communicater tasks
	73	me, me_node, me_task,& ! identity of this PE
	74	nodes, & ! number of nodes
	75	tasks_per_logical_node = -1 ! default no cluster
	76
	77
	78	!
	79	!-- Public interfaces
	80	INTERFACE poisfft_hybrid_ini
	81	MODULE PROCEDURE poisfft_hybrid_ini
	82	END INTERFACE poisfft_hybrid_ini
	83
	84	INTERFACE poisfft_hybrid
	85	MODULE PROCEDURE poisfft_hybrid
	86	END INTERFACE poisfft_hybrid
	87
	88	!
	89	!-- Private interfaces
	90	INTERFACE poisfft_hybrid_omp
	91	MODULE PROCEDURE poisfft_hybrid_omp
	92	END INTERFACE poisfft_hybrid_omp
	93
	94	INTERFACE poisfft_hybrid_omp_vec
	95	MODULE PROCEDURE poisfft_hybrid_omp_vec
	96	END INTERFACE poisfft_hybrid_omp_vec
	97
	98	INTERFACE poisfft_hybrid_nodes
	99	MODULE PROCEDURE poisfft_hybrid_nodes
	100	END INTERFACE poisfft_hybrid_nodes
	101
	102	INTERFACE tridia_hybrid
	103	MODULE PROCEDURE tridia_hybrid
	104	END INTERFACE tridia_hybrid
	105
	106	INTERFACE cascade
	107	MODULE PROCEDURE cascade
	108	END INTERFACE cascade
	109
	110	CONTAINS
	111
	112
	113	SUBROUTINE poisfft_hybrid_ini
	114
	115	USE control_parameters
	116	USE pegrid
	117
	118	IMPLICIT NONE
	119
	120	CHARACTER(LEN=8) :: cdummy
	121	INTEGER :: idummy, istat
	122	INTEGER, DIMENSION(2) :: coords, dims
	123
	124	LOGICAL, DIMENSION(2) :: period = .false., re_dims
	125
	126
	127	!
	128	!-- Set the internal index values for the hybrid solver
	129	#if defined( __parallel )
	130	npe_s = pdims(1)
	131	#else
	132	npe_s = 1
	133	#endif
	134	nxl_a = 0
	135	nxr_a = nx
	136	nxl_p = 0
	137	nxr_p = ( ( nx+1 ) / npe_s ) - 1
	138	nys_a = nys
	139	nyn_a = nyn
	140	nys_p = 0
	141	nyn_p = ( ( ny+1 ) / npe_s ) - 1
	142
	143	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
	144
	145	#if defined( __KKMP )
	146	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
	147	READ ( cdummy, '(I8)' ) n_omp_threads
[257]	148	IF ( n_omp_threads > 1 ) THEN
	149	WRITE( message_string, * ) 'Number of OpenMP threads = ', &
	150	n_omp_threads
	151	CALL message( 'poisfft_hybrid_ini', 'PA0280', 0, 0, 0, 6, 0 )
[1]	152	ENDIF
	153	#else
	154	n_omp_threads = 1
	155	#endif
	156	!
	157	!-- Initialize the one-dimensional FFT routines
	158	CALL fft_init
	159
	160	!
	161	!-- Setup for multi node version (poisfft_hybrid_nodes)
	162	IF ( n_omp_threads == 1 .AND. &
	163	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
	164
	165	IF ( tasks_per_node /= -9999 ) THEN
	166	!
	167	!-- Multi node version requires that the available number of
	168	!-- switches per logical partition must be an integral divisor
	169	!-- of the chosen number of tasks per node
	170	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
	171	!
	172	!-- Set the switch which decides about usage of the multi node
	173	!-- version
	174	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
	175	numprocs > tasks_per_node ) THEN
	176	tasks_per_logical_node = tasks_per_node / switch_per_lpar
	177	ENDIF
	178
[257]	179	IF ( tasks_per_logical_node > -1 ) THEN
	180
	181	WRITE( message_string, * ) 'running optimized ', &
	182	'multinode version', &
	183	'&switch_per_lpar = ', &
	184	switch_per_lpar, &
	185	'&tasks_per_lpar = ', &
	186	tasks_per_node, &
	187	'tasks_per_logical_node = ', &
	188	tasks_per_logical_node
	189	CALL message( 'poisfft_hybrid_ini', 'PA0281', 0, 0, 0, 6, 0 )
	190
[1]	191	ENDIF
	192
	193	ENDIF
	194	ENDIF
	195	ENDIF
	196
	197	!
	198	!-- Determine sub-topologies for multi node version
	199	IF ( tasks_per_logical_node >= 2 ) THEN
	200
	201	#if defined( __parallel )
	202	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
	203	tasks_per_logical_node
	204	dims(1) = nodes
	205	dims(2) = tasks_per_logical_node
	206
	207	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
	208	comm_node_all, istat )
	209	CALL MPI_COMM_RANK( comm_node_all, me, istat )
	210
	211	re_dims(1) = .TRUE.
	212	re_dims(2) = .FALSE.
	213	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
	214	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
	215
	216	re_dims(1) = .FALSE.
	217	re_dims(2) = .TRUE.
	218	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
	219	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
	220
	221	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
	222	! tasks_per_logical_node
	223	#else
[257]	224	message_string = 'parallel environment (MPI) required'
	225	CALL message( 'poisfft_hybrid_ini', 'PA0282', 1, 2, 0, 6, 0 )
[1]	226	#endif
	227	ENDIF
	228
	229	END SUBROUTINE poisfft_hybrid_ini
	230
	231
	232	SUBROUTINE poisfft_hybrid( ar )
	233
	234	USE control_parameters
	235	USE interfaces
	236
	237	IMPLICIT NONE
	238
	239	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	240
	241	IF ( host(1:3) == 'nec' ) THEN
	242	CALL poisfft_hybrid_omp_vec( ar )
	243	ELSE
	244	IF ( tasks_per_logical_node == -1 ) THEN
	245	CALL poisfft_hybrid_omp( ar )
	246	ELSE
	247	CALL poisfft_hybrid_nodes( ar )
	248	ENDIF
	249	ENDIF
	250
	251	END SUBROUTINE poisfft_hybrid
	252
	253
	254	SUBROUTINE poisfft_hybrid_omp ( ar )
	255
	256	USE cpulog
	257	USE interfaces
	258
	259	IMPLICIT NONE
	260
	261	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	262	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	263
	264	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	265
	266	REAL, DIMENSION(0:nx) :: fftx_ar
	267	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	268
	269	REAL, DIMENSION(0:nx,nz) :: tri_ar
	270
	271	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	272	#if defined( __KKMP )
	273	INTEGER :: omp_get_thread_num
	274	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	275	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	276	#else
	277	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	278	#endif
	279
	280
	281	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
	282
	283	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	284
	285	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	286	!$OMP DO
	287	!
	288	!-- Store grid points to be transformed on a 1d-array, do the fft
	289	!-- and sample the results on a 4d-array
	290	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	291	iei = MIN( iouter+istride-1, nxr_p )
	292	DO k = 1, nz
	293
	294	DO i = iouter, iei
	295	ii = nxl + i
	296	ir = i - iouter + 1
	297
	298	DO j = nys_a, nyn_a
	299	ffty_ar(j,ir) = ar(k,j,ii)
	300	ENDDO
	301
	302	CALL fft_y( ffty_ar(:,ir), 'forward' )
	303	ENDDO
	304
	305	m = nys_a
	306	DO n = 1, npe_s
	307	DO j = nys_p, nyn_p
	308	DO i = iouter, iei
	309	ir = i - iouter + 1
	310	work1(i,k,j,n) = ffty_ar(m,ir)
	311	ENDDO
	312	m = m+1
	313	ENDDO
	314	ENDDO
	315
	316	ENDDO
	317	ENDDO
	318	!$OMP END PARALLEL
	319
	320	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	321
	322	#if defined( __parallel )
	323	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	324
	325	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	326	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	327	comm2d, istat )
	328
	329	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	330	#else
	331	work2 = work1
	332	#endif
	333
	334	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	335
	336	#if defined( __KKMP )
	337	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
	338	!$OMP DO
	339	DO j = nys_p, nyn_p
	340	jthread = omp_get_thread_num() + 1
	341	#else
	342	DO j = nys_p, nyn_p
	343	jthread = 1
	344	#endif
	345	DO k = 1, nz
	346
	347	m = nxl_a
	348	DO n = 1, npe_s
	349	DO i = nxl_p, nxr_p
	350	fftx_ar(m) = work2(i,k,j,n)
	351	m = m+1
	352	ENDDO
	353	ENDDO
	354
	355	CALL fft_x( fftx_ar, 'forward' )
	356
	357	DO i = nxl_a, nxr_a
	358	tri_ar(i,k) = fftx_ar(i)
	359	ENDDO
	360
	361	ENDDO
	362
	363	jj = myid * (nyn_p-nys_p+1) + j
	364	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	365
	366	DO k = 1, nz
	367	DO i = nxl_a, nxr_a
	368	fftx_ar(i) = tri_ar (i,k)
	369	ENDDO
	370
	371	CALL fft_x( fftx_ar, 'backward' )
	372
	373	m = nxl_a
	374	DO n = 1, npe_s
	375	DO i = nxl_p, nxr_p
	376	work2(i,k,j,n) = fftx_ar(m)
	377	m = m+1
	378	ENDDO
	379	ENDDO
	380
	381	ENDDO
	382	ENDDO
	383	#if defined( __KKMP )
	384	!$OMP END PARALLEL
	385	#endif
	386
	387	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	388
	389	#if defined( __parallel )
	390	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	391	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	392
	393	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	394	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	395	comm2d, istat )
	396
	397	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	398	#else
	399	work1 = work2
	400	#endif
	401
	402	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	403
	404	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	405	!$OMP DO
	406	DO iouter = nxl_p, nxr_p, istride
	407	iei = MIN( iouter+istride-1, nxr_p )
	408	DO k = 1, nz
	409
	410	m = nys_a
	411	DO n = 1, npe_s
	412	DO j = nys_p, nyn_p
	413	DO i = iouter, iei
	414	ir = i - iouter + 1
	415	ffty_ar(m,ir) = work1 (i,k,j,n)
	416	ENDDO
	417	m = m+1
	418	ENDDO
	419	ENDDO
	420
	421	DO i = iouter, iei
	422	ii = nxl + i
	423	ir = i - iouter + 1
	424	CALL fft_y( ffty_ar(:,ir), 'backward' )
	425
	426	DO j = nys_a, nyn_a
	427	ar(k,j,ii) = ffty_ar(j,ir)
	428	ENDDO
	429	ENDDO
	430
	431	ENDDO
	432	ENDDO
	433	!$OMP END PARALLEL
	434
	435	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	436
	437	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
	438
	439	#if defined( __KKMP )
	440	DEALLOCATE( tri )
	441	#endif
	442
	443	END SUBROUTINE poisfft_hybrid_omp
	444
	445
	446	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
	447
	448	USE cpulog
	449	USE interfaces
	450
	451	IMPLICIT NONE
	452
	453	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	454	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	455
	456	REAL, DIMENSION(0:nx,nz) :: tri_ar
	457
	458	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	459
	460	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
	461	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
	462
	463	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	464	#if defined( __KKMP )
	465	INTEGER :: omp_get_thread_num
	466	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	467	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	468	#else
	469	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	470	#endif
	471
	472
	473	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
	474
	475	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
	476
	477	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
	478	!$OMP DO
	479	!
	480	!-- Store grid points to be transformed on a 1d-array, do the fft
	481	!-- and sample the results on a 4d-array
	482	DO i = nxl_p, nxr_p
	483
	484	DO j = nys_a, nyn_a
	485	DO k = 1, nz
	486	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
	487	ENDDO
	488	ENDDO
	489
	490	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
	491	ENDDO
	492
	493	!$OMP DO
	494	DO k = 1, nz
	495	m = nys_a
	496	DO n = 1, npe_s
	497	DO j = nys_p, nyn_p
	498	DO i = nxl_p, nxr_p
	499	work1(i,k,j,n) = ffty_ar3(m,k,i)
	500	ENDDO
	501	m = m+1
	502	ENDDO
	503	ENDDO
	504	ENDDO
	505	!$OMP END PARALLEL
	506
	507	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
	508
	509	#if defined( __parallel )
	510	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	511	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	512	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	513	comm2d, istat )
	514	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	515	#else
	516	work2 = work1
	517	#endif
	518
	519	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
	520
	521	#if defined( __KKMP )
	522	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
	523	!$OMP DO
	524	DO j = nys_p, nyn_p
	525	jthread = omp_get_thread_num() + 1
	526	#else
	527	DO j = nys_p, nyn_p
	528	jthread = 1
	529	#endif
	530	DO k = 1, nz
	531
	532	m = nxl_a
	533	DO n = 1, npe_s
	534	DO i = nxl_p, nxr_p
	535	fftx_ar3(m,k,j) = work2(i,k,j,n)
	536	m = m+1
	537	ENDDO
	538	ENDDO
	539	ENDDO
	540
	541	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
	542
	543	DO k = 1, nz
	544	DO i = nxl_a, nxr_a
	545	tri_ar(i,k) = fftx_ar3(i,k,j)
	546	ENDDO
	547	ENDDO
	548
	549	jj = myid * (nyn_p-nys_p+1) + j
	550	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	551
	552	DO k = 1, nz
	553	DO i = nxl_a, nxr_a
	554	fftx_ar3(i,k,j) = tri_ar (i,k)
	555	ENDDO
	556	ENDDO
	557
	558	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
	559
	560	DO k = 1, nz
	561	m = nxl_a
	562	DO n = 1, npe_s
	563	DO i = nxl_p, nxr_p
	564	work2(i,k,j,n) = fftx_ar3(m,k,j)
	565	m = m+1
	566	ENDDO
	567	ENDDO
	568	ENDDO
	569
	570	ENDDO
	571	#if defined( __KKMP )
	572	!$OMP END PARALLEL
	573	#endif
	574
	575	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
	576
	577	#if defined( __parallel )
	578	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	579	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	580	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	581	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	582	comm2d, istat )
	583	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	584	#else
	585	work1 = work2
	586	#endif
	587
	588	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
	589
	590	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
	591	!$OMP DO
	592	DO k = 1, nz
	593	m = nys_a
	594	DO n = 1, npe_s
	595	DO j = nys_p, nyn_p
	596	DO i = nxl_p, nxr_p
	597	ffty_ar3(m,k,i) = work1(i,k,j,n)
	598	ENDDO
	599	m = m+1
	600	ENDDO
	601	ENDDO
	602	ENDDO
	603
	604	!$OMP DO
	605	DO i = nxl_p, nxr_p
	606	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
	607	DO j = nys_a, nyn_a
	608	DO k = 1, nz
	609	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
	610	ENDDO
	611	ENDDO
	612	ENDDO
	613	!$OMP END PARALLEL
	614
	615	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
	616
	617	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
	618
	619	#if defined( __KKMP )
	620	DEALLOCATE( tri )
	621	#endif
	622
	623	END SUBROUTINE poisfft_hybrid_omp_vec
	624
	625
	626	SUBROUTINE poisfft_hybrid_nodes ( ar )
	627
	628	USE cpulog
	629	USE interfaces
	630
	631	IMPLICIT NONE
	632
	633	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	634	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
	635	n, nn, nt, nw1, nw2
	636
	637	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	638
	639	REAL, DIMENSION(0:nx) :: fftx_ar
	640	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	641
	642	REAL, DIMENSION(0:nx,nz) :: tri_ar
	643
	644	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
	645	nodes,nys_p:nyn_p) :: work1,work2
	646	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	647
	648
	649	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
	650
	651	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	652
	653	!
	654	!-- Store grid points to be transformed on a 1d-array, do the fft
	655	!-- and sample the results on a 4d-array
	656	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	657	iei = MIN( iouter+istride-1, nxr_p )
	658	DO k = 1, nz
	659
	660	DO i = iouter, iei
	661	ii = nxl + i
	662	ir = i - iouter + 1
	663
	664	DO j = nys_a, nyn_a
	665	ffty_ar(j,ir) = ar(k,j,ii)
	666	ENDDO
	667
	668	CALL fft_y( ffty_ar(:,ir), 'forward' )
	669	ENDDO
	670
	671	m = nys_a
	672	DO nn = 1, nodes
	673	DO nt = 1, tasks_per_logical_node
	674	DO j = nys_p, nyn_p
	675	DO i = iouter, iei
	676	ir = i - iouter + 1
	677	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
	678	ENDDO
	679	m = m+1
	680	ENDDO
	681	ENDDO
	682	ENDDO
	683
	684	ENDDO
	685	ENDDO
	686
	687	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	688
	689	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	690	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
	691	DO nn = 1, nodes
	692	DO j = nys_p, nyn_p
	693	#if defined( __parallel )
	694	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	695	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	696	comm_tasks, istat )
	697	#endif
	698	ENDDO
	699	ENDDO
	700	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	701
	702
	703	DO j = nys_p, nyn_p
	704
	705	CALL cascade( 1, j, nys_p, nyn_p )
	706	nw2 = nw1 * SIZE( work1, 3 )
	707	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
	708	#if defined( __parallel )
	709	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	710	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	711	comm_nodes, istat )
	712	#endif
	713	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
	714	CALL cascade( 2, j, nys_p, nyn_p )
	715
	716	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	717	DO k = 1, nz
	718
	719	m = nxl_a
	720	DO nn = 1, nodes
	721	DO nt = 1, tasks_per_logical_node
	722	DO i = nxl_p, nxr_p
	723	fftx_ar(m) = work1(i,k,nt,nn,j)
	724	m = m+1
	725	ENDDO
	726	ENDDO
	727	ENDDO
	728
	729	CALL fft_x( fftx_ar, 'forward' )
	730
	731	DO i = nxl_a, nxr_a
	732	tri_ar(i,k) = fftx_ar(i)
	733	ENDDO
	734
	735	ENDDO
	736
	737	jj = myid * (nyn_p-nys_p+1) + j
	738	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
	739
	740	DO k = 1, nz
	741	DO i = nxl_a, nxr_a
	742	fftx_ar(i) = tri_ar(i,k)
	743	ENDDO
	744
	745	CALL fft_x( fftx_ar, 'backward' )
	746
	747	m = nxl_a
	748	DO nn = 1, nodes
	749	DO nt = 1, tasks_per_logical_node
	750	DO i = nxl_p, nxr_p
	751	work1(i,k,nt,nn,j) = fftx_ar(m)
	752	m = m+1
	753	ENDDO
	754	ENDDO
	755	ENDDO
	756	ENDDO
	757
	758	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	759	nw2 = nw1 * SIZE( work1, 3 )
	760	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
	761	#if defined( __parallel )
	762	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	763	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	764	comm_nodes, istat )
	765	#endif
	766	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
	767
	768	ENDDO
	769
	770	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	771	DO nn = 1, nodes
	772	DO j = nys_p, nyn_p
	773	#if defined( __parallel )
	774	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	775	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	776	comm_tasks, istat )
	777	#endif
	778	ENDDO
	779	ENDDO
	780	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	781
	782	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	783
	784	DO iouter = nxl_p, nxr_p, istride
	785	iei = MIN( iouter+istride-1, nxr_p )
	786	DO k = 1, nz
	787
	788	m = nys_a
	789	DO nn = 1, nodes
	790	DO nt = 1, tasks_per_logical_node
	791	DO j = nys_p, nyn_p
	792	DO i = iouter, iei
	793	ir = i - iouter + 1
	794	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
	795	ENDDO
	796	m = m+1
	797	ENDDO
	798	ENDDO
	799	ENDDO
	800
	801	DO i = iouter, iei
	802	ii = nxl + i
	803	ir = i - iouter + 1
	804	CALL fft_y( ffty_ar(:,ir), 'backward' )
	805
	806	DO j = nys_a, nyn_a
	807	ar(k,j,ii) = ffty_ar(j,ir)
	808	ENDDO
	809	ENDDO
	810
	811	ENDDO
	812	ENDDO
	813
	814	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	815
	816	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
	817
	818	END SUBROUTINE poisfft_hybrid_nodes
	819
	820
	821
	822	SUBROUTINE tridia_hybrid( j, ar, tri )
	823
	824	USE arrays_3d
	825	USE control_parameters
	826	USE grid_variables
	827
	828	IMPLICIT NONE
	829
	830	INTEGER :: i, j, k, nnyh
	831	REAL, DIMENSION(0:nx,nz) :: ar
	832	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	833	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	834
	835	nnyh = (ny+1) / 2
	836
	837	tri = 0.0
	838	!
	839	!-- Define constant elements of the tridiagonal matrix.
	840	DO k = 0, nz-1
	841	DO i = 0,nx
	842	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
	843	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
	844	ENDDO
	845	ENDDO
	846
	847	IF ( j <= nnyh ) THEN
	848	CALL maketri_hybrid( j )
	849	ELSE
	850	CALL maketri_hybrid( ny+1-j)
	851	ENDIF
	852	CALL zerleg_hybrid
	853	CALL substi_hybrid( ar, tri )
	854
	855	CONTAINS
	856
	857	SUBROUTINE maketri_hybrid( j )
	858
	859	!----------------------------------------------------------------------!
	860	! maketri !
	861	! !
	862	! computes the i- and j-dependent component of the matrix !
	863	!----------------------------------------------------------------------!
	864
	865	USE constants
	866
	867	IMPLICIT NONE
	868
	869	INTEGER :: i, j, k, nnxh
	870	REAL :: a, c
	871
	872	REAL, DIMENSION(0:nx) :: l
	873
	874
	875	nnxh = (nx+1) / 2
	876	!
	877	!-- Provide the tridiagonal matrix for solution of the Poisson equation
	878	!-- in Fourier space. The coefficients are computed following the method
	879	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
	880	!-- Siano's original version.
	881	DO i = 0,nx
	882	IF ( i >= 0 .AND. i < nnxh ) THEN
	883	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	884	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	885	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	886	FLOAT( ny+1 ) ) ) / ( dy * dy )
	887	ELSEIF ( i == nnxh ) THEN
	888	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	889	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	890	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	891	FLOAT(ny+1) ) ) / ( dy * dy )
	892	ELSE
	893	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	894	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	895	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	896	FLOAT( ny+1 ) ) ) / ( dy * dy )
	897	ENDIF
	898	ENDDO
	899
	900	DO k = 0,nz-1
	901	DO i = 0, nx
	902	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	903	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	904	tri(1,i,k) = a + c - l(i)
	905	ENDDO
	906	ENDDO
	907	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	908	DO i = 0,nx
	909	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	910	ENDDO
	911	ENDIF
	912	IF ( ibc_p_t == 1 ) THEN
	913	DO i = 0,nx
	914	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	915	ENDDO
	916	ENDIF
	917
	918	END SUBROUTINE maketri_hybrid
	919
	920
	921	SUBROUTINE zerleg_hybrid
	922
	923	!----------------------------------------------------------------------!
	924	! zerleg !
	925	! !
	926	! Splitting of the tridiagonal matrix (Thomas algorithm) !
	927	!----------------------------------------------------------------------!
	928
	929	USE indices
	930
	931	IMPLICIT NONE
	932
	933	INTEGER :: i, k
	934
	935	!
	936	!-- Splitting
	937	DO i = 0, nx
	938	tri(4,i,0) = tri(1,i,0)
	939	ENDDO
	940	DO k = 1, nz-1
	941	DO i = 0,nx
	942	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	943	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	944	ENDDO
	945	ENDDO
	946
	947	END SUBROUTINE zerleg_hybrid
	948
	949	SUBROUTINE substi_hybrid( ar, tri )
	950
	951	!----------------------------------------------------------------------!
	952	! substi !
	953	! !
	954	! Substitution (Forward and Backward) (Thomas algorithm) !
	955	!----------------------------------------------------------------------!
	956
	957	IMPLICIT NONE
	958
	959	INTEGER :: i, j, k
	960	REAL, DIMENSION(0:nx,nz) :: ar
	961	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	962	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	963
	964	!
	965	!-- Forward substitution
	966	DO i = 0, nx
	967	ar1(i,0) = ar(i,1)
	968	ENDDO
	969	DO k = 1, nz - 1
	970	DO i = 0,nx
	971	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	972	ENDDO
	973	ENDDO
	974
	975	!
	976	!-- Backward substitution
	977	DO i = 0,nx
	978	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	979	ENDDO
	980	DO k = nz-2, 0, -1
	981	DO i = 0,nx
	982	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	983	/ tri(4,i,k)
	984	ENDDO
	985	ENDDO
	986
	987	END SUBROUTINE substi_hybrid
	988
	989	END SUBROUTINE tridia_hybrid
	990
	991
	992	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
	993
	994	USE cpulog
	995
	996	IMPLICIT NONE
	997
	998	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
	999	INTEGER, SAVE :: tag = 10
	1000	#if defined( __parallel )
	1001	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
	1002	#endif
	1003
	1004	REAL :: buf, buf1
	1005
	1006
	1007	buf = 1.0
	1008	buf1 = 1.1
	1009	IF ( me_node == 0 ) THEN ! first node only
	1010
	1011	SELECT CASE ( loca )
	1012
	1013	CASE ( 1 ) ! before alltoall
	1014
	1015	IF( me_task > 0 ) THEN ! first task does not wait
	1016	#if defined( __parallel )
	1017	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
	1018	buf1, 1, MPI_REAL, me_task-1, 0, &
	1019	comm_tasks, stat,ierr )
	1020	#endif
	1021	ELSEIF ( j > nys_p ) THEN
	1022	req = 0
	1023	tag = MOD( tag-10, 10 ) + 10
	1024	#if defined( __parallel )
	1025	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
	1026	tag, comm_tasks, req, ierr )
	1027	reqa = req
	1028	CALL MPI_WAITALL( 1, reqa, stat, ierr )
	1029	#endif
	1030	ENDIF
	1031
	1032	CASE ( 2 ) ! after alltoall
	1033
	1034	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
	1035	#if defined( __parallel )
	1036	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
	1037	buf1, 1, MPI_REAL, me_task+1, 0, &
	1038	comm_tasks, stat, ierr)
	1039	#endif
	1040	ELSEIF ( j < nyn_p ) THEN
	1041	req = 0
	1042	tag = MOD( tag-10, 10 ) + 10
	1043	#if defined( __parallel )
	1044	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
	1045	ierr )
	1046	#endif
	1047	ENDIF
	1048
	1049	END SELECT
	1050
	1051	ENDIF
	1052
	1053	END SUBROUTINE cascade
	1054
	1055	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |