Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 9

Last change on this file since 9 was 4, checked in by raasch, 18 years ago
Id keyword set as property for all *.f90 files
Property svn:keywords set to `Id`
File size: 30.7 KB

Rev	Line
[1]	1	MODULE poisfft_hybrid_mod
	2	!------------------------------------------------------------------------------
	3	!
	4	! Actual revisions:
	5	! -----------------
	6	!
	7	!
	8	! Former revisions:
	9	! -----------------
[3]	10	! $Id: poisfft_hybrid.f90 4 2007-02-13 11:33:16Z raasch $
	11	! RCS Log replace by Id keyword, revision history cleaned up
	12	!
[1]	13	! Revision 1.11 2004/04/30 12:43:14 raasch
	14	! Renaming of fft routines, additional argument in calls of fft_y_m
	15	!
	16	! Revision 1.2 2002/12/19 16:08:31 raasch
	17	! Preprocessor directive KKMP introduced (OMP does NOT work),
	18	! array tri will be a shared array in OpenMP loop, to get better cache
	19	! utilization, the i index (x-direction) will be executed in stride
	20	! "istride" as outer loop and in a shorter inner loop,
	21	! overlapping of computation and communication realized by new routine
	22	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
	23	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
	24	!
	25	!
	26	! Description:
	27	! ------------
	28	! Solution of the Poisson equation with a 2D spectral method.
	29	! Hybrid version for parallel computers using a 1D domain decomposition,
	30	! realized with MPI, along x and parallelization with OPEN-MP along y
	31	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
	32	! optimization is realized by overlapping of computation and communication
	33	! and by simultaneously executing as many communication calls as switches
	34	! per logical partition (LPAR) are available. This version comes into
	35	! effect if more than one node is used and if the environment variable
	36	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
	37	! without any rest.
	38	!
	39	! WARNING: In case of OpenMP, there are problems with allocating large
	40	! arrays in parallel regions.
	41	!
	42	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
	43	!------------------------------------------------------------------------------!
	44
	45	USE fft_xy
	46	USE indices
	47	USE pegrid
	48
	49	IMPLICIT NONE
	50
	51	PRIVATE
	52	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
	53
	54	INTEGER, PARAMETER :: switch_per_lpar = 2
	55
	56	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
	57	nxl_p, nxr_p, & ! partial x dimension
	58	nys_a, nyn_a, & ! total y dimension
	59	nys_p, nyn_p, & ! partial y dimension
	60
	61	npe_s, & ! total number of PEs for solver
	62	nwords, & ! number of points to be exchanged
	63	! with MPI_ALLTOALL
	64	n_omp_threads ! number of OpenMP threads
	65
	66	!
	67	!-- Variables for multi node version (cluster version) using routine
	68	!-- poisfft_hybrid_nodes
	69	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
	70	comm_node_all, & ! communicater all PEs node version
	71	comm_tasks, & ! communicater tasks
	72	me, me_node, me_task,& ! identity of this PE
	73	nodes, & ! number of nodes
	74	tasks_per_logical_node = -1 ! default no cluster
	75
	76
	77	!
	78	!-- Public interfaces
	79	INTERFACE poisfft_hybrid_ini
	80	MODULE PROCEDURE poisfft_hybrid_ini
	81	END INTERFACE poisfft_hybrid_ini
	82
	83	INTERFACE poisfft_hybrid
	84	MODULE PROCEDURE poisfft_hybrid
	85	END INTERFACE poisfft_hybrid
	86
	87	!
	88	!-- Private interfaces
	89	INTERFACE poisfft_hybrid_omp
	90	MODULE PROCEDURE poisfft_hybrid_omp
	91	END INTERFACE poisfft_hybrid_omp
	92
	93	INTERFACE poisfft_hybrid_omp_vec
	94	MODULE PROCEDURE poisfft_hybrid_omp_vec
	95	END INTERFACE poisfft_hybrid_omp_vec
	96
	97	INTERFACE poisfft_hybrid_nodes
	98	MODULE PROCEDURE poisfft_hybrid_nodes
	99	END INTERFACE poisfft_hybrid_nodes
	100
	101	INTERFACE tridia_hybrid
	102	MODULE PROCEDURE tridia_hybrid
	103	END INTERFACE tridia_hybrid
	104
	105	INTERFACE cascade
	106	MODULE PROCEDURE cascade
	107	END INTERFACE cascade
	108
	109	CONTAINS
	110
	111
	112	SUBROUTINE poisfft_hybrid_ini
	113
	114	USE control_parameters
	115	USE pegrid
	116
	117	IMPLICIT NONE
	118
	119	CHARACTER(LEN=8) :: cdummy
	120	INTEGER :: idummy, istat
	121	INTEGER, DIMENSION(2) :: coords, dims
	122
	123	LOGICAL, DIMENSION(2) :: period = .false., re_dims
	124
	125
	126	!
	127	!-- Set the internal index values for the hybrid solver
	128	#if defined( __parallel )
	129	npe_s = pdims(1)
	130	#else
	131	npe_s = 1
	132	#endif
	133	nxl_a = 0
	134	nxr_a = nx
	135	nxl_p = 0
	136	nxr_p = ( ( nx+1 ) / npe_s ) - 1
	137	nys_a = nys
	138	nyn_a = nyn
	139	nys_p = 0
	140	nyn_p = ( ( ny+1 ) / npe_s ) - 1
	141
	142	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
	143
	144	#if defined( __KKMP )
	145	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
	146	READ ( cdummy, '(I8)' ) n_omp_threads
	147	IF ( myid == 0 .AND. n_omp_threads > 1 ) THEN
	148	PRINT, '** poisfft_hybrid_ini: Number of OpenMP threads = ', &
	149	n_omp_threads
	150	ENDIF
	151	#else
	152	n_omp_threads = 1
	153	#endif
	154	!
	155	!-- Initialize the one-dimensional FFT routines
	156	CALL fft_init
	157
	158	!
	159	!-- Setup for multi node version (poisfft_hybrid_nodes)
	160	IF ( n_omp_threads == 1 .AND. &
	161	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
	162
	163	IF ( tasks_per_node /= -9999 ) THEN
	164	!
	165	!-- Multi node version requires that the available number of
	166	!-- switches per logical partition must be an integral divisor
	167	!-- of the chosen number of tasks per node
	168	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
	169	!
	170	!-- Set the switch which decides about usage of the multi node
	171	!-- version
	172	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
	173	numprocs > tasks_per_node ) THEN
	174	tasks_per_logical_node = tasks_per_node / switch_per_lpar
	175	ENDIF
	176
	177	IF ( myid == 0 .AND. tasks_per_logical_node > -1 ) THEN
	178	PRINT, '** poisfft_hybrid_ini: running optimized ', &
	179	'multinode version'
	180	PRINT*, ' switch_per_lpar = ', switch_per_lpar
	181	PRINT*, ' tasks_per_lpar = ', tasks_per_node
	182	PRINT*, ' tasks_per_logical_node = ', &
	183	tasks_per_logical_node
	184	ENDIF
	185
	186	ENDIF
	187	ENDIF
	188	ENDIF
	189
	190	!
	191	!-- Determine sub-topologies for multi node version
	192	IF ( tasks_per_logical_node >= 2 ) THEN
	193
	194	#if defined( __parallel )
	195	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
	196	tasks_per_logical_node
	197	dims(1) = nodes
	198	dims(2) = tasks_per_logical_node
	199
	200	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
	201	comm_node_all, istat )
	202	CALL MPI_COMM_RANK( comm_node_all, me, istat )
	203
	204	re_dims(1) = .TRUE.
	205	re_dims(2) = .FALSE.
	206	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
	207	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
	208
	209	re_dims(1) = .FALSE.
	210	re_dims(2) = .TRUE.
	211	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
	212	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
	213
	214	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
	215	! tasks_per_logical_node
	216	#else
	217	PRINT*, '+++ poisfft_hybrid_ini: parallel environment (MPI) required'
	218	CALL local_stop
	219	#endif
	220	ENDIF
	221
	222	END SUBROUTINE poisfft_hybrid_ini
	223
	224
	225	SUBROUTINE poisfft_hybrid( ar )
	226
	227	USE control_parameters
	228	USE interfaces
	229
	230	IMPLICIT NONE
	231
	232	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	233
	234	IF ( host(1:3) == 'nec' ) THEN
	235	CALL poisfft_hybrid_omp_vec( ar )
	236	ELSE
	237	IF ( tasks_per_logical_node == -1 ) THEN
	238	CALL poisfft_hybrid_omp( ar )
	239	ELSE
	240	CALL poisfft_hybrid_nodes( ar )
	241	ENDIF
	242	ENDIF
	243
	244	END SUBROUTINE poisfft_hybrid
	245
	246
	247	SUBROUTINE poisfft_hybrid_omp ( ar )
	248
	249	USE cpulog
	250	USE interfaces
	251
	252	IMPLICIT NONE
	253
	254	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	255	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	256
	257	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	258
	259	REAL, DIMENSION(0:nx) :: fftx_ar
	260	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	261
	262	REAL, DIMENSION(0:nx,nz) :: tri_ar
	263
	264	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	265	#if defined( __KKMP )
	266	INTEGER :: omp_get_thread_num
	267	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	268	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	269	#else
	270	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	271	#endif
	272
	273
	274	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
	275
	276	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	277
	278	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	279	!$OMP DO
	280	!
	281	!-- Store grid points to be transformed on a 1d-array, do the fft
	282	!-- and sample the results on a 4d-array
	283	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	284	iei = MIN( iouter+istride-1, nxr_p )
	285	DO k = 1, nz
	286
	287	DO i = iouter, iei
	288	ii = nxl + i
	289	ir = i - iouter + 1
	290
	291	DO j = nys_a, nyn_a
	292	ffty_ar(j,ir) = ar(k,j,ii)
	293	ENDDO
	294
	295	CALL fft_y( ffty_ar(:,ir), 'forward' )
	296	ENDDO
	297
	298	m = nys_a
	299	DO n = 1, npe_s
	300	DO j = nys_p, nyn_p
	301	DO i = iouter, iei
	302	ir = i - iouter + 1
	303	work1(i,k,j,n) = ffty_ar(m,ir)
	304	ENDDO
	305	m = m+1
	306	ENDDO
	307	ENDDO
	308
	309	ENDDO
	310	ENDDO
	311	!$OMP END PARALLEL
	312
	313	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	314
	315	#if defined( __parallel )
	316	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	317
	318	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	319	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	320	comm2d, istat )
	321
	322	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	323	#else
	324	work2 = work1
	325	#endif
	326
	327	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	328
	329	#if defined( __KKMP )
	330	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
	331	!$OMP DO
	332	DO j = nys_p, nyn_p
	333	jthread = omp_get_thread_num() + 1
	334	#else
	335	DO j = nys_p, nyn_p
	336	jthread = 1
	337	#endif
	338	DO k = 1, nz
	339
	340	m = nxl_a
	341	DO n = 1, npe_s
	342	DO i = nxl_p, nxr_p
	343	fftx_ar(m) = work2(i,k,j,n)
	344	m = m+1
	345	ENDDO
	346	ENDDO
	347
	348	CALL fft_x( fftx_ar, 'forward' )
	349
	350	DO i = nxl_a, nxr_a
	351	tri_ar(i,k) = fftx_ar(i)
	352	ENDDO
	353
	354	ENDDO
	355
	356	jj = myid * (nyn_p-nys_p+1) + j
	357	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	358
	359	DO k = 1, nz
	360	DO i = nxl_a, nxr_a
	361	fftx_ar(i) = tri_ar (i,k)
	362	ENDDO
	363
	364	CALL fft_x( fftx_ar, 'backward' )
	365
	366	m = nxl_a
	367	DO n = 1, npe_s
	368	DO i = nxl_p, nxr_p
	369	work2(i,k,j,n) = fftx_ar(m)
	370	m = m+1
	371	ENDDO
	372	ENDDO
	373
	374	ENDDO
	375	ENDDO
	376	#if defined( __KKMP )
	377	!$OMP END PARALLEL
	378	#endif
	379
	380	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	381
	382	#if defined( __parallel )
	383	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	384	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	385
	386	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	387	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	388	comm2d, istat )
	389
	390	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	391	#else
	392	work1 = work2
	393	#endif
	394
	395	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	396
	397	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	398	!$OMP DO
	399	DO iouter = nxl_p, nxr_p, istride
	400	iei = MIN( iouter+istride-1, nxr_p )
	401	DO k = 1, nz
	402
	403	m = nys_a
	404	DO n = 1, npe_s
	405	DO j = nys_p, nyn_p
	406	DO i = iouter, iei
	407	ir = i - iouter + 1
	408	ffty_ar(m,ir) = work1 (i,k,j,n)
	409	ENDDO
	410	m = m+1
	411	ENDDO
	412	ENDDO
	413
	414	DO i = iouter, iei
	415	ii = nxl + i
	416	ir = i - iouter + 1
	417	CALL fft_y( ffty_ar(:,ir), 'backward' )
	418
	419	DO j = nys_a, nyn_a
	420	ar(k,j,ii) = ffty_ar(j,ir)
	421	ENDDO
	422	ENDDO
	423
	424	ENDDO
	425	ENDDO
	426	!$OMP END PARALLEL
	427
	428	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	429
	430	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
	431
	432	#if defined( __KKMP )
	433	DEALLOCATE( tri )
	434	#endif
	435
	436	END SUBROUTINE poisfft_hybrid_omp
	437
	438
	439	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
	440
	441	USE cpulog
	442	USE interfaces
	443
	444	IMPLICIT NONE
	445
	446	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	447	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	448
	449	REAL, DIMENSION(0:nx,nz) :: tri_ar
	450
	451	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	452
	453	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
	454	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
	455
	456	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	457	#if defined( __KKMP )
	458	INTEGER :: omp_get_thread_num
	459	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	460	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	461	#else
	462	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	463	#endif
	464
	465
	466	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
	467
	468	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
	469
	470	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
	471	!$OMP DO
	472	!
	473	!-- Store grid points to be transformed on a 1d-array, do the fft
	474	!-- and sample the results on a 4d-array
	475	DO i = nxl_p, nxr_p
	476
	477	DO j = nys_a, nyn_a
	478	DO k = 1, nz
	479	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
	480	ENDDO
	481	ENDDO
	482
	483	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
	484	ENDDO
	485
	486	!$OMP DO
	487	DO k = 1, nz
	488	m = nys_a
	489	DO n = 1, npe_s
	490	DO j = nys_p, nyn_p
	491	DO i = nxl_p, nxr_p
	492	work1(i,k,j,n) = ffty_ar3(m,k,i)
	493	ENDDO
	494	m = m+1
	495	ENDDO
	496	ENDDO
	497	ENDDO
	498	!$OMP END PARALLEL
	499
	500	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
	501
	502	#if defined( __parallel )
	503	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	504	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	505	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	506	comm2d, istat )
	507	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	508	#else
	509	work2 = work1
	510	#endif
	511
	512	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
	513
	514	#if defined( __KKMP )
	515	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
	516	!$OMP DO
	517	DO j = nys_p, nyn_p
	518	jthread = omp_get_thread_num() + 1
	519	#else
	520	DO j = nys_p, nyn_p
	521	jthread = 1
	522	#endif
	523	DO k = 1, nz
	524
	525	m = nxl_a
	526	DO n = 1, npe_s
	527	DO i = nxl_p, nxr_p
	528	fftx_ar3(m,k,j) = work2(i,k,j,n)
	529	m = m+1
	530	ENDDO
	531	ENDDO
	532	ENDDO
	533
	534	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
	535
	536	DO k = 1, nz
	537	DO i = nxl_a, nxr_a
	538	tri_ar(i,k) = fftx_ar3(i,k,j)
	539	ENDDO
	540	ENDDO
	541
	542	jj = myid * (nyn_p-nys_p+1) + j
	543	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	544
	545	DO k = 1, nz
	546	DO i = nxl_a, nxr_a
	547	fftx_ar3(i,k,j) = tri_ar (i,k)
	548	ENDDO
	549	ENDDO
	550
	551	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
	552
	553	DO k = 1, nz
	554	m = nxl_a
	555	DO n = 1, npe_s
	556	DO i = nxl_p, nxr_p
	557	work2(i,k,j,n) = fftx_ar3(m,k,j)
	558	m = m+1
	559	ENDDO
	560	ENDDO
	561	ENDDO
	562
	563	ENDDO
	564	#if defined( __KKMP )
	565	!$OMP END PARALLEL
	566	#endif
	567
	568	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
	569
	570	#if defined( __parallel )
	571	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	572	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	573	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	574	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	575	comm2d, istat )
	576	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	577	#else
	578	work1 = work2
	579	#endif
	580
	581	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
	582
	583	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
	584	!$OMP DO
	585	DO k = 1, nz
	586	m = nys_a
	587	DO n = 1, npe_s
	588	DO j = nys_p, nyn_p
	589	DO i = nxl_p, nxr_p
	590	ffty_ar3(m,k,i) = work1(i,k,j,n)
	591	ENDDO
	592	m = m+1
	593	ENDDO
	594	ENDDO
	595	ENDDO
	596
	597	!$OMP DO
	598	DO i = nxl_p, nxr_p
	599	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
	600	DO j = nys_a, nyn_a
	601	DO k = 1, nz
	602	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
	603	ENDDO
	604	ENDDO
	605	ENDDO
	606	!$OMP END PARALLEL
	607
	608	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
	609
	610	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
	611
	612	#if defined( __KKMP )
	613	DEALLOCATE( tri )
	614	#endif
	615
	616	END SUBROUTINE poisfft_hybrid_omp_vec
	617
	618
	619	SUBROUTINE poisfft_hybrid_nodes ( ar )
	620
	621	USE cpulog
	622	USE interfaces
	623
	624	IMPLICIT NONE
	625
	626	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	627	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
	628	n, nn, nt, nw1, nw2
	629
	630	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	631
	632	REAL, DIMENSION(0:nx) :: fftx_ar
	633	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	634
	635	REAL, DIMENSION(0:nx,nz) :: tri_ar
	636
	637	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
	638	nodes,nys_p:nyn_p) :: work1,work2
	639	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	640
	641
	642	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
	643
	644	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	645
	646	!
	647	!-- Store grid points to be transformed on a 1d-array, do the fft
	648	!-- and sample the results on a 4d-array
	649	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	650	iei = MIN( iouter+istride-1, nxr_p )
	651	DO k = 1, nz
	652
	653	DO i = iouter, iei
	654	ii = nxl + i
	655	ir = i - iouter + 1
	656
	657	DO j = nys_a, nyn_a
	658	ffty_ar(j,ir) = ar(k,j,ii)
	659	ENDDO
	660
	661	CALL fft_y( ffty_ar(:,ir), 'forward' )
	662	ENDDO
	663
	664	m = nys_a
	665	DO nn = 1, nodes
	666	DO nt = 1, tasks_per_logical_node
	667	DO j = nys_p, nyn_p
	668	DO i = iouter, iei
	669	ir = i - iouter + 1
	670	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
	671	ENDDO
	672	m = m+1
	673	ENDDO
	674	ENDDO
	675	ENDDO
	676
	677	ENDDO
	678	ENDDO
	679
	680	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	681
	682	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	683	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
	684	DO nn = 1, nodes
	685	DO j = nys_p, nyn_p
	686	#if defined( __parallel )
	687	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	688	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	689	comm_tasks, istat )
	690	#endif
	691	ENDDO
	692	ENDDO
	693	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	694
	695
	696	DO j = nys_p, nyn_p
	697
	698	CALL cascade( 1, j, nys_p, nyn_p )
	699	nw2 = nw1 * SIZE( work1, 3 )
	700	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
	701	#if defined( __parallel )
	702	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	703	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	704	comm_nodes, istat )
	705	#endif
	706	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
	707	CALL cascade( 2, j, nys_p, nyn_p )
	708
	709	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	710	DO k = 1, nz
	711
	712	m = nxl_a
	713	DO nn = 1, nodes
	714	DO nt = 1, tasks_per_logical_node
	715	DO i = nxl_p, nxr_p
	716	fftx_ar(m) = work1(i,k,nt,nn,j)
	717	m = m+1
	718	ENDDO
	719	ENDDO
	720	ENDDO
	721
	722	CALL fft_x( fftx_ar, 'forward' )
	723
	724	DO i = nxl_a, nxr_a
	725	tri_ar(i,k) = fftx_ar(i)
	726	ENDDO
	727
	728	ENDDO
	729
	730	jj = myid * (nyn_p-nys_p+1) + j
	731	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
	732
	733	DO k = 1, nz
	734	DO i = nxl_a, nxr_a
	735	fftx_ar(i) = tri_ar(i,k)
	736	ENDDO
	737
	738	CALL fft_x( fftx_ar, 'backward' )
	739
	740	m = nxl_a
	741	DO nn = 1, nodes
	742	DO nt = 1, tasks_per_logical_node
	743	DO i = nxl_p, nxr_p
	744	work1(i,k,nt,nn,j) = fftx_ar(m)
	745	m = m+1
	746	ENDDO
	747	ENDDO
	748	ENDDO
	749	ENDDO
	750
	751	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	752	nw2 = nw1 * SIZE( work1, 3 )
	753	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
	754	#if defined( __parallel )
	755	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	756	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	757	comm_nodes, istat )
	758	#endif
	759	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
	760
	761	ENDDO
	762
	763	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	764	DO nn = 1, nodes
	765	DO j = nys_p, nyn_p
	766	#if defined( __parallel )
	767	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	768	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	769	comm_tasks, istat )
	770	#endif
	771	ENDDO
	772	ENDDO
	773	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	774
	775	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	776
	777	DO iouter = nxl_p, nxr_p, istride
	778	iei = MIN( iouter+istride-1, nxr_p )
	779	DO k = 1, nz
	780
	781	m = nys_a
	782	DO nn = 1, nodes
	783	DO nt = 1, tasks_per_logical_node
	784	DO j = nys_p, nyn_p
	785	DO i = iouter, iei
	786	ir = i - iouter + 1
	787	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
	788	ENDDO
	789	m = m+1
	790	ENDDO
	791	ENDDO
	792	ENDDO
	793
	794	DO i = iouter, iei
	795	ii = nxl + i
	796	ir = i - iouter + 1
	797	CALL fft_y( ffty_ar(:,ir), 'backward' )
	798
	799	DO j = nys_a, nyn_a
	800	ar(k,j,ii) = ffty_ar(j,ir)
	801	ENDDO
	802	ENDDO
	803
	804	ENDDO
	805	ENDDO
	806
	807	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	808
	809	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
	810
	811	END SUBROUTINE poisfft_hybrid_nodes
	812
	813
	814
	815	SUBROUTINE tridia_hybrid( j, ar, tri )
	816
	817	USE arrays_3d
	818	USE control_parameters
	819	USE grid_variables
	820
	821	IMPLICIT NONE
	822
	823	INTEGER :: i, j, k, nnyh
	824	REAL, DIMENSION(0:nx,nz) :: ar
	825	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	826	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	827
	828	nnyh = (ny+1) / 2
	829
	830	tri = 0.0
	831	!
	832	!-- Define constant elements of the tridiagonal matrix.
	833	DO k = 0, nz-1
	834	DO i = 0,nx
	835	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
	836	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
	837	ENDDO
	838	ENDDO
	839
	840	IF ( j <= nnyh ) THEN
	841	CALL maketri_hybrid( j )
	842	ELSE
	843	CALL maketri_hybrid( ny+1-j)
	844	ENDIF
	845	CALL zerleg_hybrid
	846	CALL substi_hybrid( ar, tri )
	847
	848	CONTAINS
	849
	850	SUBROUTINE maketri_hybrid( j )
	851
	852	!----------------------------------------------------------------------!
	853	! maketri !
	854	! !
	855	! computes the i- and j-dependent component of the matrix !
	856	!----------------------------------------------------------------------!
	857
	858	USE constants
	859
	860	IMPLICIT NONE
	861
	862	INTEGER :: i, j, k, nnxh
	863	REAL :: a, c
	864
	865	REAL, DIMENSION(0:nx) :: l
	866
	867
	868	nnxh = (nx+1) / 2
	869	!
	870	!-- Provide the tridiagonal matrix for solution of the Poisson equation
	871	!-- in Fourier space. The coefficients are computed following the method
	872	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
	873	!-- Siano's original version.
	874	DO i = 0,nx
	875	IF ( i >= 0 .AND. i < nnxh ) THEN
	876	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	877	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	878	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	879	FLOAT( ny+1 ) ) ) / ( dy * dy )
	880	ELSEIF ( i == nnxh ) THEN
	881	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	882	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	883	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	884	FLOAT(ny+1) ) ) / ( dy * dy )
	885	ELSE
	886	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	887	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	888	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	889	FLOAT( ny+1 ) ) ) / ( dy * dy )
	890	ENDIF
	891	ENDDO
	892
	893	DO k = 0,nz-1
	894	DO i = 0, nx
	895	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	896	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	897	tri(1,i,k) = a + c - l(i)
	898	ENDDO
	899	ENDDO
	900	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	901	DO i = 0,nx
	902	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	903	ENDDO
	904	ENDIF
	905	IF ( ibc_p_t == 1 ) THEN
	906	DO i = 0,nx
	907	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	908	ENDDO
	909	ENDIF
	910
	911	END SUBROUTINE maketri_hybrid
	912
	913
	914	SUBROUTINE zerleg_hybrid
	915
	916	!----------------------------------------------------------------------!
	917	! zerleg !
	918	! !
	919	! Splitting of the tridiagonal matrix (Thomas algorithm) !
	920	!----------------------------------------------------------------------!
	921
	922	USE indices
	923
	924	IMPLICIT NONE
	925
	926	INTEGER :: i, k
	927
	928	!
	929	!-- Splitting
	930	DO i = 0, nx
	931	tri(4,i,0) = tri(1,i,0)
	932	ENDDO
	933	DO k = 1, nz-1
	934	DO i = 0,nx
	935	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	936	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	937	ENDDO
	938	ENDDO
	939
	940	END SUBROUTINE zerleg_hybrid
	941
	942	SUBROUTINE substi_hybrid( ar, tri )
	943
	944	!----------------------------------------------------------------------!
	945	! substi !
	946	! !
	947	! Substitution (Forward and Backward) (Thomas algorithm) !
	948	!----------------------------------------------------------------------!
	949
	950	IMPLICIT NONE
	951
	952	INTEGER :: i, j, k
	953	REAL, DIMENSION(0:nx,nz) :: ar
	954	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	955	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	956
	957	!
	958	!-- Forward substitution
	959	DO i = 0, nx
	960	ar1(i,0) = ar(i,1)
	961	ENDDO
	962	DO k = 1, nz - 1
	963	DO i = 0,nx
	964	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	965	ENDDO
	966	ENDDO
	967
	968	!
	969	!-- Backward substitution
	970	DO i = 0,nx
	971	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	972	ENDDO
	973	DO k = nz-2, 0, -1
	974	DO i = 0,nx
	975	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	976	/ tri(4,i,k)
	977	ENDDO
	978	ENDDO
	979
	980	END SUBROUTINE substi_hybrid
	981
	982	END SUBROUTINE tridia_hybrid
	983
	984
	985	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
	986
	987	USE cpulog
	988
	989	IMPLICIT NONE
	990
	991	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
	992	INTEGER, SAVE :: tag = 10
	993	#if defined( __parallel )
	994	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
	995	#endif
	996
	997	REAL :: buf, buf1
	998
	999
	1000	buf = 1.0
	1001	buf1 = 1.1
	1002	IF ( me_node == 0 ) THEN ! first node only
	1003
	1004	SELECT CASE ( loca )
	1005
	1006	CASE ( 1 ) ! before alltoall
	1007
	1008	IF( me_task > 0 ) THEN ! first task does not wait
	1009	#if defined( __parallel )
	1010	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
	1011	buf1, 1, MPI_REAL, me_task-1, 0, &
	1012	comm_tasks, stat,ierr )
	1013	#endif
	1014	ELSEIF ( j > nys_p ) THEN
	1015	req = 0
	1016	tag = MOD( tag-10, 10 ) + 10
	1017	#if defined( __parallel )
	1018	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
	1019	tag, comm_tasks, req, ierr )
	1020	reqa = req
	1021	CALL MPI_WAITALL( 1, reqa, stat, ierr )
	1022	#endif
	1023	ENDIF
	1024
	1025	CASE ( 2 ) ! after alltoall
	1026
	1027	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
	1028	#if defined( __parallel )
	1029	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
	1030	buf1, 1, MPI_REAL, me_task+1, 0, &
	1031	comm_tasks, stat, ierr)
	1032	#endif
	1033	ELSEIF ( j < nyn_p ) THEN
	1034	req = 0
	1035	tag = MOD( tag-10, 10 ) + 10
	1036	#if defined( __parallel )
	1037	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
	1038	ierr )
	1039	#endif
	1040	ENDIF
	1041
	1042	END SELECT
	1043
	1044	ENDIF
	1045
	1046	END SUBROUTINE cascade
	1047
	1048	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |