Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 1

Last change on this file since 1 was 1, checked in by raasch, 17 years ago
Initial repository layout and content
File size: 31.7 KB

Rev	Line
[1]	1	MODULE poisfft_hybrid_mod
	2	!------------------------------------------------------------------------------
	3	!
	4	! Actual revisions:
	5	! -----------------
	6	!
	7	!
	8	! Former revisions:
	9	! -----------------
	10	! $Log: poisfft_hybrid.f90,v $
	11	! Revision 1.11 2004/04/30 12:43:14 raasch
	12	! Renaming of fft routines, additional argument in calls of fft_y_m
	13	!
	14	! Revision 1.10 2003/08/01 08:49:30 raasch
	15	! Error removed: reqa=req after calling routine MPI_IRECV (cascade)
	16	!
	17	! Revision 1.9 2003/05/09 14:50:47 raasch
	18	! Parameters in calls of MPI_ALLTOALL, MPI_WAITALL changed in order to fulfill
	19	! f95 type and rank requirements
	20	!
	21	! Revision 1.8 2003/03/16 11:17:04 raasch
	22	! +module pegrid in poisfft_hybrid_ini
	23	!
	24	! Revision 1.7 2003/03/16 09:45:52 raasch
	25	! Two underscores (_) are placed in front of all define-strings
	26	!
	27	! Revision 1.6 2003/03/16 09:11:42 raasch
	28	! Fortran error removed
	29	!
	30	! Revision 1.5 2003/03/14 13:45:53 raasch
	31	! Loop optimization in maketri_hybrid
	32	!
	33	! Revision 1.4 2003/03/12 16:37:22 raasch
	34	! Vectorized version (poisfft_hybrid_omp_vec) added, arguments ar and tri
	35	! passed to routine substi_hybrid (otherwise no vectorization and very
	36	! bad performance on NEC), reading of environment variable tasks_per_node
	37	! moved to routine parin
	38	!
	39	! Revision 1.3 2002/12/20 15:18:02 raasch
	40	! Variable statement restricted for parallel environment in subroutine
	41	! cascade
	42	!
	43	! Revision 1.2 2002/12/19 16:08:31 raasch
	44	! Preprocessor directive KKMP introduced (OMP does NOT work),
	45	! array tri will be a shared array in OpenMP loop, to get better cache
	46	! utilization, the i index (x-direction) will be executed in stride
	47	! "istride" as outer loop and in a shorter inner loop,
	48	! overlapping of computation and communication realized by new routine
	49	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
	50	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
	51	!
	52	!
	53	! Description:
	54	! ------------
	55	! Solution of the Poisson equation with a 2D spectral method.
	56	! Hybrid version for parallel computers using a 1D domain decomposition,
	57	! realized with MPI, along x and parallelization with OPEN-MP along y
	58	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
	59	! optimization is realized by overlapping of computation and communication
	60	! and by simultaneously executing as many communication calls as switches
	61	! per logical partition (LPAR) are available. This version comes into
	62	! effect if more than one node is used and if the environment variable
	63	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
	64	! without any rest.
	65	!
	66	! WARNING: In case of OpenMP, there are problems with allocating large
	67	! arrays in parallel regions.
	68	!
	69	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
	70	!------------------------------------------------------------------------------!
	71
	72	USE fft_xy
	73	USE indices
	74	USE pegrid
	75
	76	IMPLICIT NONE
	77
	78	PRIVATE
	79	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
	80
	81	INTEGER, PARAMETER :: switch_per_lpar = 2
	82
	83	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
	84	nxl_p, nxr_p, & ! partial x dimension
	85	nys_a, nyn_a, & ! total y dimension
	86	nys_p, nyn_p, & ! partial y dimension
	87
	88	npe_s, & ! total number of PEs for solver
	89	nwords, & ! number of points to be exchanged
	90	! with MPI_ALLTOALL
	91	n_omp_threads ! number of OpenMP threads
	92
	93	!
	94	!-- Variables for multi node version (cluster version) using routine
	95	!-- poisfft_hybrid_nodes
	96	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
	97	comm_node_all, & ! communicater all PEs node version
	98	comm_tasks, & ! communicater tasks
	99	me, me_node, me_task,& ! identity of this PE
	100	nodes, & ! number of nodes
	101	tasks_per_logical_node = -1 ! default no cluster
	102
	103
	104	!
	105	!-- Public interfaces
	106	INTERFACE poisfft_hybrid_ini
	107	MODULE PROCEDURE poisfft_hybrid_ini
	108	END INTERFACE poisfft_hybrid_ini
	109
	110	INTERFACE poisfft_hybrid
	111	MODULE PROCEDURE poisfft_hybrid
	112	END INTERFACE poisfft_hybrid
	113
	114	!
	115	!-- Private interfaces
	116	INTERFACE poisfft_hybrid_omp
	117	MODULE PROCEDURE poisfft_hybrid_omp
	118	END INTERFACE poisfft_hybrid_omp
	119
	120	INTERFACE poisfft_hybrid_omp_vec
	121	MODULE PROCEDURE poisfft_hybrid_omp_vec
	122	END INTERFACE poisfft_hybrid_omp_vec
	123
	124	INTERFACE poisfft_hybrid_nodes
	125	MODULE PROCEDURE poisfft_hybrid_nodes
	126	END INTERFACE poisfft_hybrid_nodes
	127
	128	INTERFACE tridia_hybrid
	129	MODULE PROCEDURE tridia_hybrid
	130	END INTERFACE tridia_hybrid
	131
	132	INTERFACE cascade
	133	MODULE PROCEDURE cascade
	134	END INTERFACE cascade
	135
	136	CONTAINS
	137
	138
	139	SUBROUTINE poisfft_hybrid_ini
	140
	141	USE control_parameters
	142	USE pegrid
	143
	144	IMPLICIT NONE
	145
	146	CHARACTER(LEN=8) :: cdummy
	147	INTEGER :: idummy, istat
	148	INTEGER, DIMENSION(2) :: coords, dims
	149
	150	LOGICAL, DIMENSION(2) :: period = .false., re_dims
	151
	152
	153	!
	154	!-- Set the internal index values for the hybrid solver
	155	#if defined( __parallel )
	156	npe_s = pdims(1)
	157	#else
	158	npe_s = 1
	159	#endif
	160	nxl_a = 0
	161	nxr_a = nx
	162	nxl_p = 0
	163	nxr_p = ( ( nx+1 ) / npe_s ) - 1
	164	nys_a = nys
	165	nyn_a = nyn
	166	nys_p = 0
	167	nyn_p = ( ( ny+1 ) / npe_s ) - 1
	168
	169	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
	170
	171	#if defined( __KKMP )
	172	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
	173	READ ( cdummy, '(I8)' ) n_omp_threads
	174	IF ( myid == 0 .AND. n_omp_threads > 1 ) THEN
	175	PRINT, '** poisfft_hybrid_ini: Number of OpenMP threads = ', &
	176	n_omp_threads
	177	ENDIF
	178	#else
	179	n_omp_threads = 1
	180	#endif
	181	!
	182	!-- Initialize the one-dimensional FFT routines
	183	CALL fft_init
	184
	185	!
	186	!-- Setup for multi node version (poisfft_hybrid_nodes)
	187	IF ( n_omp_threads == 1 .AND. &
	188	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
	189
	190	IF ( tasks_per_node /= -9999 ) THEN
	191	!
	192	!-- Multi node version requires that the available number of
	193	!-- switches per logical partition must be an integral divisor
	194	!-- of the chosen number of tasks per node
	195	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
	196	!
	197	!-- Set the switch which decides about usage of the multi node
	198	!-- version
	199	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
	200	numprocs > tasks_per_node ) THEN
	201	tasks_per_logical_node = tasks_per_node / switch_per_lpar
	202	ENDIF
	203
	204	IF ( myid == 0 .AND. tasks_per_logical_node > -1 ) THEN
	205	PRINT, '** poisfft_hybrid_ini: running optimized ', &
	206	'multinode version'
	207	PRINT*, ' switch_per_lpar = ', switch_per_lpar
	208	PRINT*, ' tasks_per_lpar = ', tasks_per_node
	209	PRINT*, ' tasks_per_logical_node = ', &
	210	tasks_per_logical_node
	211	ENDIF
	212
	213	ENDIF
	214	ENDIF
	215	ENDIF
	216
	217	!
	218	!-- Determine sub-topologies for multi node version
	219	IF ( tasks_per_logical_node >= 2 ) THEN
	220
	221	#if defined( __parallel )
	222	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
	223	tasks_per_logical_node
	224	dims(1) = nodes
	225	dims(2) = tasks_per_logical_node
	226
	227	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
	228	comm_node_all, istat )
	229	CALL MPI_COMM_RANK( comm_node_all, me, istat )
	230
	231	re_dims(1) = .TRUE.
	232	re_dims(2) = .FALSE.
	233	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
	234	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
	235
	236	re_dims(1) = .FALSE.
	237	re_dims(2) = .TRUE.
	238	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
	239	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
	240
	241	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
	242	! tasks_per_logical_node
	243	#else
	244	PRINT*, '+++ poisfft_hybrid_ini: parallel environment (MPI) required'
	245	CALL local_stop
	246	#endif
	247	ENDIF
	248
	249	END SUBROUTINE poisfft_hybrid_ini
	250
	251
	252	SUBROUTINE poisfft_hybrid( ar )
	253
	254	USE control_parameters
	255	USE interfaces
	256
	257	IMPLICIT NONE
	258
	259	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	260
	261	IF ( host(1:3) == 'nec' ) THEN
	262	CALL poisfft_hybrid_omp_vec( ar )
	263	ELSE
	264	IF ( tasks_per_logical_node == -1 ) THEN
	265	CALL poisfft_hybrid_omp( ar )
	266	ELSE
	267	CALL poisfft_hybrid_nodes( ar )
	268	ENDIF
	269	ENDIF
	270
	271	END SUBROUTINE poisfft_hybrid
	272
	273
	274	SUBROUTINE poisfft_hybrid_omp ( ar )
	275
	276	USE cpulog
	277	USE interfaces
	278
	279	IMPLICIT NONE
	280
	281	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	282	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	283
	284	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	285
	286	REAL, DIMENSION(0:nx) :: fftx_ar
	287	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	288
	289	REAL, DIMENSION(0:nx,nz) :: tri_ar
	290
	291	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	292	#if defined( __KKMP )
	293	INTEGER :: omp_get_thread_num
	294	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	295	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	296	#else
	297	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	298	#endif
	299
	300
	301	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
	302
	303	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	304
	305	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	306	!$OMP DO
	307	!
	308	!-- Store grid points to be transformed on a 1d-array, do the fft
	309	!-- and sample the results on a 4d-array
	310	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	311	iei = MIN( iouter+istride-1, nxr_p )
	312	DO k = 1, nz
	313
	314	DO i = iouter, iei
	315	ii = nxl + i
	316	ir = i - iouter + 1
	317
	318	DO j = nys_a, nyn_a
	319	ffty_ar(j,ir) = ar(k,j,ii)
	320	ENDDO
	321
	322	CALL fft_y( ffty_ar(:,ir), 'forward' )
	323	ENDDO
	324
	325	m = nys_a
	326	DO n = 1, npe_s
	327	DO j = nys_p, nyn_p
	328	DO i = iouter, iei
	329	ir = i - iouter + 1
	330	work1(i,k,j,n) = ffty_ar(m,ir)
	331	ENDDO
	332	m = m+1
	333	ENDDO
	334	ENDDO
	335
	336	ENDDO
	337	ENDDO
	338	!$OMP END PARALLEL
	339
	340	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	341
	342	#if defined( __parallel )
	343	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	344
	345	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	346	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	347	comm2d, istat )
	348
	349	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	350	#else
	351	work2 = work1
	352	#endif
	353
	354	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	355
	356	#if defined( __KKMP )
	357	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
	358	!$OMP DO
	359	DO j = nys_p, nyn_p
	360	jthread = omp_get_thread_num() + 1
	361	#else
	362	DO j = nys_p, nyn_p
	363	jthread = 1
	364	#endif
	365	DO k = 1, nz
	366
	367	m = nxl_a
	368	DO n = 1, npe_s
	369	DO i = nxl_p, nxr_p
	370	fftx_ar(m) = work2(i,k,j,n)
	371	m = m+1
	372	ENDDO
	373	ENDDO
	374
	375	CALL fft_x( fftx_ar, 'forward' )
	376
	377	DO i = nxl_a, nxr_a
	378	tri_ar(i,k) = fftx_ar(i)
	379	ENDDO
	380
	381	ENDDO
	382
	383	jj = myid * (nyn_p-nys_p+1) + j
	384	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	385
	386	DO k = 1, nz
	387	DO i = nxl_a, nxr_a
	388	fftx_ar(i) = tri_ar (i,k)
	389	ENDDO
	390
	391	CALL fft_x( fftx_ar, 'backward' )
	392
	393	m = nxl_a
	394	DO n = 1, npe_s
	395	DO i = nxl_p, nxr_p
	396	work2(i,k,j,n) = fftx_ar(m)
	397	m = m+1
	398	ENDDO
	399	ENDDO
	400
	401	ENDDO
	402	ENDDO
	403	#if defined( __KKMP )
	404	!$OMP END PARALLEL
	405	#endif
	406
	407	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	408
	409	#if defined( __parallel )
	410	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	411	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	412
	413	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	414	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	415	comm2d, istat )
	416
	417	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	418	#else
	419	work1 = work2
	420	#endif
	421
	422	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	423
	424	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	425	!$OMP DO
	426	DO iouter = nxl_p, nxr_p, istride
	427	iei = MIN( iouter+istride-1, nxr_p )
	428	DO k = 1, nz
	429
	430	m = nys_a
	431	DO n = 1, npe_s
	432	DO j = nys_p, nyn_p
	433	DO i = iouter, iei
	434	ir = i - iouter + 1
	435	ffty_ar(m,ir) = work1 (i,k,j,n)
	436	ENDDO
	437	m = m+1
	438	ENDDO
	439	ENDDO
	440
	441	DO i = iouter, iei
	442	ii = nxl + i
	443	ir = i - iouter + 1
	444	CALL fft_y( ffty_ar(:,ir), 'backward' )
	445
	446	DO j = nys_a, nyn_a
	447	ar(k,j,ii) = ffty_ar(j,ir)
	448	ENDDO
	449	ENDDO
	450
	451	ENDDO
	452	ENDDO
	453	!$OMP END PARALLEL
	454
	455	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	456
	457	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
	458
	459	#if defined( __KKMP )
	460	DEALLOCATE( tri )
	461	#endif
	462
	463	END SUBROUTINE poisfft_hybrid_omp
	464
	465
	466	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
	467
	468	USE cpulog
	469	USE interfaces
	470
	471	IMPLICIT NONE
	472
	473	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	474	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	475
	476	REAL, DIMENSION(0:nx,nz) :: tri_ar
	477
	478	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	479
	480	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
	481	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
	482
	483	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	484	#if defined( __KKMP )
	485	INTEGER :: omp_get_thread_num
	486	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	487	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	488	#else
	489	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	490	#endif
	491
	492
	493	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
	494
	495	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
	496
	497	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
	498	!$OMP DO
	499	!
	500	!-- Store grid points to be transformed on a 1d-array, do the fft
	501	!-- and sample the results on a 4d-array
	502	DO i = nxl_p, nxr_p
	503
	504	DO j = nys_a, nyn_a
	505	DO k = 1, nz
	506	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
	507	ENDDO
	508	ENDDO
	509
	510	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
	511	ENDDO
	512
	513	!$OMP DO
	514	DO k = 1, nz
	515	m = nys_a
	516	DO n = 1, npe_s
	517	DO j = nys_p, nyn_p
	518	DO i = nxl_p, nxr_p
	519	work1(i,k,j,n) = ffty_ar3(m,k,i)
	520	ENDDO
	521	m = m+1
	522	ENDDO
	523	ENDDO
	524	ENDDO
	525	!$OMP END PARALLEL
	526
	527	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
	528
	529	#if defined( __parallel )
	530	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	531	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	532	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	533	comm2d, istat )
	534	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	535	#else
	536	work2 = work1
	537	#endif
	538
	539	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
	540
	541	#if defined( __KKMP )
	542	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
	543	!$OMP DO
	544	DO j = nys_p, nyn_p
	545	jthread = omp_get_thread_num() + 1
	546	#else
	547	DO j = nys_p, nyn_p
	548	jthread = 1
	549	#endif
	550	DO k = 1, nz
	551
	552	m = nxl_a
	553	DO n = 1, npe_s
	554	DO i = nxl_p, nxr_p
	555	fftx_ar3(m,k,j) = work2(i,k,j,n)
	556	m = m+1
	557	ENDDO
	558	ENDDO
	559	ENDDO
	560
	561	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
	562
	563	DO k = 1, nz
	564	DO i = nxl_a, nxr_a
	565	tri_ar(i,k) = fftx_ar3(i,k,j)
	566	ENDDO
	567	ENDDO
	568
	569	jj = myid * (nyn_p-nys_p+1) + j
	570	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	571
	572	DO k = 1, nz
	573	DO i = nxl_a, nxr_a
	574	fftx_ar3(i,k,j) = tri_ar (i,k)
	575	ENDDO
	576	ENDDO
	577
	578	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
	579
	580	DO k = 1, nz
	581	m = nxl_a
	582	DO n = 1, npe_s
	583	DO i = nxl_p, nxr_p
	584	work2(i,k,j,n) = fftx_ar3(m,k,j)
	585	m = m+1
	586	ENDDO
	587	ENDDO
	588	ENDDO
	589
	590	ENDDO
	591	#if defined( __KKMP )
	592	!$OMP END PARALLEL
	593	#endif
	594
	595	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
	596
	597	#if defined( __parallel )
	598	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	599	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	600	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	601	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	602	comm2d, istat )
	603	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	604	#else
	605	work1 = work2
	606	#endif
	607
	608	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
	609
	610	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
	611	!$OMP DO
	612	DO k = 1, nz
	613	m = nys_a
	614	DO n = 1, npe_s
	615	DO j = nys_p, nyn_p
	616	DO i = nxl_p, nxr_p
	617	ffty_ar3(m,k,i) = work1(i,k,j,n)
	618	ENDDO
	619	m = m+1
	620	ENDDO
	621	ENDDO
	622	ENDDO
	623
	624	!$OMP DO
	625	DO i = nxl_p, nxr_p
	626	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
	627	DO j = nys_a, nyn_a
	628	DO k = 1, nz
	629	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
	630	ENDDO
	631	ENDDO
	632	ENDDO
	633	!$OMP END PARALLEL
	634
	635	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
	636
	637	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
	638
	639	#if defined( __KKMP )
	640	DEALLOCATE( tri )
	641	#endif
	642
	643	END SUBROUTINE poisfft_hybrid_omp_vec
	644
	645
	646	SUBROUTINE poisfft_hybrid_nodes ( ar )
	647
	648	USE cpulog
	649	USE interfaces
	650
	651	IMPLICIT NONE
	652
	653	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	654	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
	655	n, nn, nt, nw1, nw2
	656
	657	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	658
	659	REAL, DIMENSION(0:nx) :: fftx_ar
	660	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	661
	662	REAL, DIMENSION(0:nx,nz) :: tri_ar
	663
	664	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
	665	nodes,nys_p:nyn_p) :: work1,work2
	666	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	667
	668
	669	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
	670
	671	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	672
	673	!
	674	!-- Store grid points to be transformed on a 1d-array, do the fft
	675	!-- and sample the results on a 4d-array
	676	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	677	iei = MIN( iouter+istride-1, nxr_p )
	678	DO k = 1, nz
	679
	680	DO i = iouter, iei
	681	ii = nxl + i
	682	ir = i - iouter + 1
	683
	684	DO j = nys_a, nyn_a
	685	ffty_ar(j,ir) = ar(k,j,ii)
	686	ENDDO
	687
	688	CALL fft_y( ffty_ar(:,ir), 'forward' )
	689	ENDDO
	690
	691	m = nys_a
	692	DO nn = 1, nodes
	693	DO nt = 1, tasks_per_logical_node
	694	DO j = nys_p, nyn_p
	695	DO i = iouter, iei
	696	ir = i - iouter + 1
	697	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
	698	ENDDO
	699	m = m+1
	700	ENDDO
	701	ENDDO
	702	ENDDO
	703
	704	ENDDO
	705	ENDDO
	706
	707	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	708
	709	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	710	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
	711	DO nn = 1, nodes
	712	DO j = nys_p, nyn_p
	713	#if defined( __parallel )
	714	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	715	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	716	comm_tasks, istat )
	717	#endif
	718	ENDDO
	719	ENDDO
	720	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	721
	722
	723	DO j = nys_p, nyn_p
	724
	725	CALL cascade( 1, j, nys_p, nyn_p )
	726	nw2 = nw1 * SIZE( work1, 3 )
	727	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
	728	#if defined( __parallel )
	729	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	730	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	731	comm_nodes, istat )
	732	#endif
	733	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
	734	CALL cascade( 2, j, nys_p, nyn_p )
	735
	736	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	737	DO k = 1, nz
	738
	739	m = nxl_a
	740	DO nn = 1, nodes
	741	DO nt = 1, tasks_per_logical_node
	742	DO i = nxl_p, nxr_p
	743	fftx_ar(m) = work1(i,k,nt,nn,j)
	744	m = m+1
	745	ENDDO
	746	ENDDO
	747	ENDDO
	748
	749	CALL fft_x( fftx_ar, 'forward' )
	750
	751	DO i = nxl_a, nxr_a
	752	tri_ar(i,k) = fftx_ar(i)
	753	ENDDO
	754
	755	ENDDO
	756
	757	jj = myid * (nyn_p-nys_p+1) + j
	758	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
	759
	760	DO k = 1, nz
	761	DO i = nxl_a, nxr_a
	762	fftx_ar(i) = tri_ar(i,k)
	763	ENDDO
	764
	765	CALL fft_x( fftx_ar, 'backward' )
	766
	767	m = nxl_a
	768	DO nn = 1, nodes
	769	DO nt = 1, tasks_per_logical_node
	770	DO i = nxl_p, nxr_p
	771	work1(i,k,nt,nn,j) = fftx_ar(m)
	772	m = m+1
	773	ENDDO
	774	ENDDO
	775	ENDDO
	776	ENDDO
	777
	778	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	779	nw2 = nw1 * SIZE( work1, 3 )
	780	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
	781	#if defined( __parallel )
	782	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	783	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	784	comm_nodes, istat )
	785	#endif
	786	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
	787
	788	ENDDO
	789
	790	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	791	DO nn = 1, nodes
	792	DO j = nys_p, nyn_p
	793	#if defined( __parallel )
	794	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	795	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	796	comm_tasks, istat )
	797	#endif
	798	ENDDO
	799	ENDDO
	800	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	801
	802	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	803
	804	DO iouter = nxl_p, nxr_p, istride
	805	iei = MIN( iouter+istride-1, nxr_p )
	806	DO k = 1, nz
	807
	808	m = nys_a
	809	DO nn = 1, nodes
	810	DO nt = 1, tasks_per_logical_node
	811	DO j = nys_p, nyn_p
	812	DO i = iouter, iei
	813	ir = i - iouter + 1
	814	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
	815	ENDDO
	816	m = m+1
	817	ENDDO
	818	ENDDO
	819	ENDDO
	820
	821	DO i = iouter, iei
	822	ii = nxl + i
	823	ir = i - iouter + 1
	824	CALL fft_y( ffty_ar(:,ir), 'backward' )
	825
	826	DO j = nys_a, nyn_a
	827	ar(k,j,ii) = ffty_ar(j,ir)
	828	ENDDO
	829	ENDDO
	830
	831	ENDDO
	832	ENDDO
	833
	834	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	835
	836	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
	837
	838	END SUBROUTINE poisfft_hybrid_nodes
	839
	840
	841
	842	SUBROUTINE tridia_hybrid( j, ar, tri )
	843
	844	USE arrays_3d
	845	USE control_parameters
	846	USE grid_variables
	847
	848	IMPLICIT NONE
	849
	850	INTEGER :: i, j, k, nnyh
	851	REAL, DIMENSION(0:nx,nz) :: ar
	852	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	853	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	854
	855	nnyh = (ny+1) / 2
	856
	857	tri = 0.0
	858	!
	859	!-- Define constant elements of the tridiagonal matrix.
	860	DO k = 0, nz-1
	861	DO i = 0,nx
	862	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
	863	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
	864	ENDDO
	865	ENDDO
	866
	867	IF ( j <= nnyh ) THEN
	868	CALL maketri_hybrid( j )
	869	ELSE
	870	CALL maketri_hybrid( ny+1-j)
	871	ENDIF
	872	CALL zerleg_hybrid
	873	CALL substi_hybrid( ar, tri )
	874
	875	CONTAINS
	876
	877	SUBROUTINE maketri_hybrid( j )
	878
	879	!----------------------------------------------------------------------!
	880	! maketri !
	881	! !
	882	! computes the i- and j-dependent component of the matrix !
	883	!----------------------------------------------------------------------!
	884
	885	USE constants
	886
	887	IMPLICIT NONE
	888
	889	INTEGER :: i, j, k, nnxh
	890	REAL :: a, c
	891
	892	REAL, DIMENSION(0:nx) :: l
	893
	894
	895	nnxh = (nx+1) / 2
	896	!
	897	!-- Provide the tridiagonal matrix for solution of the Poisson equation
	898	!-- in Fourier space. The coefficients are computed following the method
	899	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
	900	!-- Siano's original version.
	901	DO i = 0,nx
	902	IF ( i >= 0 .AND. i < nnxh ) THEN
	903	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	904	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	905	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	906	FLOAT( ny+1 ) ) ) / ( dy * dy )
	907	ELSEIF ( i == nnxh ) THEN
	908	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	909	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	910	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	911	FLOAT(ny+1) ) ) / ( dy * dy )
	912	ELSE
	913	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	914	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	915	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	916	FLOAT( ny+1 ) ) ) / ( dy * dy )
	917	ENDIF
	918	ENDDO
	919
	920	DO k = 0,nz-1
	921	DO i = 0, nx
	922	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	923	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	924	tri(1,i,k) = a + c - l(i)
	925	ENDDO
	926	ENDDO
	927	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	928	DO i = 0,nx
	929	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	930	ENDDO
	931	ENDIF
	932	IF ( ibc_p_t == 1 ) THEN
	933	DO i = 0,nx
	934	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	935	ENDDO
	936	ENDIF
	937
	938	END SUBROUTINE maketri_hybrid
	939
	940
	941	SUBROUTINE zerleg_hybrid
	942
	943	!----------------------------------------------------------------------!
	944	! zerleg !
	945	! !
	946	! Splitting of the tridiagonal matrix (Thomas algorithm) !
	947	!----------------------------------------------------------------------!
	948
	949	USE indices
	950
	951	IMPLICIT NONE
	952
	953	INTEGER :: i, k
	954
	955	!
	956	!-- Splitting
	957	DO i = 0, nx
	958	tri(4,i,0) = tri(1,i,0)
	959	ENDDO
	960	DO k = 1, nz-1
	961	DO i = 0,nx
	962	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	963	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	964	ENDDO
	965	ENDDO
	966
	967	END SUBROUTINE zerleg_hybrid
	968
	969	SUBROUTINE substi_hybrid( ar, tri )
	970
	971	!----------------------------------------------------------------------!
	972	! substi !
	973	! !
	974	! Substitution (Forward and Backward) (Thomas algorithm) !
	975	!----------------------------------------------------------------------!
	976
	977	IMPLICIT NONE
	978
	979	INTEGER :: i, j, k
	980	REAL, DIMENSION(0:nx,nz) :: ar
	981	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	982	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	983
	984	!
	985	!-- Forward substitution
	986	DO i = 0, nx
	987	ar1(i,0) = ar(i,1)
	988	ENDDO
	989	DO k = 1, nz - 1
	990	DO i = 0,nx
	991	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	992	ENDDO
	993	ENDDO
	994
	995	!
	996	!-- Backward substitution
	997	DO i = 0,nx
	998	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	999	ENDDO
	1000	DO k = nz-2, 0, -1
	1001	DO i = 0,nx
	1002	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1003	/ tri(4,i,k)
	1004	ENDDO
	1005	ENDDO
	1006
	1007	END SUBROUTINE substi_hybrid
	1008
	1009	END SUBROUTINE tridia_hybrid
	1010
	1011
	1012	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
	1013
	1014	USE cpulog
	1015
	1016	IMPLICIT NONE
	1017
	1018	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
	1019	INTEGER, SAVE :: tag = 10
	1020	#if defined( __parallel )
	1021	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
	1022	#endif
	1023
	1024	REAL :: buf, buf1
	1025
	1026
	1027	buf = 1.0
	1028	buf1 = 1.1
	1029	IF ( me_node == 0 ) THEN ! first node only
	1030
	1031	SELECT CASE ( loca )
	1032
	1033	CASE ( 1 ) ! before alltoall
	1034
	1035	IF( me_task > 0 ) THEN ! first task does not wait
	1036	#if defined( __parallel )
	1037	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
	1038	buf1, 1, MPI_REAL, me_task-1, 0, &
	1039	comm_tasks, stat,ierr )
	1040	#endif
	1041	ELSEIF ( j > nys_p ) THEN
	1042	req = 0
	1043	tag = MOD( tag-10, 10 ) + 10
	1044	#if defined( __parallel )
	1045	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
	1046	tag, comm_tasks, req, ierr )
	1047	reqa = req
	1048	CALL MPI_WAITALL( 1, reqa, stat, ierr )
	1049	#endif
	1050	ENDIF
	1051
	1052	CASE ( 2 ) ! after alltoall
	1053
	1054	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
	1055	#if defined( __parallel )
	1056	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
	1057	buf1, 1, MPI_REAL, me_task+1, 0, &
	1058	comm_tasks, stat, ierr)
	1059	#endif
	1060	ELSEIF ( j < nyn_p ) THEN
	1061	req = 0
	1062	tag = MOD( tag-10, 10 ) + 10
	1063	#if defined( __parallel )
	1064	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
	1065	ierr )
	1066	#endif
	1067	ENDIF
	1068
	1069	END SELECT
	1070
	1071	ENDIF
	1072
	1073	END SUBROUTINE cascade
	1074
	1075	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |