Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 415

Last change on this file since 415 was 415, checked in by raasch, 15 years ago
fortran bugfix in subroutine cascade concerning dimension os MPI status variable
Property svn:keywords set to `Id`
File size: 31.4 KB

Rev	Line
[1]	1	MODULE poisfft_hybrid_mod
	2	!------------------------------------------------------------------------------
	3	!
[257]	4	! Current revisions:
[1]	5	! -----------------
[412]	6	! Dimension of array stat in cascade change to prevent type problems with
	7	! mpi2 libraries
[1]	8	!
	9	! Former revisions:
	10	! -----------------
[3]	11	! $Id: poisfft_hybrid.f90 415 2009-12-15 10:26:23Z raasch $
[392]	12	!
	13	! 274 2009-03-26 15:11:21Z heinze
	14	! Output of messages replaced by message handling routine.
	15	!
	16	! Feb. 2007
[3]	17	! RCS Log replace by Id keyword, revision history cleaned up
	18	!
[1]	19	! Revision 1.11 2004/04/30 12:43:14 raasch
	20	! Renaming of fft routines, additional argument in calls of fft_y_m
	21	!
	22	! Revision 1.2 2002/12/19 16:08:31 raasch
	23	! Preprocessor directive KKMP introduced (OMP does NOT work),
	24	! array tri will be a shared array in OpenMP loop, to get better cache
	25	! utilization, the i index (x-direction) will be executed in stride
	26	! "istride" as outer loop and in a shorter inner loop,
	27	! overlapping of computation and communication realized by new routine
	28	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
	29	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
	30	!
	31	!
	32	! Description:
	33	! ------------
	34	! Solution of the Poisson equation with a 2D spectral method.
	35	! Hybrid version for parallel computers using a 1D domain decomposition,
	36	! realized with MPI, along x and parallelization with OPEN-MP along y
	37	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
	38	! optimization is realized by overlapping of computation and communication
	39	! and by simultaneously executing as many communication calls as switches
	40	! per logical partition (LPAR) are available. This version comes into
	41	! effect if more than one node is used and if the environment variable
	42	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
	43	! without any rest.
	44	!
	45	! WARNING: In case of OpenMP, there are problems with allocating large
	46	! arrays in parallel regions.
	47	!
	48	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
	49	!------------------------------------------------------------------------------!
	50
	51	USE fft_xy
	52	USE indices
	53	USE pegrid
	54
	55	IMPLICIT NONE
	56
	57	PRIVATE
	58	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
	59
	60	INTEGER, PARAMETER :: switch_per_lpar = 2
	61
	62	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
	63	nxl_p, nxr_p, & ! partial x dimension
	64	nys_a, nyn_a, & ! total y dimension
	65	nys_p, nyn_p, & ! partial y dimension
	66
	67	npe_s, & ! total number of PEs for solver
	68	nwords, & ! number of points to be exchanged
	69	! with MPI_ALLTOALL
	70	n_omp_threads ! number of OpenMP threads
	71
	72	!
	73	!-- Variables for multi node version (cluster version) using routine
	74	!-- poisfft_hybrid_nodes
	75	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
	76	comm_node_all, & ! communicater all PEs node version
	77	comm_tasks, & ! communicater tasks
	78	me, me_node, me_task,& ! identity of this PE
	79	nodes, & ! number of nodes
	80	tasks_per_logical_node = -1 ! default no cluster
	81
	82
	83	!
	84	!-- Public interfaces
	85	INTERFACE poisfft_hybrid_ini
	86	MODULE PROCEDURE poisfft_hybrid_ini
	87	END INTERFACE poisfft_hybrid_ini
	88
	89	INTERFACE poisfft_hybrid
	90	MODULE PROCEDURE poisfft_hybrid
	91	END INTERFACE poisfft_hybrid
	92
	93	!
	94	!-- Private interfaces
	95	INTERFACE poisfft_hybrid_omp
	96	MODULE PROCEDURE poisfft_hybrid_omp
	97	END INTERFACE poisfft_hybrid_omp
	98
	99	INTERFACE poisfft_hybrid_omp_vec
	100	MODULE PROCEDURE poisfft_hybrid_omp_vec
	101	END INTERFACE poisfft_hybrid_omp_vec
	102
	103	INTERFACE poisfft_hybrid_nodes
	104	MODULE PROCEDURE poisfft_hybrid_nodes
	105	END INTERFACE poisfft_hybrid_nodes
	106
	107	INTERFACE tridia_hybrid
	108	MODULE PROCEDURE tridia_hybrid
	109	END INTERFACE tridia_hybrid
	110
	111	INTERFACE cascade
	112	MODULE PROCEDURE cascade
	113	END INTERFACE cascade
	114
	115	CONTAINS
	116
	117
	118	SUBROUTINE poisfft_hybrid_ini
	119
	120	USE control_parameters
	121	USE pegrid
	122
	123	IMPLICIT NONE
	124
	125	CHARACTER(LEN=8) :: cdummy
	126	INTEGER :: idummy, istat
	127	INTEGER, DIMENSION(2) :: coords, dims
	128
	129	LOGICAL, DIMENSION(2) :: period = .false., re_dims
	130
	131
	132	!
	133	!-- Set the internal index values for the hybrid solver
	134	#if defined( __parallel )
	135	npe_s = pdims(1)
	136	#else
	137	npe_s = 1
	138	#endif
	139	nxl_a = 0
	140	nxr_a = nx
	141	nxl_p = 0
	142	nxr_p = ( ( nx+1 ) / npe_s ) - 1
	143	nys_a = nys
	144	nyn_a = nyn
	145	nys_p = 0
	146	nyn_p = ( ( ny+1 ) / npe_s ) - 1
	147
	148	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
	149
	150	#if defined( __KKMP )
	151	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
	152	READ ( cdummy, '(I8)' ) n_omp_threads
[257]	153	IF ( n_omp_threads > 1 ) THEN
	154	WRITE( message_string, * ) 'Number of OpenMP threads = ', &
	155	n_omp_threads
	156	CALL message( 'poisfft_hybrid_ini', 'PA0280', 0, 0, 0, 6, 0 )
[1]	157	ENDIF
	158	#else
	159	n_omp_threads = 1
	160	#endif
	161	!
	162	!-- Initialize the one-dimensional FFT routines
	163	CALL fft_init
	164
	165	!
	166	!-- Setup for multi node version (poisfft_hybrid_nodes)
	167	IF ( n_omp_threads == 1 .AND. &
	168	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
	169
	170	IF ( tasks_per_node /= -9999 ) THEN
	171	!
	172	!-- Multi node version requires that the available number of
	173	!-- switches per logical partition must be an integral divisor
	174	!-- of the chosen number of tasks per node
	175	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
	176	!
	177	!-- Set the switch which decides about usage of the multi node
	178	!-- version
	179	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
	180	numprocs > tasks_per_node ) THEN
	181	tasks_per_logical_node = tasks_per_node / switch_per_lpar
	182	ENDIF
	183
[257]	184	IF ( tasks_per_logical_node > -1 ) THEN
	185
	186	WRITE( message_string, * ) 'running optimized ', &
	187	'multinode version', &
	188	'&switch_per_lpar = ', &
	189	switch_per_lpar, &
	190	'&tasks_per_lpar = ', &
	191	tasks_per_node, &
	192	'tasks_per_logical_node = ', &
	193	tasks_per_logical_node
[274]	194	CALL message( 'poisfft_hybrid_ini', 'PA0281', 0, 0, 0, 6, 0 )
[257]	195
[1]	196	ENDIF
	197
	198	ENDIF
	199	ENDIF
	200	ENDIF
	201
	202	!
	203	!-- Determine sub-topologies for multi node version
	204	IF ( tasks_per_logical_node >= 2 ) THEN
	205
	206	#if defined( __parallel )
	207	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
	208	tasks_per_logical_node
	209	dims(1) = nodes
	210	dims(2) = tasks_per_logical_node
	211
	212	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
	213	comm_node_all, istat )
	214	CALL MPI_COMM_RANK( comm_node_all, me, istat )
	215
	216	re_dims(1) = .TRUE.
	217	re_dims(2) = .FALSE.
	218	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
	219	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
	220
	221	re_dims(1) = .FALSE.
	222	re_dims(2) = .TRUE.
	223	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
	224	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
	225
	226	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
	227	! tasks_per_logical_node
	228	#else
[257]	229	message_string = 'parallel environment (MPI) required'
	230	CALL message( 'poisfft_hybrid_ini', 'PA0282', 1, 2, 0, 6, 0 )
[1]	231	#endif
	232	ENDIF
	233
	234	END SUBROUTINE poisfft_hybrid_ini
	235
	236
	237	SUBROUTINE poisfft_hybrid( ar )
	238
	239	USE control_parameters
	240	USE interfaces
	241
	242	IMPLICIT NONE
	243
	244	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	245
	246	IF ( host(1:3) == 'nec' ) THEN
	247	CALL poisfft_hybrid_omp_vec( ar )
	248	ELSE
	249	IF ( tasks_per_logical_node == -1 ) THEN
	250	CALL poisfft_hybrid_omp( ar )
	251	ELSE
	252	CALL poisfft_hybrid_nodes( ar )
	253	ENDIF
	254	ENDIF
	255
	256	END SUBROUTINE poisfft_hybrid
	257
	258
	259	SUBROUTINE poisfft_hybrid_omp ( ar )
	260
	261	USE cpulog
	262	USE interfaces
	263
	264	IMPLICIT NONE
	265
	266	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	267	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	268
	269	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	270
	271	REAL, DIMENSION(0:nx) :: fftx_ar
	272	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	273
	274	REAL, DIMENSION(0:nx,nz) :: tri_ar
	275
	276	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	277	#if defined( __KKMP )
	278	INTEGER :: omp_get_thread_num
	279	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	280	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	281	#else
	282	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	283	#endif
	284
	285
	286	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
	287
	288	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	289
	290	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	291	!$OMP DO
	292	!
	293	!-- Store grid points to be transformed on a 1d-array, do the fft
	294	!-- and sample the results on a 4d-array
	295	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	296	iei = MIN( iouter+istride-1, nxr_p )
	297	DO k = 1, nz
	298
	299	DO i = iouter, iei
	300	ii = nxl + i
	301	ir = i - iouter + 1
	302
	303	DO j = nys_a, nyn_a
	304	ffty_ar(j,ir) = ar(k,j,ii)
	305	ENDDO
	306
	307	CALL fft_y( ffty_ar(:,ir), 'forward' )
	308	ENDDO
	309
	310	m = nys_a
	311	DO n = 1, npe_s
	312	DO j = nys_p, nyn_p
	313	DO i = iouter, iei
	314	ir = i - iouter + 1
	315	work1(i,k,j,n) = ffty_ar(m,ir)
	316	ENDDO
	317	m = m+1
	318	ENDDO
	319	ENDDO
	320
	321	ENDDO
	322	ENDDO
	323	!$OMP END PARALLEL
	324
	325	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	326
	327	#if defined( __parallel )
	328	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	329
	330	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	331	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	332	comm2d, istat )
	333
	334	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	335	#else
	336	work2 = work1
	337	#endif
	338
	339	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	340
	341	#if defined( __KKMP )
	342	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
	343	!$OMP DO
	344	DO j = nys_p, nyn_p
	345	jthread = omp_get_thread_num() + 1
	346	#else
	347	DO j = nys_p, nyn_p
	348	jthread = 1
	349	#endif
	350	DO k = 1, nz
	351
	352	m = nxl_a
	353	DO n = 1, npe_s
	354	DO i = nxl_p, nxr_p
	355	fftx_ar(m) = work2(i,k,j,n)
	356	m = m+1
	357	ENDDO
	358	ENDDO
	359
	360	CALL fft_x( fftx_ar, 'forward' )
	361
	362	DO i = nxl_a, nxr_a
	363	tri_ar(i,k) = fftx_ar(i)
	364	ENDDO
	365
	366	ENDDO
	367
	368	jj = myid * (nyn_p-nys_p+1) + j
	369	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	370
	371	DO k = 1, nz
	372	DO i = nxl_a, nxr_a
	373	fftx_ar(i) = tri_ar (i,k)
	374	ENDDO
	375
	376	CALL fft_x( fftx_ar, 'backward' )
	377
	378	m = nxl_a
	379	DO n = 1, npe_s
	380	DO i = nxl_p, nxr_p
	381	work2(i,k,j,n) = fftx_ar(m)
	382	m = m+1
	383	ENDDO
	384	ENDDO
	385
	386	ENDDO
	387	ENDDO
	388	#if defined( __KKMP )
	389	!$OMP END PARALLEL
	390	#endif
	391
	392	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	393
	394	#if defined( __parallel )
	395	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	396	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	397
	398	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	399	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	400	comm2d, istat )
	401
	402	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	403	#else
	404	work1 = work2
	405	#endif
	406
	407	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	408
	409	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	410	!$OMP DO
	411	DO iouter = nxl_p, nxr_p, istride
	412	iei = MIN( iouter+istride-1, nxr_p )
	413	DO k = 1, nz
	414
	415	m = nys_a
	416	DO n = 1, npe_s
	417	DO j = nys_p, nyn_p
	418	DO i = iouter, iei
	419	ir = i - iouter + 1
	420	ffty_ar(m,ir) = work1 (i,k,j,n)
	421	ENDDO
	422	m = m+1
	423	ENDDO
	424	ENDDO
	425
	426	DO i = iouter, iei
	427	ii = nxl + i
	428	ir = i - iouter + 1
	429	CALL fft_y( ffty_ar(:,ir), 'backward' )
	430
	431	DO j = nys_a, nyn_a
	432	ar(k,j,ii) = ffty_ar(j,ir)
	433	ENDDO
	434	ENDDO
	435
	436	ENDDO
	437	ENDDO
	438	!$OMP END PARALLEL
	439
	440	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	441
	442	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
	443
	444	#if defined( __KKMP )
	445	DEALLOCATE( tri )
	446	#endif
	447
	448	END SUBROUTINE poisfft_hybrid_omp
	449
	450
	451	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
	452
	453	USE cpulog
	454	USE interfaces
	455
	456	IMPLICIT NONE
	457
	458	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	459	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	460
	461	REAL, DIMENSION(0:nx,nz) :: tri_ar
	462
	463	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	464
	465	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
	466	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
	467
	468	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	469	#if defined( __KKMP )
	470	INTEGER :: omp_get_thread_num
	471	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	472	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	473	#else
	474	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	475	#endif
	476
	477
	478	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
	479
	480	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
	481
	482	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
	483	!$OMP DO
	484	!
	485	!-- Store grid points to be transformed on a 1d-array, do the fft
	486	!-- and sample the results on a 4d-array
	487	DO i = nxl_p, nxr_p
	488
	489	DO j = nys_a, nyn_a
	490	DO k = 1, nz
	491	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
	492	ENDDO
	493	ENDDO
	494
	495	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
	496	ENDDO
	497
	498	!$OMP DO
	499	DO k = 1, nz
	500	m = nys_a
	501	DO n = 1, npe_s
	502	DO j = nys_p, nyn_p
	503	DO i = nxl_p, nxr_p
	504	work1(i,k,j,n) = ffty_ar3(m,k,i)
	505	ENDDO
	506	m = m+1
	507	ENDDO
	508	ENDDO
	509	ENDDO
	510	!$OMP END PARALLEL
	511
	512	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
	513
	514	#if defined( __parallel )
	515	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	516	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	517	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	518	comm2d, istat )
	519	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	520	#else
	521	work2 = work1
	522	#endif
	523
	524	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
	525
	526	#if defined( __KKMP )
	527	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
	528	!$OMP DO
	529	DO j = nys_p, nyn_p
	530	jthread = omp_get_thread_num() + 1
	531	#else
	532	DO j = nys_p, nyn_p
	533	jthread = 1
	534	#endif
	535	DO k = 1, nz
	536
	537	m = nxl_a
	538	DO n = 1, npe_s
	539	DO i = nxl_p, nxr_p
	540	fftx_ar3(m,k,j) = work2(i,k,j,n)
	541	m = m+1
	542	ENDDO
	543	ENDDO
	544	ENDDO
	545
	546	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
	547
	548	DO k = 1, nz
	549	DO i = nxl_a, nxr_a
	550	tri_ar(i,k) = fftx_ar3(i,k,j)
	551	ENDDO
	552	ENDDO
	553
	554	jj = myid * (nyn_p-nys_p+1) + j
	555	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	556
	557	DO k = 1, nz
	558	DO i = nxl_a, nxr_a
	559	fftx_ar3(i,k,j) = tri_ar (i,k)
	560	ENDDO
	561	ENDDO
	562
	563	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
	564
	565	DO k = 1, nz
	566	m = nxl_a
	567	DO n = 1, npe_s
	568	DO i = nxl_p, nxr_p
	569	work2(i,k,j,n) = fftx_ar3(m,k,j)
	570	m = m+1
	571	ENDDO
	572	ENDDO
	573	ENDDO
	574
	575	ENDDO
	576	#if defined( __KKMP )
	577	!$OMP END PARALLEL
	578	#endif
	579
	580	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
	581
	582	#if defined( __parallel )
	583	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	584	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	585	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	586	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	587	comm2d, istat )
	588	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	589	#else
	590	work1 = work2
	591	#endif
	592
	593	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
	594
	595	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
	596	!$OMP DO
	597	DO k = 1, nz
	598	m = nys_a
	599	DO n = 1, npe_s
	600	DO j = nys_p, nyn_p
	601	DO i = nxl_p, nxr_p
	602	ffty_ar3(m,k,i) = work1(i,k,j,n)
	603	ENDDO
	604	m = m+1
	605	ENDDO
	606	ENDDO
	607	ENDDO
	608
	609	!$OMP DO
	610	DO i = nxl_p, nxr_p
	611	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
	612	DO j = nys_a, nyn_a
	613	DO k = 1, nz
	614	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
	615	ENDDO
	616	ENDDO
	617	ENDDO
	618	!$OMP END PARALLEL
	619
	620	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
	621
	622	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
	623
	624	#if defined( __KKMP )
	625	DEALLOCATE( tri )
	626	#endif
	627
	628	END SUBROUTINE poisfft_hybrid_omp_vec
	629
	630
	631	SUBROUTINE poisfft_hybrid_nodes ( ar )
	632
	633	USE cpulog
	634	USE interfaces
	635
	636	IMPLICIT NONE
	637
	638	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	639	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
	640	n, nn, nt, nw1, nw2
	641
	642	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	643
	644	REAL, DIMENSION(0:nx) :: fftx_ar
	645	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	646
	647	REAL, DIMENSION(0:nx,nz) :: tri_ar
	648
	649	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
	650	nodes,nys_p:nyn_p) :: work1,work2
	651	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	652
	653
	654	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
	655
	656	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	657
	658	!
	659	!-- Store grid points to be transformed on a 1d-array, do the fft
	660	!-- and sample the results on a 4d-array
	661	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	662	iei = MIN( iouter+istride-1, nxr_p )
	663	DO k = 1, nz
	664
	665	DO i = iouter, iei
	666	ii = nxl + i
	667	ir = i - iouter + 1
	668
	669	DO j = nys_a, nyn_a
	670	ffty_ar(j,ir) = ar(k,j,ii)
	671	ENDDO
	672
	673	CALL fft_y( ffty_ar(:,ir), 'forward' )
	674	ENDDO
	675
	676	m = nys_a
	677	DO nn = 1, nodes
	678	DO nt = 1, tasks_per_logical_node
	679	DO j = nys_p, nyn_p
	680	DO i = iouter, iei
	681	ir = i - iouter + 1
	682	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
	683	ENDDO
	684	m = m+1
	685	ENDDO
	686	ENDDO
	687	ENDDO
	688
	689	ENDDO
	690	ENDDO
	691
	692	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	693
	694	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	695	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
	696	DO nn = 1, nodes
	697	DO j = nys_p, nyn_p
	698	#if defined( __parallel )
	699	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	700	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	701	comm_tasks, istat )
	702	#endif
	703	ENDDO
	704	ENDDO
	705	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	706
	707
	708	DO j = nys_p, nyn_p
	709
	710	CALL cascade( 1, j, nys_p, nyn_p )
	711	nw2 = nw1 * SIZE( work1, 3 )
	712	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
	713	#if defined( __parallel )
	714	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	715	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	716	comm_nodes, istat )
	717	#endif
	718	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
	719	CALL cascade( 2, j, nys_p, nyn_p )
	720
	721	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	722	DO k = 1, nz
	723
	724	m = nxl_a
	725	DO nn = 1, nodes
	726	DO nt = 1, tasks_per_logical_node
	727	DO i = nxl_p, nxr_p
	728	fftx_ar(m) = work1(i,k,nt,nn,j)
	729	m = m+1
	730	ENDDO
	731	ENDDO
	732	ENDDO
	733
	734	CALL fft_x( fftx_ar, 'forward' )
	735
	736	DO i = nxl_a, nxr_a
	737	tri_ar(i,k) = fftx_ar(i)
	738	ENDDO
	739
	740	ENDDO
	741
	742	jj = myid * (nyn_p-nys_p+1) + j
	743	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
	744
	745	DO k = 1, nz
	746	DO i = nxl_a, nxr_a
	747	fftx_ar(i) = tri_ar(i,k)
	748	ENDDO
	749
	750	CALL fft_x( fftx_ar, 'backward' )
	751
	752	m = nxl_a
	753	DO nn = 1, nodes
	754	DO nt = 1, tasks_per_logical_node
	755	DO i = nxl_p, nxr_p
	756	work1(i,k,nt,nn,j) = fftx_ar(m)
	757	m = m+1
	758	ENDDO
	759	ENDDO
	760	ENDDO
	761	ENDDO
	762
	763	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	764	nw2 = nw1 * SIZE( work1, 3 )
	765	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
	766	#if defined( __parallel )
	767	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	768	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	769	comm_nodes, istat )
	770	#endif
	771	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
	772
	773	ENDDO
	774
	775	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	776	DO nn = 1, nodes
	777	DO j = nys_p, nyn_p
	778	#if defined( __parallel )
	779	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	780	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	781	comm_tasks, istat )
	782	#endif
	783	ENDDO
	784	ENDDO
	785	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	786
	787	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	788
	789	DO iouter = nxl_p, nxr_p, istride
	790	iei = MIN( iouter+istride-1, nxr_p )
	791	DO k = 1, nz
	792
	793	m = nys_a
	794	DO nn = 1, nodes
	795	DO nt = 1, tasks_per_logical_node
	796	DO j = nys_p, nyn_p
	797	DO i = iouter, iei
	798	ir = i - iouter + 1
	799	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
	800	ENDDO
	801	m = m+1
	802	ENDDO
	803	ENDDO
	804	ENDDO
	805
	806	DO i = iouter, iei
	807	ii = nxl + i
	808	ir = i - iouter + 1
	809	CALL fft_y( ffty_ar(:,ir), 'backward' )
	810
	811	DO j = nys_a, nyn_a
	812	ar(k,j,ii) = ffty_ar(j,ir)
	813	ENDDO
	814	ENDDO
	815
	816	ENDDO
	817	ENDDO
	818
	819	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	820
	821	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
	822
	823	END SUBROUTINE poisfft_hybrid_nodes
	824
	825
	826
	827	SUBROUTINE tridia_hybrid( j, ar, tri )
	828
	829	USE arrays_3d
	830	USE control_parameters
	831	USE grid_variables
	832
	833	IMPLICIT NONE
	834
	835	INTEGER :: i, j, k, nnyh
	836	REAL, DIMENSION(0:nx,nz) :: ar
	837	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	838	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	839
	840	nnyh = (ny+1) / 2
	841
	842	tri = 0.0
	843	!
	844	!-- Define constant elements of the tridiagonal matrix.
	845	DO k = 0, nz-1
	846	DO i = 0,nx
	847	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
	848	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
	849	ENDDO
	850	ENDDO
	851
	852	IF ( j <= nnyh ) THEN
	853	CALL maketri_hybrid( j )
	854	ELSE
	855	CALL maketri_hybrid( ny+1-j)
	856	ENDIF
	857	CALL zerleg_hybrid
	858	CALL substi_hybrid( ar, tri )
	859
	860	CONTAINS
	861
	862	SUBROUTINE maketri_hybrid( j )
	863
	864	!----------------------------------------------------------------------!
	865	! maketri !
	866	! !
	867	! computes the i- and j-dependent component of the matrix !
	868	!----------------------------------------------------------------------!
	869
	870	USE constants
	871
	872	IMPLICIT NONE
	873
	874	INTEGER :: i, j, k, nnxh
	875	REAL :: a, c
	876
	877	REAL, DIMENSION(0:nx) :: l
	878
	879
	880	nnxh = (nx+1) / 2
	881	!
	882	!-- Provide the tridiagonal matrix for solution of the Poisson equation
	883	!-- in Fourier space. The coefficients are computed following the method
	884	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
	885	!-- Siano's original version.
	886	DO i = 0,nx
	887	IF ( i >= 0 .AND. i < nnxh ) THEN
	888	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	889	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	890	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	891	FLOAT( ny+1 ) ) ) / ( dy * dy )
	892	ELSEIF ( i == nnxh ) THEN
	893	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	894	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	895	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	896	FLOAT(ny+1) ) ) / ( dy * dy )
	897	ELSE
	898	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	899	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	900	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	901	FLOAT( ny+1 ) ) ) / ( dy * dy )
	902	ENDIF
	903	ENDDO
	904
	905	DO k = 0,nz-1
	906	DO i = 0, nx
	907	a = -1.0 * ddzu(k+2) * ddzw(k+1)
	908	c = -1.0 * ddzu(k+1) * ddzw(k+1)
	909	tri(1,i,k) = a + c - l(i)
	910	ENDDO
	911	ENDDO
	912	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	913	DO i = 0,nx
	914	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	915	ENDDO
	916	ENDIF
	917	IF ( ibc_p_t == 1 ) THEN
	918	DO i = 0,nx
	919	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	920	ENDDO
	921	ENDIF
	922
	923	END SUBROUTINE maketri_hybrid
	924
	925
	926	SUBROUTINE zerleg_hybrid
	927
	928	!----------------------------------------------------------------------!
	929	! zerleg !
	930	! !
	931	! Splitting of the tridiagonal matrix (Thomas algorithm) !
	932	!----------------------------------------------------------------------!
	933
	934	USE indices
	935
	936	IMPLICIT NONE
	937
	938	INTEGER :: i, k
	939
	940	!
	941	!-- Splitting
	942	DO i = 0, nx
	943	tri(4,i,0) = tri(1,i,0)
	944	ENDDO
	945	DO k = 1, nz-1
	946	DO i = 0,nx
	947	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	948	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	949	ENDDO
	950	ENDDO
	951
	952	END SUBROUTINE zerleg_hybrid
	953
	954	SUBROUTINE substi_hybrid( ar, tri )
	955
	956	!----------------------------------------------------------------------!
	957	! substi !
	958	! !
	959	! Substitution (Forward and Backward) (Thomas algorithm) !
	960	!----------------------------------------------------------------------!
	961
	962	IMPLICIT NONE
	963
	964	INTEGER :: i, j, k
	965	REAL, DIMENSION(0:nx,nz) :: ar
	966	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	967	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	968
	969	!
	970	!-- Forward substitution
	971	DO i = 0, nx
	972	ar1(i,0) = ar(i,1)
	973	ENDDO
	974	DO k = 1, nz - 1
	975	DO i = 0,nx
	976	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	977	ENDDO
	978	ENDDO
	979
	980	!
	981	!-- Backward substitution
	982	DO i = 0,nx
	983	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	984	ENDDO
	985	DO k = nz-2, 0, -1
	986	DO i = 0,nx
	987	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	988	/ tri(4,i,k)
	989	ENDDO
	990	ENDDO
	991
	992	END SUBROUTINE substi_hybrid
	993
	994	END SUBROUTINE tridia_hybrid
	995
	996
	997	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
	998
	999	USE cpulog
	1000
	1001	IMPLICIT NONE
	1002
	1003	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
	1004	INTEGER, SAVE :: tag = 10
	1005	#if defined( __parallel )
[415]	1006	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
	1007	INTEGER, DIMENSION(MPI_STATUS_SIZE,1) :: stata
[1]	1008	#endif
	1009
	1010	REAL :: buf, buf1
	1011
	1012
	1013	buf = 1.0
	1014	buf1 = 1.1
	1015	IF ( me_node == 0 ) THEN ! first node only
	1016
	1017	SELECT CASE ( loca )
	1018
	1019	CASE ( 1 ) ! before alltoall
	1020
	1021	IF( me_task > 0 ) THEN ! first task does not wait
	1022	#if defined( __parallel )
	1023	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
	1024	buf1, 1, MPI_REAL, me_task-1, 0, &
[415]	1025	comm_tasks, stat, ierr )
[1]	1026	#endif
	1027	ELSEIF ( j > nys_p ) THEN
	1028	req = 0
	1029	tag = MOD( tag-10, 10 ) + 10
	1030	#if defined( __parallel )
	1031	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
	1032	tag, comm_tasks, req, ierr )
	1033	reqa = req
[415]	1034	CALL MPI_WAITALL( 1, reqa, stata, ierr )
[1]	1035	#endif
	1036	ENDIF
	1037
	1038	CASE ( 2 ) ! after alltoall
	1039
	1040	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
	1041	#if defined( __parallel )
	1042	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
	1043	buf1, 1, MPI_REAL, me_task+1, 0, &
	1044	comm_tasks, stat, ierr)
	1045	#endif
	1046	ELSEIF ( j < nyn_p ) THEN
	1047	req = 0
	1048	tag = MOD( tag-10, 10 ) + 10
	1049	#if defined( __parallel )
	1050	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
	1051	ierr )
	1052	#endif
	1053	ENDIF
	1054
	1055	END SELECT
	1056
	1057	ENDIF
	1058
	1059	END SUBROUTINE cascade
	1060
	1061	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |