Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 807

Last change on this file since 807 was 807, checked in by maronga, 12 years ago
new utility check_namelist_files implemented
Property svn:keywords set to `Id`
File size: 32.0 KB

Rev	Line
[1]	1	MODULE poisfft_hybrid_mod
	2	!------------------------------------------------------------------------------
	3	!
[257]	4	! Current revisions:
[1]	5	! -----------------
[807]	6	! New cpp directive "__check" implemented which is used by check_namelist_files
	7	! (most of the code is unneeded by check_namelist_files).
[1]	8	!
	9	! Former revisions:
	10	! -----------------
[3]	11	! $Id: poisfft_hybrid.f90 807 2012-01-25 11:53:51Z maronga $
[392]	12	!
[668]	13	! 667 2010-12-23 12:06:00Z suehring/gryschka
	14	! ddzu replaced by ddzu_pres due to changes in zu(0)
	15	!
[482]	16	! 415 2009-12-15 10:26:23Z raasch
[807]	17	! Dimension of array stat in cascade change to prevent type problems with___
[482]	18	! mpi2 libraries
	19	!
[392]	20	! 274 2009-03-26 15:11:21Z heinze
	21	! Output of messages replaced by message handling routine.
	22	!
	23	! Feb. 2007
[3]	24	! RCS Log replace by Id keyword, revision history cleaned up
	25	!
[1]	26	! Revision 1.11 2004/04/30 12:43:14 raasch
	27	! Renaming of fft routines, additional argument in calls of fft_y_m
	28	!
	29	! Revision 1.2 2002/12/19 16:08:31 raasch
	30	! Preprocessor directive KKMP introduced (OMP does NOT work),
	31	! array tri will be a shared array in OpenMP loop, to get better cache
	32	! utilization, the i index (x-direction) will be executed in stride
	33	! "istride" as outer loop and in a shorter inner loop,
	34	! overlapping of computation and communication realized by new routine
	35	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
	36	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
	37	!
	38	!
	39	! Description:
	40	! ------------
	41	! Solution of the Poisson equation with a 2D spectral method.
	42	! Hybrid version for parallel computers using a 1D domain decomposition,
	43	! realized with MPI, along x and parallelization with OPEN-MP along y
	44	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
	45	! optimization is realized by overlapping of computation and communication
	46	! and by simultaneously executing as many communication calls as switches
	47	! per logical partition (LPAR) are available. This version comes into
	48	! effect if more than one node is used and if the environment variable
	49	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
	50	! without any rest.
	51	!
	52	! WARNING: In case of OpenMP, there are problems with allocating large
	53	! arrays in parallel regions.
	54	!
	55	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
	56	!------------------------------------------------------------------------------!
	57
	58	USE fft_xy
	59	USE indices
	60	USE pegrid
	61
	62	IMPLICIT NONE
	63
	64	INTEGER, PARAMETER :: switch_per_lpar = 2
	65
	66	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
	67	nxl_p, nxr_p, & ! partial x dimension
	68	nys_a, nyn_a, & ! total y dimension
	69	nys_p, nyn_p, & ! partial y dimension
	70
	71	npe_s, & ! total number of PEs for solver
	72	nwords, & ! number of points to be exchanged
	73	! with MPI_ALLTOALL
	74	n_omp_threads ! number of OpenMP threads
	75
	76	!
	77	!-- Variables for multi node version (cluster version) using routine
	78	!-- poisfft_hybrid_nodes
	79	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
	80	comm_node_all, & ! communicater all PEs node version
	81	comm_tasks, & ! communicater tasks
	82	me, me_node, me_task,& ! identity of this PE
	83	nodes, & ! number of nodes
	84	tasks_per_logical_node = -1 ! default no cluster
	85
[807]	86
	87	PRIVATE
	88
	89
	90	#if .NOT. defined ( __check )
	91	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
	92
	93
[1]	94	!
	95	!-- Public interfaces
	96	INTERFACE poisfft_hybrid_ini
	97	MODULE PROCEDURE poisfft_hybrid_ini
	98	END INTERFACE poisfft_hybrid_ini
	99
	100	INTERFACE poisfft_hybrid
	101	MODULE PROCEDURE poisfft_hybrid
	102	END INTERFACE poisfft_hybrid
	103
	104	!
	105	!-- Private interfaces
	106	INTERFACE poisfft_hybrid_omp
	107	MODULE PROCEDURE poisfft_hybrid_omp
	108	END INTERFACE poisfft_hybrid_omp
	109
	110	INTERFACE poisfft_hybrid_omp_vec
	111	MODULE PROCEDURE poisfft_hybrid_omp_vec
	112	END INTERFACE poisfft_hybrid_omp_vec
	113
	114	INTERFACE poisfft_hybrid_nodes
	115	MODULE PROCEDURE poisfft_hybrid_nodes
	116	END INTERFACE poisfft_hybrid_nodes
	117
	118	INTERFACE tridia_hybrid
	119	MODULE PROCEDURE tridia_hybrid
	120	END INTERFACE tridia_hybrid
	121
	122	INTERFACE cascade
	123	MODULE PROCEDURE cascade
	124	END INTERFACE cascade
[807]	125	#else
	126	PUBLIC poisfft_hybrid_ini
[1]	127
[807]	128	!
	129	!-- Public interfaces
	130	INTERFACE poisfft_hybrid_ini
	131	MODULE PROCEDURE poisfft_hybrid_ini
	132	END INTERFACE poisfft_hybrid_ini
	133	#endif
	134
[1]	135	CONTAINS
	136
[807]	137
[1]	138	SUBROUTINE poisfft_hybrid_ini
	139
	140	USE control_parameters
	141	USE pegrid
	142
	143	IMPLICIT NONE
	144
	145	CHARACTER(LEN=8) :: cdummy
	146	INTEGER :: idummy, istat
	147	INTEGER, DIMENSION(2) :: coords, dims
	148
	149	LOGICAL, DIMENSION(2) :: period = .false., re_dims
	150
	151
	152	!
	153	!-- Set the internal index values for the hybrid solver
	154	#if defined( __parallel )
	155	npe_s = pdims(1)
	156	#else
	157	npe_s = 1
	158	#endif
	159	nxl_a = 0
	160	nxr_a = nx
	161	nxl_p = 0
	162	nxr_p = ( ( nx+1 ) / npe_s ) - 1
	163	nys_a = nys
	164	nyn_a = nyn
	165	nys_p = 0
	166	nyn_p = ( ( ny+1 ) / npe_s ) - 1
	167
	168	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
	169
[807]	170	#if defined( __KKMP ) .AND. .NOT. defined ( __check )
[1]	171	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
	172	READ ( cdummy, '(I8)' ) n_omp_threads
[257]	173	IF ( n_omp_threads > 1 ) THEN
	174	WRITE( message_string, * ) 'Number of OpenMP threads = ', &
	175	n_omp_threads
	176	CALL message( 'poisfft_hybrid_ini', 'PA0280', 0, 0, 0, 6, 0 )
[1]	177	ENDIF
	178	#else
	179	n_omp_threads = 1
	180	#endif
	181	!
	182	!-- Initialize the one-dimensional FFT routines
	183	CALL fft_init
	184
	185	!
	186	!-- Setup for multi node version (poisfft_hybrid_nodes)
	187	IF ( n_omp_threads == 1 .AND. &
	188	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
	189
	190	IF ( tasks_per_node /= -9999 ) THEN
	191	!
	192	!-- Multi node version requires that the available number of
	193	!-- switches per logical partition must be an integral divisor
	194	!-- of the chosen number of tasks per node
	195	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
	196	!
	197	!-- Set the switch which decides about usage of the multi node
	198	!-- version
	199	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
	200	numprocs > tasks_per_node ) THEN
	201	tasks_per_logical_node = tasks_per_node / switch_per_lpar
	202	ENDIF
	203
[257]	204	IF ( tasks_per_logical_node > -1 ) THEN
	205
	206	WRITE( message_string, * ) 'running optimized ', &
	207	'multinode version', &
	208	'&switch_per_lpar = ', &
	209	switch_per_lpar, &
	210	'&tasks_per_lpar = ', &
	211	tasks_per_node, &
	212	'tasks_per_logical_node = ', &
	213	tasks_per_logical_node
[274]	214	CALL message( 'poisfft_hybrid_ini', 'PA0281', 0, 0, 0, 6, 0 )
[257]	215
[1]	216	ENDIF
	217
	218	ENDIF
	219	ENDIF
	220	ENDIF
	221
	222	!
	223	!-- Determine sub-topologies for multi node version
	224	IF ( tasks_per_logical_node >= 2 ) THEN
	225
[807]	226	#if defined( __parallel ) .AND. .NOT. defined ( __check )
[1]	227	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
	228	tasks_per_logical_node
	229	dims(1) = nodes
	230	dims(2) = tasks_per_logical_node
	231
	232	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
	233	comm_node_all, istat )
	234	CALL MPI_COMM_RANK( comm_node_all, me, istat )
	235
	236	re_dims(1) = .TRUE.
	237	re_dims(2) = .FALSE.
	238	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
	239	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
	240
	241	re_dims(1) = .FALSE.
	242	re_dims(2) = .TRUE.
	243	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
	244	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
	245
	246	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
	247	! tasks_per_logical_node
[807]	248	#elif .NOT. defined( __parallel )
[257]	249	message_string = 'parallel environment (MPI) required'
	250	CALL message( 'poisfft_hybrid_ini', 'PA0282', 1, 2, 0, 6, 0 )
[1]	251	#endif
	252	ENDIF
	253
	254	END SUBROUTINE poisfft_hybrid_ini
	255
[807]	256	#if .NOT. defined ( __check )
[1]	257	SUBROUTINE poisfft_hybrid( ar )
	258
	259	USE control_parameters
	260	USE interfaces
	261
	262	IMPLICIT NONE
	263
	264	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	265
	266	IF ( host(1:3) == 'nec' ) THEN
	267	CALL poisfft_hybrid_omp_vec( ar )
	268	ELSE
	269	IF ( tasks_per_logical_node == -1 ) THEN
	270	CALL poisfft_hybrid_omp( ar )
	271	ELSE
	272	CALL poisfft_hybrid_nodes( ar )
	273	ENDIF
	274	ENDIF
	275
	276	END SUBROUTINE poisfft_hybrid
	277
	278
	279	SUBROUTINE poisfft_hybrid_omp ( ar )
	280
	281	USE cpulog
	282	USE interfaces
	283
	284	IMPLICIT NONE
	285
	286	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	287	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	288
	289	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	290
	291	REAL, DIMENSION(0:nx) :: fftx_ar
	292	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	293
	294	REAL, DIMENSION(0:nx,nz) :: tri_ar
	295
	296	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	297	#if defined( __KKMP )
	298	INTEGER :: omp_get_thread_num
	299	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	300	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	301	#else
	302	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	303	#endif
	304
	305
	306	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
	307
	308	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	309
	310	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	311	!$OMP DO
	312	!
	313	!-- Store grid points to be transformed on a 1d-array, do the fft
	314	!-- and sample the results on a 4d-array
	315	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	316	iei = MIN( iouter+istride-1, nxr_p )
	317	DO k = 1, nz
	318
	319	DO i = iouter, iei
	320	ii = nxl + i
	321	ir = i - iouter + 1
	322
	323	DO j = nys_a, nyn_a
	324	ffty_ar(j,ir) = ar(k,j,ii)
	325	ENDDO
	326
	327	CALL fft_y( ffty_ar(:,ir), 'forward' )
	328	ENDDO
	329
	330	m = nys_a
	331	DO n = 1, npe_s
	332	DO j = nys_p, nyn_p
	333	DO i = iouter, iei
	334	ir = i - iouter + 1
	335	work1(i,k,j,n) = ffty_ar(m,ir)
	336	ENDDO
	337	m = m+1
	338	ENDDO
	339	ENDDO
	340
	341	ENDDO
	342	ENDDO
	343	!$OMP END PARALLEL
	344
	345	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	346
	347	#if defined( __parallel )
	348	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	349
	350	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	351	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	352	comm2d, istat )
	353
	354	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	355	#else
	356	work2 = work1
	357	#endif
	358
	359	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	360
	361	#if defined( __KKMP )
	362	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
	363	!$OMP DO
	364	DO j = nys_p, nyn_p
	365	jthread = omp_get_thread_num() + 1
	366	#else
	367	DO j = nys_p, nyn_p
	368	jthread = 1
	369	#endif
	370	DO k = 1, nz
	371
	372	m = nxl_a
	373	DO n = 1, npe_s
	374	DO i = nxl_p, nxr_p
	375	fftx_ar(m) = work2(i,k,j,n)
	376	m = m+1
	377	ENDDO
	378	ENDDO
	379
	380	CALL fft_x( fftx_ar, 'forward' )
	381
	382	DO i = nxl_a, nxr_a
	383	tri_ar(i,k) = fftx_ar(i)
	384	ENDDO
	385
	386	ENDDO
	387
	388	jj = myid * (nyn_p-nys_p+1) + j
	389	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	390
	391	DO k = 1, nz
	392	DO i = nxl_a, nxr_a
	393	fftx_ar(i) = tri_ar (i,k)
	394	ENDDO
	395
	396	CALL fft_x( fftx_ar, 'backward' )
	397
	398	m = nxl_a
	399	DO n = 1, npe_s
	400	DO i = nxl_p, nxr_p
	401	work2(i,k,j,n) = fftx_ar(m)
	402	m = m+1
	403	ENDDO
	404	ENDDO
	405
	406	ENDDO
	407	ENDDO
	408	#if defined( __KKMP )
	409	!$OMP END PARALLEL
	410	#endif
	411
	412	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	413
	414	#if defined( __parallel )
	415	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	416	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	417
	418	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	419	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	420	comm2d, istat )
	421
	422	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	423	#else
	424	work1 = work2
	425	#endif
	426
	427	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	428
	429	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	430	!$OMP DO
	431	DO iouter = nxl_p, nxr_p, istride
	432	iei = MIN( iouter+istride-1, nxr_p )
	433	DO k = 1, nz
	434
	435	m = nys_a
	436	DO n = 1, npe_s
	437	DO j = nys_p, nyn_p
	438	DO i = iouter, iei
	439	ir = i - iouter + 1
	440	ffty_ar(m,ir) = work1 (i,k,j,n)
	441	ENDDO
	442	m = m+1
	443	ENDDO
	444	ENDDO
	445
	446	DO i = iouter, iei
	447	ii = nxl + i
	448	ir = i - iouter + 1
	449	CALL fft_y( ffty_ar(:,ir), 'backward' )
	450
	451	DO j = nys_a, nyn_a
	452	ar(k,j,ii) = ffty_ar(j,ir)
	453	ENDDO
	454	ENDDO
	455
	456	ENDDO
	457	ENDDO
	458	!$OMP END PARALLEL
	459
	460	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	461
	462	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
	463
	464	#if defined( __KKMP )
	465	DEALLOCATE( tri )
	466	#endif
	467
	468	END SUBROUTINE poisfft_hybrid_omp
	469
	470
	471	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
	472
	473	USE cpulog
	474	USE interfaces
	475
	476	IMPLICIT NONE
	477
	478	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	479	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	480
	481	REAL, DIMENSION(0:nx,nz) :: tri_ar
	482
	483	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	484
	485	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
	486	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
	487
	488	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	489	#if defined( __KKMP )
	490	INTEGER :: omp_get_thread_num
	491	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	492	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	493	#else
	494	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	495	#endif
	496
	497
	498	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
	499
	500	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
	501
	502	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
	503	!$OMP DO
	504	!
	505	!-- Store grid points to be transformed on a 1d-array, do the fft
	506	!-- and sample the results on a 4d-array
	507	DO i = nxl_p, nxr_p
	508
	509	DO j = nys_a, nyn_a
	510	DO k = 1, nz
	511	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
	512	ENDDO
	513	ENDDO
	514
	515	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
	516	ENDDO
	517
	518	!$OMP DO
	519	DO k = 1, nz
	520	m = nys_a
	521	DO n = 1, npe_s
	522	DO j = nys_p, nyn_p
	523	DO i = nxl_p, nxr_p
	524	work1(i,k,j,n) = ffty_ar3(m,k,i)
	525	ENDDO
	526	m = m+1
	527	ENDDO
	528	ENDDO
	529	ENDDO
	530	!$OMP END PARALLEL
	531
	532	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
	533
	534	#if defined( __parallel )
	535	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	536	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	537	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	538	comm2d, istat )
	539	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	540	#else
	541	work2 = work1
	542	#endif
	543
	544	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
	545
	546	#if defined( __KKMP )
	547	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
	548	!$OMP DO
	549	DO j = nys_p, nyn_p
	550	jthread = omp_get_thread_num() + 1
	551	#else
	552	DO j = nys_p, nyn_p
	553	jthread = 1
	554	#endif
	555	DO k = 1, nz
	556
	557	m = nxl_a
	558	DO n = 1, npe_s
	559	DO i = nxl_p, nxr_p
	560	fftx_ar3(m,k,j) = work2(i,k,j,n)
	561	m = m+1
	562	ENDDO
	563	ENDDO
	564	ENDDO
	565
	566	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
	567
	568	DO k = 1, nz
	569	DO i = nxl_a, nxr_a
	570	tri_ar(i,k) = fftx_ar3(i,k,j)
	571	ENDDO
	572	ENDDO
	573
	574	jj = myid * (nyn_p-nys_p+1) + j
	575	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	576
	577	DO k = 1, nz
	578	DO i = nxl_a, nxr_a
	579	fftx_ar3(i,k,j) = tri_ar (i,k)
	580	ENDDO
	581	ENDDO
	582
	583	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
	584
	585	DO k = 1, nz
	586	m = nxl_a
	587	DO n = 1, npe_s
	588	DO i = nxl_p, nxr_p
	589	work2(i,k,j,n) = fftx_ar3(m,k,j)
	590	m = m+1
	591	ENDDO
	592	ENDDO
	593	ENDDO
	594
	595	ENDDO
	596	#if defined( __KKMP )
	597	!$OMP END PARALLEL
	598	#endif
	599
	600	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
	601
	602	#if defined( __parallel )
	603	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	604	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	605	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	606	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	607	comm2d, istat )
	608	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	609	#else
	610	work1 = work2
	611	#endif
	612
	613	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
	614
	615	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
	616	!$OMP DO
	617	DO k = 1, nz
	618	m = nys_a
	619	DO n = 1, npe_s
	620	DO j = nys_p, nyn_p
	621	DO i = nxl_p, nxr_p
	622	ffty_ar3(m,k,i) = work1(i,k,j,n)
	623	ENDDO
	624	m = m+1
	625	ENDDO
	626	ENDDO
	627	ENDDO
	628
	629	!$OMP DO
	630	DO i = nxl_p, nxr_p
	631	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
	632	DO j = nys_a, nyn_a
	633	DO k = 1, nz
	634	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
	635	ENDDO
	636	ENDDO
	637	ENDDO
	638	!$OMP END PARALLEL
	639
	640	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
	641
	642	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
	643
	644	#if defined( __KKMP )
	645	DEALLOCATE( tri )
	646	#endif
	647
	648	END SUBROUTINE poisfft_hybrid_omp_vec
	649
	650
	651	SUBROUTINE poisfft_hybrid_nodes ( ar )
	652
	653	USE cpulog
	654	USE interfaces
	655
	656	IMPLICIT NONE
	657
	658	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	659	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
	660	n, nn, nt, nw1, nw2
	661
	662	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	663
	664	REAL, DIMENSION(0:nx) :: fftx_ar
	665	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	666
	667	REAL, DIMENSION(0:nx,nz) :: tri_ar
	668
	669	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
	670	nodes,nys_p:nyn_p) :: work1,work2
	671	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	672
	673
	674	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
	675
	676	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	677
	678	!
	679	!-- Store grid points to be transformed on a 1d-array, do the fft
	680	!-- and sample the results on a 4d-array
	681	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	682	iei = MIN( iouter+istride-1, nxr_p )
	683	DO k = 1, nz
	684
	685	DO i = iouter, iei
	686	ii = nxl + i
	687	ir = i - iouter + 1
	688
	689	DO j = nys_a, nyn_a
	690	ffty_ar(j,ir) = ar(k,j,ii)
	691	ENDDO
	692
	693	CALL fft_y( ffty_ar(:,ir), 'forward' )
	694	ENDDO
	695
	696	m = nys_a
	697	DO nn = 1, nodes
	698	DO nt = 1, tasks_per_logical_node
	699	DO j = nys_p, nyn_p
	700	DO i = iouter, iei
	701	ir = i - iouter + 1
	702	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
	703	ENDDO
	704	m = m+1
	705	ENDDO
	706	ENDDO
	707	ENDDO
	708
	709	ENDDO
	710	ENDDO
	711
	712	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	713
	714	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	715	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
	716	DO nn = 1, nodes
	717	DO j = nys_p, nyn_p
	718	#if defined( __parallel )
	719	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	720	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	721	comm_tasks, istat )
	722	#endif
	723	ENDDO
	724	ENDDO
	725	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	726
	727
	728	DO j = nys_p, nyn_p
	729
	730	CALL cascade( 1, j, nys_p, nyn_p )
	731	nw2 = nw1 * SIZE( work1, 3 )
	732	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
	733	#if defined( __parallel )
	734	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	735	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	736	comm_nodes, istat )
	737	#endif
	738	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
	739	CALL cascade( 2, j, nys_p, nyn_p )
	740
	741	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	742	DO k = 1, nz
	743
	744	m = nxl_a
	745	DO nn = 1, nodes
	746	DO nt = 1, tasks_per_logical_node
	747	DO i = nxl_p, nxr_p
	748	fftx_ar(m) = work1(i,k,nt,nn,j)
	749	m = m+1
	750	ENDDO
	751	ENDDO
	752	ENDDO
	753
	754	CALL fft_x( fftx_ar, 'forward' )
	755
	756	DO i = nxl_a, nxr_a
	757	tri_ar(i,k) = fftx_ar(i)
	758	ENDDO
	759
	760	ENDDO
	761
	762	jj = myid * (nyn_p-nys_p+1) + j
	763	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
	764
	765	DO k = 1, nz
	766	DO i = nxl_a, nxr_a
	767	fftx_ar(i) = tri_ar(i,k)
	768	ENDDO
	769
	770	CALL fft_x( fftx_ar, 'backward' )
	771
	772	m = nxl_a
	773	DO nn = 1, nodes
	774	DO nt = 1, tasks_per_logical_node
	775	DO i = nxl_p, nxr_p
	776	work1(i,k,nt,nn,j) = fftx_ar(m)
	777	m = m+1
	778	ENDDO
	779	ENDDO
	780	ENDDO
	781	ENDDO
	782
	783	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	784	nw2 = nw1 * SIZE( work1, 3 )
	785	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
	786	#if defined( __parallel )
	787	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	788	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	789	comm_nodes, istat )
	790	#endif
	791	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
	792
	793	ENDDO
	794
	795	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	796	DO nn = 1, nodes
	797	DO j = nys_p, nyn_p
	798	#if defined( __parallel )
	799	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	800	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	801	comm_tasks, istat )
	802	#endif
	803	ENDDO
	804	ENDDO
	805	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	806
	807	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	808
	809	DO iouter = nxl_p, nxr_p, istride
	810	iei = MIN( iouter+istride-1, nxr_p )
	811	DO k = 1, nz
	812
	813	m = nys_a
	814	DO nn = 1, nodes
	815	DO nt = 1, tasks_per_logical_node
	816	DO j = nys_p, nyn_p
	817	DO i = iouter, iei
	818	ir = i - iouter + 1
	819	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
	820	ENDDO
	821	m = m+1
	822	ENDDO
	823	ENDDO
	824	ENDDO
	825
	826	DO i = iouter, iei
	827	ii = nxl + i
	828	ir = i - iouter + 1
	829	CALL fft_y( ffty_ar(:,ir), 'backward' )
	830
	831	DO j = nys_a, nyn_a
	832	ar(k,j,ii) = ffty_ar(j,ir)
	833	ENDDO
	834	ENDDO
	835
	836	ENDDO
	837	ENDDO
	838
	839	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	840
	841	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
	842
	843	END SUBROUTINE poisfft_hybrid_nodes
	844
	845
	846
	847	SUBROUTINE tridia_hybrid( j, ar, tri )
	848
	849	USE arrays_3d
	850	USE control_parameters
	851	USE grid_variables
	852
	853	IMPLICIT NONE
	854
	855	INTEGER :: i, j, k, nnyh
	856	REAL, DIMENSION(0:nx,nz) :: ar
	857	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	858	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	859
	860	nnyh = (ny+1) / 2
	861
	862	tri = 0.0
	863	!
	864	!-- Define constant elements of the tridiagonal matrix.
	865	DO k = 0, nz-1
	866	DO i = 0,nx
[667]	867	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	868	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	869	ENDDO
	870	ENDDO
	871
	872	IF ( j <= nnyh ) THEN
	873	CALL maketri_hybrid( j )
	874	ELSE
	875	CALL maketri_hybrid( ny+1-j)
	876	ENDIF
	877	CALL zerleg_hybrid
	878	CALL substi_hybrid( ar, tri )
	879
	880	CONTAINS
	881
	882	SUBROUTINE maketri_hybrid( j )
	883
	884	!----------------------------------------------------------------------!
	885	! maketri !
	886	! !
	887	! computes the i- and j-dependent component of the matrix !
	888	!----------------------------------------------------------------------!
	889
	890	USE constants
	891
	892	IMPLICIT NONE
	893
	894	INTEGER :: i, j, k, nnxh
	895	REAL :: a, c
	896
	897	REAL, DIMENSION(0:nx) :: l
	898
	899
	900	nnxh = (nx+1) / 2
	901	!
	902	!-- Provide the tridiagonal matrix for solution of the Poisson equation
	903	!-- in Fourier space. The coefficients are computed following the method
	904	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
	905	!-- Siano's original version.
	906	DO i = 0,nx
	907	IF ( i >= 0 .AND. i < nnxh ) THEN
	908	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	909	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	910	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	911	FLOAT( ny+1 ) ) ) / ( dy * dy )
	912	ELSEIF ( i == nnxh ) THEN
	913	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	914	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	915	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	916	FLOAT(ny+1) ) ) / ( dy * dy )
	917	ELSE
	918	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	919	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	920	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	921	FLOAT( ny+1 ) ) ) / ( dy * dy )
	922	ENDIF
	923	ENDDO
	924
	925	DO k = 0,nz-1
	926	DO i = 0, nx
[667]	927	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	928	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	929	tri(1,i,k) = a + c - l(i)
	930	ENDDO
	931	ENDDO
	932	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	933	DO i = 0,nx
	934	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	935	ENDDO
	936	ENDIF
	937	IF ( ibc_p_t == 1 ) THEN
	938	DO i = 0,nx
	939	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	940	ENDDO
	941	ENDIF
	942
	943	END SUBROUTINE maketri_hybrid
	944
	945
	946	SUBROUTINE zerleg_hybrid
	947
	948	!----------------------------------------------------------------------!
	949	! zerleg !
	950	! !
	951	! Splitting of the tridiagonal matrix (Thomas algorithm) !
	952	!----------------------------------------------------------------------!
	953
	954	USE indices
	955
	956	IMPLICIT NONE
	957
	958	INTEGER :: i, k
	959
	960	!
	961	!-- Splitting
	962	DO i = 0, nx
	963	tri(4,i,0) = tri(1,i,0)
	964	ENDDO
	965	DO k = 1, nz-1
	966	DO i = 0,nx
	967	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	968	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	969	ENDDO
	970	ENDDO
	971
	972	END SUBROUTINE zerleg_hybrid
	973
	974	SUBROUTINE substi_hybrid( ar, tri )
	975
	976	!----------------------------------------------------------------------!
	977	! substi !
	978	! !
	979	! Substitution (Forward and Backward) (Thomas algorithm) !
	980	!----------------------------------------------------------------------!
	981
	982	IMPLICIT NONE
	983
	984	INTEGER :: i, j, k
	985	REAL, DIMENSION(0:nx,nz) :: ar
	986	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	987	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	988
	989	!
	990	!-- Forward substitution
	991	DO i = 0, nx
	992	ar1(i,0) = ar(i,1)
	993	ENDDO
	994	DO k = 1, nz - 1
	995	DO i = 0,nx
	996	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	997	ENDDO
	998	ENDDO
	999
	1000	!
	1001	!-- Backward substitution
	1002	DO i = 0,nx
	1003	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	1004	ENDDO
	1005	DO k = nz-2, 0, -1
	1006	DO i = 0,nx
	1007	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1008	/ tri(4,i,k)
	1009	ENDDO
	1010	ENDDO
	1011
	1012	END SUBROUTINE substi_hybrid
	1013
	1014	END SUBROUTINE tridia_hybrid
	1015
	1016
	1017	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
	1018
	1019	USE cpulog
	1020
	1021	IMPLICIT NONE
	1022
	1023	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
	1024	INTEGER, SAVE :: tag = 10
	1025	#if defined( __parallel )
[415]	1026	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
	1027	INTEGER, DIMENSION(MPI_STATUS_SIZE,1) :: stata
[1]	1028	#endif
	1029
	1030	REAL :: buf, buf1
	1031
	1032
	1033	buf = 1.0
	1034	buf1 = 1.1
	1035	IF ( me_node == 0 ) THEN ! first node only
	1036
	1037	SELECT CASE ( loca )
	1038
	1039	CASE ( 1 ) ! before alltoall
	1040
	1041	IF( me_task > 0 ) THEN ! first task does not wait
	1042	#if defined( __parallel )
	1043	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
	1044	buf1, 1, MPI_REAL, me_task-1, 0, &
[415]	1045	comm_tasks, stat, ierr )
[1]	1046	#endif
	1047	ELSEIF ( j > nys_p ) THEN
	1048	req = 0
	1049	tag = MOD( tag-10, 10 ) + 10
	1050	#if defined( __parallel )
	1051	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
	1052	tag, comm_tasks, req, ierr )
	1053	reqa = req
[415]	1054	CALL MPI_WAITALL( 1, reqa, stata, ierr )
[1]	1055	#endif
	1056	ENDIF
	1057
	1058	CASE ( 2 ) ! after alltoall
	1059
	1060	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
	1061	#if defined( __parallel )
	1062	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
	1063	buf1, 1, MPI_REAL, me_task+1, 0, &
	1064	comm_tasks, stat, ierr)
	1065	#endif
	1066	ELSEIF ( j < nyn_p ) THEN
	1067	req = 0
	1068	tag = MOD( tag-10, 10 ) + 10
	1069	#if defined( __parallel )
	1070	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
	1071	ierr )
	1072	#endif
	1073	ENDIF
	1074
	1075	END SELECT
	1076
	1077	ENDIF
	1078
	1079	END SUBROUTINE cascade
[807]	1080	#endif
[1]	1081	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |