Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 858

Last change on this file since 858 was 810, checked in by maronga, 13 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 32.1 KB

Rev	Line
[1]	1	MODULE poisfft_hybrid_mod
[808]	2
[1]	3	!------------------------------------------------------------------------------
	4	!
[257]	5	! Current revisions:
[1]	6	! -----------------
	7	!
[810]	8	!
[1]	9	! Former revisions:
	10	! -----------------
[3]	11	! $Id: poisfft_hybrid.f90 810 2012-01-30 13:40:12Z suehring $
[392]	12	!
[810]	13	! 809 2012-01-30 13:32:58Z maronga
	14	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	15	!
[808]	16	! 807 2012-01-25 11:53:51Z maronga
	17	! New cpp directive "__check" implemented which is used by check_namelist_files
	18	! (most of the code is unneeded by check_namelist_files).
	19	!
[668]	20	! 667 2010-12-23 12:06:00Z suehring/gryschka
	21	! ddzu replaced by ddzu_pres due to changes in zu(0)
	22	!
[482]	23	! 415 2009-12-15 10:26:23Z raasch
[807]	24	! Dimension of array stat in cascade change to prevent type problems with___
[482]	25	! mpi2 libraries
	26	!
[392]	27	! 274 2009-03-26 15:11:21Z heinze
	28	! Output of messages replaced by message handling routine.
	29	!
	30	! Feb. 2007
[3]	31	! RCS Log replace by Id keyword, revision history cleaned up
	32	!
[1]	33	! Revision 1.11 2004/04/30 12:43:14 raasch
	34	! Renaming of fft routines, additional argument in calls of fft_y_m
	35	!
	36	! Revision 1.2 2002/12/19 16:08:31 raasch
	37	! Preprocessor directive KKMP introduced (OMP does NOT work),
	38	! array tri will be a shared array in OpenMP loop, to get better cache
	39	! utilization, the i index (x-direction) will be executed in stride
	40	! "istride" as outer loop and in a shorter inner loop,
	41	! overlapping of computation and communication realized by new routine
	42	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
	43	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
	44	!
	45	!
	46	! Description:
	47	! ------------
	48	! Solution of the Poisson equation with a 2D spectral method.
	49	! Hybrid version for parallel computers using a 1D domain decomposition,
	50	! realized with MPI, along x and parallelization with OPEN-MP along y
	51	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
	52	! optimization is realized by overlapping of computation and communication
	53	! and by simultaneously executing as many communication calls as switches
	54	! per logical partition (LPAR) are available. This version comes into
	55	! effect if more than one node is used and if the environment variable
	56	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
	57	! without any rest.
	58	!
	59	! WARNING: In case of OpenMP, there are problems with allocating large
	60	! arrays in parallel regions.
	61	!
	62	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
	63	!------------------------------------------------------------------------------!
	64
	65	USE fft_xy
	66	USE indices
	67	USE pegrid
	68
	69	IMPLICIT NONE
	70
	71	INTEGER, PARAMETER :: switch_per_lpar = 2
	72
	73	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
	74	nxl_p, nxr_p, & ! partial x dimension
	75	nys_a, nyn_a, & ! total y dimension
	76	nys_p, nyn_p, & ! partial y dimension
	77
	78	npe_s, & ! total number of PEs for solver
	79	nwords, & ! number of points to be exchanged
	80	! with MPI_ALLTOALL
	81	n_omp_threads ! number of OpenMP threads
	82
	83	!
	84	!-- Variables for multi node version (cluster version) using routine
	85	!-- poisfft_hybrid_nodes
	86	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
	87	comm_node_all, & ! communicater all PEs node version
	88	comm_tasks, & ! communicater tasks
	89	me, me_node, me_task,& ! identity of this PE
	90	nodes, & ! number of nodes
	91	tasks_per_logical_node = -1 ! default no cluster
	92
[807]	93
	94	PRIVATE
	95
	96
[809]	97	#if ! defined ( __check )
[807]	98	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
	99
	100
[1]	101	!
	102	!-- Public interfaces
	103	INTERFACE poisfft_hybrid_ini
	104	MODULE PROCEDURE poisfft_hybrid_ini
	105	END INTERFACE poisfft_hybrid_ini
	106
	107	INTERFACE poisfft_hybrid
	108	MODULE PROCEDURE poisfft_hybrid
	109	END INTERFACE poisfft_hybrid
	110
	111	!
	112	!-- Private interfaces
	113	INTERFACE poisfft_hybrid_omp
	114	MODULE PROCEDURE poisfft_hybrid_omp
	115	END INTERFACE poisfft_hybrid_omp
	116
	117	INTERFACE poisfft_hybrid_omp_vec
	118	MODULE PROCEDURE poisfft_hybrid_omp_vec
	119	END INTERFACE poisfft_hybrid_omp_vec
	120
	121	INTERFACE poisfft_hybrid_nodes
	122	MODULE PROCEDURE poisfft_hybrid_nodes
	123	END INTERFACE poisfft_hybrid_nodes
	124
	125	INTERFACE tridia_hybrid
	126	MODULE PROCEDURE tridia_hybrid
	127	END INTERFACE tridia_hybrid
	128
	129	INTERFACE cascade
	130	MODULE PROCEDURE cascade
	131	END INTERFACE cascade
[807]	132	#else
	133	PUBLIC poisfft_hybrid_ini
[1]	134
[807]	135	!
	136	!-- Public interfaces
	137	INTERFACE poisfft_hybrid_ini
	138	MODULE PROCEDURE poisfft_hybrid_ini
	139	END INTERFACE poisfft_hybrid_ini
	140	#endif
	141
[1]	142	CONTAINS
	143
[807]	144
[1]	145	SUBROUTINE poisfft_hybrid_ini
	146
	147	USE control_parameters
	148	USE pegrid
	149
	150	IMPLICIT NONE
	151
	152	CHARACTER(LEN=8) :: cdummy
	153	INTEGER :: idummy, istat
	154	INTEGER, DIMENSION(2) :: coords, dims
	155
	156	LOGICAL, DIMENSION(2) :: period = .false., re_dims
	157
	158
	159	!
	160	!-- Set the internal index values for the hybrid solver
	161	#if defined( __parallel )
	162	npe_s = pdims(1)
	163	#else
	164	npe_s = 1
	165	#endif
	166	nxl_a = 0
	167	nxr_a = nx
	168	nxl_p = 0
	169	nxr_p = ( ( nx+1 ) / npe_s ) - 1
	170	nys_a = nys
	171	nyn_a = nyn
	172	nys_p = 0
	173	nyn_p = ( ( ny+1 ) / npe_s ) - 1
	174
	175	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
	176
[809]	177	#if defined( __KKMP ) && ! defined ( __check )
[1]	178	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
	179	READ ( cdummy, '(I8)' ) n_omp_threads
[257]	180	IF ( n_omp_threads > 1 ) THEN
	181	WRITE( message_string, * ) 'Number of OpenMP threads = ', &
	182	n_omp_threads
	183	CALL message( 'poisfft_hybrid_ini', 'PA0280', 0, 0, 0, 6, 0 )
[1]	184	ENDIF
	185	#else
	186	n_omp_threads = 1
	187	#endif
	188	!
	189	!-- Initialize the one-dimensional FFT routines
	190	CALL fft_init
	191
	192	!
	193	!-- Setup for multi node version (poisfft_hybrid_nodes)
	194	IF ( n_omp_threads == 1 .AND. &
	195	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
	196
	197	IF ( tasks_per_node /= -9999 ) THEN
	198	!
	199	!-- Multi node version requires that the available number of
	200	!-- switches per logical partition must be an integral divisor
	201	!-- of the chosen number of tasks per node
	202	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
	203	!
	204	!-- Set the switch which decides about usage of the multi node
	205	!-- version
	206	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
	207	numprocs > tasks_per_node ) THEN
	208	tasks_per_logical_node = tasks_per_node / switch_per_lpar
	209	ENDIF
	210
[257]	211	IF ( tasks_per_logical_node > -1 ) THEN
	212
	213	WRITE( message_string, * ) 'running optimized ', &
	214	'multinode version', &
	215	'&switch_per_lpar = ', &
	216	switch_per_lpar, &
	217	'&tasks_per_lpar = ', &
	218	tasks_per_node, &
	219	'tasks_per_logical_node = ', &
	220	tasks_per_logical_node
[274]	221	CALL message( 'poisfft_hybrid_ini', 'PA0281', 0, 0, 0, 6, 0 )
[257]	222
[1]	223	ENDIF
	224
	225	ENDIF
	226	ENDIF
	227	ENDIF
	228
	229	!
	230	!-- Determine sub-topologies for multi node version
	231	IF ( tasks_per_logical_node >= 2 ) THEN
	232
[809]	233	#if defined( __parallel ) && ! defined ( __check )
[1]	234	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
	235	tasks_per_logical_node
	236	dims(1) = nodes
	237	dims(2) = tasks_per_logical_node
	238
	239	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
	240	comm_node_all, istat )
	241	CALL MPI_COMM_RANK( comm_node_all, me, istat )
	242
	243	re_dims(1) = .TRUE.
	244	re_dims(2) = .FALSE.
	245	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
	246	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
	247
	248	re_dims(1) = .FALSE.
	249	re_dims(2) = .TRUE.
	250	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
	251	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
	252
	253	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
	254	! tasks_per_logical_node
[809]	255	#elif ! defined( __parallel )
[257]	256	message_string = 'parallel environment (MPI) required'
	257	CALL message( 'poisfft_hybrid_ini', 'PA0282', 1, 2, 0, 6, 0 )
[1]	258	#endif
	259	ENDIF
	260
	261	END SUBROUTINE poisfft_hybrid_ini
	262
[809]	263	#if ! defined ( __check )
[1]	264	SUBROUTINE poisfft_hybrid( ar )
	265
	266	USE control_parameters
	267	USE interfaces
	268
	269	IMPLICIT NONE
	270
	271	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	272
	273	IF ( host(1:3) == 'nec' ) THEN
	274	CALL poisfft_hybrid_omp_vec( ar )
	275	ELSE
	276	IF ( tasks_per_logical_node == -1 ) THEN
	277	CALL poisfft_hybrid_omp( ar )
	278	ELSE
	279	CALL poisfft_hybrid_nodes( ar )
	280	ENDIF
	281	ENDIF
	282
	283	END SUBROUTINE poisfft_hybrid
	284
	285
	286	SUBROUTINE poisfft_hybrid_omp ( ar )
	287
	288	USE cpulog
	289	USE interfaces
	290
	291	IMPLICIT NONE
	292
	293	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	294	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	295
	296	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	297
	298	REAL, DIMENSION(0:nx) :: fftx_ar
	299	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	300
	301	REAL, DIMENSION(0:nx,nz) :: tri_ar
	302
	303	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	304	#if defined( __KKMP )
	305	INTEGER :: omp_get_thread_num
	306	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	307	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	308	#else
	309	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	310	#endif
	311
	312
	313	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
	314
	315	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	316
	317	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	318	!$OMP DO
	319	!
	320	!-- Store grid points to be transformed on a 1d-array, do the fft
	321	!-- and sample the results on a 4d-array
	322	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	323	iei = MIN( iouter+istride-1, nxr_p )
	324	DO k = 1, nz
	325
	326	DO i = iouter, iei
	327	ii = nxl + i
	328	ir = i - iouter + 1
	329
	330	DO j = nys_a, nyn_a
	331	ffty_ar(j,ir) = ar(k,j,ii)
	332	ENDDO
	333
	334	CALL fft_y( ffty_ar(:,ir), 'forward' )
	335	ENDDO
	336
	337	m = nys_a
	338	DO n = 1, npe_s
	339	DO j = nys_p, nyn_p
	340	DO i = iouter, iei
	341	ir = i - iouter + 1
	342	work1(i,k,j,n) = ffty_ar(m,ir)
	343	ENDDO
	344	m = m+1
	345	ENDDO
	346	ENDDO
	347
	348	ENDDO
	349	ENDDO
	350	!$OMP END PARALLEL
	351
	352	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	353
	354	#if defined( __parallel )
	355	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	356
	357	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	358	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	359	comm2d, istat )
	360
	361	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	362	#else
	363	work2 = work1
	364	#endif
	365
	366	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	367
	368	#if defined( __KKMP )
	369	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
	370	!$OMP DO
	371	DO j = nys_p, nyn_p
	372	jthread = omp_get_thread_num() + 1
	373	#else
	374	DO j = nys_p, nyn_p
	375	jthread = 1
	376	#endif
	377	DO k = 1, nz
	378
	379	m = nxl_a
	380	DO n = 1, npe_s
	381	DO i = nxl_p, nxr_p
	382	fftx_ar(m) = work2(i,k,j,n)
	383	m = m+1
	384	ENDDO
	385	ENDDO
	386
	387	CALL fft_x( fftx_ar, 'forward' )
	388
	389	DO i = nxl_a, nxr_a
	390	tri_ar(i,k) = fftx_ar(i)
	391	ENDDO
	392
	393	ENDDO
	394
	395	jj = myid * (nyn_p-nys_p+1) + j
	396	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	397
	398	DO k = 1, nz
	399	DO i = nxl_a, nxr_a
	400	fftx_ar(i) = tri_ar (i,k)
	401	ENDDO
	402
	403	CALL fft_x( fftx_ar, 'backward' )
	404
	405	m = nxl_a
	406	DO n = 1, npe_s
	407	DO i = nxl_p, nxr_p
	408	work2(i,k,j,n) = fftx_ar(m)
	409	m = m+1
	410	ENDDO
	411	ENDDO
	412
	413	ENDDO
	414	ENDDO
	415	#if defined( __KKMP )
	416	!$OMP END PARALLEL
	417	#endif
	418
	419	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	420
	421	#if defined( __parallel )
	422	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	423	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	424
	425	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	426	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	427	comm2d, istat )
	428
	429	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	430	#else
	431	work1 = work2
	432	#endif
	433
	434	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	435
	436	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	437	!$OMP DO
	438	DO iouter = nxl_p, nxr_p, istride
	439	iei = MIN( iouter+istride-1, nxr_p )
	440	DO k = 1, nz
	441
	442	m = nys_a
	443	DO n = 1, npe_s
	444	DO j = nys_p, nyn_p
	445	DO i = iouter, iei
	446	ir = i - iouter + 1
	447	ffty_ar(m,ir) = work1 (i,k,j,n)
	448	ENDDO
	449	m = m+1
	450	ENDDO
	451	ENDDO
	452
	453	DO i = iouter, iei
	454	ii = nxl + i
	455	ir = i - iouter + 1
	456	CALL fft_y( ffty_ar(:,ir), 'backward' )
	457
	458	DO j = nys_a, nyn_a
	459	ar(k,j,ii) = ffty_ar(j,ir)
	460	ENDDO
	461	ENDDO
	462
	463	ENDDO
	464	ENDDO
	465	!$OMP END PARALLEL
	466
	467	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	468
	469	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
	470
	471	#if defined( __KKMP )
	472	DEALLOCATE( tri )
	473	#endif
	474
	475	END SUBROUTINE poisfft_hybrid_omp
	476
	477
	478	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
	479
	480	USE cpulog
	481	USE interfaces
	482
	483	IMPLICIT NONE
	484
	485	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	486	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	487
	488	REAL, DIMENSION(0:nx,nz) :: tri_ar
	489
	490	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	491
	492	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
	493	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
	494
	495	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	496	#if defined( __KKMP )
	497	INTEGER :: omp_get_thread_num
	498	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	499	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	500	#else
	501	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	502	#endif
	503
	504
	505	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
	506
	507	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
	508
	509	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
	510	!$OMP DO
	511	!
	512	!-- Store grid points to be transformed on a 1d-array, do the fft
	513	!-- and sample the results on a 4d-array
	514	DO i = nxl_p, nxr_p
	515
	516	DO j = nys_a, nyn_a
	517	DO k = 1, nz
	518	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
	519	ENDDO
	520	ENDDO
	521
	522	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
	523	ENDDO
	524
	525	!$OMP DO
	526	DO k = 1, nz
	527	m = nys_a
	528	DO n = 1, npe_s
	529	DO j = nys_p, nyn_p
	530	DO i = nxl_p, nxr_p
	531	work1(i,k,j,n) = ffty_ar3(m,k,i)
	532	ENDDO
	533	m = m+1
	534	ENDDO
	535	ENDDO
	536	ENDDO
	537	!$OMP END PARALLEL
	538
	539	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
	540
	541	#if defined( __parallel )
	542	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	543	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	544	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	545	comm2d, istat )
	546	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	547	#else
	548	work2 = work1
	549	#endif
	550
	551	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
	552
	553	#if defined( __KKMP )
	554	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
	555	!$OMP DO
	556	DO j = nys_p, nyn_p
	557	jthread = omp_get_thread_num() + 1
	558	#else
	559	DO j = nys_p, nyn_p
	560	jthread = 1
	561	#endif
	562	DO k = 1, nz
	563
	564	m = nxl_a
	565	DO n = 1, npe_s
	566	DO i = nxl_p, nxr_p
	567	fftx_ar3(m,k,j) = work2(i,k,j,n)
	568	m = m+1
	569	ENDDO
	570	ENDDO
	571	ENDDO
	572
	573	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
	574
	575	DO k = 1, nz
	576	DO i = nxl_a, nxr_a
	577	tri_ar(i,k) = fftx_ar3(i,k,j)
	578	ENDDO
	579	ENDDO
	580
	581	jj = myid * (nyn_p-nys_p+1) + j
	582	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	583
	584	DO k = 1, nz
	585	DO i = nxl_a, nxr_a
	586	fftx_ar3(i,k,j) = tri_ar (i,k)
	587	ENDDO
	588	ENDDO
	589
	590	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
	591
	592	DO k = 1, nz
	593	m = nxl_a
	594	DO n = 1, npe_s
	595	DO i = nxl_p, nxr_p
	596	work2(i,k,j,n) = fftx_ar3(m,k,j)
	597	m = m+1
	598	ENDDO
	599	ENDDO
	600	ENDDO
	601
	602	ENDDO
	603	#if defined( __KKMP )
	604	!$OMP END PARALLEL
	605	#endif
	606
	607	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
	608
	609	#if defined( __parallel )
	610	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	611	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	612	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	613	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	614	comm2d, istat )
	615	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	616	#else
	617	work1 = work2
	618	#endif
	619
	620	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
	621
	622	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
	623	!$OMP DO
	624	DO k = 1, nz
	625	m = nys_a
	626	DO n = 1, npe_s
	627	DO j = nys_p, nyn_p
	628	DO i = nxl_p, nxr_p
	629	ffty_ar3(m,k,i) = work1(i,k,j,n)
	630	ENDDO
	631	m = m+1
	632	ENDDO
	633	ENDDO
	634	ENDDO
	635
	636	!$OMP DO
	637	DO i = nxl_p, nxr_p
	638	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
	639	DO j = nys_a, nyn_a
	640	DO k = 1, nz
	641	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
	642	ENDDO
	643	ENDDO
	644	ENDDO
	645	!$OMP END PARALLEL
	646
	647	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
	648
	649	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
	650
	651	#if defined( __KKMP )
	652	DEALLOCATE( tri )
	653	#endif
	654
	655	END SUBROUTINE poisfft_hybrid_omp_vec
	656
	657
	658	SUBROUTINE poisfft_hybrid_nodes ( ar )
	659
	660	USE cpulog
	661	USE interfaces
	662
	663	IMPLICIT NONE
	664
	665	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	666	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
	667	n, nn, nt, nw1, nw2
	668
	669	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	670
	671	REAL, DIMENSION(0:nx) :: fftx_ar
	672	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	673
	674	REAL, DIMENSION(0:nx,nz) :: tri_ar
	675
	676	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
	677	nodes,nys_p:nyn_p) :: work1,work2
	678	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	679
	680
	681	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
	682
	683	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	684
	685	!
	686	!-- Store grid points to be transformed on a 1d-array, do the fft
	687	!-- and sample the results on a 4d-array
	688	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	689	iei = MIN( iouter+istride-1, nxr_p )
	690	DO k = 1, nz
	691
	692	DO i = iouter, iei
	693	ii = nxl + i
	694	ir = i - iouter + 1
	695
	696	DO j = nys_a, nyn_a
	697	ffty_ar(j,ir) = ar(k,j,ii)
	698	ENDDO
	699
	700	CALL fft_y( ffty_ar(:,ir), 'forward' )
	701	ENDDO
	702
	703	m = nys_a
	704	DO nn = 1, nodes
	705	DO nt = 1, tasks_per_logical_node
	706	DO j = nys_p, nyn_p
	707	DO i = iouter, iei
	708	ir = i - iouter + 1
	709	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
	710	ENDDO
	711	m = m+1
	712	ENDDO
	713	ENDDO
	714	ENDDO
	715
	716	ENDDO
	717	ENDDO
	718
	719	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	720
	721	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	722	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
	723	DO nn = 1, nodes
	724	DO j = nys_p, nyn_p
	725	#if defined( __parallel )
	726	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	727	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	728	comm_tasks, istat )
	729	#endif
	730	ENDDO
	731	ENDDO
	732	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	733
	734
	735	DO j = nys_p, nyn_p
	736
	737	CALL cascade( 1, j, nys_p, nyn_p )
	738	nw2 = nw1 * SIZE( work1, 3 )
	739	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
	740	#if defined( __parallel )
	741	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	742	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	743	comm_nodes, istat )
	744	#endif
	745	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
	746	CALL cascade( 2, j, nys_p, nyn_p )
	747
	748	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	749	DO k = 1, nz
	750
	751	m = nxl_a
	752	DO nn = 1, nodes
	753	DO nt = 1, tasks_per_logical_node
	754	DO i = nxl_p, nxr_p
	755	fftx_ar(m) = work1(i,k,nt,nn,j)
	756	m = m+1
	757	ENDDO
	758	ENDDO
	759	ENDDO
	760
	761	CALL fft_x( fftx_ar, 'forward' )
	762
	763	DO i = nxl_a, nxr_a
	764	tri_ar(i,k) = fftx_ar(i)
	765	ENDDO
	766
	767	ENDDO
	768
	769	jj = myid * (nyn_p-nys_p+1) + j
	770	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
	771
	772	DO k = 1, nz
	773	DO i = nxl_a, nxr_a
	774	fftx_ar(i) = tri_ar(i,k)
	775	ENDDO
	776
	777	CALL fft_x( fftx_ar, 'backward' )
	778
	779	m = nxl_a
	780	DO nn = 1, nodes
	781	DO nt = 1, tasks_per_logical_node
	782	DO i = nxl_p, nxr_p
	783	work1(i,k,nt,nn,j) = fftx_ar(m)
	784	m = m+1
	785	ENDDO
	786	ENDDO
	787	ENDDO
	788	ENDDO
	789
	790	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	791	nw2 = nw1 * SIZE( work1, 3 )
	792	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
	793	#if defined( __parallel )
	794	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	795	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	796	comm_nodes, istat )
	797	#endif
	798	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
	799
	800	ENDDO
	801
	802	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	803	DO nn = 1, nodes
	804	DO j = nys_p, nyn_p
	805	#if defined( __parallel )
	806	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	807	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	808	comm_tasks, istat )
	809	#endif
	810	ENDDO
	811	ENDDO
	812	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	813
	814	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	815
	816	DO iouter = nxl_p, nxr_p, istride
	817	iei = MIN( iouter+istride-1, nxr_p )
	818	DO k = 1, nz
	819
	820	m = nys_a
	821	DO nn = 1, nodes
	822	DO nt = 1, tasks_per_logical_node
	823	DO j = nys_p, nyn_p
	824	DO i = iouter, iei
	825	ir = i - iouter + 1
	826	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
	827	ENDDO
	828	m = m+1
	829	ENDDO
	830	ENDDO
	831	ENDDO
	832
	833	DO i = iouter, iei
	834	ii = nxl + i
	835	ir = i - iouter + 1
	836	CALL fft_y( ffty_ar(:,ir), 'backward' )
	837
	838	DO j = nys_a, nyn_a
	839	ar(k,j,ii) = ffty_ar(j,ir)
	840	ENDDO
	841	ENDDO
	842
	843	ENDDO
	844	ENDDO
	845
	846	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	847
	848	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
	849
	850	END SUBROUTINE poisfft_hybrid_nodes
	851
	852
	853
	854	SUBROUTINE tridia_hybrid( j, ar, tri )
	855
	856	USE arrays_3d
	857	USE control_parameters
	858	USE grid_variables
	859
	860	IMPLICIT NONE
	861
	862	INTEGER :: i, j, k, nnyh
	863	REAL, DIMENSION(0:nx,nz) :: ar
	864	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	865	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	866
	867	nnyh = (ny+1) / 2
	868
	869	tri = 0.0
	870	!
	871	!-- Define constant elements of the tridiagonal matrix.
	872	DO k = 0, nz-1
	873	DO i = 0,nx
[667]	874	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	875	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	876	ENDDO
	877	ENDDO
	878
	879	IF ( j <= nnyh ) THEN
	880	CALL maketri_hybrid( j )
	881	ELSE
	882	CALL maketri_hybrid( ny+1-j)
	883	ENDIF
	884	CALL zerleg_hybrid
	885	CALL substi_hybrid( ar, tri )
	886
	887	CONTAINS
	888
	889	SUBROUTINE maketri_hybrid( j )
	890
	891	!----------------------------------------------------------------------!
	892	! maketri !
	893	! !
	894	! computes the i- and j-dependent component of the matrix !
	895	!----------------------------------------------------------------------!
	896
	897	USE constants
	898
	899	IMPLICIT NONE
	900
	901	INTEGER :: i, j, k, nnxh
	902	REAL :: a, c
	903
	904	REAL, DIMENSION(0:nx) :: l
	905
	906
	907	nnxh = (nx+1) / 2
	908	!
	909	!-- Provide the tridiagonal matrix for solution of the Poisson equation
	910	!-- in Fourier space. The coefficients are computed following the method
	911	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
	912	!-- Siano's original version.
	913	DO i = 0,nx
	914	IF ( i >= 0 .AND. i < nnxh ) THEN
	915	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
	916	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	917	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	918	FLOAT( ny+1 ) ) ) / ( dy * dy )
	919	ELSEIF ( i == nnxh ) THEN
	920	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	921	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	922	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	923	FLOAT(ny+1) ) ) / ( dy * dy )
	924	ELSE
	925	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
	926	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
	927	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
	928	FLOAT( ny+1 ) ) ) / ( dy * dy )
	929	ENDIF
	930	ENDDO
	931
	932	DO k = 0,nz-1
	933	DO i = 0, nx
[667]	934	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	935	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	936	tri(1,i,k) = a + c - l(i)
	937	ENDDO
	938	ENDDO
	939	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	940	DO i = 0,nx
	941	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	942	ENDDO
	943	ENDIF
	944	IF ( ibc_p_t == 1 ) THEN
	945	DO i = 0,nx
	946	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	947	ENDDO
	948	ENDIF
	949
	950	END SUBROUTINE maketri_hybrid
	951
	952
	953	SUBROUTINE zerleg_hybrid
	954
	955	!----------------------------------------------------------------------!
	956	! zerleg !
	957	! !
	958	! Splitting of the tridiagonal matrix (Thomas algorithm) !
	959	!----------------------------------------------------------------------!
	960
	961	USE indices
	962
	963	IMPLICIT NONE
	964
	965	INTEGER :: i, k
	966
	967	!
	968	!-- Splitting
	969	DO i = 0, nx
	970	tri(4,i,0) = tri(1,i,0)
	971	ENDDO
	972	DO k = 1, nz-1
	973	DO i = 0,nx
	974	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	975	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	976	ENDDO
	977	ENDDO
	978
	979	END SUBROUTINE zerleg_hybrid
	980
	981	SUBROUTINE substi_hybrid( ar, tri )
	982
	983	!----------------------------------------------------------------------!
	984	! substi !
	985	! !
	986	! Substitution (Forward and Backward) (Thomas algorithm) !
	987	!----------------------------------------------------------------------!
	988
	989	IMPLICIT NONE
	990
	991	INTEGER :: i, j, k
	992	REAL, DIMENSION(0:nx,nz) :: ar
	993	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	994	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	995
	996	!
	997	!-- Forward substitution
	998	DO i = 0, nx
	999	ar1(i,0) = ar(i,1)
	1000	ENDDO
	1001	DO k = 1, nz - 1
	1002	DO i = 0,nx
	1003	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1004	ENDDO
	1005	ENDDO
	1006
	1007	!
	1008	!-- Backward substitution
	1009	DO i = 0,nx
	1010	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	1011	ENDDO
	1012	DO k = nz-2, 0, -1
	1013	DO i = 0,nx
	1014	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1015	/ tri(4,i,k)
	1016	ENDDO
	1017	ENDDO
	1018
	1019	END SUBROUTINE substi_hybrid
	1020
	1021	END SUBROUTINE tridia_hybrid
	1022
	1023
	1024	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
	1025
	1026	USE cpulog
	1027
	1028	IMPLICIT NONE
	1029
	1030	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
	1031	INTEGER, SAVE :: tag = 10
	1032	#if defined( __parallel )
[415]	1033	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
	1034	INTEGER, DIMENSION(MPI_STATUS_SIZE,1) :: stata
[1]	1035	#endif
	1036
	1037	REAL :: buf, buf1
	1038
	1039
	1040	buf = 1.0
	1041	buf1 = 1.1
	1042	IF ( me_node == 0 ) THEN ! first node only
	1043
	1044	SELECT CASE ( loca )
	1045
	1046	CASE ( 1 ) ! before alltoall
	1047
	1048	IF( me_task > 0 ) THEN ! first task does not wait
	1049	#if defined( __parallel )
	1050	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
	1051	buf1, 1, MPI_REAL, me_task-1, 0, &
[415]	1052	comm_tasks, stat, ierr )
[1]	1053	#endif
	1054	ELSEIF ( j > nys_p ) THEN
	1055	req = 0
	1056	tag = MOD( tag-10, 10 ) + 10
	1057	#if defined( __parallel )
	1058	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
	1059	tag, comm_tasks, req, ierr )
	1060	reqa = req
[415]	1061	CALL MPI_WAITALL( 1, reqa, stata, ierr )
[1]	1062	#endif
	1063	ENDIF
	1064
	1065	CASE ( 2 ) ! after alltoall
	1066
	1067	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
	1068	#if defined( __parallel )
	1069	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
	1070	buf1, 1, MPI_REAL, me_task+1, 0, &
	1071	comm_tasks, stat, ierr)
	1072	#endif
	1073	ELSEIF ( j < nyn_p ) THEN
	1074	req = 0
	1075	tag = MOD( tag-10, 10 ) + 10
	1076	#if defined( __parallel )
	1077	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
	1078	ierr )
	1079	#endif
	1080	ENDIF
	1081
	1082	END SELECT
	1083
	1084	ENDIF
	1085
	1086	END SUBROUTINE cascade
[807]	1087	#endif
[1]	1088	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |