Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 1019

Last change on this file since 1019 was 1014, checked in by raasch, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 32.1 KB

Rev	Line
[1]	1	MODULE poisfft_hybrid_mod
[808]	2
[1]	3	!------------------------------------------------------------------------------
[257]	4	! Current revisions:
[1]	5	! -----------------
	6	!
[1014]	7	!
[1]	8	! Former revisions:
	9	! -----------------
[3]	10	! $Id: poisfft_hybrid.f90 1014 2012-09-21 07:09:03Z raasch $
[392]	11	!
[1014]	12	! 1013 2012-09-21 07:03:55Z raasch
	13	! FLOAT type conversion replaced by REAL
	14	!
[810]	15	! 809 2012-01-30 13:32:58Z maronga
	16	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	17	!
[808]	18	! 807 2012-01-25 11:53:51Z maronga
	19	! New cpp directive "__check" implemented which is used by check_namelist_files
	20	! (most of the code is unneeded by check_namelist_files).
	21	!
[668]	22	! 667 2010-12-23 12:06:00Z suehring/gryschka
	23	! ddzu replaced by ddzu_pres due to changes in zu(0)
	24	!
[482]	25	! 415 2009-12-15 10:26:23Z raasch
[807]	26	! Dimension of array stat in cascade change to prevent type problems with___
[482]	27	! mpi2 libraries
	28	!
[392]	29	! 274 2009-03-26 15:11:21Z heinze
	30	! Output of messages replaced by message handling routine.
	31	!
	32	! Feb. 2007
[3]	33	! RCS Log replace by Id keyword, revision history cleaned up
	34	!
[1]	35	! Revision 1.11 2004/04/30 12:43:14 raasch
	36	! Renaming of fft routines, additional argument in calls of fft_y_m
	37	!
	38	! Revision 1.2 2002/12/19 16:08:31 raasch
	39	! Preprocessor directive KKMP introduced (OMP does NOT work),
	40	! array tri will be a shared array in OpenMP loop, to get better cache
	41	! utilization, the i index (x-direction) will be executed in stride
	42	! "istride" as outer loop and in a shorter inner loop,
	43	! overlapping of computation and communication realized by new routine
	44	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
	45	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
	46	!
	47	!
	48	! Description:
	49	! ------------
	50	! Solution of the Poisson equation with a 2D spectral method.
	51	! Hybrid version for parallel computers using a 1D domain decomposition,
	52	! realized with MPI, along x and parallelization with OPEN-MP along y
	53	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
	54	! optimization is realized by overlapping of computation and communication
	55	! and by simultaneously executing as many communication calls as switches
	56	! per logical partition (LPAR) are available. This version comes into
	57	! effect if more than one node is used and if the environment variable
	58	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
	59	! without any rest.
	60	!
	61	! WARNING: In case of OpenMP, there are problems with allocating large
	62	! arrays in parallel regions.
	63	!
	64	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
	65	!------------------------------------------------------------------------------!
	66
	67	USE fft_xy
	68	USE indices
	69	USE pegrid
	70
	71	IMPLICIT NONE
	72
	73	INTEGER, PARAMETER :: switch_per_lpar = 2
	74
	75	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
	76	nxl_p, nxr_p, & ! partial x dimension
	77	nys_a, nyn_a, & ! total y dimension
	78	nys_p, nyn_p, & ! partial y dimension
	79
	80	npe_s, & ! total number of PEs for solver
	81	nwords, & ! number of points to be exchanged
	82	! with MPI_ALLTOALL
	83	n_omp_threads ! number of OpenMP threads
	84
	85	!
	86	!-- Variables for multi node version (cluster version) using routine
	87	!-- poisfft_hybrid_nodes
	88	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
	89	comm_node_all, & ! communicater all PEs node version
	90	comm_tasks, & ! communicater tasks
	91	me, me_node, me_task,& ! identity of this PE
	92	nodes, & ! number of nodes
	93	tasks_per_logical_node = -1 ! default no cluster
	94
[807]	95
	96	PRIVATE
	97
	98
[809]	99	#if ! defined ( __check )
[807]	100	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
	101
	102
[1]	103	!
	104	!-- Public interfaces
	105	INTERFACE poisfft_hybrid_ini
	106	MODULE PROCEDURE poisfft_hybrid_ini
	107	END INTERFACE poisfft_hybrid_ini
	108
	109	INTERFACE poisfft_hybrid
	110	MODULE PROCEDURE poisfft_hybrid
	111	END INTERFACE poisfft_hybrid
	112
	113	!
	114	!-- Private interfaces
	115	INTERFACE poisfft_hybrid_omp
	116	MODULE PROCEDURE poisfft_hybrid_omp
	117	END INTERFACE poisfft_hybrid_omp
	118
	119	INTERFACE poisfft_hybrid_omp_vec
	120	MODULE PROCEDURE poisfft_hybrid_omp_vec
	121	END INTERFACE poisfft_hybrid_omp_vec
	122
	123	INTERFACE poisfft_hybrid_nodes
	124	MODULE PROCEDURE poisfft_hybrid_nodes
	125	END INTERFACE poisfft_hybrid_nodes
	126
	127	INTERFACE tridia_hybrid
	128	MODULE PROCEDURE tridia_hybrid
	129	END INTERFACE tridia_hybrid
	130
	131	INTERFACE cascade
	132	MODULE PROCEDURE cascade
	133	END INTERFACE cascade
[807]	134	#else
	135	PUBLIC poisfft_hybrid_ini
[1]	136
[807]	137	!
	138	!-- Public interfaces
	139	INTERFACE poisfft_hybrid_ini
	140	MODULE PROCEDURE poisfft_hybrid_ini
	141	END INTERFACE poisfft_hybrid_ini
	142	#endif
	143
[1]	144	CONTAINS
	145
[807]	146
[1]	147	SUBROUTINE poisfft_hybrid_ini
	148
	149	USE control_parameters
	150	USE pegrid
	151
	152	IMPLICIT NONE
	153
	154	CHARACTER(LEN=8) :: cdummy
	155	INTEGER :: idummy, istat
	156	INTEGER, DIMENSION(2) :: coords, dims
	157
	158	LOGICAL, DIMENSION(2) :: period = .false., re_dims
	159
	160
	161	!
	162	!-- Set the internal index values for the hybrid solver
	163	#if defined( __parallel )
	164	npe_s = pdims(1)
	165	#else
	166	npe_s = 1
	167	#endif
	168	nxl_a = 0
	169	nxr_a = nx
	170	nxl_p = 0
	171	nxr_p = ( ( nx+1 ) / npe_s ) - 1
	172	nys_a = nys
	173	nyn_a = nyn
	174	nys_p = 0
	175	nyn_p = ( ( ny+1 ) / npe_s ) - 1
	176
	177	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
	178
[809]	179	#if defined( __KKMP ) && ! defined ( __check )
[1]	180	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
	181	READ ( cdummy, '(I8)' ) n_omp_threads
[257]	182	IF ( n_omp_threads > 1 ) THEN
	183	WRITE( message_string, * ) 'Number of OpenMP threads = ', &
	184	n_omp_threads
	185	CALL message( 'poisfft_hybrid_ini', 'PA0280', 0, 0, 0, 6, 0 )
[1]	186	ENDIF
	187	#else
	188	n_omp_threads = 1
	189	#endif
	190	!
	191	!-- Initialize the one-dimensional FFT routines
	192	CALL fft_init
	193
	194	!
	195	!-- Setup for multi node version (poisfft_hybrid_nodes)
	196	IF ( n_omp_threads == 1 .AND. &
	197	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
	198
	199	IF ( tasks_per_node /= -9999 ) THEN
	200	!
	201	!-- Multi node version requires that the available number of
	202	!-- switches per logical partition must be an integral divisor
	203	!-- of the chosen number of tasks per node
	204	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
	205	!
	206	!-- Set the switch which decides about usage of the multi node
	207	!-- version
	208	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
	209	numprocs > tasks_per_node ) THEN
	210	tasks_per_logical_node = tasks_per_node / switch_per_lpar
	211	ENDIF
	212
[257]	213	IF ( tasks_per_logical_node > -1 ) THEN
	214
	215	WRITE( message_string, * ) 'running optimized ', &
	216	'multinode version', &
	217	'&switch_per_lpar = ', &
	218	switch_per_lpar, &
	219	'&tasks_per_lpar = ', &
	220	tasks_per_node, &
	221	'tasks_per_logical_node = ', &
	222	tasks_per_logical_node
[274]	223	CALL message( 'poisfft_hybrid_ini', 'PA0281', 0, 0, 0, 6, 0 )
[257]	224
[1]	225	ENDIF
	226
	227	ENDIF
	228	ENDIF
	229	ENDIF
	230
	231	!
	232	!-- Determine sub-topologies for multi node version
	233	IF ( tasks_per_logical_node >= 2 ) THEN
	234
[809]	235	#if defined( __parallel ) && ! defined ( __check )
[1]	236	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
	237	tasks_per_logical_node
	238	dims(1) = nodes
	239	dims(2) = tasks_per_logical_node
	240
	241	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
	242	comm_node_all, istat )
	243	CALL MPI_COMM_RANK( comm_node_all, me, istat )
	244
	245	re_dims(1) = .TRUE.
	246	re_dims(2) = .FALSE.
	247	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
	248	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
	249
	250	re_dims(1) = .FALSE.
	251	re_dims(2) = .TRUE.
	252	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
	253	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
	254
	255	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
	256	! tasks_per_logical_node
[809]	257	#elif ! defined( __parallel )
[257]	258	message_string = 'parallel environment (MPI) required'
	259	CALL message( 'poisfft_hybrid_ini', 'PA0282', 1, 2, 0, 6, 0 )
[1]	260	#endif
	261	ENDIF
	262
	263	END SUBROUTINE poisfft_hybrid_ini
	264
[809]	265	#if ! defined ( __check )
[1]	266	SUBROUTINE poisfft_hybrid( ar )
	267
	268	USE control_parameters
	269	USE interfaces
	270
	271	IMPLICIT NONE
	272
	273	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	274
	275	IF ( host(1:3) == 'nec' ) THEN
	276	CALL poisfft_hybrid_omp_vec( ar )
	277	ELSE
	278	IF ( tasks_per_logical_node == -1 ) THEN
	279	CALL poisfft_hybrid_omp( ar )
	280	ELSE
	281	CALL poisfft_hybrid_nodes( ar )
	282	ENDIF
	283	ENDIF
	284
	285	END SUBROUTINE poisfft_hybrid
	286
	287
	288	SUBROUTINE poisfft_hybrid_omp ( ar )
	289
	290	USE cpulog
	291	USE interfaces
	292
	293	IMPLICIT NONE
	294
	295	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	296	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	297
	298	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	299
	300	REAL, DIMENSION(0:nx) :: fftx_ar
	301	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	302
	303	REAL, DIMENSION(0:nx,nz) :: tri_ar
	304
	305	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	306	#if defined( __KKMP )
	307	INTEGER :: omp_get_thread_num
	308	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	309	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	310	#else
	311	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	312	#endif
	313
	314
	315	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
	316
	317	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	318
	319	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	320	!$OMP DO
	321	!
	322	!-- Store grid points to be transformed on a 1d-array, do the fft
	323	!-- and sample the results on a 4d-array
	324	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	325	iei = MIN( iouter+istride-1, nxr_p )
	326	DO k = 1, nz
	327
	328	DO i = iouter, iei
	329	ii = nxl + i
	330	ir = i - iouter + 1
	331
	332	DO j = nys_a, nyn_a
	333	ffty_ar(j,ir) = ar(k,j,ii)
	334	ENDDO
	335
	336	CALL fft_y( ffty_ar(:,ir), 'forward' )
	337	ENDDO
	338
	339	m = nys_a
	340	DO n = 1, npe_s
	341	DO j = nys_p, nyn_p
	342	DO i = iouter, iei
	343	ir = i - iouter + 1
	344	work1(i,k,j,n) = ffty_ar(m,ir)
	345	ENDDO
	346	m = m+1
	347	ENDDO
	348	ENDDO
	349
	350	ENDDO
	351	ENDDO
	352	!$OMP END PARALLEL
	353
	354	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	355
	356	#if defined( __parallel )
	357	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	358
	359	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	360	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	361	comm2d, istat )
	362
	363	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	364	#else
	365	work2 = work1
	366	#endif
	367
	368	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	369
	370	#if defined( __KKMP )
	371	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
	372	!$OMP DO
	373	DO j = nys_p, nyn_p
	374	jthread = omp_get_thread_num() + 1
	375	#else
	376	DO j = nys_p, nyn_p
	377	jthread = 1
	378	#endif
	379	DO k = 1, nz
	380
	381	m = nxl_a
	382	DO n = 1, npe_s
	383	DO i = nxl_p, nxr_p
	384	fftx_ar(m) = work2(i,k,j,n)
	385	m = m+1
	386	ENDDO
	387	ENDDO
	388
	389	CALL fft_x( fftx_ar, 'forward' )
	390
	391	DO i = nxl_a, nxr_a
	392	tri_ar(i,k) = fftx_ar(i)
	393	ENDDO
	394
	395	ENDDO
	396
	397	jj = myid * (nyn_p-nys_p+1) + j
	398	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	399
	400	DO k = 1, nz
	401	DO i = nxl_a, nxr_a
	402	fftx_ar(i) = tri_ar (i,k)
	403	ENDDO
	404
	405	CALL fft_x( fftx_ar, 'backward' )
	406
	407	m = nxl_a
	408	DO n = 1, npe_s
	409	DO i = nxl_p, nxr_p
	410	work2(i,k,j,n) = fftx_ar(m)
	411	m = m+1
	412	ENDDO
	413	ENDDO
	414
	415	ENDDO
	416	ENDDO
	417	#if defined( __KKMP )
	418	!$OMP END PARALLEL
	419	#endif
	420
	421	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	422
	423	#if defined( __parallel )
	424	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	425	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	426
	427	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	428	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	429	comm2d, istat )
	430
	431	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	432	#else
	433	work1 = work2
	434	#endif
	435
	436	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	437
	438	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	439	!$OMP DO
	440	DO iouter = nxl_p, nxr_p, istride
	441	iei = MIN( iouter+istride-1, nxr_p )
	442	DO k = 1, nz
	443
	444	m = nys_a
	445	DO n = 1, npe_s
	446	DO j = nys_p, nyn_p
	447	DO i = iouter, iei
	448	ir = i - iouter + 1
	449	ffty_ar(m,ir) = work1 (i,k,j,n)
	450	ENDDO
	451	m = m+1
	452	ENDDO
	453	ENDDO
	454
	455	DO i = iouter, iei
	456	ii = nxl + i
	457	ir = i - iouter + 1
	458	CALL fft_y( ffty_ar(:,ir), 'backward' )
	459
	460	DO j = nys_a, nyn_a
	461	ar(k,j,ii) = ffty_ar(j,ir)
	462	ENDDO
	463	ENDDO
	464
	465	ENDDO
	466	ENDDO
	467	!$OMP END PARALLEL
	468
	469	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	470
	471	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
	472
	473	#if defined( __KKMP )
	474	DEALLOCATE( tri )
	475	#endif
	476
	477	END SUBROUTINE poisfft_hybrid_omp
	478
	479
	480	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
	481
	482	USE cpulog
	483	USE interfaces
	484
	485	IMPLICIT NONE
	486
	487	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	488	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	489
	490	REAL, DIMENSION(0:nx,nz) :: tri_ar
	491
	492	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	493
	494	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
	495	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
	496
	497	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	498	#if defined( __KKMP )
	499	INTEGER :: omp_get_thread_num
	500	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	501	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	502	#else
	503	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	504	#endif
	505
	506
	507	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
	508
	509	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
	510
	511	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
	512	!$OMP DO
	513	!
	514	!-- Store grid points to be transformed on a 1d-array, do the fft
	515	!-- and sample the results on a 4d-array
	516	DO i = nxl_p, nxr_p
	517
	518	DO j = nys_a, nyn_a
	519	DO k = 1, nz
	520	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
	521	ENDDO
	522	ENDDO
	523
	524	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
	525	ENDDO
	526
	527	!$OMP DO
	528	DO k = 1, nz
	529	m = nys_a
	530	DO n = 1, npe_s
	531	DO j = nys_p, nyn_p
	532	DO i = nxl_p, nxr_p
	533	work1(i,k,j,n) = ffty_ar3(m,k,i)
	534	ENDDO
	535	m = m+1
	536	ENDDO
	537	ENDDO
	538	ENDDO
	539	!$OMP END PARALLEL
	540
	541	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
	542
	543	#if defined( __parallel )
	544	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	545	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	546	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	547	comm2d, istat )
	548	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	549	#else
	550	work2 = work1
	551	#endif
	552
	553	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
	554
	555	#if defined( __KKMP )
	556	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
	557	!$OMP DO
	558	DO j = nys_p, nyn_p
	559	jthread = omp_get_thread_num() + 1
	560	#else
	561	DO j = nys_p, nyn_p
	562	jthread = 1
	563	#endif
	564	DO k = 1, nz
	565
	566	m = nxl_a
	567	DO n = 1, npe_s
	568	DO i = nxl_p, nxr_p
	569	fftx_ar3(m,k,j) = work2(i,k,j,n)
	570	m = m+1
	571	ENDDO
	572	ENDDO
	573	ENDDO
	574
	575	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
	576
	577	DO k = 1, nz
	578	DO i = nxl_a, nxr_a
	579	tri_ar(i,k) = fftx_ar3(i,k,j)
	580	ENDDO
	581	ENDDO
	582
	583	jj = myid * (nyn_p-nys_p+1) + j
	584	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	585
	586	DO k = 1, nz
	587	DO i = nxl_a, nxr_a
	588	fftx_ar3(i,k,j) = tri_ar (i,k)
	589	ENDDO
	590	ENDDO
	591
	592	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
	593
	594	DO k = 1, nz
	595	m = nxl_a
	596	DO n = 1, npe_s
	597	DO i = nxl_p, nxr_p
	598	work2(i,k,j,n) = fftx_ar3(m,k,j)
	599	m = m+1
	600	ENDDO
	601	ENDDO
	602	ENDDO
	603
	604	ENDDO
	605	#if defined( __KKMP )
	606	!$OMP END PARALLEL
	607	#endif
	608
	609	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
	610
	611	#if defined( __parallel )
	612	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	613	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	614	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	615	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	616	comm2d, istat )
	617	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	618	#else
	619	work1 = work2
	620	#endif
	621
	622	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
	623
	624	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
	625	!$OMP DO
	626	DO k = 1, nz
	627	m = nys_a
	628	DO n = 1, npe_s
	629	DO j = nys_p, nyn_p
	630	DO i = nxl_p, nxr_p
	631	ffty_ar3(m,k,i) = work1(i,k,j,n)
	632	ENDDO
	633	m = m+1
	634	ENDDO
	635	ENDDO
	636	ENDDO
	637
	638	!$OMP DO
	639	DO i = nxl_p, nxr_p
	640	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
	641	DO j = nys_a, nyn_a
	642	DO k = 1, nz
	643	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
	644	ENDDO
	645	ENDDO
	646	ENDDO
	647	!$OMP END PARALLEL
	648
	649	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
	650
	651	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
	652
	653	#if defined( __KKMP )
	654	DEALLOCATE( tri )
	655	#endif
	656
	657	END SUBROUTINE poisfft_hybrid_omp_vec
	658
	659
	660	SUBROUTINE poisfft_hybrid_nodes ( ar )
	661
	662	USE cpulog
	663	USE interfaces
	664
	665	IMPLICIT NONE
	666
	667	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	668	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
	669	n, nn, nt, nw1, nw2
	670
	671	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	672
	673	REAL, DIMENSION(0:nx) :: fftx_ar
	674	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	675
	676	REAL, DIMENSION(0:nx,nz) :: tri_ar
	677
	678	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
	679	nodes,nys_p:nyn_p) :: work1,work2
	680	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	681
	682
	683	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
	684
	685	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	686
	687	!
	688	!-- Store grid points to be transformed on a 1d-array, do the fft
	689	!-- and sample the results on a 4d-array
	690	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	691	iei = MIN( iouter+istride-1, nxr_p )
	692	DO k = 1, nz
	693
	694	DO i = iouter, iei
	695	ii = nxl + i
	696	ir = i - iouter + 1
	697
	698	DO j = nys_a, nyn_a
	699	ffty_ar(j,ir) = ar(k,j,ii)
	700	ENDDO
	701
	702	CALL fft_y( ffty_ar(:,ir), 'forward' )
	703	ENDDO
	704
	705	m = nys_a
	706	DO nn = 1, nodes
	707	DO nt = 1, tasks_per_logical_node
	708	DO j = nys_p, nyn_p
	709	DO i = iouter, iei
	710	ir = i - iouter + 1
	711	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
	712	ENDDO
	713	m = m+1
	714	ENDDO
	715	ENDDO
	716	ENDDO
	717
	718	ENDDO
	719	ENDDO
	720
	721	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	722
	723	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	724	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
	725	DO nn = 1, nodes
	726	DO j = nys_p, nyn_p
	727	#if defined( __parallel )
	728	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	729	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	730	comm_tasks, istat )
	731	#endif
	732	ENDDO
	733	ENDDO
	734	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	735
	736
	737	DO j = nys_p, nyn_p
	738
	739	CALL cascade( 1, j, nys_p, nyn_p )
	740	nw2 = nw1 * SIZE( work1, 3 )
	741	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
	742	#if defined( __parallel )
	743	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	744	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	745	comm_nodes, istat )
	746	#endif
	747	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
	748	CALL cascade( 2, j, nys_p, nyn_p )
	749
	750	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	751	DO k = 1, nz
	752
	753	m = nxl_a
	754	DO nn = 1, nodes
	755	DO nt = 1, tasks_per_logical_node
	756	DO i = nxl_p, nxr_p
	757	fftx_ar(m) = work1(i,k,nt,nn,j)
	758	m = m+1
	759	ENDDO
	760	ENDDO
	761	ENDDO
	762
	763	CALL fft_x( fftx_ar, 'forward' )
	764
	765	DO i = nxl_a, nxr_a
	766	tri_ar(i,k) = fftx_ar(i)
	767	ENDDO
	768
	769	ENDDO
	770
	771	jj = myid * (nyn_p-nys_p+1) + j
	772	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
	773
	774	DO k = 1, nz
	775	DO i = nxl_a, nxr_a
	776	fftx_ar(i) = tri_ar(i,k)
	777	ENDDO
	778
	779	CALL fft_x( fftx_ar, 'backward' )
	780
	781	m = nxl_a
	782	DO nn = 1, nodes
	783	DO nt = 1, tasks_per_logical_node
	784	DO i = nxl_p, nxr_p
	785	work1(i,k,nt,nn,j) = fftx_ar(m)
	786	m = m+1
	787	ENDDO
	788	ENDDO
	789	ENDDO
	790	ENDDO
	791
	792	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	793	nw2 = nw1 * SIZE( work1, 3 )
	794	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
	795	#if defined( __parallel )
	796	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	797	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	798	comm_nodes, istat )
	799	#endif
	800	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
	801
	802	ENDDO
	803
	804	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	805	DO nn = 1, nodes
	806	DO j = nys_p, nyn_p
	807	#if defined( __parallel )
	808	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	809	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	810	comm_tasks, istat )
	811	#endif
	812	ENDDO
	813	ENDDO
	814	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	815
	816	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	817
	818	DO iouter = nxl_p, nxr_p, istride
	819	iei = MIN( iouter+istride-1, nxr_p )
	820	DO k = 1, nz
	821
	822	m = nys_a
	823	DO nn = 1, nodes
	824	DO nt = 1, tasks_per_logical_node
	825	DO j = nys_p, nyn_p
	826	DO i = iouter, iei
	827	ir = i - iouter + 1
	828	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
	829	ENDDO
	830	m = m+1
	831	ENDDO
	832	ENDDO
	833	ENDDO
	834
	835	DO i = iouter, iei
	836	ii = nxl + i
	837	ir = i - iouter + 1
	838	CALL fft_y( ffty_ar(:,ir), 'backward' )
	839
	840	DO j = nys_a, nyn_a
	841	ar(k,j,ii) = ffty_ar(j,ir)
	842	ENDDO
	843	ENDDO
	844
	845	ENDDO
	846	ENDDO
	847
	848	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	849
	850	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
	851
	852	END SUBROUTINE poisfft_hybrid_nodes
	853
	854
	855
	856	SUBROUTINE tridia_hybrid( j, ar, tri )
	857
	858	USE arrays_3d
	859	USE control_parameters
	860	USE grid_variables
	861
	862	IMPLICIT NONE
	863
	864	INTEGER :: i, j, k, nnyh
	865	REAL, DIMENSION(0:nx,nz) :: ar
	866	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	867	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	868
	869	nnyh = (ny+1) / 2
	870
	871	tri = 0.0
	872	!
	873	!-- Define constant elements of the tridiagonal matrix.
	874	DO k = 0, nz-1
	875	DO i = 0,nx
[667]	876	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	877	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	878	ENDDO
	879	ENDDO
	880
	881	IF ( j <= nnyh ) THEN
	882	CALL maketri_hybrid( j )
	883	ELSE
	884	CALL maketri_hybrid( ny+1-j)
	885	ENDIF
	886	CALL zerleg_hybrid
	887	CALL substi_hybrid( ar, tri )
	888
	889	CONTAINS
	890
	891	SUBROUTINE maketri_hybrid( j )
	892
	893	!----------------------------------------------------------------------!
	894	! maketri !
	895	! !
	896	! computes the i- and j-dependent component of the matrix !
	897	!----------------------------------------------------------------------!
	898
	899	USE constants
	900
	901	IMPLICIT NONE
	902
	903	INTEGER :: i, j, k, nnxh
	904	REAL :: a, c
	905
	906	REAL, DIMENSION(0:nx) :: l
	907
	908
	909	nnxh = (nx+1) / 2
	910	!
	911	!-- Provide the tridiagonal matrix for solution of the Poisson equation
	912	!-- in Fourier space. The coefficients are computed following the method
	913	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
	914	!-- Siano's original version.
	915	DO i = 0,nx
	916	IF ( i >= 0 .AND. i < nnxh ) THEN
	917	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
[1013]	918	REAL( nx+1 ) ) ) / ( dx * dx ) + &
[1]	919	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	920	REAL( ny+1 ) ) ) / ( dy * dy )
[1]	921	ELSEIF ( i == nnxh ) THEN
	922	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
[1013]	923	REAL( nx+1 ) ) ) / ( dx * dx ) + &
[1]	924	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	925	REAL(ny+1) ) ) / ( dy * dy )
[1]	926	ELSE
	927	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
[1013]	928	REAL( nx+1 ) ) ) / ( dx * dx ) + &
[1]	929	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	930	REAL( ny+1 ) ) ) / ( dy * dy )
[1]	931	ENDIF
	932	ENDDO
	933
	934	DO k = 0,nz-1
	935	DO i = 0, nx
[667]	936	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	937	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	938	tri(1,i,k) = a + c - l(i)
	939	ENDDO
	940	ENDDO
	941	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	942	DO i = 0,nx
	943	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	944	ENDDO
	945	ENDIF
	946	IF ( ibc_p_t == 1 ) THEN
	947	DO i = 0,nx
	948	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	949	ENDDO
	950	ENDIF
	951
	952	END SUBROUTINE maketri_hybrid
	953
	954
	955	SUBROUTINE zerleg_hybrid
	956
	957	!----------------------------------------------------------------------!
	958	! zerleg !
	959	! !
	960	! Splitting of the tridiagonal matrix (Thomas algorithm) !
	961	!----------------------------------------------------------------------!
	962
	963	USE indices
	964
	965	IMPLICIT NONE
	966
	967	INTEGER :: i, k
	968
	969	!
	970	!-- Splitting
	971	DO i = 0, nx
	972	tri(4,i,0) = tri(1,i,0)
	973	ENDDO
	974	DO k = 1, nz-1
	975	DO i = 0,nx
	976	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	977	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	978	ENDDO
	979	ENDDO
	980
	981	END SUBROUTINE zerleg_hybrid
	982
	983	SUBROUTINE substi_hybrid( ar, tri )
	984
	985	!----------------------------------------------------------------------!
	986	! substi !
	987	! !
	988	! Substitution (Forward and Backward) (Thomas algorithm) !
	989	!----------------------------------------------------------------------!
	990
	991	IMPLICIT NONE
	992
	993	INTEGER :: i, j, k
	994	REAL, DIMENSION(0:nx,nz) :: ar
	995	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	996	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	997
	998	!
	999	!-- Forward substitution
	1000	DO i = 0, nx
	1001	ar1(i,0) = ar(i,1)
	1002	ENDDO
	1003	DO k = 1, nz - 1
	1004	DO i = 0,nx
	1005	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1006	ENDDO
	1007	ENDDO
	1008
	1009	!
	1010	!-- Backward substitution
	1011	DO i = 0,nx
	1012	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	1013	ENDDO
	1014	DO k = nz-2, 0, -1
	1015	DO i = 0,nx
	1016	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1017	/ tri(4,i,k)
	1018	ENDDO
	1019	ENDDO
	1020
	1021	END SUBROUTINE substi_hybrid
	1022
	1023	END SUBROUTINE tridia_hybrid
	1024
	1025
	1026	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
	1027
	1028	USE cpulog
	1029
	1030	IMPLICIT NONE
	1031
	1032	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
	1033	INTEGER, SAVE :: tag = 10
	1034	#if defined( __parallel )
[415]	1035	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
	1036	INTEGER, DIMENSION(MPI_STATUS_SIZE,1) :: stata
[1]	1037	#endif
	1038
	1039	REAL :: buf, buf1
	1040
	1041
	1042	buf = 1.0
	1043	buf1 = 1.1
	1044	IF ( me_node == 0 ) THEN ! first node only
	1045
	1046	SELECT CASE ( loca )
	1047
	1048	CASE ( 1 ) ! before alltoall
	1049
	1050	IF( me_task > 0 ) THEN ! first task does not wait
	1051	#if defined( __parallel )
	1052	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
	1053	buf1, 1, MPI_REAL, me_task-1, 0, &
[415]	1054	comm_tasks, stat, ierr )
[1]	1055	#endif
	1056	ELSEIF ( j > nys_p ) THEN
	1057	req = 0
	1058	tag = MOD( tag-10, 10 ) + 10
	1059	#if defined( __parallel )
	1060	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
	1061	tag, comm_tasks, req, ierr )
	1062	reqa = req
[415]	1063	CALL MPI_WAITALL( 1, reqa, stata, ierr )
[1]	1064	#endif
	1065	ENDIF
	1066
	1067	CASE ( 2 ) ! after alltoall
	1068
	1069	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
	1070	#if defined( __parallel )
	1071	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
	1072	buf1, 1, MPI_REAL, me_task+1, 0, &
	1073	comm_tasks, stat, ierr)
	1074	#endif
	1075	ELSEIF ( j < nyn_p ) THEN
	1076	req = 0
	1077	tag = MOD( tag-10, 10 ) + 10
	1078	#if defined( __parallel )
	1079	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
	1080	ierr )
	1081	#endif
	1082	ENDIF
	1083
	1084	END SELECT
	1085
	1086	ENDIF
	1087
	1088	END SUBROUTINE cascade
[807]	1089	#endif
[1]	1090	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |