Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 1139

Last change on this file since 1139 was 1112, checked in by raasch, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 33.4 KB

Rev	Line
[1]	1	MODULE poisfft_hybrid_mod
[808]	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
	17	! Copyright 1997-2012 Leibniz University Hannover
	18	!--------------------------------------------------------------------------------!
	19	!
[257]	20	! Current revisions:
[1]	21	! -----------------
[1112]	22	!
[1]	23	!
	24	! Former revisions:
	25	! -----------------
[3]	26	! $Id: poisfft_hybrid.f90 1112 2013-03-09 00:34:37Z raasch $
[392]	27	!
[1112]	28	! 1111 2013-03-08 23:54:10Z raasch
	29	! poisfft_hybrid_ini is now called internally from poisfft_hybrid,
	30	! ibc_p_b = 2 removed
	31	!
[1107]	32	! 1106 2013-03-04 05:31:38Z raasch
	33	! calls of fft_x, fft_y replaced by fft_x_1d, fft_y_1d
	34	!
[1037]	35	! 1036 2012-10-22 13:43:42Z raasch
	36	! code put under GPL (PALM 3.9)
	37	!
[1014]	38	! 1013 2012-09-21 07:03:55Z raasch
	39	! FLOAT type conversion replaced by REAL
	40	!
[810]	41	! 809 2012-01-30 13:32:58Z maronga
	42	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	43	!
[808]	44	! 807 2012-01-25 11:53:51Z maronga
	45	! New cpp directive "__check" implemented which is used by check_namelist_files
	46	! (most of the code is unneeded by check_namelist_files).
	47	!
[668]	48	! 667 2010-12-23 12:06:00Z suehring/gryschka
	49	! ddzu replaced by ddzu_pres due to changes in zu(0)
	50	!
[482]	51	! 415 2009-12-15 10:26:23Z raasch
[807]	52	! Dimension of array stat in cascade change to prevent type problems with___
[482]	53	! mpi2 libraries
	54	!
[392]	55	! 274 2009-03-26 15:11:21Z heinze
	56	! Output of messages replaced by message handling routine.
	57	!
	58	! Feb. 2007
[3]	59	! RCS Log replace by Id keyword, revision history cleaned up
	60	!
[1]	61	! Revision 1.11 2004/04/30 12:43:14 raasch
	62	! Renaming of fft routines, additional argument in calls of fft_y_m
	63	!
	64	! Revision 1.2 2002/12/19 16:08:31 raasch
	65	! Preprocessor directive KKMP introduced (OMP does NOT work),
	66	! array tri will be a shared array in OpenMP loop, to get better cache
	67	! utilization, the i index (x-direction) will be executed in stride
	68	! "istride" as outer loop and in a shorter inner loop,
	69	! overlapping of computation and communication realized by new routine
	70	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
	71	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
	72	!
	73	!
	74	! Description:
	75	! ------------
	76	! Solution of the Poisson equation with a 2D spectral method.
	77	! Hybrid version for parallel computers using a 1D domain decomposition,
	78	! realized with MPI, along x and parallelization with OPEN-MP along y
	79	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
	80	! optimization is realized by overlapping of computation and communication
	81	! and by simultaneously executing as many communication calls as switches
	82	! per logical partition (LPAR) are available. This version comes into
	83	! effect if more than one node is used and if the environment variable
	84	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
	85	! without any rest.
	86	!
	87	! WARNING: In case of OpenMP, there are problems with allocating large
	88	! arrays in parallel regions.
	89	!
	90	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
	91	!------------------------------------------------------------------------------!
	92
	93	USE fft_xy
	94	USE indices
	95	USE pegrid
	96
	97	IMPLICIT NONE
	98
	99	INTEGER, PARAMETER :: switch_per_lpar = 2
	100
	101	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
	102	nxl_p, nxr_p, & ! partial x dimension
	103	nys_a, nyn_a, & ! total y dimension
	104	nys_p, nyn_p, & ! partial y dimension
	105
	106	npe_s, & ! total number of PEs for solver
	107	nwords, & ! number of points to be exchanged
	108	! with MPI_ALLTOALL
	109	n_omp_threads ! number of OpenMP threads
	110
	111	!
	112	!-- Variables for multi node version (cluster version) using routine
	113	!-- poisfft_hybrid_nodes
	114	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
	115	comm_node_all, & ! communicater all PEs node version
	116	comm_tasks, & ! communicater tasks
	117	me, me_node, me_task,& ! identity of this PE
	118	nodes, & ! number of nodes
	119	tasks_per_logical_node = -1 ! default no cluster
	120
[1111]	121	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
[807]	122
[1111]	123
[807]	124	PRIVATE
	125
	126
[809]	127	#if ! defined ( __check )
[807]	128	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
	129
	130
[1]	131	!
	132	!-- Public interfaces
	133	INTERFACE poisfft_hybrid_ini
	134	MODULE PROCEDURE poisfft_hybrid_ini
	135	END INTERFACE poisfft_hybrid_ini
	136
	137	INTERFACE poisfft_hybrid
	138	MODULE PROCEDURE poisfft_hybrid
	139	END INTERFACE poisfft_hybrid
	140
	141	!
	142	!-- Private interfaces
	143	INTERFACE poisfft_hybrid_omp
	144	MODULE PROCEDURE poisfft_hybrid_omp
	145	END INTERFACE poisfft_hybrid_omp
	146
	147	INTERFACE poisfft_hybrid_omp_vec
	148	MODULE PROCEDURE poisfft_hybrid_omp_vec
	149	END INTERFACE poisfft_hybrid_omp_vec
	150
	151	INTERFACE poisfft_hybrid_nodes
	152	MODULE PROCEDURE poisfft_hybrid_nodes
	153	END INTERFACE poisfft_hybrid_nodes
	154
	155	INTERFACE tridia_hybrid
	156	MODULE PROCEDURE tridia_hybrid
	157	END INTERFACE tridia_hybrid
	158
	159	INTERFACE cascade
	160	MODULE PROCEDURE cascade
	161	END INTERFACE cascade
[807]	162	#else
	163	PUBLIC poisfft_hybrid_ini
[1]	164
[807]	165	!
	166	!-- Public interfaces
	167	INTERFACE poisfft_hybrid_ini
	168	MODULE PROCEDURE poisfft_hybrid_ini
	169	END INTERFACE poisfft_hybrid_ini
	170	#endif
	171
[1]	172	CONTAINS
	173
[807]	174
[1]	175	SUBROUTINE poisfft_hybrid_ini
	176
	177	USE control_parameters
	178	USE pegrid
	179
	180	IMPLICIT NONE
	181
	182	CHARACTER(LEN=8) :: cdummy
	183	INTEGER :: idummy, istat
	184	INTEGER, DIMENSION(2) :: coords, dims
	185
	186	LOGICAL, DIMENSION(2) :: period = .false., re_dims
	187
	188
	189	!
	190	!-- Set the internal index values for the hybrid solver
	191	#if defined( __parallel )
	192	npe_s = pdims(1)
	193	#else
	194	npe_s = 1
	195	#endif
	196	nxl_a = 0
	197	nxr_a = nx
	198	nxl_p = 0
	199	nxr_p = ( ( nx+1 ) / npe_s ) - 1
	200	nys_a = nys
	201	nyn_a = nyn
	202	nys_p = 0
	203	nyn_p = ( ( ny+1 ) / npe_s ) - 1
	204
	205	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
	206
[809]	207	#if defined( __KKMP ) && ! defined ( __check )
[1]	208	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
	209	READ ( cdummy, '(I8)' ) n_omp_threads
[257]	210	IF ( n_omp_threads > 1 ) THEN
	211	WRITE( message_string, * ) 'Number of OpenMP threads = ', &
	212	n_omp_threads
	213	CALL message( 'poisfft_hybrid_ini', 'PA0280', 0, 0, 0, 6, 0 )
[1]	214	ENDIF
	215	#else
	216	n_omp_threads = 1
	217	#endif
	218	!
	219	!-- Initialize the one-dimensional FFT routines
	220	CALL fft_init
	221
	222	!
	223	!-- Setup for multi node version (poisfft_hybrid_nodes)
	224	IF ( n_omp_threads == 1 .AND. &
	225	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
	226
	227	IF ( tasks_per_node /= -9999 ) THEN
	228	!
	229	!-- Multi node version requires that the available number of
	230	!-- switches per logical partition must be an integral divisor
	231	!-- of the chosen number of tasks per node
	232	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
	233	!
	234	!-- Set the switch which decides about usage of the multi node
	235	!-- version
	236	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
	237	numprocs > tasks_per_node ) THEN
	238	tasks_per_logical_node = tasks_per_node / switch_per_lpar
	239	ENDIF
	240
[257]	241	IF ( tasks_per_logical_node > -1 ) THEN
	242
	243	WRITE( message_string, * ) 'running optimized ', &
	244	'multinode version', &
	245	'&switch_per_lpar = ', &
	246	switch_per_lpar, &
	247	'&tasks_per_lpar = ', &
	248	tasks_per_node, &
	249	'tasks_per_logical_node = ', &
	250	tasks_per_logical_node
[274]	251	CALL message( 'poisfft_hybrid_ini', 'PA0281', 0, 0, 0, 6, 0 )
[257]	252
[1]	253	ENDIF
	254
	255	ENDIF
	256	ENDIF
	257	ENDIF
	258
	259	!
	260	!-- Determine sub-topologies for multi node version
	261	IF ( tasks_per_logical_node >= 2 ) THEN
	262
[809]	263	#if defined( __parallel ) && ! defined ( __check )
[1]	264	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
	265	tasks_per_logical_node
	266	dims(1) = nodes
	267	dims(2) = tasks_per_logical_node
	268
	269	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
	270	comm_node_all, istat )
	271	CALL MPI_COMM_RANK( comm_node_all, me, istat )
	272
	273	re_dims(1) = .TRUE.
	274	re_dims(2) = .FALSE.
	275	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
	276	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
	277
	278	re_dims(1) = .FALSE.
	279	re_dims(2) = .TRUE.
	280	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
	281	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
	282
	283	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
	284	! tasks_per_logical_node
[809]	285	#elif ! defined( __parallel )
[257]	286	message_string = 'parallel environment (MPI) required'
	287	CALL message( 'poisfft_hybrid_ini', 'PA0282', 1, 2, 0, 6, 0 )
[1]	288	#endif
	289	ENDIF
	290
[1111]	291	poisfft_initialized = .TRUE.
	292
[1]	293	END SUBROUTINE poisfft_hybrid_ini
	294
[809]	295	#if ! defined ( __check )
[1]	296	SUBROUTINE poisfft_hybrid( ar )
	297
	298	USE control_parameters
	299	USE interfaces
	300
	301	IMPLICIT NONE
	302
	303	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	304
[1111]	305	IF ( .NOT. poisfft_initialized ) CALL poisfft_hybrid_ini
	306
[1]	307	IF ( host(1:3) == 'nec' ) THEN
	308	CALL poisfft_hybrid_omp_vec( ar )
	309	ELSE
	310	IF ( tasks_per_logical_node == -1 ) THEN
	311	CALL poisfft_hybrid_omp( ar )
	312	ELSE
	313	CALL poisfft_hybrid_nodes( ar )
	314	ENDIF
	315	ENDIF
	316
	317	END SUBROUTINE poisfft_hybrid
	318
	319
	320	SUBROUTINE poisfft_hybrid_omp ( ar )
	321
	322	USE cpulog
	323	USE interfaces
	324
	325	IMPLICIT NONE
	326
	327	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	328	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	329
	330	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	331
	332	REAL, DIMENSION(0:nx) :: fftx_ar
	333	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	334
	335	REAL, DIMENSION(0:nx,nz) :: tri_ar
	336
	337	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	338	#if defined( __KKMP )
	339	INTEGER :: omp_get_thread_num
	340	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	341	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	342	#else
	343	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	344	#endif
	345
	346
	347	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
	348
[1106]	349	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	350
	351	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	352	!$OMP DO
	353	!
	354	!-- Store grid points to be transformed on a 1d-array, do the fft
	355	!-- and sample the results on a 4d-array
	356	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	357	iei = MIN( iouter+istride-1, nxr_p )
	358	DO k = 1, nz
	359
	360	DO i = iouter, iei
	361	ii = nxl + i
	362	ir = i - iouter + 1
	363
	364	DO j = nys_a, nyn_a
	365	ffty_ar(j,ir) = ar(k,j,ii)
	366	ENDDO
	367
[1106]	368	CALL fft_y_1d( ffty_ar(:,ir), 'forward' )
[1]	369	ENDDO
	370
	371	m = nys_a
	372	DO n = 1, npe_s
	373	DO j = nys_p, nyn_p
	374	DO i = iouter, iei
	375	ir = i - iouter + 1
	376	work1(i,k,j,n) = ffty_ar(m,ir)
	377	ENDDO
	378	m = m+1
	379	ENDDO
	380	ENDDO
	381
	382	ENDDO
	383	ENDDO
	384	!$OMP END PARALLEL
	385
[1106]	386	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	387
	388	#if defined( __parallel )
	389	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	390
	391	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	392	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	393	comm2d, istat )
	394
	395	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	396	#else
	397	work2 = work1
	398	#endif
	399
[1106]	400	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	401
	402	#if defined( __KKMP )
	403	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
	404	!$OMP DO
	405	DO j = nys_p, nyn_p
	406	jthread = omp_get_thread_num() + 1
	407	#else
	408	DO j = nys_p, nyn_p
	409	jthread = 1
	410	#endif
	411	DO k = 1, nz
	412
	413	m = nxl_a
	414	DO n = 1, npe_s
	415	DO i = nxl_p, nxr_p
	416	fftx_ar(m) = work2(i,k,j,n)
	417	m = m+1
	418	ENDDO
	419	ENDDO
	420
[1106]	421	CALL fft_x_1d( fftx_ar, 'forward' )
[1]	422
	423	DO i = nxl_a, nxr_a
	424	tri_ar(i,k) = fftx_ar(i)
	425	ENDDO
	426
	427	ENDDO
	428
	429	jj = myid * (nyn_p-nys_p+1) + j
	430	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	431
	432	DO k = 1, nz
	433	DO i = nxl_a, nxr_a
	434	fftx_ar(i) = tri_ar (i,k)
	435	ENDDO
	436
[1106]	437	CALL fft_x_1d( fftx_ar, 'backward' )
[1]	438
	439	m = nxl_a
	440	DO n = 1, npe_s
	441	DO i = nxl_p, nxr_p
	442	work2(i,k,j,n) = fftx_ar(m)
	443	m = m+1
	444	ENDDO
	445	ENDDO
	446
	447	ENDDO
	448	ENDDO
	449	#if defined( __KKMP )
	450	!$OMP END PARALLEL
	451	#endif
	452
[1106]	453	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	454
	455	#if defined( __parallel )
	456	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	457	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	458
	459	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	460	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	461	comm2d, istat )
	462
	463	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	464	#else
	465	work1 = work2
	466	#endif
	467
[1106]	468	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	469
	470	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	471	!$OMP DO
	472	DO iouter = nxl_p, nxr_p, istride
	473	iei = MIN( iouter+istride-1, nxr_p )
	474	DO k = 1, nz
	475
	476	m = nys_a
	477	DO n = 1, npe_s
	478	DO j = nys_p, nyn_p
	479	DO i = iouter, iei
	480	ir = i - iouter + 1
	481	ffty_ar(m,ir) = work1 (i,k,j,n)
	482	ENDDO
	483	m = m+1
	484	ENDDO
	485	ENDDO
	486
	487	DO i = iouter, iei
	488	ii = nxl + i
	489	ir = i - iouter + 1
[1106]	490	CALL fft_y_1d( ffty_ar(:,ir), 'backward' )
[1]	491
	492	DO j = nys_a, nyn_a
	493	ar(k,j,ii) = ffty_ar(j,ir)
	494	ENDDO
	495	ENDDO
	496
	497	ENDDO
	498	ENDDO
	499	!$OMP END PARALLEL
	500
[1106]	501	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	502
	503	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
	504
	505	#if defined( __KKMP )
	506	DEALLOCATE( tri )
	507	#endif
	508
	509	END SUBROUTINE poisfft_hybrid_omp
	510
	511
	512	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
	513
	514	USE cpulog
	515	USE interfaces
	516
	517	IMPLICIT NONE
	518
	519	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	520	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	521
	522	REAL, DIMENSION(0:nx,nz) :: tri_ar
	523
	524	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	525
	526	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
	527	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
	528
	529	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	530	#if defined( __KKMP )
	531	INTEGER :: omp_get_thread_num
	532	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	533	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	534	#else
	535	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	536	#endif
	537
	538
	539	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
	540
	541	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
	542
	543	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
	544	!$OMP DO
	545	!
	546	!-- Store grid points to be transformed on a 1d-array, do the fft
	547	!-- and sample the results on a 4d-array
	548	DO i = nxl_p, nxr_p
	549
	550	DO j = nys_a, nyn_a
	551	DO k = 1, nz
	552	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
	553	ENDDO
	554	ENDDO
	555
	556	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
	557	ENDDO
	558
	559	!$OMP DO
	560	DO k = 1, nz
	561	m = nys_a
	562	DO n = 1, npe_s
	563	DO j = nys_p, nyn_p
	564	DO i = nxl_p, nxr_p
	565	work1(i,k,j,n) = ffty_ar3(m,k,i)
	566	ENDDO
	567	m = m+1
	568	ENDDO
	569	ENDDO
	570	ENDDO
	571	!$OMP END PARALLEL
	572
	573	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
	574
	575	#if defined( __parallel )
	576	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	577	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	578	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	579	comm2d, istat )
	580	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	581	#else
	582	work2 = work1
	583	#endif
	584
	585	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
	586
	587	#if defined( __KKMP )
	588	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
	589	!$OMP DO
	590	DO j = nys_p, nyn_p
	591	jthread = omp_get_thread_num() + 1
	592	#else
	593	DO j = nys_p, nyn_p
	594	jthread = 1
	595	#endif
	596	DO k = 1, nz
	597
	598	m = nxl_a
	599	DO n = 1, npe_s
	600	DO i = nxl_p, nxr_p
	601	fftx_ar3(m,k,j) = work2(i,k,j,n)
	602	m = m+1
	603	ENDDO
	604	ENDDO
	605	ENDDO
	606
	607	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
	608
	609	DO k = 1, nz
	610	DO i = nxl_a, nxr_a
	611	tri_ar(i,k) = fftx_ar3(i,k,j)
	612	ENDDO
	613	ENDDO
	614
	615	jj = myid * (nyn_p-nys_p+1) + j
	616	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	617
	618	DO k = 1, nz
	619	DO i = nxl_a, nxr_a
	620	fftx_ar3(i,k,j) = tri_ar (i,k)
	621	ENDDO
	622	ENDDO
	623
	624	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
	625
	626	DO k = 1, nz
	627	m = nxl_a
	628	DO n = 1, npe_s
	629	DO i = nxl_p, nxr_p
	630	work2(i,k,j,n) = fftx_ar3(m,k,j)
	631	m = m+1
	632	ENDDO
	633	ENDDO
	634	ENDDO
	635
	636	ENDDO
	637	#if defined( __KKMP )
	638	!$OMP END PARALLEL
	639	#endif
	640
	641	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
	642
	643	#if defined( __parallel )
	644	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	645	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	646	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	647	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	648	comm2d, istat )
	649	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	650	#else
	651	work1 = work2
	652	#endif
	653
	654	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
	655
	656	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
	657	!$OMP DO
	658	DO k = 1, nz
	659	m = nys_a
	660	DO n = 1, npe_s
	661	DO j = nys_p, nyn_p
	662	DO i = nxl_p, nxr_p
	663	ffty_ar3(m,k,i) = work1(i,k,j,n)
	664	ENDDO
	665	m = m+1
	666	ENDDO
	667	ENDDO
	668	ENDDO
	669
	670	!$OMP DO
	671	DO i = nxl_p, nxr_p
	672	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
	673	DO j = nys_a, nyn_a
	674	DO k = 1, nz
	675	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
	676	ENDDO
	677	ENDDO
	678	ENDDO
	679	!$OMP END PARALLEL
	680
	681	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
	682
	683	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
	684
	685	#if defined( __KKMP )
	686	DEALLOCATE( tri )
	687	#endif
	688
	689	END SUBROUTINE poisfft_hybrid_omp_vec
	690
	691
	692	SUBROUTINE poisfft_hybrid_nodes ( ar )
	693
	694	USE cpulog
	695	USE interfaces
	696
	697	IMPLICIT NONE
	698
	699	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	700	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
	701	n, nn, nt, nw1, nw2
	702
	703	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	704
	705	REAL, DIMENSION(0:nx) :: fftx_ar
	706	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	707
	708	REAL, DIMENSION(0:nx,nz) :: tri_ar
	709
	710	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
	711	nodes,nys_p:nyn_p) :: work1,work2
	712	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	713
	714
	715	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
	716
[1106]	717	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
[1]	718
	719	!
	720	!-- Store grid points to be transformed on a 1d-array, do the fft
	721	!-- and sample the results on a 4d-array
	722	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	723	iei = MIN( iouter+istride-1, nxr_p )
	724	DO k = 1, nz
	725
	726	DO i = iouter, iei
	727	ii = nxl + i
	728	ir = i - iouter + 1
	729
	730	DO j = nys_a, nyn_a
	731	ffty_ar(j,ir) = ar(k,j,ii)
	732	ENDDO
	733
[1106]	734	CALL fft_y_1d( ffty_ar(:,ir), 'forward' )
[1]	735	ENDDO
	736
	737	m = nys_a
	738	DO nn = 1, nodes
	739	DO nt = 1, tasks_per_logical_node
	740	DO j = nys_p, nyn_p
	741	DO i = iouter, iei
	742	ir = i - iouter + 1
	743	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
	744	ENDDO
	745	m = m+1
	746	ENDDO
	747	ENDDO
	748	ENDDO
	749
	750	ENDDO
	751	ENDDO
	752
[1106]	753	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
[1]	754
	755	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	756	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
	757	DO nn = 1, nodes
	758	DO j = nys_p, nyn_p
	759	#if defined( __parallel )
	760	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	761	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	762	comm_tasks, istat )
	763	#endif
	764	ENDDO
	765	ENDDO
	766	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	767
	768
	769	DO j = nys_p, nyn_p
	770
	771	CALL cascade( 1, j, nys_p, nyn_p )
	772	nw2 = nw1 * SIZE( work1, 3 )
	773	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
	774	#if defined( __parallel )
	775	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	776	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	777	comm_nodes, istat )
	778	#endif
	779	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
	780	CALL cascade( 2, j, nys_p, nyn_p )
	781
[1106]	782	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
[1]	783	DO k = 1, nz
	784
	785	m = nxl_a
	786	DO nn = 1, nodes
	787	DO nt = 1, tasks_per_logical_node
	788	DO i = nxl_p, nxr_p
	789	fftx_ar(m) = work1(i,k,nt,nn,j)
	790	m = m+1
	791	ENDDO
	792	ENDDO
	793	ENDDO
	794
[1106]	795	CALL fft_x_1d( fftx_ar, 'forward' )
[1]	796
	797	DO i = nxl_a, nxr_a
	798	tri_ar(i,k) = fftx_ar(i)
	799	ENDDO
	800
	801	ENDDO
	802
	803	jj = myid * (nyn_p-nys_p+1) + j
	804	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
	805
	806	DO k = 1, nz
	807	DO i = nxl_a, nxr_a
	808	fftx_ar(i) = tri_ar(i,k)
	809	ENDDO
	810
[1106]	811	CALL fft_x_1d( fftx_ar, 'backward' )
[1]	812
	813	m = nxl_a
	814	DO nn = 1, nodes
	815	DO nt = 1, tasks_per_logical_node
	816	DO i = nxl_p, nxr_p
	817	work1(i,k,nt,nn,j) = fftx_ar(m)
	818	m = m+1
	819	ENDDO
	820	ENDDO
	821	ENDDO
	822	ENDDO
	823
[1106]	824	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
[1]	825	nw2 = nw1 * SIZE( work1, 3 )
	826	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
	827	#if defined( __parallel )
	828	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	829	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	830	comm_nodes, istat )
	831	#endif
	832	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
	833
	834	ENDDO
	835
	836	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	837	DO nn = 1, nodes
	838	DO j = nys_p, nyn_p
	839	#if defined( __parallel )
	840	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	841	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	842	comm_tasks, istat )
	843	#endif
	844	ENDDO
	845	ENDDO
	846	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	847
[1106]	848	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
[1]	849
	850	DO iouter = nxl_p, nxr_p, istride
	851	iei = MIN( iouter+istride-1, nxr_p )
	852	DO k = 1, nz
	853
	854	m = nys_a
	855	DO nn = 1, nodes
	856	DO nt = 1, tasks_per_logical_node
	857	DO j = nys_p, nyn_p
	858	DO i = iouter, iei
	859	ir = i - iouter + 1
	860	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
	861	ENDDO
	862	m = m+1
	863	ENDDO
	864	ENDDO
	865	ENDDO
	866
	867	DO i = iouter, iei
	868	ii = nxl + i
	869	ir = i - iouter + 1
[1106]	870	CALL fft_y_1d( ffty_ar(:,ir), 'backward' )
[1]	871
	872	DO j = nys_a, nyn_a
	873	ar(k,j,ii) = ffty_ar(j,ir)
	874	ENDDO
	875	ENDDO
	876
	877	ENDDO
	878	ENDDO
	879
[1106]	880	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
[1]	881
	882	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
	883
	884	END SUBROUTINE poisfft_hybrid_nodes
	885
	886
	887
	888	SUBROUTINE tridia_hybrid( j, ar, tri )
	889
	890	USE arrays_3d
	891	USE control_parameters
	892	USE grid_variables
	893
	894	IMPLICIT NONE
	895
	896	INTEGER :: i, j, k, nnyh
	897	REAL, DIMENSION(0:nx,nz) :: ar
	898	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	899	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	900
	901	nnyh = (ny+1) / 2
	902
	903	tri = 0.0
	904	!
	905	!-- Define constant elements of the tridiagonal matrix.
	906	DO k = 0, nz-1
	907	DO i = 0,nx
[667]	908	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	909	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	910	ENDDO
	911	ENDDO
	912
	913	IF ( j <= nnyh ) THEN
	914	CALL maketri_hybrid( j )
	915	ELSE
	916	CALL maketri_hybrid( ny+1-j)
	917	ENDIF
	918	CALL zerleg_hybrid
	919	CALL substi_hybrid( ar, tri )
	920
	921	CONTAINS
	922
	923	SUBROUTINE maketri_hybrid( j )
	924
	925	!----------------------------------------------------------------------!
	926	! maketri !
	927	! !
	928	! computes the i- and j-dependent component of the matrix !
	929	!----------------------------------------------------------------------!
	930
	931	USE constants
	932
	933	IMPLICIT NONE
	934
	935	INTEGER :: i, j, k, nnxh
	936	REAL :: a, c
	937
	938	REAL, DIMENSION(0:nx) :: l
	939
	940
	941	nnxh = (nx+1) / 2
	942	!
	943	!-- Provide the tridiagonal matrix for solution of the Poisson equation
	944	!-- in Fourier space. The coefficients are computed following the method
	945	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
	946	!-- Siano's original version.
	947	DO i = 0,nx
	948	IF ( i >= 0 .AND. i < nnxh ) THEN
	949	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
[1013]	950	REAL( nx+1 ) ) ) / ( dx * dx ) + &
[1]	951	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	952	REAL( ny+1 ) ) ) / ( dy * dy )
[1]	953	ELSEIF ( i == nnxh ) THEN
	954	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
[1013]	955	REAL( nx+1 ) ) ) / ( dx * dx ) + &
[1]	956	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	957	REAL(ny+1) ) ) / ( dy * dy )
[1]	958	ELSE
	959	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
[1013]	960	REAL( nx+1 ) ) ) / ( dx * dx ) + &
[1]	961	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	962	REAL( ny+1 ) ) ) / ( dy * dy )
[1]	963	ENDIF
	964	ENDDO
	965
	966	DO k = 0,nz-1
	967	DO i = 0, nx
[667]	968	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	969	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	970	tri(1,i,k) = a + c - l(i)
	971	ENDDO
	972	ENDDO
[1111]	973	IF ( ibc_p_b == 1 ) THEN
[1]	974	DO i = 0,nx
	975	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	976	ENDDO
	977	ENDIF
	978	IF ( ibc_p_t == 1 ) THEN
	979	DO i = 0,nx
	980	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	981	ENDDO
	982	ENDIF
	983
	984	END SUBROUTINE maketri_hybrid
	985
	986
	987	SUBROUTINE zerleg_hybrid
	988
	989	!----------------------------------------------------------------------!
	990	! zerleg !
	991	! !
	992	! Splitting of the tridiagonal matrix (Thomas algorithm) !
	993	!----------------------------------------------------------------------!
	994
	995	USE indices
	996
	997	IMPLICIT NONE
	998
	999	INTEGER :: i, k
	1000
	1001	!
	1002	!-- Splitting
	1003	DO i = 0, nx
	1004	tri(4,i,0) = tri(1,i,0)
	1005	ENDDO
	1006	DO k = 1, nz-1
	1007	DO i = 0,nx
	1008	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	1009	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	1010	ENDDO
	1011	ENDDO
	1012
	1013	END SUBROUTINE zerleg_hybrid
	1014
	1015	SUBROUTINE substi_hybrid( ar, tri )
	1016
	1017	!----------------------------------------------------------------------!
	1018	! substi !
	1019	! !
	1020	! Substitution (Forward and Backward) (Thomas algorithm) !
	1021	!----------------------------------------------------------------------!
	1022
	1023	IMPLICIT NONE
	1024
	1025	INTEGER :: i, j, k
	1026	REAL, DIMENSION(0:nx,nz) :: ar
	1027	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1028	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1029
	1030	!
	1031	!-- Forward substitution
	1032	DO i = 0, nx
	1033	ar1(i,0) = ar(i,1)
	1034	ENDDO
	1035	DO k = 1, nz - 1
	1036	DO i = 0,nx
	1037	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1038	ENDDO
	1039	ENDDO
	1040
	1041	!
	1042	!-- Backward substitution
	1043	DO i = 0,nx
	1044	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	1045	ENDDO
	1046	DO k = nz-2, 0, -1
	1047	DO i = 0,nx
	1048	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1049	/ tri(4,i,k)
	1050	ENDDO
	1051	ENDDO
	1052
	1053	END SUBROUTINE substi_hybrid
	1054
	1055	END SUBROUTINE tridia_hybrid
	1056
	1057
	1058	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
	1059
	1060	USE cpulog
	1061
	1062	IMPLICIT NONE
	1063
	1064	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
	1065	INTEGER, SAVE :: tag = 10
	1066	#if defined( __parallel )
[415]	1067	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
	1068	INTEGER, DIMENSION(MPI_STATUS_SIZE,1) :: stata
[1]	1069	#endif
	1070
	1071	REAL :: buf, buf1
	1072
	1073
	1074	buf = 1.0
	1075	buf1 = 1.1
	1076	IF ( me_node == 0 ) THEN ! first node only
	1077
	1078	SELECT CASE ( loca )
	1079
	1080	CASE ( 1 ) ! before alltoall
	1081
	1082	IF( me_task > 0 ) THEN ! first task does not wait
	1083	#if defined( __parallel )
	1084	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
	1085	buf1, 1, MPI_REAL, me_task-1, 0, &
[415]	1086	comm_tasks, stat, ierr )
[1]	1087	#endif
	1088	ELSEIF ( j > nys_p ) THEN
	1089	req = 0
	1090	tag = MOD( tag-10, 10 ) + 10
	1091	#if defined( __parallel )
	1092	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
	1093	tag, comm_tasks, req, ierr )
	1094	reqa = req
[415]	1095	CALL MPI_WAITALL( 1, reqa, stata, ierr )
[1]	1096	#endif
	1097	ENDIF
	1098
	1099	CASE ( 2 ) ! after alltoall
	1100
	1101	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
	1102	#if defined( __parallel )
	1103	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
	1104	buf1, 1, MPI_REAL, me_task+1, 0, &
	1105	comm_tasks, stat, ierr)
	1106	#endif
	1107	ELSEIF ( j < nyn_p ) THEN
	1108	req = 0
	1109	tag = MOD( tag-10, 10 ) + 10
	1110	#if defined( __parallel )
	1111	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
	1112	ierr )
	1113	#endif
	1114	ENDIF
	1115
	1116	END SELECT
	1117
	1118	ENDIF
	1119
	1120	END SUBROUTINE cascade
[807]	1121	#endif
[1]	1122	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |