Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 1065

Last change on this file since 1065 was 1037, checked in by raasch, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 33.0 KB

Rev	Line
[1]	1	MODULE poisfft_hybrid_mod
[808]	2
[1036]	3	!--------------------------------------------------------------------------------!
	4	! This file is part of PALM.
	5	!
	6	! PALM is free software: you can redistribute it and/or modify it under the terms
	7	! of the GNU General Public License as published by the Free Software Foundation,
	8	! either version 3 of the License, or (at your option) any later version.
	9	!
	10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
	11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
	12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
	13	!
	14	! You should have received a copy of the GNU General Public License along with
	15	! PALM. If not, see <http://www.gnu.org/licenses/>.
	16	!
	17	! Copyright 1997-2012 Leibniz University Hannover
	18	!--------------------------------------------------------------------------------!
	19	!
[257]	20	! Current revisions:
[1]	21	! -----------------
	22	!
[1014]	23	!
[1]	24	! Former revisions:
	25	! -----------------
[3]	26	! $Id: poisfft_hybrid.f90 1037 2012-10-22 14:10:22Z hoffmann $
[392]	27	!
[1037]	28	! 1036 2012-10-22 13:43:42Z raasch
	29	! code put under GPL (PALM 3.9)
	30	!
[1014]	31	! 1013 2012-09-21 07:03:55Z raasch
	32	! FLOAT type conversion replaced by REAL
	33	!
[810]	34	! 809 2012-01-30 13:32:58Z maronga
	35	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
	36	!
[808]	37	! 807 2012-01-25 11:53:51Z maronga
	38	! New cpp directive "__check" implemented which is used by check_namelist_files
	39	! (most of the code is unneeded by check_namelist_files).
	40	!
[668]	41	! 667 2010-12-23 12:06:00Z suehring/gryschka
	42	! ddzu replaced by ddzu_pres due to changes in zu(0)
	43	!
[482]	44	! 415 2009-12-15 10:26:23Z raasch
[807]	45	! Dimension of array stat in cascade change to prevent type problems with___
[482]	46	! mpi2 libraries
	47	!
[392]	48	! 274 2009-03-26 15:11:21Z heinze
	49	! Output of messages replaced by message handling routine.
	50	!
	51	! Feb. 2007
[3]	52	! RCS Log replace by Id keyword, revision history cleaned up
	53	!
[1]	54	! Revision 1.11 2004/04/30 12:43:14 raasch
	55	! Renaming of fft routines, additional argument in calls of fft_y_m
	56	!
	57	! Revision 1.2 2002/12/19 16:08:31 raasch
	58	! Preprocessor directive KKMP introduced (OMP does NOT work),
	59	! array tri will be a shared array in OpenMP loop, to get better cache
	60	! utilization, the i index (x-direction) will be executed in stride
	61	! "istride" as outer loop and in a shorter inner loop,
	62	! overlapping of computation and communication realized by new routine
	63	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
	64	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
	65	!
	66	!
	67	! Description:
	68	! ------------
	69	! Solution of the Poisson equation with a 2D spectral method.
	70	! Hybrid version for parallel computers using a 1D domain decomposition,
	71	! realized with MPI, along x and parallelization with OPEN-MP along y
	72	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
	73	! optimization is realized by overlapping of computation and communication
	74	! and by simultaneously executing as many communication calls as switches
	75	! per logical partition (LPAR) are available. This version comes into
	76	! effect if more than one node is used and if the environment variable
	77	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
	78	! without any rest.
	79	!
	80	! WARNING: In case of OpenMP, there are problems with allocating large
	81	! arrays in parallel regions.
	82	!
	83	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
	84	!------------------------------------------------------------------------------!
	85
	86	USE fft_xy
	87	USE indices
	88	USE pegrid
	89
	90	IMPLICIT NONE
	91
	92	INTEGER, PARAMETER :: switch_per_lpar = 2
	93
	94	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
	95	nxl_p, nxr_p, & ! partial x dimension
	96	nys_a, nyn_a, & ! total y dimension
	97	nys_p, nyn_p, & ! partial y dimension
	98
	99	npe_s, & ! total number of PEs for solver
	100	nwords, & ! number of points to be exchanged
	101	! with MPI_ALLTOALL
	102	n_omp_threads ! number of OpenMP threads
	103
	104	!
	105	!-- Variables for multi node version (cluster version) using routine
	106	!-- poisfft_hybrid_nodes
	107	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
	108	comm_node_all, & ! communicater all PEs node version
	109	comm_tasks, & ! communicater tasks
	110	me, me_node, me_task,& ! identity of this PE
	111	nodes, & ! number of nodes
	112	tasks_per_logical_node = -1 ! default no cluster
	113
[807]	114
	115	PRIVATE
	116
	117
[809]	118	#if ! defined ( __check )
[807]	119	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
	120
	121
[1]	122	!
	123	!-- Public interfaces
	124	INTERFACE poisfft_hybrid_ini
	125	MODULE PROCEDURE poisfft_hybrid_ini
	126	END INTERFACE poisfft_hybrid_ini
	127
	128	INTERFACE poisfft_hybrid
	129	MODULE PROCEDURE poisfft_hybrid
	130	END INTERFACE poisfft_hybrid
	131
	132	!
	133	!-- Private interfaces
	134	INTERFACE poisfft_hybrid_omp
	135	MODULE PROCEDURE poisfft_hybrid_omp
	136	END INTERFACE poisfft_hybrid_omp
	137
	138	INTERFACE poisfft_hybrid_omp_vec
	139	MODULE PROCEDURE poisfft_hybrid_omp_vec
	140	END INTERFACE poisfft_hybrid_omp_vec
	141
	142	INTERFACE poisfft_hybrid_nodes
	143	MODULE PROCEDURE poisfft_hybrid_nodes
	144	END INTERFACE poisfft_hybrid_nodes
	145
	146	INTERFACE tridia_hybrid
	147	MODULE PROCEDURE tridia_hybrid
	148	END INTERFACE tridia_hybrid
	149
	150	INTERFACE cascade
	151	MODULE PROCEDURE cascade
	152	END INTERFACE cascade
[807]	153	#else
	154	PUBLIC poisfft_hybrid_ini
[1]	155
[807]	156	!
	157	!-- Public interfaces
	158	INTERFACE poisfft_hybrid_ini
	159	MODULE PROCEDURE poisfft_hybrid_ini
	160	END INTERFACE poisfft_hybrid_ini
	161	#endif
	162
[1]	163	CONTAINS
	164
[807]	165
[1]	166	SUBROUTINE poisfft_hybrid_ini
	167
	168	USE control_parameters
	169	USE pegrid
	170
	171	IMPLICIT NONE
	172
	173	CHARACTER(LEN=8) :: cdummy
	174	INTEGER :: idummy, istat
	175	INTEGER, DIMENSION(2) :: coords, dims
	176
	177	LOGICAL, DIMENSION(2) :: period = .false., re_dims
	178
	179
	180	!
	181	!-- Set the internal index values for the hybrid solver
	182	#if defined( __parallel )
	183	npe_s = pdims(1)
	184	#else
	185	npe_s = 1
	186	#endif
	187	nxl_a = 0
	188	nxr_a = nx
	189	nxl_p = 0
	190	nxr_p = ( ( nx+1 ) / npe_s ) - 1
	191	nys_a = nys
	192	nyn_a = nyn
	193	nys_p = 0
	194	nyn_p = ( ( ny+1 ) / npe_s ) - 1
	195
	196	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
	197
[809]	198	#if defined( __KKMP ) && ! defined ( __check )
[1]	199	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
	200	READ ( cdummy, '(I8)' ) n_omp_threads
[257]	201	IF ( n_omp_threads > 1 ) THEN
	202	WRITE( message_string, * ) 'Number of OpenMP threads = ', &
	203	n_omp_threads
	204	CALL message( 'poisfft_hybrid_ini', 'PA0280', 0, 0, 0, 6, 0 )
[1]	205	ENDIF
	206	#else
	207	n_omp_threads = 1
	208	#endif
	209	!
	210	!-- Initialize the one-dimensional FFT routines
	211	CALL fft_init
	212
	213	!
	214	!-- Setup for multi node version (poisfft_hybrid_nodes)
	215	IF ( n_omp_threads == 1 .AND. &
	216	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
	217
	218	IF ( tasks_per_node /= -9999 ) THEN
	219	!
	220	!-- Multi node version requires that the available number of
	221	!-- switches per logical partition must be an integral divisor
	222	!-- of the chosen number of tasks per node
	223	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
	224	!
	225	!-- Set the switch which decides about usage of the multi node
	226	!-- version
	227	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
	228	numprocs > tasks_per_node ) THEN
	229	tasks_per_logical_node = tasks_per_node / switch_per_lpar
	230	ENDIF
	231
[257]	232	IF ( tasks_per_logical_node > -1 ) THEN
	233
	234	WRITE( message_string, * ) 'running optimized ', &
	235	'multinode version', &
	236	'&switch_per_lpar = ', &
	237	switch_per_lpar, &
	238	'&tasks_per_lpar = ', &
	239	tasks_per_node, &
	240	'tasks_per_logical_node = ', &
	241	tasks_per_logical_node
[274]	242	CALL message( 'poisfft_hybrid_ini', 'PA0281', 0, 0, 0, 6, 0 )
[257]	243
[1]	244	ENDIF
	245
	246	ENDIF
	247	ENDIF
	248	ENDIF
	249
	250	!
	251	!-- Determine sub-topologies for multi node version
	252	IF ( tasks_per_logical_node >= 2 ) THEN
	253
[809]	254	#if defined( __parallel ) && ! defined ( __check )
[1]	255	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
	256	tasks_per_logical_node
	257	dims(1) = nodes
	258	dims(2) = tasks_per_logical_node
	259
	260	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
	261	comm_node_all, istat )
	262	CALL MPI_COMM_RANK( comm_node_all, me, istat )
	263
	264	re_dims(1) = .TRUE.
	265	re_dims(2) = .FALSE.
	266	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
	267	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
	268
	269	re_dims(1) = .FALSE.
	270	re_dims(2) = .TRUE.
	271	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
	272	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
	273
	274	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
	275	! tasks_per_logical_node
[809]	276	#elif ! defined( __parallel )
[257]	277	message_string = 'parallel environment (MPI) required'
	278	CALL message( 'poisfft_hybrid_ini', 'PA0282', 1, 2, 0, 6, 0 )
[1]	279	#endif
	280	ENDIF
	281
	282	END SUBROUTINE poisfft_hybrid_ini
	283
[809]	284	#if ! defined ( __check )
[1]	285	SUBROUTINE poisfft_hybrid( ar )
	286
	287	USE control_parameters
	288	USE interfaces
	289
	290	IMPLICIT NONE
	291
	292	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	293
	294	IF ( host(1:3) == 'nec' ) THEN
	295	CALL poisfft_hybrid_omp_vec( ar )
	296	ELSE
	297	IF ( tasks_per_logical_node == -1 ) THEN
	298	CALL poisfft_hybrid_omp( ar )
	299	ELSE
	300	CALL poisfft_hybrid_nodes( ar )
	301	ENDIF
	302	ENDIF
	303
	304	END SUBROUTINE poisfft_hybrid
	305
	306
	307	SUBROUTINE poisfft_hybrid_omp ( ar )
	308
	309	USE cpulog
	310	USE interfaces
	311
	312	IMPLICIT NONE
	313
	314	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	315	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	316
	317	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	318
	319	REAL, DIMENSION(0:nx) :: fftx_ar
	320	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	321
	322	REAL, DIMENSION(0:nx,nz) :: tri_ar
	323
	324	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	325	#if defined( __KKMP )
	326	INTEGER :: omp_get_thread_num
	327	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	328	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	329	#else
	330	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	331	#endif
	332
	333
	334	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
	335
	336	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	337
	338	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	339	!$OMP DO
	340	!
	341	!-- Store grid points to be transformed on a 1d-array, do the fft
	342	!-- and sample the results on a 4d-array
	343	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	344	iei = MIN( iouter+istride-1, nxr_p )
	345	DO k = 1, nz
	346
	347	DO i = iouter, iei
	348	ii = nxl + i
	349	ir = i - iouter + 1
	350
	351	DO j = nys_a, nyn_a
	352	ffty_ar(j,ir) = ar(k,j,ii)
	353	ENDDO
	354
	355	CALL fft_y( ffty_ar(:,ir), 'forward' )
	356	ENDDO
	357
	358	m = nys_a
	359	DO n = 1, npe_s
	360	DO j = nys_p, nyn_p
	361	DO i = iouter, iei
	362	ir = i - iouter + 1
	363	work1(i,k,j,n) = ffty_ar(m,ir)
	364	ENDDO
	365	m = m+1
	366	ENDDO
	367	ENDDO
	368
	369	ENDDO
	370	ENDDO
	371	!$OMP END PARALLEL
	372
	373	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	374
	375	#if defined( __parallel )
	376	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	377
	378	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	379	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	380	comm2d, istat )
	381
	382	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	383	#else
	384	work2 = work1
	385	#endif
	386
	387	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	388
	389	#if defined( __KKMP )
	390	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
	391	!$OMP DO
	392	DO j = nys_p, nyn_p
	393	jthread = omp_get_thread_num() + 1
	394	#else
	395	DO j = nys_p, nyn_p
	396	jthread = 1
	397	#endif
	398	DO k = 1, nz
	399
	400	m = nxl_a
	401	DO n = 1, npe_s
	402	DO i = nxl_p, nxr_p
	403	fftx_ar(m) = work2(i,k,j,n)
	404	m = m+1
	405	ENDDO
	406	ENDDO
	407
	408	CALL fft_x( fftx_ar, 'forward' )
	409
	410	DO i = nxl_a, nxr_a
	411	tri_ar(i,k) = fftx_ar(i)
	412	ENDDO
	413
	414	ENDDO
	415
	416	jj = myid * (nyn_p-nys_p+1) + j
	417	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	418
	419	DO k = 1, nz
	420	DO i = nxl_a, nxr_a
	421	fftx_ar(i) = tri_ar (i,k)
	422	ENDDO
	423
	424	CALL fft_x( fftx_ar, 'backward' )
	425
	426	m = nxl_a
	427	DO n = 1, npe_s
	428	DO i = nxl_p, nxr_p
	429	work2(i,k,j,n) = fftx_ar(m)
	430	m = m+1
	431	ENDDO
	432	ENDDO
	433
	434	ENDDO
	435	ENDDO
	436	#if defined( __KKMP )
	437	!$OMP END PARALLEL
	438	#endif
	439
	440	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	441
	442	#if defined( __parallel )
	443	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	444	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	445
	446	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	447	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	448	comm2d, istat )
	449
	450	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	451	#else
	452	work1 = work2
	453	#endif
	454
	455	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	456
	457	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
	458	!$OMP DO
	459	DO iouter = nxl_p, nxr_p, istride
	460	iei = MIN( iouter+istride-1, nxr_p )
	461	DO k = 1, nz
	462
	463	m = nys_a
	464	DO n = 1, npe_s
	465	DO j = nys_p, nyn_p
	466	DO i = iouter, iei
	467	ir = i - iouter + 1
	468	ffty_ar(m,ir) = work1 (i,k,j,n)
	469	ENDDO
	470	m = m+1
	471	ENDDO
	472	ENDDO
	473
	474	DO i = iouter, iei
	475	ii = nxl + i
	476	ir = i - iouter + 1
	477	CALL fft_y( ffty_ar(:,ir), 'backward' )
	478
	479	DO j = nys_a, nyn_a
	480	ar(k,j,ii) = ffty_ar(j,ir)
	481	ENDDO
	482	ENDDO
	483
	484	ENDDO
	485	ENDDO
	486	!$OMP END PARALLEL
	487
	488	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	489
	490	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
	491
	492	#if defined( __KKMP )
	493	DEALLOCATE( tri )
	494	#endif
	495
	496	END SUBROUTINE poisfft_hybrid_omp
	497
	498
	499	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
	500
	501	USE cpulog
	502	USE interfaces
	503
	504	IMPLICIT NONE
	505
	506	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	507	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
	508
	509	REAL, DIMENSION(0:nx,nz) :: tri_ar
	510
	511	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	512
	513	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
	514	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
	515
	516	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
	517	#if defined( __KKMP )
	518	INTEGER :: omp_get_thread_num
	519	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
	520	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
	521	#else
	522	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
	523	#endif
	524
	525
	526	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
	527
	528	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
	529
	530	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
	531	!$OMP DO
	532	!
	533	!-- Store grid points to be transformed on a 1d-array, do the fft
	534	!-- and sample the results on a 4d-array
	535	DO i = nxl_p, nxr_p
	536
	537	DO j = nys_a, nyn_a
	538	DO k = 1, nz
	539	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
	540	ENDDO
	541	ENDDO
	542
	543	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
	544	ENDDO
	545
	546	!$OMP DO
	547	DO k = 1, nz
	548	m = nys_a
	549	DO n = 1, npe_s
	550	DO j = nys_p, nyn_p
	551	DO i = nxl_p, nxr_p
	552	work1(i,k,j,n) = ffty_ar3(m,k,i)
	553	ENDDO
	554	m = m+1
	555	ENDDO
	556	ENDDO
	557	ENDDO
	558	!$OMP END PARALLEL
	559
	560	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
	561
	562	#if defined( __parallel )
	563	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	564	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	565	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	566	comm2d, istat )
	567	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	568	#else
	569	work2 = work1
	570	#endif
	571
	572	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
	573
	574	#if defined( __KKMP )
	575	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
	576	!$OMP DO
	577	DO j = nys_p, nyn_p
	578	jthread = omp_get_thread_num() + 1
	579	#else
	580	DO j = nys_p, nyn_p
	581	jthread = 1
	582	#endif
	583	DO k = 1, nz
	584
	585	m = nxl_a
	586	DO n = 1, npe_s
	587	DO i = nxl_p, nxr_p
	588	fftx_ar3(m,k,j) = work2(i,k,j,n)
	589	m = m+1
	590	ENDDO
	591	ENDDO
	592	ENDDO
	593
	594	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
	595
	596	DO k = 1, nz
	597	DO i = nxl_a, nxr_a
	598	tri_ar(i,k) = fftx_ar3(i,k,j)
	599	ENDDO
	600	ENDDO
	601
	602	jj = myid * (nyn_p-nys_p+1) + j
	603	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
	604
	605	DO k = 1, nz
	606	DO i = nxl_a, nxr_a
	607	fftx_ar3(i,k,j) = tri_ar (i,k)
	608	ENDDO
	609	ENDDO
	610
	611	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
	612
	613	DO k = 1, nz
	614	m = nxl_a
	615	DO n = 1, npe_s
	616	DO i = nxl_p, nxr_p
	617	work2(i,k,j,n) = fftx_ar3(m,k,j)
	618	m = m+1
	619	ENDDO
	620	ENDDO
	621	ENDDO
	622
	623	ENDDO
	624	#if defined( __KKMP )
	625	!$OMP END PARALLEL
	626	#endif
	627
	628	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
	629
	630	#if defined( __parallel )
	631	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
	632	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
	633	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	634	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
	635	comm2d, istat )
	636	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
	637	#else
	638	work1 = work2
	639	#endif
	640
	641	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
	642
	643	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
	644	!$OMP DO
	645	DO k = 1, nz
	646	m = nys_a
	647	DO n = 1, npe_s
	648	DO j = nys_p, nyn_p
	649	DO i = nxl_p, nxr_p
	650	ffty_ar3(m,k,i) = work1(i,k,j,n)
	651	ENDDO
	652	m = m+1
	653	ENDDO
	654	ENDDO
	655	ENDDO
	656
	657	!$OMP DO
	658	DO i = nxl_p, nxr_p
	659	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
	660	DO j = nys_a, nyn_a
	661	DO k = 1, nz
	662	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
	663	ENDDO
	664	ENDDO
	665	ENDDO
	666	!$OMP END PARALLEL
	667
	668	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
	669
	670	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
	671
	672	#if defined( __KKMP )
	673	DEALLOCATE( tri )
	674	#endif
	675
	676	END SUBROUTINE poisfft_hybrid_omp_vec
	677
	678
	679	SUBROUTINE poisfft_hybrid_nodes ( ar )
	680
	681	USE cpulog
	682	USE interfaces
	683
	684	IMPLICIT NONE
	685
	686	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
	687	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
	688	n, nn, nt, nw1, nw2
	689
	690	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
	691
	692	REAL, DIMENSION(0:nx) :: fftx_ar
	693	REAL, DIMENSION(0:ny,istride) :: ffty_ar
	694
	695	REAL, DIMENSION(0:nx,nz) :: tri_ar
	696
	697	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
	698	nodes,nys_p:nyn_p) :: work1,work2
	699	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	700
	701
	702	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
	703
	704	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
	705
	706	!
	707	!-- Store grid points to be transformed on a 1d-array, do the fft
	708	!-- and sample the results on a 4d-array
	709	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
	710	iei = MIN( iouter+istride-1, nxr_p )
	711	DO k = 1, nz
	712
	713	DO i = iouter, iei
	714	ii = nxl + i
	715	ir = i - iouter + 1
	716
	717	DO j = nys_a, nyn_a
	718	ffty_ar(j,ir) = ar(k,j,ii)
	719	ENDDO
	720
	721	CALL fft_y( ffty_ar(:,ir), 'forward' )
	722	ENDDO
	723
	724	m = nys_a
	725	DO nn = 1, nodes
	726	DO nt = 1, tasks_per_logical_node
	727	DO j = nys_p, nyn_p
	728	DO i = iouter, iei
	729	ir = i - iouter + 1
	730	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
	731	ENDDO
	732	m = m+1
	733	ENDDO
	734	ENDDO
	735	ENDDO
	736
	737	ENDDO
	738	ENDDO
	739
	740	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
	741
	742	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	743	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
	744	DO nn = 1, nodes
	745	DO j = nys_p, nyn_p
	746	#if defined( __parallel )
	747	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	748	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	749	comm_tasks, istat )
	750	#endif
	751	ENDDO
	752	ENDDO
	753	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	754
	755
	756	DO j = nys_p, nyn_p
	757
	758	CALL cascade( 1, j, nys_p, nyn_p )
	759	nw2 = nw1 * SIZE( work1, 3 )
	760	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
	761	#if defined( __parallel )
	762	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	763	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	764	comm_nodes, istat )
	765	#endif
	766	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
	767	CALL cascade( 2, j, nys_p, nyn_p )
	768
	769	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
	770	DO k = 1, nz
	771
	772	m = nxl_a
	773	DO nn = 1, nodes
	774	DO nt = 1, tasks_per_logical_node
	775	DO i = nxl_p, nxr_p
	776	fftx_ar(m) = work1(i,k,nt,nn,j)
	777	m = m+1
	778	ENDDO
	779	ENDDO
	780	ENDDO
	781
	782	CALL fft_x( fftx_ar, 'forward' )
	783
	784	DO i = nxl_a, nxr_a
	785	tri_ar(i,k) = fftx_ar(i)
	786	ENDDO
	787
	788	ENDDO
	789
	790	jj = myid * (nyn_p-nys_p+1) + j
	791	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
	792
	793	DO k = 1, nz
	794	DO i = nxl_a, nxr_a
	795	fftx_ar(i) = tri_ar(i,k)
	796	ENDDO
	797
	798	CALL fft_x( fftx_ar, 'backward' )
	799
	800	m = nxl_a
	801	DO nn = 1, nodes
	802	DO nt = 1, tasks_per_logical_node
	803	DO i = nxl_p, nxr_p
	804	work1(i,k,nt,nn,j) = fftx_ar(m)
	805	m = m+1
	806	ENDDO
	807	ENDDO
	808	ENDDO
	809	ENDDO
	810
	811	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
	812	nw2 = nw1 * SIZE( work1, 3 )
	813	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
	814	#if defined( __parallel )
	815	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	816	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
	817	comm_nodes, istat )
	818	#endif
	819	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
	820
	821	ENDDO
	822
	823	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
	824	DO nn = 1, nodes
	825	DO j = nys_p, nyn_p
	826	#if defined( __parallel )
	827	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	828	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
	829	comm_tasks, istat )
	830	#endif
	831	ENDDO
	832	ENDDO
	833	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
	834
	835	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
	836
	837	DO iouter = nxl_p, nxr_p, istride
	838	iei = MIN( iouter+istride-1, nxr_p )
	839	DO k = 1, nz
	840
	841	m = nys_a
	842	DO nn = 1, nodes
	843	DO nt = 1, tasks_per_logical_node
	844	DO j = nys_p, nyn_p
	845	DO i = iouter, iei
	846	ir = i - iouter + 1
	847	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
	848	ENDDO
	849	m = m+1
	850	ENDDO
	851	ENDDO
	852	ENDDO
	853
	854	DO i = iouter, iei
	855	ii = nxl + i
	856	ir = i - iouter + 1
	857	CALL fft_y( ffty_ar(:,ir), 'backward' )
	858
	859	DO j = nys_a, nyn_a
	860	ar(k,j,ii) = ffty_ar(j,ir)
	861	ENDDO
	862	ENDDO
	863
	864	ENDDO
	865	ENDDO
	866
	867	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
	868
	869	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
	870
	871	END SUBROUTINE poisfft_hybrid_nodes
	872
	873
	874
	875	SUBROUTINE tridia_hybrid( j, ar, tri )
	876
	877	USE arrays_3d
	878	USE control_parameters
	879	USE grid_variables
	880
	881	IMPLICIT NONE
	882
	883	INTEGER :: i, j, k, nnyh
	884	REAL, DIMENSION(0:nx,nz) :: ar
	885	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	886	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	887
	888	nnyh = (ny+1) / 2
	889
	890	tri = 0.0
	891	!
	892	!-- Define constant elements of the tridiagonal matrix.
	893	DO k = 0, nz-1
	894	DO i = 0,nx
[667]	895	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
	896	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
[1]	897	ENDDO
	898	ENDDO
	899
	900	IF ( j <= nnyh ) THEN
	901	CALL maketri_hybrid( j )
	902	ELSE
	903	CALL maketri_hybrid( ny+1-j)
	904	ENDIF
	905	CALL zerleg_hybrid
	906	CALL substi_hybrid( ar, tri )
	907
	908	CONTAINS
	909
	910	SUBROUTINE maketri_hybrid( j )
	911
	912	!----------------------------------------------------------------------!
	913	! maketri !
	914	! !
	915	! computes the i- and j-dependent component of the matrix !
	916	!----------------------------------------------------------------------!
	917
	918	USE constants
	919
	920	IMPLICIT NONE
	921
	922	INTEGER :: i, j, k, nnxh
	923	REAL :: a, c
	924
	925	REAL, DIMENSION(0:nx) :: l
	926
	927
	928	nnxh = (nx+1) / 2
	929	!
	930	!-- Provide the tridiagonal matrix for solution of the Poisson equation
	931	!-- in Fourier space. The coefficients are computed following the method
	932	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
	933	!-- Siano's original version.
	934	DO i = 0,nx
	935	IF ( i >= 0 .AND. i < nnxh ) THEN
	936	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
[1013]	937	REAL( nx+1 ) ) ) / ( dx * dx ) + &
[1]	938	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	939	REAL( ny+1 ) ) ) / ( dy * dy )
[1]	940	ELSEIF ( i == nnxh ) THEN
	941	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
[1013]	942	REAL( nx+1 ) ) ) / ( dx * dx ) + &
[1]	943	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	944	REAL(ny+1) ) ) / ( dy * dy )
[1]	945	ELSE
	946	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
[1013]	947	REAL( nx+1 ) ) ) / ( dx * dx ) + &
[1]	948	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
[1013]	949	REAL( ny+1 ) ) ) / ( dy * dy )
[1]	950	ENDIF
	951	ENDDO
	952
	953	DO k = 0,nz-1
	954	DO i = 0, nx
[667]	955	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
	956	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
[1]	957	tri(1,i,k) = a + c - l(i)
	958	ENDDO
	959	ENDDO
	960	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
	961	DO i = 0,nx
	962	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
	963	ENDDO
	964	ENDIF
	965	IF ( ibc_p_t == 1 ) THEN
	966	DO i = 0,nx
	967	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
	968	ENDDO
	969	ENDIF
	970
	971	END SUBROUTINE maketri_hybrid
	972
	973
	974	SUBROUTINE zerleg_hybrid
	975
	976	!----------------------------------------------------------------------!
	977	! zerleg !
	978	! !
	979	! Splitting of the tridiagonal matrix (Thomas algorithm) !
	980	!----------------------------------------------------------------------!
	981
	982	USE indices
	983
	984	IMPLICIT NONE
	985
	986	INTEGER :: i, k
	987
	988	!
	989	!-- Splitting
	990	DO i = 0, nx
	991	tri(4,i,0) = tri(1,i,0)
	992	ENDDO
	993	DO k = 1, nz-1
	994	DO i = 0,nx
	995	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
	996	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
	997	ENDDO
	998	ENDDO
	999
	1000	END SUBROUTINE zerleg_hybrid
	1001
	1002	SUBROUTINE substi_hybrid( ar, tri )
	1003
	1004	!----------------------------------------------------------------------!
	1005	! substi !
	1006	! !
	1007	! Substitution (Forward and Backward) (Thomas algorithm) !
	1008	!----------------------------------------------------------------------!
	1009
	1010	IMPLICIT NONE
	1011
	1012	INTEGER :: i, j, k
	1013	REAL, DIMENSION(0:nx,nz) :: ar
	1014	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
	1015	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
	1016
	1017	!
	1018	!-- Forward substitution
	1019	DO i = 0, nx
	1020	ar1(i,0) = ar(i,1)
	1021	ENDDO
	1022	DO k = 1, nz - 1
	1023	DO i = 0,nx
	1024	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
	1025	ENDDO
	1026	ENDDO
	1027
	1028	!
	1029	!-- Backward substitution
	1030	DO i = 0,nx
	1031	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
	1032	ENDDO
	1033	DO k = nz-2, 0, -1
	1034	DO i = 0,nx
	1035	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
	1036	/ tri(4,i,k)
	1037	ENDDO
	1038	ENDDO
	1039
	1040	END SUBROUTINE substi_hybrid
	1041
	1042	END SUBROUTINE tridia_hybrid
	1043
	1044
	1045	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
	1046
	1047	USE cpulog
	1048
	1049	IMPLICIT NONE
	1050
	1051	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
	1052	INTEGER, SAVE :: tag = 10
	1053	#if defined( __parallel )
[415]	1054	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
	1055	INTEGER, DIMENSION(MPI_STATUS_SIZE,1) :: stata
[1]	1056	#endif
	1057
	1058	REAL :: buf, buf1
	1059
	1060
	1061	buf = 1.0
	1062	buf1 = 1.1
	1063	IF ( me_node == 0 ) THEN ! first node only
	1064
	1065	SELECT CASE ( loca )
	1066
	1067	CASE ( 1 ) ! before alltoall
	1068
	1069	IF( me_task > 0 ) THEN ! first task does not wait
	1070	#if defined( __parallel )
	1071	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
	1072	buf1, 1, MPI_REAL, me_task-1, 0, &
[415]	1073	comm_tasks, stat, ierr )
[1]	1074	#endif
	1075	ELSEIF ( j > nys_p ) THEN
	1076	req = 0
	1077	tag = MOD( tag-10, 10 ) + 10
	1078	#if defined( __parallel )
	1079	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
	1080	tag, comm_tasks, req, ierr )
	1081	reqa = req
[415]	1082	CALL MPI_WAITALL( 1, reqa, stata, ierr )
[1]	1083	#endif
	1084	ENDIF
	1085
	1086	CASE ( 2 ) ! after alltoall
	1087
	1088	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
	1089	#if defined( __parallel )
	1090	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
	1091	buf1, 1, MPI_REAL, me_task+1, 0, &
	1092	comm_tasks, stat, ierr)
	1093	#endif
	1094	ELSEIF ( j < nyn_p ) THEN
	1095	req = 0
	1096	tag = MOD( tag-10, 10 ) + 10
	1097	#if defined( __parallel )
	1098	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
	1099	ierr )
	1100	#endif
	1101	ENDIF
	1102
	1103	END SELECT
	1104
	1105	ENDIF
	1106
	1107	END SUBROUTINE cascade
[807]	1108	#endif
[1]	1109	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |