Home

Context Navigation

poisfft_hybrid.f90 @ 558

Last change on this file since 558 was 482, checked in by raasch, 15 years ago

New:
---
compare_palm_logs is additionally compiled with mbuild -u (Makefile in trunk/UTIL)

make options (mopts) to be set by configuration file implemented (mrun, mbuild)

humidity=.T. is now usable for runs with topography. wall_humidityflux and
wall_scalarflux are the corresponding new parin arrays.
(check_parameters, init_3d_model, parin)

Large scale vertical motion (subsidence/ascent) can be applied to the
prognostic equation for the potential temperature. (check_parameters, header,
Makefile, modules, parin, prognostic_equations, read_var_list, subsidence,
write_var_list)

A simple method for installing and running palm (with limited features)
has been added. (Makefile, palm_simple_install, palm_simple_run)

Changed:

2d-decomposition is default for Cray-XT machines (init_pegrid)

var_ts is replaced by dots_max (modules,init_3d_model)

Every cloud droplet has now an own weighting factor and can be deleted due to
collisions. Condensation and collision of cloud droplets are adjusted
accordingly. (advec_particles)

Collision efficiency for large cloud droplets has changed according to table of
Rogers and Yau. (collision_efficiency)

Errors:

Bugfix for generating serial jobs (subjob)

Bugfix: index problem concerning gradient_level indices removed (header)

Dimension of array stat in cascade change to prevent type problems with
mpi2 libraries (poisfft_hybrid)

Loop was split to make runs reproducible when using ifort compiler.
(disturb_field)

Bugfix: exchange of ghost points for prho included (time_integration)

Bugfix: calculation of time-averaged surface heatfluxes (sum_up_3d_data)

Bugfix: calculation of precipitation_rate (calc_precipitation)

Bugfix: initial data assignments to some dvrp arrays changed due to error
messages from gfortran compiler (modules)

Bugfix: calculation of cloud droplet velocity (advec_particles)

Bugfix: transfer of particles at south/left edge (advec_particles)

Bugfix: calculation of collision_efficiency (collision_efficiency)

Bugfix: initialisation of var_mod (subsidence)

Property svn:keywords set to Id

File size: 31.4 KB

Line
1	MODULE poisfft_hybrid_mod
2	!------------------------------------------------------------------------------
3	!
4	! Current revisions:
5	! -----------------
6	!
7	!
8	! Former revisions:
9	! -----------------
10	! $Id: poisfft_hybrid.f90 482 2010-02-05 06:37:03Z weinreis $
11	!
12	! 415 2009-12-15 10:26:23Z raasch
13	! Dimension of array stat in cascade change to prevent type problems with
14	! mpi2 libraries
15	!
16	! 274 2009-03-26 15:11:21Z heinze
17	! Output of messages replaced by message handling routine.
18	!
19	! Feb. 2007
20	! RCS Log replace by Id keyword, revision history cleaned up
21	!
22	! Revision 1.11 2004/04/30 12:43:14 raasch
23	! Renaming of fft routines, additional argument in calls of fft_y_m
24	!
25	! Revision 1.2 2002/12/19 16:08:31 raasch
26	! Preprocessor directive KKMP introduced (OMP does NOT work),
27	! array tri will be a shared array in OpenMP loop, to get better cache
28	! utilization, the i index (x-direction) will be executed in stride
29	! "istride" as outer loop and in a shorter inner loop,
30	! overlapping of computation and communication realized by new routine
31	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
32	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
33	!
34	!
35	! Description:
36	! ------------
37	! Solution of the Poisson equation with a 2D spectral method.
38	! Hybrid version for parallel computers using a 1D domain decomposition,
39	! realized with MPI, along x and parallelization with OPEN-MP along y
40	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
41	! optimization is realized by overlapping of computation and communication
42	! and by simultaneously executing as many communication calls as switches
43	! per logical partition (LPAR) are available. This version comes into
44	! effect if more than one node is used and if the environment variable
45	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
46	! without any rest.
47	!
48	! WARNING: In case of OpenMP, there are problems with allocating large
49	! arrays in parallel regions.
50	!
51	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
52	!------------------------------------------------------------------------------!
53
54	USE fft_xy
55	USE indices
56	USE pegrid
57
58	IMPLICIT NONE
59
60	PRIVATE
61	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
62
63	INTEGER, PARAMETER :: switch_per_lpar = 2
64
65	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
66	nxl_p, nxr_p, & ! partial x dimension
67	nys_a, nyn_a, & ! total y dimension
68	nys_p, nyn_p, & ! partial y dimension
69
70	npe_s, & ! total number of PEs for solver
71	nwords, & ! number of points to be exchanged
72	! with MPI_ALLTOALL
73	n_omp_threads ! number of OpenMP threads
74
75	!
76	!-- Variables for multi node version (cluster version) using routine
77	!-- poisfft_hybrid_nodes
78	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
79	comm_node_all, & ! communicater all PEs node version
80	comm_tasks, & ! communicater tasks
81	me, me_node, me_task,& ! identity of this PE
82	nodes, & ! number of nodes
83	tasks_per_logical_node = -1 ! default no cluster
84
85
86	!
87	!-- Public interfaces
88	INTERFACE poisfft_hybrid_ini
89	MODULE PROCEDURE poisfft_hybrid_ini
90	END INTERFACE poisfft_hybrid_ini
91
92	INTERFACE poisfft_hybrid
93	MODULE PROCEDURE poisfft_hybrid
94	END INTERFACE poisfft_hybrid
95
96	!
97	!-- Private interfaces
98	INTERFACE poisfft_hybrid_omp
99	MODULE PROCEDURE poisfft_hybrid_omp
100	END INTERFACE poisfft_hybrid_omp
101
102	INTERFACE poisfft_hybrid_omp_vec
103	MODULE PROCEDURE poisfft_hybrid_omp_vec
104	END INTERFACE poisfft_hybrid_omp_vec
105
106	INTERFACE poisfft_hybrid_nodes
107	MODULE PROCEDURE poisfft_hybrid_nodes
108	END INTERFACE poisfft_hybrid_nodes
109
110	INTERFACE tridia_hybrid
111	MODULE PROCEDURE tridia_hybrid
112	END INTERFACE tridia_hybrid
113
114	INTERFACE cascade
115	MODULE PROCEDURE cascade
116	END INTERFACE cascade
117
118	CONTAINS
119
120
121	SUBROUTINE poisfft_hybrid_ini
122
123	USE control_parameters
124	USE pegrid
125
126	IMPLICIT NONE
127
128	CHARACTER(LEN=8) :: cdummy
129	INTEGER :: idummy, istat
130	INTEGER, DIMENSION(2) :: coords, dims
131
132	LOGICAL, DIMENSION(2) :: period = .false., re_dims
133
134
135	!
136	!-- Set the internal index values for the hybrid solver
137	#if defined( __parallel )
138	npe_s = pdims(1)
139	#else
140	npe_s = 1
141	#endif
142	nxl_a = 0
143	nxr_a = nx
144	nxl_p = 0
145	nxr_p = ( ( nx+1 ) / npe_s ) - 1
146	nys_a = nys
147	nyn_a = nyn
148	nys_p = 0
149	nyn_p = ( ( ny+1 ) / npe_s ) - 1
150
151	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
152
153	#if defined( __KKMP )
154	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
155	READ ( cdummy, '(I8)' ) n_omp_threads
156	IF ( n_omp_threads > 1 ) THEN
157	WRITE( message_string, * ) 'Number of OpenMP threads = ', &
158	n_omp_threads
159	CALL message( 'poisfft_hybrid_ini', 'PA0280', 0, 0, 0, 6, 0 )
160	ENDIF
161	#else
162	n_omp_threads = 1
163	#endif
164	!
165	!-- Initialize the one-dimensional FFT routines
166	CALL fft_init
167
168	!
169	!-- Setup for multi node version (poisfft_hybrid_nodes)
170	IF ( n_omp_threads == 1 .AND. &
171	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
172
173	IF ( tasks_per_node /= -9999 ) THEN
174	!
175	!-- Multi node version requires that the available number of
176	!-- switches per logical partition must be an integral divisor
177	!-- of the chosen number of tasks per node
178	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
179	!
180	!-- Set the switch which decides about usage of the multi node
181	!-- version
182	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
183	numprocs > tasks_per_node ) THEN
184	tasks_per_logical_node = tasks_per_node / switch_per_lpar
185	ENDIF
186
187	IF ( tasks_per_logical_node > -1 ) THEN
188
189	WRITE( message_string, * ) 'running optimized ', &
190	'multinode version', &
191	'&switch_per_lpar = ', &
192	switch_per_lpar, &
193	'&tasks_per_lpar = ', &
194	tasks_per_node, &
195	'tasks_per_logical_node = ', &
196	tasks_per_logical_node
197	CALL message( 'poisfft_hybrid_ini', 'PA0281', 0, 0, 0, 6, 0 )
198
199	ENDIF
200
201	ENDIF
202	ENDIF
203	ENDIF
204
205	!
206	!-- Determine sub-topologies for multi node version
207	IF ( tasks_per_logical_node >= 2 ) THEN
208
209	#if defined( __parallel )
210	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
211	tasks_per_logical_node
212	dims(1) = nodes
213	dims(2) = tasks_per_logical_node
214
215	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
216	comm_node_all, istat )
217	CALL MPI_COMM_RANK( comm_node_all, me, istat )
218
219	re_dims(1) = .TRUE.
220	re_dims(2) = .FALSE.
221	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
222	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
223
224	re_dims(1) = .FALSE.
225	re_dims(2) = .TRUE.
226	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
227	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
228
229	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
230	! tasks_per_logical_node
231	#else
232	message_string = 'parallel environment (MPI) required'
233	CALL message( 'poisfft_hybrid_ini', 'PA0282', 1, 2, 0, 6, 0 )
234	#endif
235	ENDIF
236
237	END SUBROUTINE poisfft_hybrid_ini
238
239
240	SUBROUTINE poisfft_hybrid( ar )
241
242	USE control_parameters
243	USE interfaces
244
245	IMPLICIT NONE
246
247	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
248
249	IF ( host(1:3) == 'nec' ) THEN
250	CALL poisfft_hybrid_omp_vec( ar )
251	ELSE
252	IF ( tasks_per_logical_node == -1 ) THEN
253	CALL poisfft_hybrid_omp( ar )
254	ELSE
255	CALL poisfft_hybrid_nodes( ar )
256	ENDIF
257	ENDIF
258
259	END SUBROUTINE poisfft_hybrid
260
261
262	SUBROUTINE poisfft_hybrid_omp ( ar )
263
264	USE cpulog
265	USE interfaces
266
267	IMPLICIT NONE
268
269	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
270	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
271
272	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
273
274	REAL, DIMENSION(0:nx) :: fftx_ar
275	REAL, DIMENSION(0:ny,istride) :: ffty_ar
276
277	REAL, DIMENSION(0:nx,nz) :: tri_ar
278
279	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
280	#if defined( __KKMP )
281	INTEGER :: omp_get_thread_num
282	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
283	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
284	#else
285	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
286	#endif
287
288
289	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
290
291	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
292
293	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
294	!$OMP DO
295	!
296	!-- Store grid points to be transformed on a 1d-array, do the fft
297	!-- and sample the results on a 4d-array
298	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
299	iei = MIN( iouter+istride-1, nxr_p )
300	DO k = 1, nz
301
302	DO i = iouter, iei
303	ii = nxl + i
304	ir = i - iouter + 1
305
306	DO j = nys_a, nyn_a
307	ffty_ar(j,ir) = ar(k,j,ii)
308	ENDDO
309
310	CALL fft_y( ffty_ar(:,ir), 'forward' )
311	ENDDO
312
313	m = nys_a
314	DO n = 1, npe_s
315	DO j = nys_p, nyn_p
316	DO i = iouter, iei
317	ir = i - iouter + 1
318	work1(i,k,j,n) = ffty_ar(m,ir)
319	ENDDO
320	m = m+1
321	ENDDO
322	ENDDO
323
324	ENDDO
325	ENDDO
326	!$OMP END PARALLEL
327
328	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
329
330	#if defined( __parallel )
331	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
332
333	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
334	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
335	comm2d, istat )
336
337	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
338	#else
339	work2 = work1
340	#endif
341
342	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
343
344	#if defined( __KKMP )
345	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
346	!$OMP DO
347	DO j = nys_p, nyn_p
348	jthread = omp_get_thread_num() + 1
349	#else
350	DO j = nys_p, nyn_p
351	jthread = 1
352	#endif
353	DO k = 1, nz
354
355	m = nxl_a
356	DO n = 1, npe_s
357	DO i = nxl_p, nxr_p
358	fftx_ar(m) = work2(i,k,j,n)
359	m = m+1
360	ENDDO
361	ENDDO
362
363	CALL fft_x( fftx_ar, 'forward' )
364
365	DO i = nxl_a, nxr_a
366	tri_ar(i,k) = fftx_ar(i)
367	ENDDO
368
369	ENDDO
370
371	jj = myid * (nyn_p-nys_p+1) + j
372	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
373
374	DO k = 1, nz
375	DO i = nxl_a, nxr_a
376	fftx_ar(i) = tri_ar (i,k)
377	ENDDO
378
379	CALL fft_x( fftx_ar, 'backward' )
380
381	m = nxl_a
382	DO n = 1, npe_s
383	DO i = nxl_p, nxr_p
384	work2(i,k,j,n) = fftx_ar(m)
385	m = m+1
386	ENDDO
387	ENDDO
388
389	ENDDO
390	ENDDO
391	#if defined( __KKMP )
392	!$OMP END PARALLEL
393	#endif
394
395	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
396
397	#if defined( __parallel )
398	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
399	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
400
401	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
402	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
403	comm2d, istat )
404
405	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
406	#else
407	work1 = work2
408	#endif
409
410	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
411
412	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
413	!$OMP DO
414	DO iouter = nxl_p, nxr_p, istride
415	iei = MIN( iouter+istride-1, nxr_p )
416	DO k = 1, nz
417
418	m = nys_a
419	DO n = 1, npe_s
420	DO j = nys_p, nyn_p
421	DO i = iouter, iei
422	ir = i - iouter + 1
423	ffty_ar(m,ir) = work1 (i,k,j,n)
424	ENDDO
425	m = m+1
426	ENDDO
427	ENDDO
428
429	DO i = iouter, iei
430	ii = nxl + i
431	ir = i - iouter + 1
432	CALL fft_y( ffty_ar(:,ir), 'backward' )
433
434	DO j = nys_a, nyn_a
435	ar(k,j,ii) = ffty_ar(j,ir)
436	ENDDO
437	ENDDO
438
439	ENDDO
440	ENDDO
441	!$OMP END PARALLEL
442
443	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
444
445	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
446
447	#if defined( __KKMP )
448	DEALLOCATE( tri )
449	#endif
450
451	END SUBROUTINE poisfft_hybrid_omp
452
453
454	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
455
456	USE cpulog
457	USE interfaces
458
459	IMPLICIT NONE
460
461	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
462	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
463
464	REAL, DIMENSION(0:nx,nz) :: tri_ar
465
466	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
467
468	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
469	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
470
471	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
472	#if defined( __KKMP )
473	INTEGER :: omp_get_thread_num
474	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
475	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
476	#else
477	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
478	#endif
479
480
481	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
482
483	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
484
485	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
486	!$OMP DO
487	!
488	!-- Store grid points to be transformed on a 1d-array, do the fft
489	!-- and sample the results on a 4d-array
490	DO i = nxl_p, nxr_p
491
492	DO j = nys_a, nyn_a
493	DO k = 1, nz
494	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
495	ENDDO
496	ENDDO
497
498	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
499	ENDDO
500
501	!$OMP DO
502	DO k = 1, nz
503	m = nys_a
504	DO n = 1, npe_s
505	DO j = nys_p, nyn_p
506	DO i = nxl_p, nxr_p
507	work1(i,k,j,n) = ffty_ar3(m,k,i)
508	ENDDO
509	m = m+1
510	ENDDO
511	ENDDO
512	ENDDO
513	!$OMP END PARALLEL
514
515	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
516
517	#if defined( __parallel )
518	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
519	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
520	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
521	comm2d, istat )
522	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
523	#else
524	work2 = work1
525	#endif
526
527	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
528
529	#if defined( __KKMP )
530	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
531	!$OMP DO
532	DO j = nys_p, nyn_p
533	jthread = omp_get_thread_num() + 1
534	#else
535	DO j = nys_p, nyn_p
536	jthread = 1
537	#endif
538	DO k = 1, nz
539
540	m = nxl_a
541	DO n = 1, npe_s
542	DO i = nxl_p, nxr_p
543	fftx_ar3(m,k,j) = work2(i,k,j,n)
544	m = m+1
545	ENDDO
546	ENDDO
547	ENDDO
548
549	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
550
551	DO k = 1, nz
552	DO i = nxl_a, nxr_a
553	tri_ar(i,k) = fftx_ar3(i,k,j)
554	ENDDO
555	ENDDO
556
557	jj = myid * (nyn_p-nys_p+1) + j
558	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
559
560	DO k = 1, nz
561	DO i = nxl_a, nxr_a
562	fftx_ar3(i,k,j) = tri_ar (i,k)
563	ENDDO
564	ENDDO
565
566	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
567
568	DO k = 1, nz
569	m = nxl_a
570	DO n = 1, npe_s
571	DO i = nxl_p, nxr_p
572	work2(i,k,j,n) = fftx_ar3(m,k,j)
573	m = m+1
574	ENDDO
575	ENDDO
576	ENDDO
577
578	ENDDO
579	#if defined( __KKMP )
580	!$OMP END PARALLEL
581	#endif
582
583	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
584
585	#if defined( __parallel )
586	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
587	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
588	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
589	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
590	comm2d, istat )
591	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
592	#else
593	work1 = work2
594	#endif
595
596	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
597
598	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
599	!$OMP DO
600	DO k = 1, nz
601	m = nys_a
602	DO n = 1, npe_s
603	DO j = nys_p, nyn_p
604	DO i = nxl_p, nxr_p
605	ffty_ar3(m,k,i) = work1(i,k,j,n)
606	ENDDO
607	m = m+1
608	ENDDO
609	ENDDO
610	ENDDO
611
612	!$OMP DO
613	DO i = nxl_p, nxr_p
614	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
615	DO j = nys_a, nyn_a
616	DO k = 1, nz
617	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
618	ENDDO
619	ENDDO
620	ENDDO
621	!$OMP END PARALLEL
622
623	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
624
625	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
626
627	#if defined( __KKMP )
628	DEALLOCATE( tri )
629	#endif
630
631	END SUBROUTINE poisfft_hybrid_omp_vec
632
633
634	SUBROUTINE poisfft_hybrid_nodes ( ar )
635
636	USE cpulog
637	USE interfaces
638
639	IMPLICIT NONE
640
641	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
642	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
643	n, nn, nt, nw1, nw2
644
645	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
646
647	REAL, DIMENSION(0:nx) :: fftx_ar
648	REAL, DIMENSION(0:ny,istride) :: ffty_ar
649
650	REAL, DIMENSION(0:nx,nz) :: tri_ar
651
652	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
653	nodes,nys_p:nyn_p) :: work1,work2
654	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
655
656
657	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
658
659	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
660
661	!
662	!-- Store grid points to be transformed on a 1d-array, do the fft
663	!-- and sample the results on a 4d-array
664	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
665	iei = MIN( iouter+istride-1, nxr_p )
666	DO k = 1, nz
667
668	DO i = iouter, iei
669	ii = nxl + i
670	ir = i - iouter + 1
671
672	DO j = nys_a, nyn_a
673	ffty_ar(j,ir) = ar(k,j,ii)
674	ENDDO
675
676	CALL fft_y( ffty_ar(:,ir), 'forward' )
677	ENDDO
678
679	m = nys_a
680	DO nn = 1, nodes
681	DO nt = 1, tasks_per_logical_node
682	DO j = nys_p, nyn_p
683	DO i = iouter, iei
684	ir = i - iouter + 1
685	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
686	ENDDO
687	m = m+1
688	ENDDO
689	ENDDO
690	ENDDO
691
692	ENDDO
693	ENDDO
694
695	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
696
697	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
698	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
699	DO nn = 1, nodes
700	DO j = nys_p, nyn_p
701	#if defined( __parallel )
702	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
703	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
704	comm_tasks, istat )
705	#endif
706	ENDDO
707	ENDDO
708	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
709
710
711	DO j = nys_p, nyn_p
712
713	CALL cascade( 1, j, nys_p, nyn_p )
714	nw2 = nw1 * SIZE( work1, 3 )
715	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
716	#if defined( __parallel )
717	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
718	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
719	comm_nodes, istat )
720	#endif
721	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
722	CALL cascade( 2, j, nys_p, nyn_p )
723
724	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
725	DO k = 1, nz
726
727	m = nxl_a
728	DO nn = 1, nodes
729	DO nt = 1, tasks_per_logical_node
730	DO i = nxl_p, nxr_p
731	fftx_ar(m) = work1(i,k,nt,nn,j)
732	m = m+1
733	ENDDO
734	ENDDO
735	ENDDO
736
737	CALL fft_x( fftx_ar, 'forward' )
738
739	DO i = nxl_a, nxr_a
740	tri_ar(i,k) = fftx_ar(i)
741	ENDDO
742
743	ENDDO
744
745	jj = myid * (nyn_p-nys_p+1) + j
746	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
747
748	DO k = 1, nz
749	DO i = nxl_a, nxr_a
750	fftx_ar(i) = tri_ar(i,k)
751	ENDDO
752
753	CALL fft_x( fftx_ar, 'backward' )
754
755	m = nxl_a
756	DO nn = 1, nodes
757	DO nt = 1, tasks_per_logical_node
758	DO i = nxl_p, nxr_p
759	work1(i,k,nt,nn,j) = fftx_ar(m)
760	m = m+1
761	ENDDO
762	ENDDO
763	ENDDO
764	ENDDO
765
766	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
767	nw2 = nw1 * SIZE( work1, 3 )
768	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
769	#if defined( __parallel )
770	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
771	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
772	comm_nodes, istat )
773	#endif
774	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
775
776	ENDDO
777
778	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
779	DO nn = 1, nodes
780	DO j = nys_p, nyn_p
781	#if defined( __parallel )
782	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
783	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
784	comm_tasks, istat )
785	#endif
786	ENDDO
787	ENDDO
788	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
789
790	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
791
792	DO iouter = nxl_p, nxr_p, istride
793	iei = MIN( iouter+istride-1, nxr_p )
794	DO k = 1, nz
795
796	m = nys_a
797	DO nn = 1, nodes
798	DO nt = 1, tasks_per_logical_node
799	DO j = nys_p, nyn_p
800	DO i = iouter, iei
801	ir = i - iouter + 1
802	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
803	ENDDO
804	m = m+1
805	ENDDO
806	ENDDO
807	ENDDO
808
809	DO i = iouter, iei
810	ii = nxl + i
811	ir = i - iouter + 1
812	CALL fft_y( ffty_ar(:,ir), 'backward' )
813
814	DO j = nys_a, nyn_a
815	ar(k,j,ii) = ffty_ar(j,ir)
816	ENDDO
817	ENDDO
818
819	ENDDO
820	ENDDO
821
822	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
823
824	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
825
826	END SUBROUTINE poisfft_hybrid_nodes
827
828
829
830	SUBROUTINE tridia_hybrid( j, ar, tri )
831
832	USE arrays_3d
833	USE control_parameters
834	USE grid_variables
835
836	IMPLICIT NONE
837
838	INTEGER :: i, j, k, nnyh
839	REAL, DIMENSION(0:nx,nz) :: ar
840	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
841	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
842
843	nnyh = (ny+1) / 2
844
845	tri = 0.0
846	!
847	!-- Define constant elements of the tridiagonal matrix.
848	DO k = 0, nz-1
849	DO i = 0,nx
850	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
851	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
852	ENDDO
853	ENDDO
854
855	IF ( j <= nnyh ) THEN
856	CALL maketri_hybrid( j )
857	ELSE
858	CALL maketri_hybrid( ny+1-j)
859	ENDIF
860	CALL zerleg_hybrid
861	CALL substi_hybrid( ar, tri )
862
863	CONTAINS
864
865	SUBROUTINE maketri_hybrid( j )
866
867	!----------------------------------------------------------------------!
868	! maketri !
869	! !
870	! computes the i- and j-dependent component of the matrix !
871	!----------------------------------------------------------------------!
872
873	USE constants
874
875	IMPLICIT NONE
876
877	INTEGER :: i, j, k, nnxh
878	REAL :: a, c
879
880	REAL, DIMENSION(0:nx) :: l
881
882
883	nnxh = (nx+1) / 2
884	!
885	!-- Provide the tridiagonal matrix for solution of the Poisson equation
886	!-- in Fourier space. The coefficients are computed following the method
887	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
888	!-- Siano's original version.
889	DO i = 0,nx
890	IF ( i >= 0 .AND. i < nnxh ) THEN
891	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
892	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
893	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
894	FLOAT( ny+1 ) ) ) / ( dy * dy )
895	ELSEIF ( i == nnxh ) THEN
896	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
897	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
898	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
899	FLOAT(ny+1) ) ) / ( dy * dy )
900	ELSE
901	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
902	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
903	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
904	FLOAT( ny+1 ) ) ) / ( dy * dy )
905	ENDIF
906	ENDDO
907
908	DO k = 0,nz-1
909	DO i = 0, nx
910	a = -1.0 * ddzu(k+2) * ddzw(k+1)
911	c = -1.0 * ddzu(k+1) * ddzw(k+1)
912	tri(1,i,k) = a + c - l(i)
913	ENDDO
914	ENDDO
915	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
916	DO i = 0,nx
917	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
918	ENDDO
919	ENDIF
920	IF ( ibc_p_t == 1 ) THEN
921	DO i = 0,nx
922	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
923	ENDDO
924	ENDIF
925
926	END SUBROUTINE maketri_hybrid
927
928
929	SUBROUTINE zerleg_hybrid
930
931	!----------------------------------------------------------------------!
932	! zerleg !
933	! !
934	! Splitting of the tridiagonal matrix (Thomas algorithm) !
935	!----------------------------------------------------------------------!
936
937	USE indices
938
939	IMPLICIT NONE
940
941	INTEGER :: i, k
942
943	!
944	!-- Splitting
945	DO i = 0, nx
946	tri(4,i,0) = tri(1,i,0)
947	ENDDO
948	DO k = 1, nz-1
949	DO i = 0,nx
950	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
951	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
952	ENDDO
953	ENDDO
954
955	END SUBROUTINE zerleg_hybrid
956
957	SUBROUTINE substi_hybrid( ar, tri )
958
959	!----------------------------------------------------------------------!
960	! substi !
961	! !
962	! Substitution (Forward and Backward) (Thomas algorithm) !
963	!----------------------------------------------------------------------!
964
965	IMPLICIT NONE
966
967	INTEGER :: i, j, k
968	REAL, DIMENSION(0:nx,nz) :: ar
969	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
970	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
971
972	!
973	!-- Forward substitution
974	DO i = 0, nx
975	ar1(i,0) = ar(i,1)
976	ENDDO
977	DO k = 1, nz - 1
978	DO i = 0,nx
979	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
980	ENDDO
981	ENDDO
982
983	!
984	!-- Backward substitution
985	DO i = 0,nx
986	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
987	ENDDO
988	DO k = nz-2, 0, -1
989	DO i = 0,nx
990	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
991	/ tri(4,i,k)
992	ENDDO
993	ENDDO
994
995	END SUBROUTINE substi_hybrid
996
997	END SUBROUTINE tridia_hybrid
998
999
1000	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
1001
1002	USE cpulog
1003
1004	IMPLICIT NONE
1005
1006	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
1007	INTEGER, SAVE :: tag = 10
1008	#if defined( __parallel )
1009	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
1010	INTEGER, DIMENSION(MPI_STATUS_SIZE,1) :: stata
1011	#endif
1012
1013	REAL :: buf, buf1
1014
1015
1016	buf = 1.0
1017	buf1 = 1.1
1018	IF ( me_node == 0 ) THEN ! first node only
1019
1020	SELECT CASE ( loca )
1021
1022	CASE ( 1 ) ! before alltoall
1023
1024	IF( me_task > 0 ) THEN ! first task does not wait
1025	#if defined( __parallel )
1026	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
1027	buf1, 1, MPI_REAL, me_task-1, 0, &
1028	comm_tasks, stat, ierr )
1029	#endif
1030	ELSEIF ( j > nys_p ) THEN
1031	req = 0
1032	tag = MOD( tag-10, 10 ) + 10
1033	#if defined( __parallel )
1034	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
1035	tag, comm_tasks, req, ierr )
1036	reqa = req
1037	CALL MPI_WAITALL( 1, reqa, stata, ierr )
1038	#endif
1039	ENDIF
1040
1041	CASE ( 2 ) ! after alltoall
1042
1043	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
1044	#if defined( __parallel )
1045	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
1046	buf1, 1, MPI_REAL, me_task+1, 0, &
1047	comm_tasks, stat, ierr)
1048	#endif
1049	ELSEIF ( j < nyn_p ) THEN
1050	req = 0
1051	tag = MOD( tag-10, 10 ) + 10
1052	#if defined( __parallel )
1053	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
1054	ierr )
1055	#endif
1056	ENDIF
1057
1058	END SELECT
1059
1060	ENDIF
1061
1062	END SUBROUTINE cascade
1063
1064	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 558

Download in other formats: