Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 692

Last change on this file since 692 was 668, checked in by suehring, 14 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 31.5 KB

Line
1	MODULE poisfft_hybrid_mod
2	!------------------------------------------------------------------------------
3	!
4	! Current revisions:
5	! -----------------
6	!
7	! Former revisions:
8	! -----------------
9	! $Id: poisfft_hybrid.f90 668 2010-12-23 13:22:58Z maronga $
10	!
11	! 667 2010-12-23 12:06:00Z suehring/gryschka
12	! ddzu replaced by ddzu_pres due to changes in zu(0)
13	!
14	! 415 2009-12-15 10:26:23Z raasch
15	! Dimension of array stat in cascade change to prevent type problems with
16	! mpi2 libraries
17	!
18	! 274 2009-03-26 15:11:21Z heinze
19	! Output of messages replaced by message handling routine.
20	!
21	! Feb. 2007
22	! RCS Log replace by Id keyword, revision history cleaned up
23	!
24	! Revision 1.11 2004/04/30 12:43:14 raasch
25	! Renaming of fft routines, additional argument in calls of fft_y_m
26	!
27	! Revision 1.2 2002/12/19 16:08:31 raasch
28	! Preprocessor directive KKMP introduced (OMP does NOT work),
29	! array tri will be a shared array in OpenMP loop, to get better cache
30	! utilization, the i index (x-direction) will be executed in stride
31	! "istride" as outer loop and in a shorter inner loop,
32	! overlapping of computation and communication realized by new routine
33	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
34	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
35	!
36	!
37	! Description:
38	! ------------
39	! Solution of the Poisson equation with a 2D spectral method.
40	! Hybrid version for parallel computers using a 1D domain decomposition,
41	! realized with MPI, along x and parallelization with OPEN-MP along y
42	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
43	! optimization is realized by overlapping of computation and communication
44	! and by simultaneously executing as many communication calls as switches
45	! per logical partition (LPAR) are available. This version comes into
46	! effect if more than one node is used and if the environment variable
47	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
48	! without any rest.
49	!
50	! WARNING: In case of OpenMP, there are problems with allocating large
51	! arrays in parallel regions.
52	!
53	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
54	!------------------------------------------------------------------------------!
55
56	USE fft_xy
57	USE indices
58	USE pegrid
59
60	IMPLICIT NONE
61
62	PRIVATE
63	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
64
65	INTEGER, PARAMETER :: switch_per_lpar = 2
66
67	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
68	nxl_p, nxr_p, & ! partial x dimension
69	nys_a, nyn_a, & ! total y dimension
70	nys_p, nyn_p, & ! partial y dimension
71
72	npe_s, & ! total number of PEs for solver
73	nwords, & ! number of points to be exchanged
74	! with MPI_ALLTOALL
75	n_omp_threads ! number of OpenMP threads
76
77	!
78	!-- Variables for multi node version (cluster version) using routine
79	!-- poisfft_hybrid_nodes
80	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
81	comm_node_all, & ! communicater all PEs node version
82	comm_tasks, & ! communicater tasks
83	me, me_node, me_task,& ! identity of this PE
84	nodes, & ! number of nodes
85	tasks_per_logical_node = -1 ! default no cluster
86
87
88	!
89	!-- Public interfaces
90	INTERFACE poisfft_hybrid_ini
91	MODULE PROCEDURE poisfft_hybrid_ini
92	END INTERFACE poisfft_hybrid_ini
93
94	INTERFACE poisfft_hybrid
95	MODULE PROCEDURE poisfft_hybrid
96	END INTERFACE poisfft_hybrid
97
98	!
99	!-- Private interfaces
100	INTERFACE poisfft_hybrid_omp
101	MODULE PROCEDURE poisfft_hybrid_omp
102	END INTERFACE poisfft_hybrid_omp
103
104	INTERFACE poisfft_hybrid_omp_vec
105	MODULE PROCEDURE poisfft_hybrid_omp_vec
106	END INTERFACE poisfft_hybrid_omp_vec
107
108	INTERFACE poisfft_hybrid_nodes
109	MODULE PROCEDURE poisfft_hybrid_nodes
110	END INTERFACE poisfft_hybrid_nodes
111
112	INTERFACE tridia_hybrid
113	MODULE PROCEDURE tridia_hybrid
114	END INTERFACE tridia_hybrid
115
116	INTERFACE cascade
117	MODULE PROCEDURE cascade
118	END INTERFACE cascade
119
120	CONTAINS
121
122
123	SUBROUTINE poisfft_hybrid_ini
124
125	USE control_parameters
126	USE pegrid
127
128	IMPLICIT NONE
129
130	CHARACTER(LEN=8) :: cdummy
131	INTEGER :: idummy, istat
132	INTEGER, DIMENSION(2) :: coords, dims
133
134	LOGICAL, DIMENSION(2) :: period = .false., re_dims
135
136
137	!
138	!-- Set the internal index values for the hybrid solver
139	#if defined( __parallel )
140	npe_s = pdims(1)
141	#else
142	npe_s = 1
143	#endif
144	nxl_a = 0
145	nxr_a = nx
146	nxl_p = 0
147	nxr_p = ( ( nx+1 ) / npe_s ) - 1
148	nys_a = nys
149	nyn_a = nyn
150	nys_p = 0
151	nyn_p = ( ( ny+1 ) / npe_s ) - 1
152
153	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
154
155	#if defined( __KKMP )
156	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
157	READ ( cdummy, '(I8)' ) n_omp_threads
158	IF ( n_omp_threads > 1 ) THEN
159	WRITE( message_string, * ) 'Number of OpenMP threads = ', &
160	n_omp_threads
161	CALL message( 'poisfft_hybrid_ini', 'PA0280', 0, 0, 0, 6, 0 )
162	ENDIF
163	#else
164	n_omp_threads = 1
165	#endif
166	!
167	!-- Initialize the one-dimensional FFT routines
168	CALL fft_init
169
170	!
171	!-- Setup for multi node version (poisfft_hybrid_nodes)
172	IF ( n_omp_threads == 1 .AND. &
173	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
174
175	IF ( tasks_per_node /= -9999 ) THEN
176	!
177	!-- Multi node version requires that the available number of
178	!-- switches per logical partition must be an integral divisor
179	!-- of the chosen number of tasks per node
180	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
181	!
182	!-- Set the switch which decides about usage of the multi node
183	!-- version
184	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
185	numprocs > tasks_per_node ) THEN
186	tasks_per_logical_node = tasks_per_node / switch_per_lpar
187	ENDIF
188
189	IF ( tasks_per_logical_node > -1 ) THEN
190
191	WRITE( message_string, * ) 'running optimized ', &
192	'multinode version', &
193	'&switch_per_lpar = ', &
194	switch_per_lpar, &
195	'&tasks_per_lpar = ', &
196	tasks_per_node, &
197	'tasks_per_logical_node = ', &
198	tasks_per_logical_node
199	CALL message( 'poisfft_hybrid_ini', 'PA0281', 0, 0, 0, 6, 0 )
200
201	ENDIF
202
203	ENDIF
204	ENDIF
205	ENDIF
206
207	!
208	!-- Determine sub-topologies for multi node version
209	IF ( tasks_per_logical_node >= 2 ) THEN
210
211	#if defined( __parallel )
212	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
213	tasks_per_logical_node
214	dims(1) = nodes
215	dims(2) = tasks_per_logical_node
216
217	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
218	comm_node_all, istat )
219	CALL MPI_COMM_RANK( comm_node_all, me, istat )
220
221	re_dims(1) = .TRUE.
222	re_dims(2) = .FALSE.
223	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
224	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
225
226	re_dims(1) = .FALSE.
227	re_dims(2) = .TRUE.
228	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
229	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
230
231	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
232	! tasks_per_logical_node
233	#else
234	message_string = 'parallel environment (MPI) required'
235	CALL message( 'poisfft_hybrid_ini', 'PA0282', 1, 2, 0, 6, 0 )
236	#endif
237	ENDIF
238
239	END SUBROUTINE poisfft_hybrid_ini
240
241
242	SUBROUTINE poisfft_hybrid( ar )
243
244	USE control_parameters
245	USE interfaces
246
247	IMPLICIT NONE
248
249	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
250
251	IF ( host(1:3) == 'nec' ) THEN
252	CALL poisfft_hybrid_omp_vec( ar )
253	ELSE
254	IF ( tasks_per_logical_node == -1 ) THEN
255	CALL poisfft_hybrid_omp( ar )
256	ELSE
257	CALL poisfft_hybrid_nodes( ar )
258	ENDIF
259	ENDIF
260
261	END SUBROUTINE poisfft_hybrid
262
263
264	SUBROUTINE poisfft_hybrid_omp ( ar )
265
266	USE cpulog
267	USE interfaces
268
269	IMPLICIT NONE
270
271	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
272	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
273
274	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
275
276	REAL, DIMENSION(0:nx) :: fftx_ar
277	REAL, DIMENSION(0:ny,istride) :: ffty_ar
278
279	REAL, DIMENSION(0:nx,nz) :: tri_ar
280
281	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
282	#if defined( __KKMP )
283	INTEGER :: omp_get_thread_num
284	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
285	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
286	#else
287	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
288	#endif
289
290
291	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
292
293	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
294
295	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
296	!$OMP DO
297	!
298	!-- Store grid points to be transformed on a 1d-array, do the fft
299	!-- and sample the results on a 4d-array
300	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
301	iei = MIN( iouter+istride-1, nxr_p )
302	DO k = 1, nz
303
304	DO i = iouter, iei
305	ii = nxl + i
306	ir = i - iouter + 1
307
308	DO j = nys_a, nyn_a
309	ffty_ar(j,ir) = ar(k,j,ii)
310	ENDDO
311
312	CALL fft_y( ffty_ar(:,ir), 'forward' )
313	ENDDO
314
315	m = nys_a
316	DO n = 1, npe_s
317	DO j = nys_p, nyn_p
318	DO i = iouter, iei
319	ir = i - iouter + 1
320	work1(i,k,j,n) = ffty_ar(m,ir)
321	ENDDO
322	m = m+1
323	ENDDO
324	ENDDO
325
326	ENDDO
327	ENDDO
328	!$OMP END PARALLEL
329
330	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
331
332	#if defined( __parallel )
333	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
334
335	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
336	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
337	comm2d, istat )
338
339	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
340	#else
341	work2 = work1
342	#endif
343
344	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
345
346	#if defined( __KKMP )
347	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
348	!$OMP DO
349	DO j = nys_p, nyn_p
350	jthread = omp_get_thread_num() + 1
351	#else
352	DO j = nys_p, nyn_p
353	jthread = 1
354	#endif
355	DO k = 1, nz
356
357	m = nxl_a
358	DO n = 1, npe_s
359	DO i = nxl_p, nxr_p
360	fftx_ar(m) = work2(i,k,j,n)
361	m = m+1
362	ENDDO
363	ENDDO
364
365	CALL fft_x( fftx_ar, 'forward' )
366
367	DO i = nxl_a, nxr_a
368	tri_ar(i,k) = fftx_ar(i)
369	ENDDO
370
371	ENDDO
372
373	jj = myid * (nyn_p-nys_p+1) + j
374	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
375
376	DO k = 1, nz
377	DO i = nxl_a, nxr_a
378	fftx_ar(i) = tri_ar (i,k)
379	ENDDO
380
381	CALL fft_x( fftx_ar, 'backward' )
382
383	m = nxl_a
384	DO n = 1, npe_s
385	DO i = nxl_p, nxr_p
386	work2(i,k,j,n) = fftx_ar(m)
387	m = m+1
388	ENDDO
389	ENDDO
390
391	ENDDO
392	ENDDO
393	#if defined( __KKMP )
394	!$OMP END PARALLEL
395	#endif
396
397	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
398
399	#if defined( __parallel )
400	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
401	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
402
403	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
404	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
405	comm2d, istat )
406
407	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
408	#else
409	work1 = work2
410	#endif
411
412	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
413
414	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
415	!$OMP DO
416	DO iouter = nxl_p, nxr_p, istride
417	iei = MIN( iouter+istride-1, nxr_p )
418	DO k = 1, nz
419
420	m = nys_a
421	DO n = 1, npe_s
422	DO j = nys_p, nyn_p
423	DO i = iouter, iei
424	ir = i - iouter + 1
425	ffty_ar(m,ir) = work1 (i,k,j,n)
426	ENDDO
427	m = m+1
428	ENDDO
429	ENDDO
430
431	DO i = iouter, iei
432	ii = nxl + i
433	ir = i - iouter + 1
434	CALL fft_y( ffty_ar(:,ir), 'backward' )
435
436	DO j = nys_a, nyn_a
437	ar(k,j,ii) = ffty_ar(j,ir)
438	ENDDO
439	ENDDO
440
441	ENDDO
442	ENDDO
443	!$OMP END PARALLEL
444
445	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
446
447	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
448
449	#if defined( __KKMP )
450	DEALLOCATE( tri )
451	#endif
452
453	END SUBROUTINE poisfft_hybrid_omp
454
455
456	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
457
458	USE cpulog
459	USE interfaces
460
461	IMPLICIT NONE
462
463	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
464	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
465
466	REAL, DIMENSION(0:nx,nz) :: tri_ar
467
468	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
469
470	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
471	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
472
473	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
474	#if defined( __KKMP )
475	INTEGER :: omp_get_thread_num
476	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
477	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
478	#else
479	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
480	#endif
481
482
483	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
484
485	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
486
487	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
488	!$OMP DO
489	!
490	!-- Store grid points to be transformed on a 1d-array, do the fft
491	!-- and sample the results on a 4d-array
492	DO i = nxl_p, nxr_p
493
494	DO j = nys_a, nyn_a
495	DO k = 1, nz
496	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
497	ENDDO
498	ENDDO
499
500	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
501	ENDDO
502
503	!$OMP DO
504	DO k = 1, nz
505	m = nys_a
506	DO n = 1, npe_s
507	DO j = nys_p, nyn_p
508	DO i = nxl_p, nxr_p
509	work1(i,k,j,n) = ffty_ar3(m,k,i)
510	ENDDO
511	m = m+1
512	ENDDO
513	ENDDO
514	ENDDO
515	!$OMP END PARALLEL
516
517	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
518
519	#if defined( __parallel )
520	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
521	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
522	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
523	comm2d, istat )
524	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
525	#else
526	work2 = work1
527	#endif
528
529	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
530
531	#if defined( __KKMP )
532	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
533	!$OMP DO
534	DO j = nys_p, nyn_p
535	jthread = omp_get_thread_num() + 1
536	#else
537	DO j = nys_p, nyn_p
538	jthread = 1
539	#endif
540	DO k = 1, nz
541
542	m = nxl_a
543	DO n = 1, npe_s
544	DO i = nxl_p, nxr_p
545	fftx_ar3(m,k,j) = work2(i,k,j,n)
546	m = m+1
547	ENDDO
548	ENDDO
549	ENDDO
550
551	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
552
553	DO k = 1, nz
554	DO i = nxl_a, nxr_a
555	tri_ar(i,k) = fftx_ar3(i,k,j)
556	ENDDO
557	ENDDO
558
559	jj = myid * (nyn_p-nys_p+1) + j
560	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
561
562	DO k = 1, nz
563	DO i = nxl_a, nxr_a
564	fftx_ar3(i,k,j) = tri_ar (i,k)
565	ENDDO
566	ENDDO
567
568	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
569
570	DO k = 1, nz
571	m = nxl_a
572	DO n = 1, npe_s
573	DO i = nxl_p, nxr_p
574	work2(i,k,j,n) = fftx_ar3(m,k,j)
575	m = m+1
576	ENDDO
577	ENDDO
578	ENDDO
579
580	ENDDO
581	#if defined( __KKMP )
582	!$OMP END PARALLEL
583	#endif
584
585	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
586
587	#if defined( __parallel )
588	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
589	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
590	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
591	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
592	comm2d, istat )
593	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
594	#else
595	work1 = work2
596	#endif
597
598	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
599
600	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
601	!$OMP DO
602	DO k = 1, nz
603	m = nys_a
604	DO n = 1, npe_s
605	DO j = nys_p, nyn_p
606	DO i = nxl_p, nxr_p
607	ffty_ar3(m,k,i) = work1(i,k,j,n)
608	ENDDO
609	m = m+1
610	ENDDO
611	ENDDO
612	ENDDO
613
614	!$OMP DO
615	DO i = nxl_p, nxr_p
616	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
617	DO j = nys_a, nyn_a
618	DO k = 1, nz
619	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
620	ENDDO
621	ENDDO
622	ENDDO
623	!$OMP END PARALLEL
624
625	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
626
627	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
628
629	#if defined( __KKMP )
630	DEALLOCATE( tri )
631	#endif
632
633	END SUBROUTINE poisfft_hybrid_omp_vec
634
635
636	SUBROUTINE poisfft_hybrid_nodes ( ar )
637
638	USE cpulog
639	USE interfaces
640
641	IMPLICIT NONE
642
643	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
644	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
645	n, nn, nt, nw1, nw2
646
647	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
648
649	REAL, DIMENSION(0:nx) :: fftx_ar
650	REAL, DIMENSION(0:ny,istride) :: ffty_ar
651
652	REAL, DIMENSION(0:nx,nz) :: tri_ar
653
654	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
655	nodes,nys_p:nyn_p) :: work1,work2
656	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
657
658
659	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
660
661	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
662
663	!
664	!-- Store grid points to be transformed on a 1d-array, do the fft
665	!-- and sample the results on a 4d-array
666	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
667	iei = MIN( iouter+istride-1, nxr_p )
668	DO k = 1, nz
669
670	DO i = iouter, iei
671	ii = nxl + i
672	ir = i - iouter + 1
673
674	DO j = nys_a, nyn_a
675	ffty_ar(j,ir) = ar(k,j,ii)
676	ENDDO
677
678	CALL fft_y( ffty_ar(:,ir), 'forward' )
679	ENDDO
680
681	m = nys_a
682	DO nn = 1, nodes
683	DO nt = 1, tasks_per_logical_node
684	DO j = nys_p, nyn_p
685	DO i = iouter, iei
686	ir = i - iouter + 1
687	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
688	ENDDO
689	m = m+1
690	ENDDO
691	ENDDO
692	ENDDO
693
694	ENDDO
695	ENDDO
696
697	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
698
699	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
700	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
701	DO nn = 1, nodes
702	DO j = nys_p, nyn_p
703	#if defined( __parallel )
704	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
705	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
706	comm_tasks, istat )
707	#endif
708	ENDDO
709	ENDDO
710	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
711
712
713	DO j = nys_p, nyn_p
714
715	CALL cascade( 1, j, nys_p, nyn_p )
716	nw2 = nw1 * SIZE( work1, 3 )
717	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
718	#if defined( __parallel )
719	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
720	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
721	comm_nodes, istat )
722	#endif
723	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
724	CALL cascade( 2, j, nys_p, nyn_p )
725
726	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
727	DO k = 1, nz
728
729	m = nxl_a
730	DO nn = 1, nodes
731	DO nt = 1, tasks_per_logical_node
732	DO i = nxl_p, nxr_p
733	fftx_ar(m) = work1(i,k,nt,nn,j)
734	m = m+1
735	ENDDO
736	ENDDO
737	ENDDO
738
739	CALL fft_x( fftx_ar, 'forward' )
740
741	DO i = nxl_a, nxr_a
742	tri_ar(i,k) = fftx_ar(i)
743	ENDDO
744
745	ENDDO
746
747	jj = myid * (nyn_p-nys_p+1) + j
748	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
749
750	DO k = 1, nz
751	DO i = nxl_a, nxr_a
752	fftx_ar(i) = tri_ar(i,k)
753	ENDDO
754
755	CALL fft_x( fftx_ar, 'backward' )
756
757	m = nxl_a
758	DO nn = 1, nodes
759	DO nt = 1, tasks_per_logical_node
760	DO i = nxl_p, nxr_p
761	work1(i,k,nt,nn,j) = fftx_ar(m)
762	m = m+1
763	ENDDO
764	ENDDO
765	ENDDO
766	ENDDO
767
768	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
769	nw2 = nw1 * SIZE( work1, 3 )
770	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
771	#if defined( __parallel )
772	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
773	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
774	comm_nodes, istat )
775	#endif
776	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
777
778	ENDDO
779
780	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
781	DO nn = 1, nodes
782	DO j = nys_p, nyn_p
783	#if defined( __parallel )
784	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
785	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
786	comm_tasks, istat )
787	#endif
788	ENDDO
789	ENDDO
790	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
791
792	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
793
794	DO iouter = nxl_p, nxr_p, istride
795	iei = MIN( iouter+istride-1, nxr_p )
796	DO k = 1, nz
797
798	m = nys_a
799	DO nn = 1, nodes
800	DO nt = 1, tasks_per_logical_node
801	DO j = nys_p, nyn_p
802	DO i = iouter, iei
803	ir = i - iouter + 1
804	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
805	ENDDO
806	m = m+1
807	ENDDO
808	ENDDO
809	ENDDO
810
811	DO i = iouter, iei
812	ii = nxl + i
813	ir = i - iouter + 1
814	CALL fft_y( ffty_ar(:,ir), 'backward' )
815
816	DO j = nys_a, nyn_a
817	ar(k,j,ii) = ffty_ar(j,ir)
818	ENDDO
819	ENDDO
820
821	ENDDO
822	ENDDO
823
824	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
825
826	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
827
828	END SUBROUTINE poisfft_hybrid_nodes
829
830
831
832	SUBROUTINE tridia_hybrid( j, ar, tri )
833
834	USE arrays_3d
835	USE control_parameters
836	USE grid_variables
837
838	IMPLICIT NONE
839
840	INTEGER :: i, j, k, nnyh
841	REAL, DIMENSION(0:nx,nz) :: ar
842	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
843	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
844
845	nnyh = (ny+1) / 2
846
847	tri = 0.0
848	!
849	!-- Define constant elements of the tridiagonal matrix.
850	DO k = 0, nz-1
851	DO i = 0,nx
852	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
853	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
854	ENDDO
855	ENDDO
856
857	IF ( j <= nnyh ) THEN
858	CALL maketri_hybrid( j )
859	ELSE
860	CALL maketri_hybrid( ny+1-j)
861	ENDIF
862	CALL zerleg_hybrid
863	CALL substi_hybrid( ar, tri )
864
865	CONTAINS
866
867	SUBROUTINE maketri_hybrid( j )
868
869	!----------------------------------------------------------------------!
870	! maketri !
871	! !
872	! computes the i- and j-dependent component of the matrix !
873	!----------------------------------------------------------------------!
874
875	USE constants
876
877	IMPLICIT NONE
878
879	INTEGER :: i, j, k, nnxh
880	REAL :: a, c
881
882	REAL, DIMENSION(0:nx) :: l
883
884
885	nnxh = (nx+1) / 2
886	!
887	!-- Provide the tridiagonal matrix for solution of the Poisson equation
888	!-- in Fourier space. The coefficients are computed following the method
889	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
890	!-- Siano's original version.
891	DO i = 0,nx
892	IF ( i >= 0 .AND. i < nnxh ) THEN
893	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
894	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
895	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
896	FLOAT( ny+1 ) ) ) / ( dy * dy )
897	ELSEIF ( i == nnxh ) THEN
898	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
899	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
900	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
901	FLOAT(ny+1) ) ) / ( dy * dy )
902	ELSE
903	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
904	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
905	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
906	FLOAT( ny+1 ) ) ) / ( dy * dy )
907	ENDIF
908	ENDDO
909
910	DO k = 0,nz-1
911	DO i = 0, nx
912	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
913	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
914	tri(1,i,k) = a + c - l(i)
915	ENDDO
916	ENDDO
917	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
918	DO i = 0,nx
919	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
920	ENDDO
921	ENDIF
922	IF ( ibc_p_t == 1 ) THEN
923	DO i = 0,nx
924	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
925	ENDDO
926	ENDIF
927
928	END SUBROUTINE maketri_hybrid
929
930
931	SUBROUTINE zerleg_hybrid
932
933	!----------------------------------------------------------------------!
934	! zerleg !
935	! !
936	! Splitting of the tridiagonal matrix (Thomas algorithm) !
937	!----------------------------------------------------------------------!
938
939	USE indices
940
941	IMPLICIT NONE
942
943	INTEGER :: i, k
944
945	!
946	!-- Splitting
947	DO i = 0, nx
948	tri(4,i,0) = tri(1,i,0)
949	ENDDO
950	DO k = 1, nz-1
951	DO i = 0,nx
952	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
953	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
954	ENDDO
955	ENDDO
956
957	END SUBROUTINE zerleg_hybrid
958
959	SUBROUTINE substi_hybrid( ar, tri )
960
961	!----------------------------------------------------------------------!
962	! substi !
963	! !
964	! Substitution (Forward and Backward) (Thomas algorithm) !
965	!----------------------------------------------------------------------!
966
967	IMPLICIT NONE
968
969	INTEGER :: i, j, k
970	REAL, DIMENSION(0:nx,nz) :: ar
971	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
972	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
973
974	!
975	!-- Forward substitution
976	DO i = 0, nx
977	ar1(i,0) = ar(i,1)
978	ENDDO
979	DO k = 1, nz - 1
980	DO i = 0,nx
981	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
982	ENDDO
983	ENDDO
984
985	!
986	!-- Backward substitution
987	DO i = 0,nx
988	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
989	ENDDO
990	DO k = nz-2, 0, -1
991	DO i = 0,nx
992	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
993	/ tri(4,i,k)
994	ENDDO
995	ENDDO
996
997	END SUBROUTINE substi_hybrid
998
999	END SUBROUTINE tridia_hybrid
1000
1001
1002	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
1003
1004	USE cpulog
1005
1006	IMPLICIT NONE
1007
1008	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
1009	INTEGER, SAVE :: tag = 10
1010	#if defined( __parallel )
1011	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
1012	INTEGER, DIMENSION(MPI_STATUS_SIZE,1) :: stata
1013	#endif
1014
1015	REAL :: buf, buf1
1016
1017
1018	buf = 1.0
1019	buf1 = 1.1
1020	IF ( me_node == 0 ) THEN ! first node only
1021
1022	SELECT CASE ( loca )
1023
1024	CASE ( 1 ) ! before alltoall
1025
1026	IF( me_task > 0 ) THEN ! first task does not wait
1027	#if defined( __parallel )
1028	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
1029	buf1, 1, MPI_REAL, me_task-1, 0, &
1030	comm_tasks, stat, ierr )
1031	#endif
1032	ELSEIF ( j > nys_p ) THEN
1033	req = 0
1034	tag = MOD( tag-10, 10 ) + 10
1035	#if defined( __parallel )
1036	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
1037	tag, comm_tasks, req, ierr )
1038	reqa = req
1039	CALL MPI_WAITALL( 1, reqa, stata, ierr )
1040	#endif
1041	ENDIF
1042
1043	CASE ( 2 ) ! after alltoall
1044
1045	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
1046	#if defined( __parallel )
1047	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
1048	buf1, 1, MPI_REAL, me_task+1, 0, &
1049	comm_tasks, stat, ierr)
1050	#endif
1051	ELSEIF ( j < nyn_p ) THEN
1052	req = 0
1053	tag = MOD( tag-10, 10 ) + 10
1054	#if defined( __parallel )
1055	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
1056	ierr )
1057	#endif
1058	ENDIF
1059
1060	END SELECT
1061
1062	ENDIF
1063
1064	END SUBROUTINE cascade
1065
1066	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |