Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 251

Last change on this file since 251 was 4, checked in by raasch, 18 years ago
Id keyword set as property for all *.f90 files
Property svn:keywords set to `Id`
File size: 30.7 KB

Line
1	MODULE poisfft_hybrid_mod
2	!------------------------------------------------------------------------------
3	!
4	! Actual revisions:
5	! -----------------
6	!
7	!
8	! Former revisions:
9	! -----------------
10	! $Id: poisfft_hybrid.f90 4 2007-02-13 11:33:16Z raasch $
11	! RCS Log replace by Id keyword, revision history cleaned up
12	!
13	! Revision 1.11 2004/04/30 12:43:14 raasch
14	! Renaming of fft routines, additional argument in calls of fft_y_m
15	!
16	! Revision 1.2 2002/12/19 16:08:31 raasch
17	! Preprocessor directive KKMP introduced (OMP does NOT work),
18	! array tri will be a shared array in OpenMP loop, to get better cache
19	! utilization, the i index (x-direction) will be executed in stride
20	! "istride" as outer loop and in a shorter inner loop,
21	! overlapping of computation and communication realized by new routine
22	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
23	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
24	!
25	!
26	! Description:
27	! ------------
28	! Solution of the Poisson equation with a 2D spectral method.
29	! Hybrid version for parallel computers using a 1D domain decomposition,
30	! realized with MPI, along x and parallelization with OPEN-MP along y
31	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
32	! optimization is realized by overlapping of computation and communication
33	! and by simultaneously executing as many communication calls as switches
34	! per logical partition (LPAR) are available. This version comes into
35	! effect if more than one node is used and if the environment variable
36	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
37	! without any rest.
38	!
39	! WARNING: In case of OpenMP, there are problems with allocating large
40	! arrays in parallel regions.
41	!
42	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
43	!------------------------------------------------------------------------------!
44
45	USE fft_xy
46	USE indices
47	USE pegrid
48
49	IMPLICIT NONE
50
51	PRIVATE
52	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
53
54	INTEGER, PARAMETER :: switch_per_lpar = 2
55
56	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
57	nxl_p, nxr_p, & ! partial x dimension
58	nys_a, nyn_a, & ! total y dimension
59	nys_p, nyn_p, & ! partial y dimension
60
61	npe_s, & ! total number of PEs for solver
62	nwords, & ! number of points to be exchanged
63	! with MPI_ALLTOALL
64	n_omp_threads ! number of OpenMP threads
65
66	!
67	!-- Variables for multi node version (cluster version) using routine
68	!-- poisfft_hybrid_nodes
69	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
70	comm_node_all, & ! communicater all PEs node version
71	comm_tasks, & ! communicater tasks
72	me, me_node, me_task,& ! identity of this PE
73	nodes, & ! number of nodes
74	tasks_per_logical_node = -1 ! default no cluster
75
76
77	!
78	!-- Public interfaces
79	INTERFACE poisfft_hybrid_ini
80	MODULE PROCEDURE poisfft_hybrid_ini
81	END INTERFACE poisfft_hybrid_ini
82
83	INTERFACE poisfft_hybrid
84	MODULE PROCEDURE poisfft_hybrid
85	END INTERFACE poisfft_hybrid
86
87	!
88	!-- Private interfaces
89	INTERFACE poisfft_hybrid_omp
90	MODULE PROCEDURE poisfft_hybrid_omp
91	END INTERFACE poisfft_hybrid_omp
92
93	INTERFACE poisfft_hybrid_omp_vec
94	MODULE PROCEDURE poisfft_hybrid_omp_vec
95	END INTERFACE poisfft_hybrid_omp_vec
96
97	INTERFACE poisfft_hybrid_nodes
98	MODULE PROCEDURE poisfft_hybrid_nodes
99	END INTERFACE poisfft_hybrid_nodes
100
101	INTERFACE tridia_hybrid
102	MODULE PROCEDURE tridia_hybrid
103	END INTERFACE tridia_hybrid
104
105	INTERFACE cascade
106	MODULE PROCEDURE cascade
107	END INTERFACE cascade
108
109	CONTAINS
110
111
112	SUBROUTINE poisfft_hybrid_ini
113
114	USE control_parameters
115	USE pegrid
116
117	IMPLICIT NONE
118
119	CHARACTER(LEN=8) :: cdummy
120	INTEGER :: idummy, istat
121	INTEGER, DIMENSION(2) :: coords, dims
122
123	LOGICAL, DIMENSION(2) :: period = .false., re_dims
124
125
126	!
127	!-- Set the internal index values for the hybrid solver
128	#if defined( __parallel )
129	npe_s = pdims(1)
130	#else
131	npe_s = 1
132	#endif
133	nxl_a = 0
134	nxr_a = nx
135	nxl_p = 0
136	nxr_p = ( ( nx+1 ) / npe_s ) - 1
137	nys_a = nys
138	nyn_a = nyn
139	nys_p = 0
140	nyn_p = ( ( ny+1 ) / npe_s ) - 1
141
142	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
143
144	#if defined( __KKMP )
145	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
146	READ ( cdummy, '(I8)' ) n_omp_threads
147	IF ( myid == 0 .AND. n_omp_threads > 1 ) THEN
148	PRINT, '** poisfft_hybrid_ini: Number of OpenMP threads = ', &
149	n_omp_threads
150	ENDIF
151	#else
152	n_omp_threads = 1
153	#endif
154	!
155	!-- Initialize the one-dimensional FFT routines
156	CALL fft_init
157
158	!
159	!-- Setup for multi node version (poisfft_hybrid_nodes)
160	IF ( n_omp_threads == 1 .AND. &
161	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
162
163	IF ( tasks_per_node /= -9999 ) THEN
164	!
165	!-- Multi node version requires that the available number of
166	!-- switches per logical partition must be an integral divisor
167	!-- of the chosen number of tasks per node
168	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
169	!
170	!-- Set the switch which decides about usage of the multi node
171	!-- version
172	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
173	numprocs > tasks_per_node ) THEN
174	tasks_per_logical_node = tasks_per_node / switch_per_lpar
175	ENDIF
176
177	IF ( myid == 0 .AND. tasks_per_logical_node > -1 ) THEN
178	PRINT, '** poisfft_hybrid_ini: running optimized ', &
179	'multinode version'
180	PRINT*, ' switch_per_lpar = ', switch_per_lpar
181	PRINT*, ' tasks_per_lpar = ', tasks_per_node
182	PRINT*, ' tasks_per_logical_node = ', &
183	tasks_per_logical_node
184	ENDIF
185
186	ENDIF
187	ENDIF
188	ENDIF
189
190	!
191	!-- Determine sub-topologies for multi node version
192	IF ( tasks_per_logical_node >= 2 ) THEN
193
194	#if defined( __parallel )
195	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
196	tasks_per_logical_node
197	dims(1) = nodes
198	dims(2) = tasks_per_logical_node
199
200	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
201	comm_node_all, istat )
202	CALL MPI_COMM_RANK( comm_node_all, me, istat )
203
204	re_dims(1) = .TRUE.
205	re_dims(2) = .FALSE.
206	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
207	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
208
209	re_dims(1) = .FALSE.
210	re_dims(2) = .TRUE.
211	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
212	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
213
214	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
215	! tasks_per_logical_node
216	#else
217	PRINT*, '+++ poisfft_hybrid_ini: parallel environment (MPI) required'
218	CALL local_stop
219	#endif
220	ENDIF
221
222	END SUBROUTINE poisfft_hybrid_ini
223
224
225	SUBROUTINE poisfft_hybrid( ar )
226
227	USE control_parameters
228	USE interfaces
229
230	IMPLICIT NONE
231
232	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
233
234	IF ( host(1:3) == 'nec' ) THEN
235	CALL poisfft_hybrid_omp_vec( ar )
236	ELSE
237	IF ( tasks_per_logical_node == -1 ) THEN
238	CALL poisfft_hybrid_omp( ar )
239	ELSE
240	CALL poisfft_hybrid_nodes( ar )
241	ENDIF
242	ENDIF
243
244	END SUBROUTINE poisfft_hybrid
245
246
247	SUBROUTINE poisfft_hybrid_omp ( ar )
248
249	USE cpulog
250	USE interfaces
251
252	IMPLICIT NONE
253
254	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
255	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
256
257	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
258
259	REAL, DIMENSION(0:nx) :: fftx_ar
260	REAL, DIMENSION(0:ny,istride) :: ffty_ar
261
262	REAL, DIMENSION(0:nx,nz) :: tri_ar
263
264	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
265	#if defined( __KKMP )
266	INTEGER :: omp_get_thread_num
267	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
268	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
269	#else
270	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
271	#endif
272
273
274	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
275
276	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
277
278	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
279	!$OMP DO
280	!
281	!-- Store grid points to be transformed on a 1d-array, do the fft
282	!-- and sample the results on a 4d-array
283	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
284	iei = MIN( iouter+istride-1, nxr_p )
285	DO k = 1, nz
286
287	DO i = iouter, iei
288	ii = nxl + i
289	ir = i - iouter + 1
290
291	DO j = nys_a, nyn_a
292	ffty_ar(j,ir) = ar(k,j,ii)
293	ENDDO
294
295	CALL fft_y( ffty_ar(:,ir), 'forward' )
296	ENDDO
297
298	m = nys_a
299	DO n = 1, npe_s
300	DO j = nys_p, nyn_p
301	DO i = iouter, iei
302	ir = i - iouter + 1
303	work1(i,k,j,n) = ffty_ar(m,ir)
304	ENDDO
305	m = m+1
306	ENDDO
307	ENDDO
308
309	ENDDO
310	ENDDO
311	!$OMP END PARALLEL
312
313	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
314
315	#if defined( __parallel )
316	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
317
318	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
319	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
320	comm2d, istat )
321
322	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
323	#else
324	work2 = work1
325	#endif
326
327	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
328
329	#if defined( __KKMP )
330	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
331	!$OMP DO
332	DO j = nys_p, nyn_p
333	jthread = omp_get_thread_num() + 1
334	#else
335	DO j = nys_p, nyn_p
336	jthread = 1
337	#endif
338	DO k = 1, nz
339
340	m = nxl_a
341	DO n = 1, npe_s
342	DO i = nxl_p, nxr_p
343	fftx_ar(m) = work2(i,k,j,n)
344	m = m+1
345	ENDDO
346	ENDDO
347
348	CALL fft_x( fftx_ar, 'forward' )
349
350	DO i = nxl_a, nxr_a
351	tri_ar(i,k) = fftx_ar(i)
352	ENDDO
353
354	ENDDO
355
356	jj = myid * (nyn_p-nys_p+1) + j
357	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
358
359	DO k = 1, nz
360	DO i = nxl_a, nxr_a
361	fftx_ar(i) = tri_ar (i,k)
362	ENDDO
363
364	CALL fft_x( fftx_ar, 'backward' )
365
366	m = nxl_a
367	DO n = 1, npe_s
368	DO i = nxl_p, nxr_p
369	work2(i,k,j,n) = fftx_ar(m)
370	m = m+1
371	ENDDO
372	ENDDO
373
374	ENDDO
375	ENDDO
376	#if defined( __KKMP )
377	!$OMP END PARALLEL
378	#endif
379
380	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
381
382	#if defined( __parallel )
383	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
384	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
385
386	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
387	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
388	comm2d, istat )
389
390	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
391	#else
392	work1 = work2
393	#endif
394
395	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
396
397	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
398	!$OMP DO
399	DO iouter = nxl_p, nxr_p, istride
400	iei = MIN( iouter+istride-1, nxr_p )
401	DO k = 1, nz
402
403	m = nys_a
404	DO n = 1, npe_s
405	DO j = nys_p, nyn_p
406	DO i = iouter, iei
407	ir = i - iouter + 1
408	ffty_ar(m,ir) = work1 (i,k,j,n)
409	ENDDO
410	m = m+1
411	ENDDO
412	ENDDO
413
414	DO i = iouter, iei
415	ii = nxl + i
416	ir = i - iouter + 1
417	CALL fft_y( ffty_ar(:,ir), 'backward' )
418
419	DO j = nys_a, nyn_a
420	ar(k,j,ii) = ffty_ar(j,ir)
421	ENDDO
422	ENDDO
423
424	ENDDO
425	ENDDO
426	!$OMP END PARALLEL
427
428	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
429
430	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
431
432	#if defined( __KKMP )
433	DEALLOCATE( tri )
434	#endif
435
436	END SUBROUTINE poisfft_hybrid_omp
437
438
439	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
440
441	USE cpulog
442	USE interfaces
443
444	IMPLICIT NONE
445
446	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
447	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
448
449	REAL, DIMENSION(0:nx,nz) :: tri_ar
450
451	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
452
453	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
454	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
455
456	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
457	#if defined( __KKMP )
458	INTEGER :: omp_get_thread_num
459	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
460	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
461	#else
462	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
463	#endif
464
465
466	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
467
468	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
469
470	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
471	!$OMP DO
472	!
473	!-- Store grid points to be transformed on a 1d-array, do the fft
474	!-- and sample the results on a 4d-array
475	DO i = nxl_p, nxr_p
476
477	DO j = nys_a, nyn_a
478	DO k = 1, nz
479	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
480	ENDDO
481	ENDDO
482
483	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
484	ENDDO
485
486	!$OMP DO
487	DO k = 1, nz
488	m = nys_a
489	DO n = 1, npe_s
490	DO j = nys_p, nyn_p
491	DO i = nxl_p, nxr_p
492	work1(i,k,j,n) = ffty_ar3(m,k,i)
493	ENDDO
494	m = m+1
495	ENDDO
496	ENDDO
497	ENDDO
498	!$OMP END PARALLEL
499
500	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
501
502	#if defined( __parallel )
503	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
504	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
505	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
506	comm2d, istat )
507	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
508	#else
509	work2 = work1
510	#endif
511
512	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
513
514	#if defined( __KKMP )
515	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
516	!$OMP DO
517	DO j = nys_p, nyn_p
518	jthread = omp_get_thread_num() + 1
519	#else
520	DO j = nys_p, nyn_p
521	jthread = 1
522	#endif
523	DO k = 1, nz
524
525	m = nxl_a
526	DO n = 1, npe_s
527	DO i = nxl_p, nxr_p
528	fftx_ar3(m,k,j) = work2(i,k,j,n)
529	m = m+1
530	ENDDO
531	ENDDO
532	ENDDO
533
534	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
535
536	DO k = 1, nz
537	DO i = nxl_a, nxr_a
538	tri_ar(i,k) = fftx_ar3(i,k,j)
539	ENDDO
540	ENDDO
541
542	jj = myid * (nyn_p-nys_p+1) + j
543	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
544
545	DO k = 1, nz
546	DO i = nxl_a, nxr_a
547	fftx_ar3(i,k,j) = tri_ar (i,k)
548	ENDDO
549	ENDDO
550
551	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
552
553	DO k = 1, nz
554	m = nxl_a
555	DO n = 1, npe_s
556	DO i = nxl_p, nxr_p
557	work2(i,k,j,n) = fftx_ar3(m,k,j)
558	m = m+1
559	ENDDO
560	ENDDO
561	ENDDO
562
563	ENDDO
564	#if defined( __KKMP )
565	!$OMP END PARALLEL
566	#endif
567
568	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
569
570	#if defined( __parallel )
571	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
572	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
573	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
574	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
575	comm2d, istat )
576	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
577	#else
578	work1 = work2
579	#endif
580
581	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
582
583	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
584	!$OMP DO
585	DO k = 1, nz
586	m = nys_a
587	DO n = 1, npe_s
588	DO j = nys_p, nyn_p
589	DO i = nxl_p, nxr_p
590	ffty_ar3(m,k,i) = work1(i,k,j,n)
591	ENDDO
592	m = m+1
593	ENDDO
594	ENDDO
595	ENDDO
596
597	!$OMP DO
598	DO i = nxl_p, nxr_p
599	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
600	DO j = nys_a, nyn_a
601	DO k = 1, nz
602	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
603	ENDDO
604	ENDDO
605	ENDDO
606	!$OMP END PARALLEL
607
608	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
609
610	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
611
612	#if defined( __KKMP )
613	DEALLOCATE( tri )
614	#endif
615
616	END SUBROUTINE poisfft_hybrid_omp_vec
617
618
619	SUBROUTINE poisfft_hybrid_nodes ( ar )
620
621	USE cpulog
622	USE interfaces
623
624	IMPLICIT NONE
625
626	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
627	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
628	n, nn, nt, nw1, nw2
629
630	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
631
632	REAL, DIMENSION(0:nx) :: fftx_ar
633	REAL, DIMENSION(0:ny,istride) :: ffty_ar
634
635	REAL, DIMENSION(0:nx,nz) :: tri_ar
636
637	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
638	nodes,nys_p:nyn_p) :: work1,work2
639	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
640
641
642	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
643
644	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
645
646	!
647	!-- Store grid points to be transformed on a 1d-array, do the fft
648	!-- and sample the results on a 4d-array
649	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
650	iei = MIN( iouter+istride-1, nxr_p )
651	DO k = 1, nz
652
653	DO i = iouter, iei
654	ii = nxl + i
655	ir = i - iouter + 1
656
657	DO j = nys_a, nyn_a
658	ffty_ar(j,ir) = ar(k,j,ii)
659	ENDDO
660
661	CALL fft_y( ffty_ar(:,ir), 'forward' )
662	ENDDO
663
664	m = nys_a
665	DO nn = 1, nodes
666	DO nt = 1, tasks_per_logical_node
667	DO j = nys_p, nyn_p
668	DO i = iouter, iei
669	ir = i - iouter + 1
670	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
671	ENDDO
672	m = m+1
673	ENDDO
674	ENDDO
675	ENDDO
676
677	ENDDO
678	ENDDO
679
680	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
681
682	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
683	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
684	DO nn = 1, nodes
685	DO j = nys_p, nyn_p
686	#if defined( __parallel )
687	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
688	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
689	comm_tasks, istat )
690	#endif
691	ENDDO
692	ENDDO
693	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
694
695
696	DO j = nys_p, nyn_p
697
698	CALL cascade( 1, j, nys_p, nyn_p )
699	nw2 = nw1 * SIZE( work1, 3 )
700	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
701	#if defined( __parallel )
702	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
703	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
704	comm_nodes, istat )
705	#endif
706	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
707	CALL cascade( 2, j, nys_p, nyn_p )
708
709	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
710	DO k = 1, nz
711
712	m = nxl_a
713	DO nn = 1, nodes
714	DO nt = 1, tasks_per_logical_node
715	DO i = nxl_p, nxr_p
716	fftx_ar(m) = work1(i,k,nt,nn,j)
717	m = m+1
718	ENDDO
719	ENDDO
720	ENDDO
721
722	CALL fft_x( fftx_ar, 'forward' )
723
724	DO i = nxl_a, nxr_a
725	tri_ar(i,k) = fftx_ar(i)
726	ENDDO
727
728	ENDDO
729
730	jj = myid * (nyn_p-nys_p+1) + j
731	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
732
733	DO k = 1, nz
734	DO i = nxl_a, nxr_a
735	fftx_ar(i) = tri_ar(i,k)
736	ENDDO
737
738	CALL fft_x( fftx_ar, 'backward' )
739
740	m = nxl_a
741	DO nn = 1, nodes
742	DO nt = 1, tasks_per_logical_node
743	DO i = nxl_p, nxr_p
744	work1(i,k,nt,nn,j) = fftx_ar(m)
745	m = m+1
746	ENDDO
747	ENDDO
748	ENDDO
749	ENDDO
750
751	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
752	nw2 = nw1 * SIZE( work1, 3 )
753	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
754	#if defined( __parallel )
755	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
756	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
757	comm_nodes, istat )
758	#endif
759	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
760
761	ENDDO
762
763	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
764	DO nn = 1, nodes
765	DO j = nys_p, nyn_p
766	#if defined( __parallel )
767	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
768	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
769	comm_tasks, istat )
770	#endif
771	ENDDO
772	ENDDO
773	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
774
775	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
776
777	DO iouter = nxl_p, nxr_p, istride
778	iei = MIN( iouter+istride-1, nxr_p )
779	DO k = 1, nz
780
781	m = nys_a
782	DO nn = 1, nodes
783	DO nt = 1, tasks_per_logical_node
784	DO j = nys_p, nyn_p
785	DO i = iouter, iei
786	ir = i - iouter + 1
787	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
788	ENDDO
789	m = m+1
790	ENDDO
791	ENDDO
792	ENDDO
793
794	DO i = iouter, iei
795	ii = nxl + i
796	ir = i - iouter + 1
797	CALL fft_y( ffty_ar(:,ir), 'backward' )
798
799	DO j = nys_a, nyn_a
800	ar(k,j,ii) = ffty_ar(j,ir)
801	ENDDO
802	ENDDO
803
804	ENDDO
805	ENDDO
806
807	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
808
809	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
810
811	END SUBROUTINE poisfft_hybrid_nodes
812
813
814
815	SUBROUTINE tridia_hybrid( j, ar, tri )
816
817	USE arrays_3d
818	USE control_parameters
819	USE grid_variables
820
821	IMPLICIT NONE
822
823	INTEGER :: i, j, k, nnyh
824	REAL, DIMENSION(0:nx,nz) :: ar
825	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
826	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
827
828	nnyh = (ny+1) / 2
829
830	tri = 0.0
831	!
832	!-- Define constant elements of the tridiagonal matrix.
833	DO k = 0, nz-1
834	DO i = 0,nx
835	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
836	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
837	ENDDO
838	ENDDO
839
840	IF ( j <= nnyh ) THEN
841	CALL maketri_hybrid( j )
842	ELSE
843	CALL maketri_hybrid( ny+1-j)
844	ENDIF
845	CALL zerleg_hybrid
846	CALL substi_hybrid( ar, tri )
847
848	CONTAINS
849
850	SUBROUTINE maketri_hybrid( j )
851
852	!----------------------------------------------------------------------!
853	! maketri !
854	! !
855	! computes the i- and j-dependent component of the matrix !
856	!----------------------------------------------------------------------!
857
858	USE constants
859
860	IMPLICIT NONE
861
862	INTEGER :: i, j, k, nnxh
863	REAL :: a, c
864
865	REAL, DIMENSION(0:nx) :: l
866
867
868	nnxh = (nx+1) / 2
869	!
870	!-- Provide the tridiagonal matrix for solution of the Poisson equation
871	!-- in Fourier space. The coefficients are computed following the method
872	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
873	!-- Siano's original version.
874	DO i = 0,nx
875	IF ( i >= 0 .AND. i < nnxh ) THEN
876	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
877	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
878	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
879	FLOAT( ny+1 ) ) ) / ( dy * dy )
880	ELSEIF ( i == nnxh ) THEN
881	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
882	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
883	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
884	FLOAT(ny+1) ) ) / ( dy * dy )
885	ELSE
886	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
887	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
888	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
889	FLOAT( ny+1 ) ) ) / ( dy * dy )
890	ENDIF
891	ENDDO
892
893	DO k = 0,nz-1
894	DO i = 0, nx
895	a = -1.0 * ddzu(k+2) * ddzw(k+1)
896	c = -1.0 * ddzu(k+1) * ddzw(k+1)
897	tri(1,i,k) = a + c - l(i)
898	ENDDO
899	ENDDO
900	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
901	DO i = 0,nx
902	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
903	ENDDO
904	ENDIF
905	IF ( ibc_p_t == 1 ) THEN
906	DO i = 0,nx
907	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
908	ENDDO
909	ENDIF
910
911	END SUBROUTINE maketri_hybrid
912
913
914	SUBROUTINE zerleg_hybrid
915
916	!----------------------------------------------------------------------!
917	! zerleg !
918	! !
919	! Splitting of the tridiagonal matrix (Thomas algorithm) !
920	!----------------------------------------------------------------------!
921
922	USE indices
923
924	IMPLICIT NONE
925
926	INTEGER :: i, k
927
928	!
929	!-- Splitting
930	DO i = 0, nx
931	tri(4,i,0) = tri(1,i,0)
932	ENDDO
933	DO k = 1, nz-1
934	DO i = 0,nx
935	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
936	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
937	ENDDO
938	ENDDO
939
940	END SUBROUTINE zerleg_hybrid
941
942	SUBROUTINE substi_hybrid( ar, tri )
943
944	!----------------------------------------------------------------------!
945	! substi !
946	! !
947	! Substitution (Forward and Backward) (Thomas algorithm) !
948	!----------------------------------------------------------------------!
949
950	IMPLICIT NONE
951
952	INTEGER :: i, j, k
953	REAL, DIMENSION(0:nx,nz) :: ar
954	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
955	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
956
957	!
958	!-- Forward substitution
959	DO i = 0, nx
960	ar1(i,0) = ar(i,1)
961	ENDDO
962	DO k = 1, nz - 1
963	DO i = 0,nx
964	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
965	ENDDO
966	ENDDO
967
968	!
969	!-- Backward substitution
970	DO i = 0,nx
971	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
972	ENDDO
973	DO k = nz-2, 0, -1
974	DO i = 0,nx
975	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
976	/ tri(4,i,k)
977	ENDDO
978	ENDDO
979
980	END SUBROUTINE substi_hybrid
981
982	END SUBROUTINE tridia_hybrid
983
984
985	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
986
987	USE cpulog
988
989	IMPLICIT NONE
990
991	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
992	INTEGER, SAVE :: tag = 10
993	#if defined( __parallel )
994	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
995	#endif
996
997	REAL :: buf, buf1
998
999
1000	buf = 1.0
1001	buf1 = 1.1
1002	IF ( me_node == 0 ) THEN ! first node only
1003
1004	SELECT CASE ( loca )
1005
1006	CASE ( 1 ) ! before alltoall
1007
1008	IF( me_task > 0 ) THEN ! first task does not wait
1009	#if defined( __parallel )
1010	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
1011	buf1, 1, MPI_REAL, me_task-1, 0, &
1012	comm_tasks, stat,ierr )
1013	#endif
1014	ELSEIF ( j > nys_p ) THEN
1015	req = 0
1016	tag = MOD( tag-10, 10 ) + 10
1017	#if defined( __parallel )
1018	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
1019	tag, comm_tasks, req, ierr )
1020	reqa = req
1021	CALL MPI_WAITALL( 1, reqa, stat, ierr )
1022	#endif
1023	ENDIF
1024
1025	CASE ( 2 ) ! after alltoall
1026
1027	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
1028	#if defined( __parallel )
1029	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
1030	buf1, 1, MPI_REAL, me_task+1, 0, &
1031	comm_tasks, stat, ierr)
1032	#endif
1033	ELSEIF ( j < nyn_p ) THEN
1034	req = 0
1035	tag = MOD( tag-10, 10 ) + 10
1036	#if defined( __parallel )
1037	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
1038	ierr )
1039	#endif
1040	ENDIF
1041
1042	END SELECT
1043
1044	ENDIF
1045
1046	END SUBROUTINE cascade
1047
1048	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |