Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 1

Last change on this file since 1 was 1, checked in by raasch, 18 years ago
Initial repository layout and content
File size: 31.7 KB

Line
1	MODULE poisfft_hybrid_mod
2	!------------------------------------------------------------------------------
3	!
4	! Actual revisions:
5	! -----------------
6	!
7	!
8	! Former revisions:
9	! -----------------
10	! $Log: poisfft_hybrid.f90,v $
11	! Revision 1.11 2004/04/30 12:43:14 raasch
12	! Renaming of fft routines, additional argument in calls of fft_y_m
13	!
14	! Revision 1.10 2003/08/01 08:49:30 raasch
15	! Error removed: reqa=req after calling routine MPI_IRECV (cascade)
16	!
17	! Revision 1.9 2003/05/09 14:50:47 raasch
18	! Parameters in calls of MPI_ALLTOALL, MPI_WAITALL changed in order to fulfill
19	! f95 type and rank requirements
20	!
21	! Revision 1.8 2003/03/16 11:17:04 raasch
22	! +module pegrid in poisfft_hybrid_ini
23	!
24	! Revision 1.7 2003/03/16 09:45:52 raasch
25	! Two underscores (_) are placed in front of all define-strings
26	!
27	! Revision 1.6 2003/03/16 09:11:42 raasch
28	! Fortran error removed
29	!
30	! Revision 1.5 2003/03/14 13:45:53 raasch
31	! Loop optimization in maketri_hybrid
32	!
33	! Revision 1.4 2003/03/12 16:37:22 raasch
34	! Vectorized version (poisfft_hybrid_omp_vec) added, arguments ar and tri
35	! passed to routine substi_hybrid (otherwise no vectorization and very
36	! bad performance on NEC), reading of environment variable tasks_per_node
37	! moved to routine parin
38	!
39	! Revision 1.3 2002/12/20 15:18:02 raasch
40	! Variable statement restricted for parallel environment in subroutine
41	! cascade
42	!
43	! Revision 1.2 2002/12/19 16:08:31 raasch
44	! Preprocessor directive KKMP introduced (OMP does NOT work),
45	! array tri will be a shared array in OpenMP loop, to get better cache
46	! utilization, the i index (x-direction) will be executed in stride
47	! "istride" as outer loop and in a shorter inner loop,
48	! overlapping of computation and communication realized by new routine
49	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
50	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
51	!
52	!
53	! Description:
54	! ------------
55	! Solution of the Poisson equation with a 2D spectral method.
56	! Hybrid version for parallel computers using a 1D domain decomposition,
57	! realized with MPI, along x and parallelization with OPEN-MP along y
58	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
59	! optimization is realized by overlapping of computation and communication
60	! and by simultaneously executing as many communication calls as switches
61	! per logical partition (LPAR) are available. This version comes into
62	! effect if more than one node is used and if the environment variable
63	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
64	! without any rest.
65	!
66	! WARNING: In case of OpenMP, there are problems with allocating large
67	! arrays in parallel regions.
68	!
69	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
70	!------------------------------------------------------------------------------!
71
72	USE fft_xy
73	USE indices
74	USE pegrid
75
76	IMPLICIT NONE
77
78	PRIVATE
79	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
80
81	INTEGER, PARAMETER :: switch_per_lpar = 2
82
83	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
84	nxl_p, nxr_p, & ! partial x dimension
85	nys_a, nyn_a, & ! total y dimension
86	nys_p, nyn_p, & ! partial y dimension
87
88	npe_s, & ! total number of PEs for solver
89	nwords, & ! number of points to be exchanged
90	! with MPI_ALLTOALL
91	n_omp_threads ! number of OpenMP threads
92
93	!
94	!-- Variables for multi node version (cluster version) using routine
95	!-- poisfft_hybrid_nodes
96	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
97	comm_node_all, & ! communicater all PEs node version
98	comm_tasks, & ! communicater tasks
99	me, me_node, me_task,& ! identity of this PE
100	nodes, & ! number of nodes
101	tasks_per_logical_node = -1 ! default no cluster
102
103
104	!
105	!-- Public interfaces
106	INTERFACE poisfft_hybrid_ini
107	MODULE PROCEDURE poisfft_hybrid_ini
108	END INTERFACE poisfft_hybrid_ini
109
110	INTERFACE poisfft_hybrid
111	MODULE PROCEDURE poisfft_hybrid
112	END INTERFACE poisfft_hybrid
113
114	!
115	!-- Private interfaces
116	INTERFACE poisfft_hybrid_omp
117	MODULE PROCEDURE poisfft_hybrid_omp
118	END INTERFACE poisfft_hybrid_omp
119
120	INTERFACE poisfft_hybrid_omp_vec
121	MODULE PROCEDURE poisfft_hybrid_omp_vec
122	END INTERFACE poisfft_hybrid_omp_vec
123
124	INTERFACE poisfft_hybrid_nodes
125	MODULE PROCEDURE poisfft_hybrid_nodes
126	END INTERFACE poisfft_hybrid_nodes
127
128	INTERFACE tridia_hybrid
129	MODULE PROCEDURE tridia_hybrid
130	END INTERFACE tridia_hybrid
131
132	INTERFACE cascade
133	MODULE PROCEDURE cascade
134	END INTERFACE cascade
135
136	CONTAINS
137
138
139	SUBROUTINE poisfft_hybrid_ini
140
141	USE control_parameters
142	USE pegrid
143
144	IMPLICIT NONE
145
146	CHARACTER(LEN=8) :: cdummy
147	INTEGER :: idummy, istat
148	INTEGER, DIMENSION(2) :: coords, dims
149
150	LOGICAL, DIMENSION(2) :: period = .false., re_dims
151
152
153	!
154	!-- Set the internal index values for the hybrid solver
155	#if defined( __parallel )
156	npe_s = pdims(1)
157	#else
158	npe_s = 1
159	#endif
160	nxl_a = 0
161	nxr_a = nx
162	nxl_p = 0
163	nxr_p = ( ( nx+1 ) / npe_s ) - 1
164	nys_a = nys
165	nyn_a = nyn
166	nys_p = 0
167	nyn_p = ( ( ny+1 ) / npe_s ) - 1
168
169	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
170
171	#if defined( __KKMP )
172	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
173	READ ( cdummy, '(I8)' ) n_omp_threads
174	IF ( myid == 0 .AND. n_omp_threads > 1 ) THEN
175	PRINT, '** poisfft_hybrid_ini: Number of OpenMP threads = ', &
176	n_omp_threads
177	ENDIF
178	#else
179	n_omp_threads = 1
180	#endif
181	!
182	!-- Initialize the one-dimensional FFT routines
183	CALL fft_init
184
185	!
186	!-- Setup for multi node version (poisfft_hybrid_nodes)
187	IF ( n_omp_threads == 1 .AND. &
188	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
189
190	IF ( tasks_per_node /= -9999 ) THEN
191	!
192	!-- Multi node version requires that the available number of
193	!-- switches per logical partition must be an integral divisor
194	!-- of the chosen number of tasks per node
195	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
196	!
197	!-- Set the switch which decides about usage of the multi node
198	!-- version
199	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
200	numprocs > tasks_per_node ) THEN
201	tasks_per_logical_node = tasks_per_node / switch_per_lpar
202	ENDIF
203
204	IF ( myid == 0 .AND. tasks_per_logical_node > -1 ) THEN
205	PRINT, '** poisfft_hybrid_ini: running optimized ', &
206	'multinode version'
207	PRINT*, ' switch_per_lpar = ', switch_per_lpar
208	PRINT*, ' tasks_per_lpar = ', tasks_per_node
209	PRINT*, ' tasks_per_logical_node = ', &
210	tasks_per_logical_node
211	ENDIF
212
213	ENDIF
214	ENDIF
215	ENDIF
216
217	!
218	!-- Determine sub-topologies for multi node version
219	IF ( tasks_per_logical_node >= 2 ) THEN
220
221	#if defined( __parallel )
222	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
223	tasks_per_logical_node
224	dims(1) = nodes
225	dims(2) = tasks_per_logical_node
226
227	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
228	comm_node_all, istat )
229	CALL MPI_COMM_RANK( comm_node_all, me, istat )
230
231	re_dims(1) = .TRUE.
232	re_dims(2) = .FALSE.
233	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
234	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
235
236	re_dims(1) = .FALSE.
237	re_dims(2) = .TRUE.
238	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
239	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
240
241	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
242	! tasks_per_logical_node
243	#else
244	PRINT*, '+++ poisfft_hybrid_ini: parallel environment (MPI) required'
245	CALL local_stop
246	#endif
247	ENDIF
248
249	END SUBROUTINE poisfft_hybrid_ini
250
251
252	SUBROUTINE poisfft_hybrid( ar )
253
254	USE control_parameters
255	USE interfaces
256
257	IMPLICIT NONE
258
259	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
260
261	IF ( host(1:3) == 'nec' ) THEN
262	CALL poisfft_hybrid_omp_vec( ar )
263	ELSE
264	IF ( tasks_per_logical_node == -1 ) THEN
265	CALL poisfft_hybrid_omp( ar )
266	ELSE
267	CALL poisfft_hybrid_nodes( ar )
268	ENDIF
269	ENDIF
270
271	END SUBROUTINE poisfft_hybrid
272
273
274	SUBROUTINE poisfft_hybrid_omp ( ar )
275
276	USE cpulog
277	USE interfaces
278
279	IMPLICIT NONE
280
281	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
282	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
283
284	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
285
286	REAL, DIMENSION(0:nx) :: fftx_ar
287	REAL, DIMENSION(0:ny,istride) :: ffty_ar
288
289	REAL, DIMENSION(0:nx,nz) :: tri_ar
290
291	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
292	#if defined( __KKMP )
293	INTEGER :: omp_get_thread_num
294	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
295	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
296	#else
297	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
298	#endif
299
300
301	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
302
303	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
304
305	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
306	!$OMP DO
307	!
308	!-- Store grid points to be transformed on a 1d-array, do the fft
309	!-- and sample the results on a 4d-array
310	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
311	iei = MIN( iouter+istride-1, nxr_p )
312	DO k = 1, nz
313
314	DO i = iouter, iei
315	ii = nxl + i
316	ir = i - iouter + 1
317
318	DO j = nys_a, nyn_a
319	ffty_ar(j,ir) = ar(k,j,ii)
320	ENDDO
321
322	CALL fft_y( ffty_ar(:,ir), 'forward' )
323	ENDDO
324
325	m = nys_a
326	DO n = 1, npe_s
327	DO j = nys_p, nyn_p
328	DO i = iouter, iei
329	ir = i - iouter + 1
330	work1(i,k,j,n) = ffty_ar(m,ir)
331	ENDDO
332	m = m+1
333	ENDDO
334	ENDDO
335
336	ENDDO
337	ENDDO
338	!$OMP END PARALLEL
339
340	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
341
342	#if defined( __parallel )
343	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
344
345	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
346	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
347	comm2d, istat )
348
349	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
350	#else
351	work2 = work1
352	#endif
353
354	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
355
356	#if defined( __KKMP )
357	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
358	!$OMP DO
359	DO j = nys_p, nyn_p
360	jthread = omp_get_thread_num() + 1
361	#else
362	DO j = nys_p, nyn_p
363	jthread = 1
364	#endif
365	DO k = 1, nz
366
367	m = nxl_a
368	DO n = 1, npe_s
369	DO i = nxl_p, nxr_p
370	fftx_ar(m) = work2(i,k,j,n)
371	m = m+1
372	ENDDO
373	ENDDO
374
375	CALL fft_x( fftx_ar, 'forward' )
376
377	DO i = nxl_a, nxr_a
378	tri_ar(i,k) = fftx_ar(i)
379	ENDDO
380
381	ENDDO
382
383	jj = myid * (nyn_p-nys_p+1) + j
384	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
385
386	DO k = 1, nz
387	DO i = nxl_a, nxr_a
388	fftx_ar(i) = tri_ar (i,k)
389	ENDDO
390
391	CALL fft_x( fftx_ar, 'backward' )
392
393	m = nxl_a
394	DO n = 1, npe_s
395	DO i = nxl_p, nxr_p
396	work2(i,k,j,n) = fftx_ar(m)
397	m = m+1
398	ENDDO
399	ENDDO
400
401	ENDDO
402	ENDDO
403	#if defined( __KKMP )
404	!$OMP END PARALLEL
405	#endif
406
407	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
408
409	#if defined( __parallel )
410	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
411	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
412
413	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
414	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
415	comm2d, istat )
416
417	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
418	#else
419	work1 = work2
420	#endif
421
422	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
423
424	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
425	!$OMP DO
426	DO iouter = nxl_p, nxr_p, istride
427	iei = MIN( iouter+istride-1, nxr_p )
428	DO k = 1, nz
429
430	m = nys_a
431	DO n = 1, npe_s
432	DO j = nys_p, nyn_p
433	DO i = iouter, iei
434	ir = i - iouter + 1
435	ffty_ar(m,ir) = work1 (i,k,j,n)
436	ENDDO
437	m = m+1
438	ENDDO
439	ENDDO
440
441	DO i = iouter, iei
442	ii = nxl + i
443	ir = i - iouter + 1
444	CALL fft_y( ffty_ar(:,ir), 'backward' )
445
446	DO j = nys_a, nyn_a
447	ar(k,j,ii) = ffty_ar(j,ir)
448	ENDDO
449	ENDDO
450
451	ENDDO
452	ENDDO
453	!$OMP END PARALLEL
454
455	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
456
457	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
458
459	#if defined( __KKMP )
460	DEALLOCATE( tri )
461	#endif
462
463	END SUBROUTINE poisfft_hybrid_omp
464
465
466	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
467
468	USE cpulog
469	USE interfaces
470
471	IMPLICIT NONE
472
473	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
474	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
475
476	REAL, DIMENSION(0:nx,nz) :: tri_ar
477
478	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
479
480	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
481	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
482
483	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
484	#if defined( __KKMP )
485	INTEGER :: omp_get_thread_num
486	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
487	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
488	#else
489	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
490	#endif
491
492
493	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
494
495	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
496
497	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
498	!$OMP DO
499	!
500	!-- Store grid points to be transformed on a 1d-array, do the fft
501	!-- and sample the results on a 4d-array
502	DO i = nxl_p, nxr_p
503
504	DO j = nys_a, nyn_a
505	DO k = 1, nz
506	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
507	ENDDO
508	ENDDO
509
510	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
511	ENDDO
512
513	!$OMP DO
514	DO k = 1, nz
515	m = nys_a
516	DO n = 1, npe_s
517	DO j = nys_p, nyn_p
518	DO i = nxl_p, nxr_p
519	work1(i,k,j,n) = ffty_ar3(m,k,i)
520	ENDDO
521	m = m+1
522	ENDDO
523	ENDDO
524	ENDDO
525	!$OMP END PARALLEL
526
527	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
528
529	#if defined( __parallel )
530	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
531	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
532	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
533	comm2d, istat )
534	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
535	#else
536	work2 = work1
537	#endif
538
539	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
540
541	#if defined( __KKMP )
542	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
543	!$OMP DO
544	DO j = nys_p, nyn_p
545	jthread = omp_get_thread_num() + 1
546	#else
547	DO j = nys_p, nyn_p
548	jthread = 1
549	#endif
550	DO k = 1, nz
551
552	m = nxl_a
553	DO n = 1, npe_s
554	DO i = nxl_p, nxr_p
555	fftx_ar3(m,k,j) = work2(i,k,j,n)
556	m = m+1
557	ENDDO
558	ENDDO
559	ENDDO
560
561	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
562
563	DO k = 1, nz
564	DO i = nxl_a, nxr_a
565	tri_ar(i,k) = fftx_ar3(i,k,j)
566	ENDDO
567	ENDDO
568
569	jj = myid * (nyn_p-nys_p+1) + j
570	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
571
572	DO k = 1, nz
573	DO i = nxl_a, nxr_a
574	fftx_ar3(i,k,j) = tri_ar (i,k)
575	ENDDO
576	ENDDO
577
578	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
579
580	DO k = 1, nz
581	m = nxl_a
582	DO n = 1, npe_s
583	DO i = nxl_p, nxr_p
584	work2(i,k,j,n) = fftx_ar3(m,k,j)
585	m = m+1
586	ENDDO
587	ENDDO
588	ENDDO
589
590	ENDDO
591	#if defined( __KKMP )
592	!$OMP END PARALLEL
593	#endif
594
595	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
596
597	#if defined( __parallel )
598	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
599	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
600	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
601	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
602	comm2d, istat )
603	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
604	#else
605	work1 = work2
606	#endif
607
608	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
609
610	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
611	!$OMP DO
612	DO k = 1, nz
613	m = nys_a
614	DO n = 1, npe_s
615	DO j = nys_p, nyn_p
616	DO i = nxl_p, nxr_p
617	ffty_ar3(m,k,i) = work1(i,k,j,n)
618	ENDDO
619	m = m+1
620	ENDDO
621	ENDDO
622	ENDDO
623
624	!$OMP DO
625	DO i = nxl_p, nxr_p
626	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
627	DO j = nys_a, nyn_a
628	DO k = 1, nz
629	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
630	ENDDO
631	ENDDO
632	ENDDO
633	!$OMP END PARALLEL
634
635	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
636
637	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
638
639	#if defined( __KKMP )
640	DEALLOCATE( tri )
641	#endif
642
643	END SUBROUTINE poisfft_hybrid_omp_vec
644
645
646	SUBROUTINE poisfft_hybrid_nodes ( ar )
647
648	USE cpulog
649	USE interfaces
650
651	IMPLICIT NONE
652
653	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
654	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
655	n, nn, nt, nw1, nw2
656
657	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
658
659	REAL, DIMENSION(0:nx) :: fftx_ar
660	REAL, DIMENSION(0:ny,istride) :: ffty_ar
661
662	REAL, DIMENSION(0:nx,nz) :: tri_ar
663
664	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
665	nodes,nys_p:nyn_p) :: work1,work2
666	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
667
668
669	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
670
671	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
672
673	!
674	!-- Store grid points to be transformed on a 1d-array, do the fft
675	!-- and sample the results on a 4d-array
676	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
677	iei = MIN( iouter+istride-1, nxr_p )
678	DO k = 1, nz
679
680	DO i = iouter, iei
681	ii = nxl + i
682	ir = i - iouter + 1
683
684	DO j = nys_a, nyn_a
685	ffty_ar(j,ir) = ar(k,j,ii)
686	ENDDO
687
688	CALL fft_y( ffty_ar(:,ir), 'forward' )
689	ENDDO
690
691	m = nys_a
692	DO nn = 1, nodes
693	DO nt = 1, tasks_per_logical_node
694	DO j = nys_p, nyn_p
695	DO i = iouter, iei
696	ir = i - iouter + 1
697	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
698	ENDDO
699	m = m+1
700	ENDDO
701	ENDDO
702	ENDDO
703
704	ENDDO
705	ENDDO
706
707	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
708
709	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
710	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
711	DO nn = 1, nodes
712	DO j = nys_p, nyn_p
713	#if defined( __parallel )
714	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
715	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
716	comm_tasks, istat )
717	#endif
718	ENDDO
719	ENDDO
720	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
721
722
723	DO j = nys_p, nyn_p
724
725	CALL cascade( 1, j, nys_p, nyn_p )
726	nw2 = nw1 * SIZE( work1, 3 )
727	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
728	#if defined( __parallel )
729	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
730	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
731	comm_nodes, istat )
732	#endif
733	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
734	CALL cascade( 2, j, nys_p, nyn_p )
735
736	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
737	DO k = 1, nz
738
739	m = nxl_a
740	DO nn = 1, nodes
741	DO nt = 1, tasks_per_logical_node
742	DO i = nxl_p, nxr_p
743	fftx_ar(m) = work1(i,k,nt,nn,j)
744	m = m+1
745	ENDDO
746	ENDDO
747	ENDDO
748
749	CALL fft_x( fftx_ar, 'forward' )
750
751	DO i = nxl_a, nxr_a
752	tri_ar(i,k) = fftx_ar(i)
753	ENDDO
754
755	ENDDO
756
757	jj = myid * (nyn_p-nys_p+1) + j
758	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
759
760	DO k = 1, nz
761	DO i = nxl_a, nxr_a
762	fftx_ar(i) = tri_ar(i,k)
763	ENDDO
764
765	CALL fft_x( fftx_ar, 'backward' )
766
767	m = nxl_a
768	DO nn = 1, nodes
769	DO nt = 1, tasks_per_logical_node
770	DO i = nxl_p, nxr_p
771	work1(i,k,nt,nn,j) = fftx_ar(m)
772	m = m+1
773	ENDDO
774	ENDDO
775	ENDDO
776	ENDDO
777
778	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
779	nw2 = nw1 * SIZE( work1, 3 )
780	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
781	#if defined( __parallel )
782	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
783	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
784	comm_nodes, istat )
785	#endif
786	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
787
788	ENDDO
789
790	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
791	DO nn = 1, nodes
792	DO j = nys_p, nyn_p
793	#if defined( __parallel )
794	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
795	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
796	comm_tasks, istat )
797	#endif
798	ENDDO
799	ENDDO
800	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
801
802	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
803
804	DO iouter = nxl_p, nxr_p, istride
805	iei = MIN( iouter+istride-1, nxr_p )
806	DO k = 1, nz
807
808	m = nys_a
809	DO nn = 1, nodes
810	DO nt = 1, tasks_per_logical_node
811	DO j = nys_p, nyn_p
812	DO i = iouter, iei
813	ir = i - iouter + 1
814	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
815	ENDDO
816	m = m+1
817	ENDDO
818	ENDDO
819	ENDDO
820
821	DO i = iouter, iei
822	ii = nxl + i
823	ir = i - iouter + 1
824	CALL fft_y( ffty_ar(:,ir), 'backward' )
825
826	DO j = nys_a, nyn_a
827	ar(k,j,ii) = ffty_ar(j,ir)
828	ENDDO
829	ENDDO
830
831	ENDDO
832	ENDDO
833
834	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
835
836	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
837
838	END SUBROUTINE poisfft_hybrid_nodes
839
840
841
842	SUBROUTINE tridia_hybrid( j, ar, tri )
843
844	USE arrays_3d
845	USE control_parameters
846	USE grid_variables
847
848	IMPLICIT NONE
849
850	INTEGER :: i, j, k, nnyh
851	REAL, DIMENSION(0:nx,nz) :: ar
852	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
853	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
854
855	nnyh = (ny+1) / 2
856
857	tri = 0.0
858	!
859	!-- Define constant elements of the tridiagonal matrix.
860	DO k = 0, nz-1
861	DO i = 0,nx
862	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
863	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
864	ENDDO
865	ENDDO
866
867	IF ( j <= nnyh ) THEN
868	CALL maketri_hybrid( j )
869	ELSE
870	CALL maketri_hybrid( ny+1-j)
871	ENDIF
872	CALL zerleg_hybrid
873	CALL substi_hybrid( ar, tri )
874
875	CONTAINS
876
877	SUBROUTINE maketri_hybrid( j )
878
879	!----------------------------------------------------------------------!
880	! maketri !
881	! !
882	! computes the i- and j-dependent component of the matrix !
883	!----------------------------------------------------------------------!
884
885	USE constants
886
887	IMPLICIT NONE
888
889	INTEGER :: i, j, k, nnxh
890	REAL :: a, c
891
892	REAL, DIMENSION(0:nx) :: l
893
894
895	nnxh = (nx+1) / 2
896	!
897	!-- Provide the tridiagonal matrix for solution of the Poisson equation
898	!-- in Fourier space. The coefficients are computed following the method
899	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
900	!-- Siano's original version.
901	DO i = 0,nx
902	IF ( i >= 0 .AND. i < nnxh ) THEN
903	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
904	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
905	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
906	FLOAT( ny+1 ) ) ) / ( dy * dy )
907	ELSEIF ( i == nnxh ) THEN
908	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
909	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
910	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
911	FLOAT(ny+1) ) ) / ( dy * dy )
912	ELSE
913	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
914	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
915	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
916	FLOAT( ny+1 ) ) ) / ( dy * dy )
917	ENDIF
918	ENDDO
919
920	DO k = 0,nz-1
921	DO i = 0, nx
922	a = -1.0 * ddzu(k+2) * ddzw(k+1)
923	c = -1.0 * ddzu(k+1) * ddzw(k+1)
924	tri(1,i,k) = a + c - l(i)
925	ENDDO
926	ENDDO
927	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
928	DO i = 0,nx
929	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
930	ENDDO
931	ENDIF
932	IF ( ibc_p_t == 1 ) THEN
933	DO i = 0,nx
934	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
935	ENDDO
936	ENDIF
937
938	END SUBROUTINE maketri_hybrid
939
940
941	SUBROUTINE zerleg_hybrid
942
943	!----------------------------------------------------------------------!
944	! zerleg !
945	! !
946	! Splitting of the tridiagonal matrix (Thomas algorithm) !
947	!----------------------------------------------------------------------!
948
949	USE indices
950
951	IMPLICIT NONE
952
953	INTEGER :: i, k
954
955	!
956	!-- Splitting
957	DO i = 0, nx
958	tri(4,i,0) = tri(1,i,0)
959	ENDDO
960	DO k = 1, nz-1
961	DO i = 0,nx
962	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
963	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
964	ENDDO
965	ENDDO
966
967	END SUBROUTINE zerleg_hybrid
968
969	SUBROUTINE substi_hybrid( ar, tri )
970
971	!----------------------------------------------------------------------!
972	! substi !
973	! !
974	! Substitution (Forward and Backward) (Thomas algorithm) !
975	!----------------------------------------------------------------------!
976
977	IMPLICIT NONE
978
979	INTEGER :: i, j, k
980	REAL, DIMENSION(0:nx,nz) :: ar
981	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
982	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
983
984	!
985	!-- Forward substitution
986	DO i = 0, nx
987	ar1(i,0) = ar(i,1)
988	ENDDO
989	DO k = 1, nz - 1
990	DO i = 0,nx
991	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
992	ENDDO
993	ENDDO
994
995	!
996	!-- Backward substitution
997	DO i = 0,nx
998	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
999	ENDDO
1000	DO k = nz-2, 0, -1
1001	DO i = 0,nx
1002	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1003	/ tri(4,i,k)
1004	ENDDO
1005	ENDDO
1006
1007	END SUBROUTINE substi_hybrid
1008
1009	END SUBROUTINE tridia_hybrid
1010
1011
1012	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
1013
1014	USE cpulog
1015
1016	IMPLICIT NONE
1017
1018	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
1019	INTEGER, SAVE :: tag = 10
1020	#if defined( __parallel )
1021	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
1022	#endif
1023
1024	REAL :: buf, buf1
1025
1026
1027	buf = 1.0
1028	buf1 = 1.1
1029	IF ( me_node == 0 ) THEN ! first node only
1030
1031	SELECT CASE ( loca )
1032
1033	CASE ( 1 ) ! before alltoall
1034
1035	IF( me_task > 0 ) THEN ! first task does not wait
1036	#if defined( __parallel )
1037	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
1038	buf1, 1, MPI_REAL, me_task-1, 0, &
1039	comm_tasks, stat,ierr )
1040	#endif
1041	ELSEIF ( j > nys_p ) THEN
1042	req = 0
1043	tag = MOD( tag-10, 10 ) + 10
1044	#if defined( __parallel )
1045	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
1046	tag, comm_tasks, req, ierr )
1047	reqa = req
1048	CALL MPI_WAITALL( 1, reqa, stat, ierr )
1049	#endif
1050	ENDIF
1051
1052	CASE ( 2 ) ! after alltoall
1053
1054	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
1055	#if defined( __parallel )
1056	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
1057	buf1, 1, MPI_REAL, me_task+1, 0, &
1058	comm_tasks, stat, ierr)
1059	#endif
1060	ELSEIF ( j < nyn_p ) THEN
1061	req = 0
1062	tag = MOD( tag-10, 10 ) + 10
1063	#if defined( __parallel )
1064	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
1065	ierr )
1066	#endif
1067	ENDIF
1068
1069	END SELECT
1070
1071	ENDIF
1072
1073	END SUBROUTINE cascade
1074
1075	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |