Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 342

Last change on this file since 342 was 274, checked in by heinze, 16 years ago
Indentation of the message calls corrected
Property svn:keywords set to `Id`
File size: 31.2 KB

Line
1	MODULE poisfft_hybrid_mod
2	!------------------------------------------------------------------------------
3	!
4	! Current revisions:
5	! -----------------
6	! Output of messages replaced by message handling routine.
7	!
8	!
9	! Former revisions:
10	! -----------------
11	! $Id: poisfft_hybrid.f90 274 2009-03-26 15:11:21Z heinze $
12	! RCS Log replace by Id keyword, revision history cleaned up
13	!
14	! Revision 1.11 2004/04/30 12:43:14 raasch
15	! Renaming of fft routines, additional argument in calls of fft_y_m
16	!
17	! Revision 1.2 2002/12/19 16:08:31 raasch
18	! Preprocessor directive KKMP introduced (OMP does NOT work),
19	! array tri will be a shared array in OpenMP loop, to get better cache
20	! utilization, the i index (x-direction) will be executed in stride
21	! "istride" as outer loop and in a shorter inner loop,
22	! overlapping of computation and communication realized by new routine
23	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
24	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
25	!
26	!
27	! Description:
28	! ------------
29	! Solution of the Poisson equation with a 2D spectral method.
30	! Hybrid version for parallel computers using a 1D domain decomposition,
31	! realized with MPI, along x and parallelization with OPEN-MP along y
32	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
33	! optimization is realized by overlapping of computation and communication
34	! and by simultaneously executing as many communication calls as switches
35	! per logical partition (LPAR) are available. This version comes into
36	! effect if more than one node is used and if the environment variable
37	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
38	! without any rest.
39	!
40	! WARNING: In case of OpenMP, there are problems with allocating large
41	! arrays in parallel regions.
42	!
43	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
44	!------------------------------------------------------------------------------!
45
46	USE fft_xy
47	USE indices
48	USE pegrid
49
50	IMPLICIT NONE
51
52	PRIVATE
53	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
54
55	INTEGER, PARAMETER :: switch_per_lpar = 2
56
57	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
58	nxl_p, nxr_p, & ! partial x dimension
59	nys_a, nyn_a, & ! total y dimension
60	nys_p, nyn_p, & ! partial y dimension
61
62	npe_s, & ! total number of PEs for solver
63	nwords, & ! number of points to be exchanged
64	! with MPI_ALLTOALL
65	n_omp_threads ! number of OpenMP threads
66
67	!
68	!-- Variables for multi node version (cluster version) using routine
69	!-- poisfft_hybrid_nodes
70	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
71	comm_node_all, & ! communicater all PEs node version
72	comm_tasks, & ! communicater tasks
73	me, me_node, me_task,& ! identity of this PE
74	nodes, & ! number of nodes
75	tasks_per_logical_node = -1 ! default no cluster
76
77
78	!
79	!-- Public interfaces
80	INTERFACE poisfft_hybrid_ini
81	MODULE PROCEDURE poisfft_hybrid_ini
82	END INTERFACE poisfft_hybrid_ini
83
84	INTERFACE poisfft_hybrid
85	MODULE PROCEDURE poisfft_hybrid
86	END INTERFACE poisfft_hybrid
87
88	!
89	!-- Private interfaces
90	INTERFACE poisfft_hybrid_omp
91	MODULE PROCEDURE poisfft_hybrid_omp
92	END INTERFACE poisfft_hybrid_omp
93
94	INTERFACE poisfft_hybrid_omp_vec
95	MODULE PROCEDURE poisfft_hybrid_omp_vec
96	END INTERFACE poisfft_hybrid_omp_vec
97
98	INTERFACE poisfft_hybrid_nodes
99	MODULE PROCEDURE poisfft_hybrid_nodes
100	END INTERFACE poisfft_hybrid_nodes
101
102	INTERFACE tridia_hybrid
103	MODULE PROCEDURE tridia_hybrid
104	END INTERFACE tridia_hybrid
105
106	INTERFACE cascade
107	MODULE PROCEDURE cascade
108	END INTERFACE cascade
109
110	CONTAINS
111
112
113	SUBROUTINE poisfft_hybrid_ini
114
115	USE control_parameters
116	USE pegrid
117
118	IMPLICIT NONE
119
120	CHARACTER(LEN=8) :: cdummy
121	INTEGER :: idummy, istat
122	INTEGER, DIMENSION(2) :: coords, dims
123
124	LOGICAL, DIMENSION(2) :: period = .false., re_dims
125
126
127	!
128	!-- Set the internal index values for the hybrid solver
129	#if defined( __parallel )
130	npe_s = pdims(1)
131	#else
132	npe_s = 1
133	#endif
134	nxl_a = 0
135	nxr_a = nx
136	nxl_p = 0
137	nxr_p = ( ( nx+1 ) / npe_s ) - 1
138	nys_a = nys
139	nyn_a = nyn
140	nys_p = 0
141	nyn_p = ( ( ny+1 ) / npe_s ) - 1
142
143	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
144
145	#if defined( __KKMP )
146	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
147	READ ( cdummy, '(I8)' ) n_omp_threads
148	IF ( n_omp_threads > 1 ) THEN
149	WRITE( message_string, * ) 'Number of OpenMP threads = ', &
150	n_omp_threads
151	CALL message( 'poisfft_hybrid_ini', 'PA0280', 0, 0, 0, 6, 0 )
152	ENDIF
153	#else
154	n_omp_threads = 1
155	#endif
156	!
157	!-- Initialize the one-dimensional FFT routines
158	CALL fft_init
159
160	!
161	!-- Setup for multi node version (poisfft_hybrid_nodes)
162	IF ( n_omp_threads == 1 .AND. &
163	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
164
165	IF ( tasks_per_node /= -9999 ) THEN
166	!
167	!-- Multi node version requires that the available number of
168	!-- switches per logical partition must be an integral divisor
169	!-- of the chosen number of tasks per node
170	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
171	!
172	!-- Set the switch which decides about usage of the multi node
173	!-- version
174	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
175	numprocs > tasks_per_node ) THEN
176	tasks_per_logical_node = tasks_per_node / switch_per_lpar
177	ENDIF
178
179	IF ( tasks_per_logical_node > -1 ) THEN
180
181	WRITE( message_string, * ) 'running optimized ', &
182	'multinode version', &
183	'&switch_per_lpar = ', &
184	switch_per_lpar, &
185	'&tasks_per_lpar = ', &
186	tasks_per_node, &
187	'tasks_per_logical_node = ', &
188	tasks_per_logical_node
189	CALL message( 'poisfft_hybrid_ini', 'PA0281', 0, 0, 0, 6, 0 )
190
191	ENDIF
192
193	ENDIF
194	ENDIF
195	ENDIF
196
197	!
198	!-- Determine sub-topologies for multi node version
199	IF ( tasks_per_logical_node >= 2 ) THEN
200
201	#if defined( __parallel )
202	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
203	tasks_per_logical_node
204	dims(1) = nodes
205	dims(2) = tasks_per_logical_node
206
207	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
208	comm_node_all, istat )
209	CALL MPI_COMM_RANK( comm_node_all, me, istat )
210
211	re_dims(1) = .TRUE.
212	re_dims(2) = .FALSE.
213	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
214	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
215
216	re_dims(1) = .FALSE.
217	re_dims(2) = .TRUE.
218	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
219	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
220
221	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
222	! tasks_per_logical_node
223	#else
224	message_string = 'parallel environment (MPI) required'
225	CALL message( 'poisfft_hybrid_ini', 'PA0282', 1, 2, 0, 6, 0 )
226	#endif
227	ENDIF
228
229	END SUBROUTINE poisfft_hybrid_ini
230
231
232	SUBROUTINE poisfft_hybrid( ar )
233
234	USE control_parameters
235	USE interfaces
236
237	IMPLICIT NONE
238
239	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
240
241	IF ( host(1:3) == 'nec' ) THEN
242	CALL poisfft_hybrid_omp_vec( ar )
243	ELSE
244	IF ( tasks_per_logical_node == -1 ) THEN
245	CALL poisfft_hybrid_omp( ar )
246	ELSE
247	CALL poisfft_hybrid_nodes( ar )
248	ENDIF
249	ENDIF
250
251	END SUBROUTINE poisfft_hybrid
252
253
254	SUBROUTINE poisfft_hybrid_omp ( ar )
255
256	USE cpulog
257	USE interfaces
258
259	IMPLICIT NONE
260
261	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
262	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
263
264	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
265
266	REAL, DIMENSION(0:nx) :: fftx_ar
267	REAL, DIMENSION(0:ny,istride) :: ffty_ar
268
269	REAL, DIMENSION(0:nx,nz) :: tri_ar
270
271	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
272	#if defined( __KKMP )
273	INTEGER :: omp_get_thread_num
274	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
275	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
276	#else
277	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
278	#endif
279
280
281	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
282
283	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
284
285	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
286	!$OMP DO
287	!
288	!-- Store grid points to be transformed on a 1d-array, do the fft
289	!-- and sample the results on a 4d-array
290	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
291	iei = MIN( iouter+istride-1, nxr_p )
292	DO k = 1, nz
293
294	DO i = iouter, iei
295	ii = nxl + i
296	ir = i - iouter + 1
297
298	DO j = nys_a, nyn_a
299	ffty_ar(j,ir) = ar(k,j,ii)
300	ENDDO
301
302	CALL fft_y( ffty_ar(:,ir), 'forward' )
303	ENDDO
304
305	m = nys_a
306	DO n = 1, npe_s
307	DO j = nys_p, nyn_p
308	DO i = iouter, iei
309	ir = i - iouter + 1
310	work1(i,k,j,n) = ffty_ar(m,ir)
311	ENDDO
312	m = m+1
313	ENDDO
314	ENDDO
315
316	ENDDO
317	ENDDO
318	!$OMP END PARALLEL
319
320	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
321
322	#if defined( __parallel )
323	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
324
325	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
326	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
327	comm2d, istat )
328
329	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
330	#else
331	work2 = work1
332	#endif
333
334	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
335
336	#if defined( __KKMP )
337	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
338	!$OMP DO
339	DO j = nys_p, nyn_p
340	jthread = omp_get_thread_num() + 1
341	#else
342	DO j = nys_p, nyn_p
343	jthread = 1
344	#endif
345	DO k = 1, nz
346
347	m = nxl_a
348	DO n = 1, npe_s
349	DO i = nxl_p, nxr_p
350	fftx_ar(m) = work2(i,k,j,n)
351	m = m+1
352	ENDDO
353	ENDDO
354
355	CALL fft_x( fftx_ar, 'forward' )
356
357	DO i = nxl_a, nxr_a
358	tri_ar(i,k) = fftx_ar(i)
359	ENDDO
360
361	ENDDO
362
363	jj = myid * (nyn_p-nys_p+1) + j
364	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
365
366	DO k = 1, nz
367	DO i = nxl_a, nxr_a
368	fftx_ar(i) = tri_ar (i,k)
369	ENDDO
370
371	CALL fft_x( fftx_ar, 'backward' )
372
373	m = nxl_a
374	DO n = 1, npe_s
375	DO i = nxl_p, nxr_p
376	work2(i,k,j,n) = fftx_ar(m)
377	m = m+1
378	ENDDO
379	ENDDO
380
381	ENDDO
382	ENDDO
383	#if defined( __KKMP )
384	!$OMP END PARALLEL
385	#endif
386
387	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
388
389	#if defined( __parallel )
390	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
391	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
392
393	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
394	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
395	comm2d, istat )
396
397	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
398	#else
399	work1 = work2
400	#endif
401
402	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
403
404	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
405	!$OMP DO
406	DO iouter = nxl_p, nxr_p, istride
407	iei = MIN( iouter+istride-1, nxr_p )
408	DO k = 1, nz
409
410	m = nys_a
411	DO n = 1, npe_s
412	DO j = nys_p, nyn_p
413	DO i = iouter, iei
414	ir = i - iouter + 1
415	ffty_ar(m,ir) = work1 (i,k,j,n)
416	ENDDO
417	m = m+1
418	ENDDO
419	ENDDO
420
421	DO i = iouter, iei
422	ii = nxl + i
423	ir = i - iouter + 1
424	CALL fft_y( ffty_ar(:,ir), 'backward' )
425
426	DO j = nys_a, nyn_a
427	ar(k,j,ii) = ffty_ar(j,ir)
428	ENDDO
429	ENDDO
430
431	ENDDO
432	ENDDO
433	!$OMP END PARALLEL
434
435	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
436
437	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
438
439	#if defined( __KKMP )
440	DEALLOCATE( tri )
441	#endif
442
443	END SUBROUTINE poisfft_hybrid_omp
444
445
446	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
447
448	USE cpulog
449	USE interfaces
450
451	IMPLICIT NONE
452
453	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
454	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
455
456	REAL, DIMENSION(0:nx,nz) :: tri_ar
457
458	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
459
460	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
461	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
462
463	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
464	#if defined( __KKMP )
465	INTEGER :: omp_get_thread_num
466	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
467	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
468	#else
469	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
470	#endif
471
472
473	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
474
475	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
476
477	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
478	!$OMP DO
479	!
480	!-- Store grid points to be transformed on a 1d-array, do the fft
481	!-- and sample the results on a 4d-array
482	DO i = nxl_p, nxr_p
483
484	DO j = nys_a, nyn_a
485	DO k = 1, nz
486	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
487	ENDDO
488	ENDDO
489
490	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
491	ENDDO
492
493	!$OMP DO
494	DO k = 1, nz
495	m = nys_a
496	DO n = 1, npe_s
497	DO j = nys_p, nyn_p
498	DO i = nxl_p, nxr_p
499	work1(i,k,j,n) = ffty_ar3(m,k,i)
500	ENDDO
501	m = m+1
502	ENDDO
503	ENDDO
504	ENDDO
505	!$OMP END PARALLEL
506
507	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
508
509	#if defined( __parallel )
510	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
511	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
512	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
513	comm2d, istat )
514	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
515	#else
516	work2 = work1
517	#endif
518
519	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
520
521	#if defined( __KKMP )
522	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
523	!$OMP DO
524	DO j = nys_p, nyn_p
525	jthread = omp_get_thread_num() + 1
526	#else
527	DO j = nys_p, nyn_p
528	jthread = 1
529	#endif
530	DO k = 1, nz
531
532	m = nxl_a
533	DO n = 1, npe_s
534	DO i = nxl_p, nxr_p
535	fftx_ar3(m,k,j) = work2(i,k,j,n)
536	m = m+1
537	ENDDO
538	ENDDO
539	ENDDO
540
541	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
542
543	DO k = 1, nz
544	DO i = nxl_a, nxr_a
545	tri_ar(i,k) = fftx_ar3(i,k,j)
546	ENDDO
547	ENDDO
548
549	jj = myid * (nyn_p-nys_p+1) + j
550	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
551
552	DO k = 1, nz
553	DO i = nxl_a, nxr_a
554	fftx_ar3(i,k,j) = tri_ar (i,k)
555	ENDDO
556	ENDDO
557
558	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
559
560	DO k = 1, nz
561	m = nxl_a
562	DO n = 1, npe_s
563	DO i = nxl_p, nxr_p
564	work2(i,k,j,n) = fftx_ar3(m,k,j)
565	m = m+1
566	ENDDO
567	ENDDO
568	ENDDO
569
570	ENDDO
571	#if defined( __KKMP )
572	!$OMP END PARALLEL
573	#endif
574
575	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
576
577	#if defined( __parallel )
578	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
579	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
580	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
581	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
582	comm2d, istat )
583	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
584	#else
585	work1 = work2
586	#endif
587
588	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
589
590	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
591	!$OMP DO
592	DO k = 1, nz
593	m = nys_a
594	DO n = 1, npe_s
595	DO j = nys_p, nyn_p
596	DO i = nxl_p, nxr_p
597	ffty_ar3(m,k,i) = work1(i,k,j,n)
598	ENDDO
599	m = m+1
600	ENDDO
601	ENDDO
602	ENDDO
603
604	!$OMP DO
605	DO i = nxl_p, nxr_p
606	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
607	DO j = nys_a, nyn_a
608	DO k = 1, nz
609	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
610	ENDDO
611	ENDDO
612	ENDDO
613	!$OMP END PARALLEL
614
615	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
616
617	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
618
619	#if defined( __KKMP )
620	DEALLOCATE( tri )
621	#endif
622
623	END SUBROUTINE poisfft_hybrid_omp_vec
624
625
626	SUBROUTINE poisfft_hybrid_nodes ( ar )
627
628	USE cpulog
629	USE interfaces
630
631	IMPLICIT NONE
632
633	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
634	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
635	n, nn, nt, nw1, nw2
636
637	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
638
639	REAL, DIMENSION(0:nx) :: fftx_ar
640	REAL, DIMENSION(0:ny,istride) :: ffty_ar
641
642	REAL, DIMENSION(0:nx,nz) :: tri_ar
643
644	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
645	nodes,nys_p:nyn_p) :: work1,work2
646	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
647
648
649	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
650
651	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
652
653	!
654	!-- Store grid points to be transformed on a 1d-array, do the fft
655	!-- and sample the results on a 4d-array
656	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
657	iei = MIN( iouter+istride-1, nxr_p )
658	DO k = 1, nz
659
660	DO i = iouter, iei
661	ii = nxl + i
662	ir = i - iouter + 1
663
664	DO j = nys_a, nyn_a
665	ffty_ar(j,ir) = ar(k,j,ii)
666	ENDDO
667
668	CALL fft_y( ffty_ar(:,ir), 'forward' )
669	ENDDO
670
671	m = nys_a
672	DO nn = 1, nodes
673	DO nt = 1, tasks_per_logical_node
674	DO j = nys_p, nyn_p
675	DO i = iouter, iei
676	ir = i - iouter + 1
677	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
678	ENDDO
679	m = m+1
680	ENDDO
681	ENDDO
682	ENDDO
683
684	ENDDO
685	ENDDO
686
687	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
688
689	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
690	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
691	DO nn = 1, nodes
692	DO j = nys_p, nyn_p
693	#if defined( __parallel )
694	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
695	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
696	comm_tasks, istat )
697	#endif
698	ENDDO
699	ENDDO
700	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
701
702
703	DO j = nys_p, nyn_p
704
705	CALL cascade( 1, j, nys_p, nyn_p )
706	nw2 = nw1 * SIZE( work1, 3 )
707	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
708	#if defined( __parallel )
709	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
710	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
711	comm_nodes, istat )
712	#endif
713	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
714	CALL cascade( 2, j, nys_p, nyn_p )
715
716	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
717	DO k = 1, nz
718
719	m = nxl_a
720	DO nn = 1, nodes
721	DO nt = 1, tasks_per_logical_node
722	DO i = nxl_p, nxr_p
723	fftx_ar(m) = work1(i,k,nt,nn,j)
724	m = m+1
725	ENDDO
726	ENDDO
727	ENDDO
728
729	CALL fft_x( fftx_ar, 'forward' )
730
731	DO i = nxl_a, nxr_a
732	tri_ar(i,k) = fftx_ar(i)
733	ENDDO
734
735	ENDDO
736
737	jj = myid * (nyn_p-nys_p+1) + j
738	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
739
740	DO k = 1, nz
741	DO i = nxl_a, nxr_a
742	fftx_ar(i) = tri_ar(i,k)
743	ENDDO
744
745	CALL fft_x( fftx_ar, 'backward' )
746
747	m = nxl_a
748	DO nn = 1, nodes
749	DO nt = 1, tasks_per_logical_node
750	DO i = nxl_p, nxr_p
751	work1(i,k,nt,nn,j) = fftx_ar(m)
752	m = m+1
753	ENDDO
754	ENDDO
755	ENDDO
756	ENDDO
757
758	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
759	nw2 = nw1 * SIZE( work1, 3 )
760	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
761	#if defined( __parallel )
762	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
763	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
764	comm_nodes, istat )
765	#endif
766	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
767
768	ENDDO
769
770	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
771	DO nn = 1, nodes
772	DO j = nys_p, nyn_p
773	#if defined( __parallel )
774	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
775	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
776	comm_tasks, istat )
777	#endif
778	ENDDO
779	ENDDO
780	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
781
782	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
783
784	DO iouter = nxl_p, nxr_p, istride
785	iei = MIN( iouter+istride-1, nxr_p )
786	DO k = 1, nz
787
788	m = nys_a
789	DO nn = 1, nodes
790	DO nt = 1, tasks_per_logical_node
791	DO j = nys_p, nyn_p
792	DO i = iouter, iei
793	ir = i - iouter + 1
794	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
795	ENDDO
796	m = m+1
797	ENDDO
798	ENDDO
799	ENDDO
800
801	DO i = iouter, iei
802	ii = nxl + i
803	ir = i - iouter + 1
804	CALL fft_y( ffty_ar(:,ir), 'backward' )
805
806	DO j = nys_a, nyn_a
807	ar(k,j,ii) = ffty_ar(j,ir)
808	ENDDO
809	ENDDO
810
811	ENDDO
812	ENDDO
813
814	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
815
816	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
817
818	END SUBROUTINE poisfft_hybrid_nodes
819
820
821
822	SUBROUTINE tridia_hybrid( j, ar, tri )
823
824	USE arrays_3d
825	USE control_parameters
826	USE grid_variables
827
828	IMPLICIT NONE
829
830	INTEGER :: i, j, k, nnyh
831	REAL, DIMENSION(0:nx,nz) :: ar
832	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
833	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
834
835	nnyh = (ny+1) / 2
836
837	tri = 0.0
838	!
839	!-- Define constant elements of the tridiagonal matrix.
840	DO k = 0, nz-1
841	DO i = 0,nx
842	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
843	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
844	ENDDO
845	ENDDO
846
847	IF ( j <= nnyh ) THEN
848	CALL maketri_hybrid( j )
849	ELSE
850	CALL maketri_hybrid( ny+1-j)
851	ENDIF
852	CALL zerleg_hybrid
853	CALL substi_hybrid( ar, tri )
854
855	CONTAINS
856
857	SUBROUTINE maketri_hybrid( j )
858
859	!----------------------------------------------------------------------!
860	! maketri !
861	! !
862	! computes the i- and j-dependent component of the matrix !
863	!----------------------------------------------------------------------!
864
865	USE constants
866
867	IMPLICIT NONE
868
869	INTEGER :: i, j, k, nnxh
870	REAL :: a, c
871
872	REAL, DIMENSION(0:nx) :: l
873
874
875	nnxh = (nx+1) / 2
876	!
877	!-- Provide the tridiagonal matrix for solution of the Poisson equation
878	!-- in Fourier space. The coefficients are computed following the method
879	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
880	!-- Siano's original version.
881	DO i = 0,nx
882	IF ( i >= 0 .AND. i < nnxh ) THEN
883	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
884	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
885	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
886	FLOAT( ny+1 ) ) ) / ( dy * dy )
887	ELSEIF ( i == nnxh ) THEN
888	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
889	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
890	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
891	FLOAT(ny+1) ) ) / ( dy * dy )
892	ELSE
893	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
894	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
895	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
896	FLOAT( ny+1 ) ) ) / ( dy * dy )
897	ENDIF
898	ENDDO
899
900	DO k = 0,nz-1
901	DO i = 0, nx
902	a = -1.0 * ddzu(k+2) * ddzw(k+1)
903	c = -1.0 * ddzu(k+1) * ddzw(k+1)
904	tri(1,i,k) = a + c - l(i)
905	ENDDO
906	ENDDO
907	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
908	DO i = 0,nx
909	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
910	ENDDO
911	ENDIF
912	IF ( ibc_p_t == 1 ) THEN
913	DO i = 0,nx
914	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
915	ENDDO
916	ENDIF
917
918	END SUBROUTINE maketri_hybrid
919
920
921	SUBROUTINE zerleg_hybrid
922
923	!----------------------------------------------------------------------!
924	! zerleg !
925	! !
926	! Splitting of the tridiagonal matrix (Thomas algorithm) !
927	!----------------------------------------------------------------------!
928
929	USE indices
930
931	IMPLICIT NONE
932
933	INTEGER :: i, k
934
935	!
936	!-- Splitting
937	DO i = 0, nx
938	tri(4,i,0) = tri(1,i,0)
939	ENDDO
940	DO k = 1, nz-1
941	DO i = 0,nx
942	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
943	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
944	ENDDO
945	ENDDO
946
947	END SUBROUTINE zerleg_hybrid
948
949	SUBROUTINE substi_hybrid( ar, tri )
950
951	!----------------------------------------------------------------------!
952	! substi !
953	! !
954	! Substitution (Forward and Backward) (Thomas algorithm) !
955	!----------------------------------------------------------------------!
956
957	IMPLICIT NONE
958
959	INTEGER :: i, j, k
960	REAL, DIMENSION(0:nx,nz) :: ar
961	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
962	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
963
964	!
965	!-- Forward substitution
966	DO i = 0, nx
967	ar1(i,0) = ar(i,1)
968	ENDDO
969	DO k = 1, nz - 1
970	DO i = 0,nx
971	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
972	ENDDO
973	ENDDO
974
975	!
976	!-- Backward substitution
977	DO i = 0,nx
978	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
979	ENDDO
980	DO k = nz-2, 0, -1
981	DO i = 0,nx
982	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
983	/ tri(4,i,k)
984	ENDDO
985	ENDDO
986
987	END SUBROUTINE substi_hybrid
988
989	END SUBROUTINE tridia_hybrid
990
991
992	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
993
994	USE cpulog
995
996	IMPLICIT NONE
997
998	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
999	INTEGER, SAVE :: tag = 10
1000	#if defined( __parallel )
1001	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
1002	#endif
1003
1004	REAL :: buf, buf1
1005
1006
1007	buf = 1.0
1008	buf1 = 1.1
1009	IF ( me_node == 0 ) THEN ! first node only
1010
1011	SELECT CASE ( loca )
1012
1013	CASE ( 1 ) ! before alltoall
1014
1015	IF( me_task > 0 ) THEN ! first task does not wait
1016	#if defined( __parallel )
1017	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
1018	buf1, 1, MPI_REAL, me_task-1, 0, &
1019	comm_tasks, stat,ierr )
1020	#endif
1021	ELSEIF ( j > nys_p ) THEN
1022	req = 0
1023	tag = MOD( tag-10, 10 ) + 10
1024	#if defined( __parallel )
1025	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
1026	tag, comm_tasks, req, ierr )
1027	reqa = req
1028	CALL MPI_WAITALL( 1, reqa, stat, ierr )
1029	#endif
1030	ENDIF
1031
1032	CASE ( 2 ) ! after alltoall
1033
1034	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
1035	#if defined( __parallel )
1036	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
1037	buf1, 1, MPI_REAL, me_task+1, 0, &
1038	comm_tasks, stat, ierr)
1039	#endif
1040	ELSEIF ( j < nyn_p ) THEN
1041	req = 0
1042	tag = MOD( tag-10, 10 ) + 10
1043	#if defined( __parallel )
1044	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
1045	ierr )
1046	#endif
1047	ENDIF
1048
1049	END SELECT
1050
1051	ENDIF
1052
1053	END SUBROUTINE cascade
1054
1055	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |