Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 420

Last change on this file since 420 was 415, checked in by raasch, 15 years ago
fortran bugfix in subroutine cascade concerning dimension os MPI status variable
Property svn:keywords set to `Id`
File size: 31.4 KB

Line
1	MODULE poisfft_hybrid_mod
2	!------------------------------------------------------------------------------
3	!
4	! Current revisions:
5	! -----------------
6	! Dimension of array stat in cascade change to prevent type problems with
7	! mpi2 libraries
8	!
9	! Former revisions:
10	! -----------------
11	! $Id: poisfft_hybrid.f90 415 2009-12-15 10:26:23Z franke $
12	!
13	! 274 2009-03-26 15:11:21Z heinze
14	! Output of messages replaced by message handling routine.
15	!
16	! Feb. 2007
17	! RCS Log replace by Id keyword, revision history cleaned up
18	!
19	! Revision 1.11 2004/04/30 12:43:14 raasch
20	! Renaming of fft routines, additional argument in calls of fft_y_m
21	!
22	! Revision 1.2 2002/12/19 16:08:31 raasch
23	! Preprocessor directive KKMP introduced (OMP does NOT work),
24	! array tri will be a shared array in OpenMP loop, to get better cache
25	! utilization, the i index (x-direction) will be executed in stride
26	! "istride" as outer loop and in a shorter inner loop,
27	! overlapping of computation and communication realized by new routine
28	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
29	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
30	!
31	!
32	! Description:
33	! ------------
34	! Solution of the Poisson equation with a 2D spectral method.
35	! Hybrid version for parallel computers using a 1D domain decomposition,
36	! realized with MPI, along x and parallelization with OPEN-MP along y
37	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
38	! optimization is realized by overlapping of computation and communication
39	! and by simultaneously executing as many communication calls as switches
40	! per logical partition (LPAR) are available. This version comes into
41	! effect if more than one node is used and if the environment variable
42	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
43	! without any rest.
44	!
45	! WARNING: In case of OpenMP, there are problems with allocating large
46	! arrays in parallel regions.
47	!
48	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
49	!------------------------------------------------------------------------------!
50
51	USE fft_xy
52	USE indices
53	USE pegrid
54
55	IMPLICIT NONE
56
57	PRIVATE
58	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
59
60	INTEGER, PARAMETER :: switch_per_lpar = 2
61
62	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
63	nxl_p, nxr_p, & ! partial x dimension
64	nys_a, nyn_a, & ! total y dimension
65	nys_p, nyn_p, & ! partial y dimension
66
67	npe_s, & ! total number of PEs for solver
68	nwords, & ! number of points to be exchanged
69	! with MPI_ALLTOALL
70	n_omp_threads ! number of OpenMP threads
71
72	!
73	!-- Variables for multi node version (cluster version) using routine
74	!-- poisfft_hybrid_nodes
75	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
76	comm_node_all, & ! communicater all PEs node version
77	comm_tasks, & ! communicater tasks
78	me, me_node, me_task,& ! identity of this PE
79	nodes, & ! number of nodes
80	tasks_per_logical_node = -1 ! default no cluster
81
82
83	!
84	!-- Public interfaces
85	INTERFACE poisfft_hybrid_ini
86	MODULE PROCEDURE poisfft_hybrid_ini
87	END INTERFACE poisfft_hybrid_ini
88
89	INTERFACE poisfft_hybrid
90	MODULE PROCEDURE poisfft_hybrid
91	END INTERFACE poisfft_hybrid
92
93	!
94	!-- Private interfaces
95	INTERFACE poisfft_hybrid_omp
96	MODULE PROCEDURE poisfft_hybrid_omp
97	END INTERFACE poisfft_hybrid_omp
98
99	INTERFACE poisfft_hybrid_omp_vec
100	MODULE PROCEDURE poisfft_hybrid_omp_vec
101	END INTERFACE poisfft_hybrid_omp_vec
102
103	INTERFACE poisfft_hybrid_nodes
104	MODULE PROCEDURE poisfft_hybrid_nodes
105	END INTERFACE poisfft_hybrid_nodes
106
107	INTERFACE tridia_hybrid
108	MODULE PROCEDURE tridia_hybrid
109	END INTERFACE tridia_hybrid
110
111	INTERFACE cascade
112	MODULE PROCEDURE cascade
113	END INTERFACE cascade
114
115	CONTAINS
116
117
118	SUBROUTINE poisfft_hybrid_ini
119
120	USE control_parameters
121	USE pegrid
122
123	IMPLICIT NONE
124
125	CHARACTER(LEN=8) :: cdummy
126	INTEGER :: idummy, istat
127	INTEGER, DIMENSION(2) :: coords, dims
128
129	LOGICAL, DIMENSION(2) :: period = .false., re_dims
130
131
132	!
133	!-- Set the internal index values for the hybrid solver
134	#if defined( __parallel )
135	npe_s = pdims(1)
136	#else
137	npe_s = 1
138	#endif
139	nxl_a = 0
140	nxr_a = nx
141	nxl_p = 0
142	nxr_p = ( ( nx+1 ) / npe_s ) - 1
143	nys_a = nys
144	nyn_a = nyn
145	nys_p = 0
146	nyn_p = ( ( ny+1 ) / npe_s ) - 1
147
148	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
149
150	#if defined( __KKMP )
151	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
152	READ ( cdummy, '(I8)' ) n_omp_threads
153	IF ( n_omp_threads > 1 ) THEN
154	WRITE( message_string, * ) 'Number of OpenMP threads = ', &
155	n_omp_threads
156	CALL message( 'poisfft_hybrid_ini', 'PA0280', 0, 0, 0, 6, 0 )
157	ENDIF
158	#else
159	n_omp_threads = 1
160	#endif
161	!
162	!-- Initialize the one-dimensional FFT routines
163	CALL fft_init
164
165	!
166	!-- Setup for multi node version (poisfft_hybrid_nodes)
167	IF ( n_omp_threads == 1 .AND. &
168	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
169
170	IF ( tasks_per_node /= -9999 ) THEN
171	!
172	!-- Multi node version requires that the available number of
173	!-- switches per logical partition must be an integral divisor
174	!-- of the chosen number of tasks per node
175	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
176	!
177	!-- Set the switch which decides about usage of the multi node
178	!-- version
179	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
180	numprocs > tasks_per_node ) THEN
181	tasks_per_logical_node = tasks_per_node / switch_per_lpar
182	ENDIF
183
184	IF ( tasks_per_logical_node > -1 ) THEN
185
186	WRITE( message_string, * ) 'running optimized ', &
187	'multinode version', &
188	'&switch_per_lpar = ', &
189	switch_per_lpar, &
190	'&tasks_per_lpar = ', &
191	tasks_per_node, &
192	'tasks_per_logical_node = ', &
193	tasks_per_logical_node
194	CALL message( 'poisfft_hybrid_ini', 'PA0281', 0, 0, 0, 6, 0 )
195
196	ENDIF
197
198	ENDIF
199	ENDIF
200	ENDIF
201
202	!
203	!-- Determine sub-topologies for multi node version
204	IF ( tasks_per_logical_node >= 2 ) THEN
205
206	#if defined( __parallel )
207	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
208	tasks_per_logical_node
209	dims(1) = nodes
210	dims(2) = tasks_per_logical_node
211
212	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
213	comm_node_all, istat )
214	CALL MPI_COMM_RANK( comm_node_all, me, istat )
215
216	re_dims(1) = .TRUE.
217	re_dims(2) = .FALSE.
218	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
219	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
220
221	re_dims(1) = .FALSE.
222	re_dims(2) = .TRUE.
223	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
224	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
225
226	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
227	! tasks_per_logical_node
228	#else
229	message_string = 'parallel environment (MPI) required'
230	CALL message( 'poisfft_hybrid_ini', 'PA0282', 1, 2, 0, 6, 0 )
231	#endif
232	ENDIF
233
234	END SUBROUTINE poisfft_hybrid_ini
235
236
237	SUBROUTINE poisfft_hybrid( ar )
238
239	USE control_parameters
240	USE interfaces
241
242	IMPLICIT NONE
243
244	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
245
246	IF ( host(1:3) == 'nec' ) THEN
247	CALL poisfft_hybrid_omp_vec( ar )
248	ELSE
249	IF ( tasks_per_logical_node == -1 ) THEN
250	CALL poisfft_hybrid_omp( ar )
251	ELSE
252	CALL poisfft_hybrid_nodes( ar )
253	ENDIF
254	ENDIF
255
256	END SUBROUTINE poisfft_hybrid
257
258
259	SUBROUTINE poisfft_hybrid_omp ( ar )
260
261	USE cpulog
262	USE interfaces
263
264	IMPLICIT NONE
265
266	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
267	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
268
269	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
270
271	REAL, DIMENSION(0:nx) :: fftx_ar
272	REAL, DIMENSION(0:ny,istride) :: ffty_ar
273
274	REAL, DIMENSION(0:nx,nz) :: tri_ar
275
276	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
277	#if defined( __KKMP )
278	INTEGER :: omp_get_thread_num
279	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
280	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
281	#else
282	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
283	#endif
284
285
286	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
287
288	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
289
290	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
291	!$OMP DO
292	!
293	!-- Store grid points to be transformed on a 1d-array, do the fft
294	!-- and sample the results on a 4d-array
295	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
296	iei = MIN( iouter+istride-1, nxr_p )
297	DO k = 1, nz
298
299	DO i = iouter, iei
300	ii = nxl + i
301	ir = i - iouter + 1
302
303	DO j = nys_a, nyn_a
304	ffty_ar(j,ir) = ar(k,j,ii)
305	ENDDO
306
307	CALL fft_y( ffty_ar(:,ir), 'forward' )
308	ENDDO
309
310	m = nys_a
311	DO n = 1, npe_s
312	DO j = nys_p, nyn_p
313	DO i = iouter, iei
314	ir = i - iouter + 1
315	work1(i,k,j,n) = ffty_ar(m,ir)
316	ENDDO
317	m = m+1
318	ENDDO
319	ENDDO
320
321	ENDDO
322	ENDDO
323	!$OMP END PARALLEL
324
325	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
326
327	#if defined( __parallel )
328	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
329
330	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
331	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
332	comm2d, istat )
333
334	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
335	#else
336	work2 = work1
337	#endif
338
339	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
340
341	#if defined( __KKMP )
342	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
343	!$OMP DO
344	DO j = nys_p, nyn_p
345	jthread = omp_get_thread_num() + 1
346	#else
347	DO j = nys_p, nyn_p
348	jthread = 1
349	#endif
350	DO k = 1, nz
351
352	m = nxl_a
353	DO n = 1, npe_s
354	DO i = nxl_p, nxr_p
355	fftx_ar(m) = work2(i,k,j,n)
356	m = m+1
357	ENDDO
358	ENDDO
359
360	CALL fft_x( fftx_ar, 'forward' )
361
362	DO i = nxl_a, nxr_a
363	tri_ar(i,k) = fftx_ar(i)
364	ENDDO
365
366	ENDDO
367
368	jj = myid * (nyn_p-nys_p+1) + j
369	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
370
371	DO k = 1, nz
372	DO i = nxl_a, nxr_a
373	fftx_ar(i) = tri_ar (i,k)
374	ENDDO
375
376	CALL fft_x( fftx_ar, 'backward' )
377
378	m = nxl_a
379	DO n = 1, npe_s
380	DO i = nxl_p, nxr_p
381	work2(i,k,j,n) = fftx_ar(m)
382	m = m+1
383	ENDDO
384	ENDDO
385
386	ENDDO
387	ENDDO
388	#if defined( __KKMP )
389	!$OMP END PARALLEL
390	#endif
391
392	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
393
394	#if defined( __parallel )
395	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
396	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
397
398	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
399	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
400	comm2d, istat )
401
402	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
403	#else
404	work1 = work2
405	#endif
406
407	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
408
409	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
410	!$OMP DO
411	DO iouter = nxl_p, nxr_p, istride
412	iei = MIN( iouter+istride-1, nxr_p )
413	DO k = 1, nz
414
415	m = nys_a
416	DO n = 1, npe_s
417	DO j = nys_p, nyn_p
418	DO i = iouter, iei
419	ir = i - iouter + 1
420	ffty_ar(m,ir) = work1 (i,k,j,n)
421	ENDDO
422	m = m+1
423	ENDDO
424	ENDDO
425
426	DO i = iouter, iei
427	ii = nxl + i
428	ir = i - iouter + 1
429	CALL fft_y( ffty_ar(:,ir), 'backward' )
430
431	DO j = nys_a, nyn_a
432	ar(k,j,ii) = ffty_ar(j,ir)
433	ENDDO
434	ENDDO
435
436	ENDDO
437	ENDDO
438	!$OMP END PARALLEL
439
440	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
441
442	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
443
444	#if defined( __KKMP )
445	DEALLOCATE( tri )
446	#endif
447
448	END SUBROUTINE poisfft_hybrid_omp
449
450
451	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
452
453	USE cpulog
454	USE interfaces
455
456	IMPLICIT NONE
457
458	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
459	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
460
461	REAL, DIMENSION(0:nx,nz) :: tri_ar
462
463	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
464
465	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
466	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
467
468	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
469	#if defined( __KKMP )
470	INTEGER :: omp_get_thread_num
471	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
472	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
473	#else
474	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
475	#endif
476
477
478	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
479
480	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
481
482	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
483	!$OMP DO
484	!
485	!-- Store grid points to be transformed on a 1d-array, do the fft
486	!-- and sample the results on a 4d-array
487	DO i = nxl_p, nxr_p
488
489	DO j = nys_a, nyn_a
490	DO k = 1, nz
491	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
492	ENDDO
493	ENDDO
494
495	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
496	ENDDO
497
498	!$OMP DO
499	DO k = 1, nz
500	m = nys_a
501	DO n = 1, npe_s
502	DO j = nys_p, nyn_p
503	DO i = nxl_p, nxr_p
504	work1(i,k,j,n) = ffty_ar3(m,k,i)
505	ENDDO
506	m = m+1
507	ENDDO
508	ENDDO
509	ENDDO
510	!$OMP END PARALLEL
511
512	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
513
514	#if defined( __parallel )
515	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
516	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
517	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
518	comm2d, istat )
519	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
520	#else
521	work2 = work1
522	#endif
523
524	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
525
526	#if defined( __KKMP )
527	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
528	!$OMP DO
529	DO j = nys_p, nyn_p
530	jthread = omp_get_thread_num() + 1
531	#else
532	DO j = nys_p, nyn_p
533	jthread = 1
534	#endif
535	DO k = 1, nz
536
537	m = nxl_a
538	DO n = 1, npe_s
539	DO i = nxl_p, nxr_p
540	fftx_ar3(m,k,j) = work2(i,k,j,n)
541	m = m+1
542	ENDDO
543	ENDDO
544	ENDDO
545
546	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
547
548	DO k = 1, nz
549	DO i = nxl_a, nxr_a
550	tri_ar(i,k) = fftx_ar3(i,k,j)
551	ENDDO
552	ENDDO
553
554	jj = myid * (nyn_p-nys_p+1) + j
555	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
556
557	DO k = 1, nz
558	DO i = nxl_a, nxr_a
559	fftx_ar3(i,k,j) = tri_ar (i,k)
560	ENDDO
561	ENDDO
562
563	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
564
565	DO k = 1, nz
566	m = nxl_a
567	DO n = 1, npe_s
568	DO i = nxl_p, nxr_p
569	work2(i,k,j,n) = fftx_ar3(m,k,j)
570	m = m+1
571	ENDDO
572	ENDDO
573	ENDDO
574
575	ENDDO
576	#if defined( __KKMP )
577	!$OMP END PARALLEL
578	#endif
579
580	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
581
582	#if defined( __parallel )
583	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
584	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
585	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
586	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
587	comm2d, istat )
588	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
589	#else
590	work1 = work2
591	#endif
592
593	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
594
595	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
596	!$OMP DO
597	DO k = 1, nz
598	m = nys_a
599	DO n = 1, npe_s
600	DO j = nys_p, nyn_p
601	DO i = nxl_p, nxr_p
602	ffty_ar3(m,k,i) = work1(i,k,j,n)
603	ENDDO
604	m = m+1
605	ENDDO
606	ENDDO
607	ENDDO
608
609	!$OMP DO
610	DO i = nxl_p, nxr_p
611	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
612	DO j = nys_a, nyn_a
613	DO k = 1, nz
614	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
615	ENDDO
616	ENDDO
617	ENDDO
618	!$OMP END PARALLEL
619
620	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
621
622	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
623
624	#if defined( __KKMP )
625	DEALLOCATE( tri )
626	#endif
627
628	END SUBROUTINE poisfft_hybrid_omp_vec
629
630
631	SUBROUTINE poisfft_hybrid_nodes ( ar )
632
633	USE cpulog
634	USE interfaces
635
636	IMPLICIT NONE
637
638	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
639	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
640	n, nn, nt, nw1, nw2
641
642	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
643
644	REAL, DIMENSION(0:nx) :: fftx_ar
645	REAL, DIMENSION(0:ny,istride) :: ffty_ar
646
647	REAL, DIMENSION(0:nx,nz) :: tri_ar
648
649	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
650	nodes,nys_p:nyn_p) :: work1,work2
651	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
652
653
654	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
655
656	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
657
658	!
659	!-- Store grid points to be transformed on a 1d-array, do the fft
660	!-- and sample the results on a 4d-array
661	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
662	iei = MIN( iouter+istride-1, nxr_p )
663	DO k = 1, nz
664
665	DO i = iouter, iei
666	ii = nxl + i
667	ir = i - iouter + 1
668
669	DO j = nys_a, nyn_a
670	ffty_ar(j,ir) = ar(k,j,ii)
671	ENDDO
672
673	CALL fft_y( ffty_ar(:,ir), 'forward' )
674	ENDDO
675
676	m = nys_a
677	DO nn = 1, nodes
678	DO nt = 1, tasks_per_logical_node
679	DO j = nys_p, nyn_p
680	DO i = iouter, iei
681	ir = i - iouter + 1
682	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
683	ENDDO
684	m = m+1
685	ENDDO
686	ENDDO
687	ENDDO
688
689	ENDDO
690	ENDDO
691
692	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
693
694	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
695	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
696	DO nn = 1, nodes
697	DO j = nys_p, nyn_p
698	#if defined( __parallel )
699	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
700	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
701	comm_tasks, istat )
702	#endif
703	ENDDO
704	ENDDO
705	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
706
707
708	DO j = nys_p, nyn_p
709
710	CALL cascade( 1, j, nys_p, nyn_p )
711	nw2 = nw1 * SIZE( work1, 3 )
712	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
713	#if defined( __parallel )
714	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
715	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
716	comm_nodes, istat )
717	#endif
718	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
719	CALL cascade( 2, j, nys_p, nyn_p )
720
721	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
722	DO k = 1, nz
723
724	m = nxl_a
725	DO nn = 1, nodes
726	DO nt = 1, tasks_per_logical_node
727	DO i = nxl_p, nxr_p
728	fftx_ar(m) = work1(i,k,nt,nn,j)
729	m = m+1
730	ENDDO
731	ENDDO
732	ENDDO
733
734	CALL fft_x( fftx_ar, 'forward' )
735
736	DO i = nxl_a, nxr_a
737	tri_ar(i,k) = fftx_ar(i)
738	ENDDO
739
740	ENDDO
741
742	jj = myid * (nyn_p-nys_p+1) + j
743	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
744
745	DO k = 1, nz
746	DO i = nxl_a, nxr_a
747	fftx_ar(i) = tri_ar(i,k)
748	ENDDO
749
750	CALL fft_x( fftx_ar, 'backward' )
751
752	m = nxl_a
753	DO nn = 1, nodes
754	DO nt = 1, tasks_per_logical_node
755	DO i = nxl_p, nxr_p
756	work1(i,k,nt,nn,j) = fftx_ar(m)
757	m = m+1
758	ENDDO
759	ENDDO
760	ENDDO
761	ENDDO
762
763	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
764	nw2 = nw1 * SIZE( work1, 3 )
765	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
766	#if defined( __parallel )
767	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
768	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
769	comm_nodes, istat )
770	#endif
771	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
772
773	ENDDO
774
775	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
776	DO nn = 1, nodes
777	DO j = nys_p, nyn_p
778	#if defined( __parallel )
779	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
780	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
781	comm_tasks, istat )
782	#endif
783	ENDDO
784	ENDDO
785	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
786
787	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
788
789	DO iouter = nxl_p, nxr_p, istride
790	iei = MIN( iouter+istride-1, nxr_p )
791	DO k = 1, nz
792
793	m = nys_a
794	DO nn = 1, nodes
795	DO nt = 1, tasks_per_logical_node
796	DO j = nys_p, nyn_p
797	DO i = iouter, iei
798	ir = i - iouter + 1
799	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
800	ENDDO
801	m = m+1
802	ENDDO
803	ENDDO
804	ENDDO
805
806	DO i = iouter, iei
807	ii = nxl + i
808	ir = i - iouter + 1
809	CALL fft_y( ffty_ar(:,ir), 'backward' )
810
811	DO j = nys_a, nyn_a
812	ar(k,j,ii) = ffty_ar(j,ir)
813	ENDDO
814	ENDDO
815
816	ENDDO
817	ENDDO
818
819	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
820
821	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
822
823	END SUBROUTINE poisfft_hybrid_nodes
824
825
826
827	SUBROUTINE tridia_hybrid( j, ar, tri )
828
829	USE arrays_3d
830	USE control_parameters
831	USE grid_variables
832
833	IMPLICIT NONE
834
835	INTEGER :: i, j, k, nnyh
836	REAL, DIMENSION(0:nx,nz) :: ar
837	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
838	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
839
840	nnyh = (ny+1) / 2
841
842	tri = 0.0
843	!
844	!-- Define constant elements of the tridiagonal matrix.
845	DO k = 0, nz-1
846	DO i = 0,nx
847	tri(2,i,k) = ddzu(k+1) * ddzw(k+1)
848	tri(3,i,k) = ddzu(k+2) * ddzw(k+1)
849	ENDDO
850	ENDDO
851
852	IF ( j <= nnyh ) THEN
853	CALL maketri_hybrid( j )
854	ELSE
855	CALL maketri_hybrid( ny+1-j)
856	ENDIF
857	CALL zerleg_hybrid
858	CALL substi_hybrid( ar, tri )
859
860	CONTAINS
861
862	SUBROUTINE maketri_hybrid( j )
863
864	!----------------------------------------------------------------------!
865	! maketri !
866	! !
867	! computes the i- and j-dependent component of the matrix !
868	!----------------------------------------------------------------------!
869
870	USE constants
871
872	IMPLICIT NONE
873
874	INTEGER :: i, j, k, nnxh
875	REAL :: a, c
876
877	REAL, DIMENSION(0:nx) :: l
878
879
880	nnxh = (nx+1) / 2
881	!
882	!-- Provide the tridiagonal matrix for solution of the Poisson equation
883	!-- in Fourier space. The coefficients are computed following the method
884	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
885	!-- Siano's original version.
886	DO i = 0,nx
887	IF ( i >= 0 .AND. i < nnxh ) THEN
888	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
889	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
890	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
891	FLOAT( ny+1 ) ) ) / ( dy * dy )
892	ELSEIF ( i == nnxh ) THEN
893	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
894	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
895	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
896	FLOAT(ny+1) ) ) / ( dy * dy )
897	ELSE
898	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
899	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
900	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
901	FLOAT( ny+1 ) ) ) / ( dy * dy )
902	ENDIF
903	ENDDO
904
905	DO k = 0,nz-1
906	DO i = 0, nx
907	a = -1.0 * ddzu(k+2) * ddzw(k+1)
908	c = -1.0 * ddzu(k+1) * ddzw(k+1)
909	tri(1,i,k) = a + c - l(i)
910	ENDDO
911	ENDDO
912	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
913	DO i = 0,nx
914	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
915	ENDDO
916	ENDIF
917	IF ( ibc_p_t == 1 ) THEN
918	DO i = 0,nx
919	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
920	ENDDO
921	ENDIF
922
923	END SUBROUTINE maketri_hybrid
924
925
926	SUBROUTINE zerleg_hybrid
927
928	!----------------------------------------------------------------------!
929	! zerleg !
930	! !
931	! Splitting of the tridiagonal matrix (Thomas algorithm) !
932	!----------------------------------------------------------------------!
933
934	USE indices
935
936	IMPLICIT NONE
937
938	INTEGER :: i, k
939
940	!
941	!-- Splitting
942	DO i = 0, nx
943	tri(4,i,0) = tri(1,i,0)
944	ENDDO
945	DO k = 1, nz-1
946	DO i = 0,nx
947	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
948	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
949	ENDDO
950	ENDDO
951
952	END SUBROUTINE zerleg_hybrid
953
954	SUBROUTINE substi_hybrid( ar, tri )
955
956	!----------------------------------------------------------------------!
957	! substi !
958	! !
959	! Substitution (Forward and Backward) (Thomas algorithm) !
960	!----------------------------------------------------------------------!
961
962	IMPLICIT NONE
963
964	INTEGER :: i, j, k
965	REAL, DIMENSION(0:nx,nz) :: ar
966	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
967	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
968
969	!
970	!-- Forward substitution
971	DO i = 0, nx
972	ar1(i,0) = ar(i,1)
973	ENDDO
974	DO k = 1, nz - 1
975	DO i = 0,nx
976	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
977	ENDDO
978	ENDDO
979
980	!
981	!-- Backward substitution
982	DO i = 0,nx
983	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
984	ENDDO
985	DO k = nz-2, 0, -1
986	DO i = 0,nx
987	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
988	/ tri(4,i,k)
989	ENDDO
990	ENDDO
991
992	END SUBROUTINE substi_hybrid
993
994	END SUBROUTINE tridia_hybrid
995
996
997	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
998
999	USE cpulog
1000
1001	IMPLICIT NONE
1002
1003	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
1004	INTEGER, SAVE :: tag = 10
1005	#if defined( __parallel )
1006	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
1007	INTEGER, DIMENSION(MPI_STATUS_SIZE,1) :: stata
1008	#endif
1009
1010	REAL :: buf, buf1
1011
1012
1013	buf = 1.0
1014	buf1 = 1.1
1015	IF ( me_node == 0 ) THEN ! first node only
1016
1017	SELECT CASE ( loca )
1018
1019	CASE ( 1 ) ! before alltoall
1020
1021	IF( me_task > 0 ) THEN ! first task does not wait
1022	#if defined( __parallel )
1023	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
1024	buf1, 1, MPI_REAL, me_task-1, 0, &
1025	comm_tasks, stat, ierr )
1026	#endif
1027	ELSEIF ( j > nys_p ) THEN
1028	req = 0
1029	tag = MOD( tag-10, 10 ) + 10
1030	#if defined( __parallel )
1031	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
1032	tag, comm_tasks, req, ierr )
1033	reqa = req
1034	CALL MPI_WAITALL( 1, reqa, stata, ierr )
1035	#endif
1036	ENDIF
1037
1038	CASE ( 2 ) ! after alltoall
1039
1040	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
1041	#if defined( __parallel )
1042	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
1043	buf1, 1, MPI_REAL, me_task+1, 0, &
1044	comm_tasks, stat, ierr)
1045	#endif
1046	ELSEIF ( j < nyn_p ) THEN
1047	req = 0
1048	tag = MOD( tag-10, 10 ) + 10
1049	#if defined( __parallel )
1050	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
1051	ierr )
1052	#endif
1053	ENDIF
1054
1055	END SELECT
1056
1057	ENDIF
1058
1059	END SUBROUTINE cascade
1060
1061	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |