Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 974

Last change on this file since 974 was 810, checked in by maronga, 13 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 32.1 KB

Line
1	MODULE poisfft_hybrid_mod
2
3	!------------------------------------------------------------------------------
4	!
5	! Current revisions:
6	! -----------------
7	!
8	!
9	! Former revisions:
10	! -----------------
11	! $Id: poisfft_hybrid.f90 810 2012-01-30 13:40:12Z maronga $
12	!
13	! 809 2012-01-30 13:32:58Z maronga
14	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
15	!
16	! 807 2012-01-25 11:53:51Z maronga
17	! New cpp directive "__check" implemented which is used by check_namelist_files
18	! (most of the code is unneeded by check_namelist_files).
19	!
20	! 667 2010-12-23 12:06:00Z suehring/gryschka
21	! ddzu replaced by ddzu_pres due to changes in zu(0)
22	!
23	! 415 2009-12-15 10:26:23Z raasch
24	! Dimension of array stat in cascade change to prevent type problems with___
25	! mpi2 libraries
26	!
27	! 274 2009-03-26 15:11:21Z heinze
28	! Output of messages replaced by message handling routine.
29	!
30	! Feb. 2007
31	! RCS Log replace by Id keyword, revision history cleaned up
32	!
33	! Revision 1.11 2004/04/30 12:43:14 raasch
34	! Renaming of fft routines, additional argument in calls of fft_y_m
35	!
36	! Revision 1.2 2002/12/19 16:08:31 raasch
37	! Preprocessor directive KKMP introduced (OMP does NOT work),
38	! array tri will be a shared array in OpenMP loop, to get better cache
39	! utilization, the i index (x-direction) will be executed in stride
40	! "istride" as outer loop and in a shorter inner loop,
41	! overlapping of computation and communication realized by new routine
42	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
43	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
44	!
45	!
46	! Description:
47	! ------------
48	! Solution of the Poisson equation with a 2D spectral method.
49	! Hybrid version for parallel computers using a 1D domain decomposition,
50	! realized with MPI, along x and parallelization with OPEN-MP along y
51	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
52	! optimization is realized by overlapping of computation and communication
53	! and by simultaneously executing as many communication calls as switches
54	! per logical partition (LPAR) are available. This version comes into
55	! effect if more than one node is used and if the environment variable
56	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
57	! without any rest.
58	!
59	! WARNING: In case of OpenMP, there are problems with allocating large
60	! arrays in parallel regions.
61	!
62	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
63	!------------------------------------------------------------------------------!
64
65	USE fft_xy
66	USE indices
67	USE pegrid
68
69	IMPLICIT NONE
70
71	INTEGER, PARAMETER :: switch_per_lpar = 2
72
73	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
74	nxl_p, nxr_p, & ! partial x dimension
75	nys_a, nyn_a, & ! total y dimension
76	nys_p, nyn_p, & ! partial y dimension
77
78	npe_s, & ! total number of PEs for solver
79	nwords, & ! number of points to be exchanged
80	! with MPI_ALLTOALL
81	n_omp_threads ! number of OpenMP threads
82
83	!
84	!-- Variables for multi node version (cluster version) using routine
85	!-- poisfft_hybrid_nodes
86	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
87	comm_node_all, & ! communicater all PEs node version
88	comm_tasks, & ! communicater tasks
89	me, me_node, me_task,& ! identity of this PE
90	nodes, & ! number of nodes
91	tasks_per_logical_node = -1 ! default no cluster
92
93
94	PRIVATE
95
96
97	#if ! defined ( __check )
98	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
99
100
101	!
102	!-- Public interfaces
103	INTERFACE poisfft_hybrid_ini
104	MODULE PROCEDURE poisfft_hybrid_ini
105	END INTERFACE poisfft_hybrid_ini
106
107	INTERFACE poisfft_hybrid
108	MODULE PROCEDURE poisfft_hybrid
109	END INTERFACE poisfft_hybrid
110
111	!
112	!-- Private interfaces
113	INTERFACE poisfft_hybrid_omp
114	MODULE PROCEDURE poisfft_hybrid_omp
115	END INTERFACE poisfft_hybrid_omp
116
117	INTERFACE poisfft_hybrid_omp_vec
118	MODULE PROCEDURE poisfft_hybrid_omp_vec
119	END INTERFACE poisfft_hybrid_omp_vec
120
121	INTERFACE poisfft_hybrid_nodes
122	MODULE PROCEDURE poisfft_hybrid_nodes
123	END INTERFACE poisfft_hybrid_nodes
124
125	INTERFACE tridia_hybrid
126	MODULE PROCEDURE tridia_hybrid
127	END INTERFACE tridia_hybrid
128
129	INTERFACE cascade
130	MODULE PROCEDURE cascade
131	END INTERFACE cascade
132	#else
133	PUBLIC poisfft_hybrid_ini
134
135	!
136	!-- Public interfaces
137	INTERFACE poisfft_hybrid_ini
138	MODULE PROCEDURE poisfft_hybrid_ini
139	END INTERFACE poisfft_hybrid_ini
140	#endif
141
142	CONTAINS
143
144
145	SUBROUTINE poisfft_hybrid_ini
146
147	USE control_parameters
148	USE pegrid
149
150	IMPLICIT NONE
151
152	CHARACTER(LEN=8) :: cdummy
153	INTEGER :: idummy, istat
154	INTEGER, DIMENSION(2) :: coords, dims
155
156	LOGICAL, DIMENSION(2) :: period = .false., re_dims
157
158
159	!
160	!-- Set the internal index values for the hybrid solver
161	#if defined( __parallel )
162	npe_s = pdims(1)
163	#else
164	npe_s = 1
165	#endif
166	nxl_a = 0
167	nxr_a = nx
168	nxl_p = 0
169	nxr_p = ( ( nx+1 ) / npe_s ) - 1
170	nys_a = nys
171	nyn_a = nyn
172	nys_p = 0
173	nyn_p = ( ( ny+1 ) / npe_s ) - 1
174
175	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
176
177	#if defined( __KKMP ) && ! defined ( __check )
178	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
179	READ ( cdummy, '(I8)' ) n_omp_threads
180	IF ( n_omp_threads > 1 ) THEN
181	WRITE( message_string, * ) 'Number of OpenMP threads = ', &
182	n_omp_threads
183	CALL message( 'poisfft_hybrid_ini', 'PA0280', 0, 0, 0, 6, 0 )
184	ENDIF
185	#else
186	n_omp_threads = 1
187	#endif
188	!
189	!-- Initialize the one-dimensional FFT routines
190	CALL fft_init
191
192	!
193	!-- Setup for multi node version (poisfft_hybrid_nodes)
194	IF ( n_omp_threads == 1 .AND. &
195	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
196
197	IF ( tasks_per_node /= -9999 ) THEN
198	!
199	!-- Multi node version requires that the available number of
200	!-- switches per logical partition must be an integral divisor
201	!-- of the chosen number of tasks per node
202	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
203	!
204	!-- Set the switch which decides about usage of the multi node
205	!-- version
206	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
207	numprocs > tasks_per_node ) THEN
208	tasks_per_logical_node = tasks_per_node / switch_per_lpar
209	ENDIF
210
211	IF ( tasks_per_logical_node > -1 ) THEN
212
213	WRITE( message_string, * ) 'running optimized ', &
214	'multinode version', &
215	'&switch_per_lpar = ', &
216	switch_per_lpar, &
217	'&tasks_per_lpar = ', &
218	tasks_per_node, &
219	'tasks_per_logical_node = ', &
220	tasks_per_logical_node
221	CALL message( 'poisfft_hybrid_ini', 'PA0281', 0, 0, 0, 6, 0 )
222
223	ENDIF
224
225	ENDIF
226	ENDIF
227	ENDIF
228
229	!
230	!-- Determine sub-topologies for multi node version
231	IF ( tasks_per_logical_node >= 2 ) THEN
232
233	#if defined( __parallel ) && ! defined ( __check )
234	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
235	tasks_per_logical_node
236	dims(1) = nodes
237	dims(2) = tasks_per_logical_node
238
239	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
240	comm_node_all, istat )
241	CALL MPI_COMM_RANK( comm_node_all, me, istat )
242
243	re_dims(1) = .TRUE.
244	re_dims(2) = .FALSE.
245	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
246	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
247
248	re_dims(1) = .FALSE.
249	re_dims(2) = .TRUE.
250	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
251	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
252
253	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
254	! tasks_per_logical_node
255	#elif ! defined( __parallel )
256	message_string = 'parallel environment (MPI) required'
257	CALL message( 'poisfft_hybrid_ini', 'PA0282', 1, 2, 0, 6, 0 )
258	#endif
259	ENDIF
260
261	END SUBROUTINE poisfft_hybrid_ini
262
263	#if ! defined ( __check )
264	SUBROUTINE poisfft_hybrid( ar )
265
266	USE control_parameters
267	USE interfaces
268
269	IMPLICIT NONE
270
271	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
272
273	IF ( host(1:3) == 'nec' ) THEN
274	CALL poisfft_hybrid_omp_vec( ar )
275	ELSE
276	IF ( tasks_per_logical_node == -1 ) THEN
277	CALL poisfft_hybrid_omp( ar )
278	ELSE
279	CALL poisfft_hybrid_nodes( ar )
280	ENDIF
281	ENDIF
282
283	END SUBROUTINE poisfft_hybrid
284
285
286	SUBROUTINE poisfft_hybrid_omp ( ar )
287
288	USE cpulog
289	USE interfaces
290
291	IMPLICIT NONE
292
293	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
294	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
295
296	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
297
298	REAL, DIMENSION(0:nx) :: fftx_ar
299	REAL, DIMENSION(0:ny,istride) :: ffty_ar
300
301	REAL, DIMENSION(0:nx,nz) :: tri_ar
302
303	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
304	#if defined( __KKMP )
305	INTEGER :: omp_get_thread_num
306	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
307	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
308	#else
309	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
310	#endif
311
312
313	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
314
315	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
316
317	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
318	!$OMP DO
319	!
320	!-- Store grid points to be transformed on a 1d-array, do the fft
321	!-- and sample the results on a 4d-array
322	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
323	iei = MIN( iouter+istride-1, nxr_p )
324	DO k = 1, nz
325
326	DO i = iouter, iei
327	ii = nxl + i
328	ir = i - iouter + 1
329
330	DO j = nys_a, nyn_a
331	ffty_ar(j,ir) = ar(k,j,ii)
332	ENDDO
333
334	CALL fft_y( ffty_ar(:,ir), 'forward' )
335	ENDDO
336
337	m = nys_a
338	DO n = 1, npe_s
339	DO j = nys_p, nyn_p
340	DO i = iouter, iei
341	ir = i - iouter + 1
342	work1(i,k,j,n) = ffty_ar(m,ir)
343	ENDDO
344	m = m+1
345	ENDDO
346	ENDDO
347
348	ENDDO
349	ENDDO
350	!$OMP END PARALLEL
351
352	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
353
354	#if defined( __parallel )
355	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
356
357	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
358	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
359	comm2d, istat )
360
361	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
362	#else
363	work2 = work1
364	#endif
365
366	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
367
368	#if defined( __KKMP )
369	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
370	!$OMP DO
371	DO j = nys_p, nyn_p
372	jthread = omp_get_thread_num() + 1
373	#else
374	DO j = nys_p, nyn_p
375	jthread = 1
376	#endif
377	DO k = 1, nz
378
379	m = nxl_a
380	DO n = 1, npe_s
381	DO i = nxl_p, nxr_p
382	fftx_ar(m) = work2(i,k,j,n)
383	m = m+1
384	ENDDO
385	ENDDO
386
387	CALL fft_x( fftx_ar, 'forward' )
388
389	DO i = nxl_a, nxr_a
390	tri_ar(i,k) = fftx_ar(i)
391	ENDDO
392
393	ENDDO
394
395	jj = myid * (nyn_p-nys_p+1) + j
396	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
397
398	DO k = 1, nz
399	DO i = nxl_a, nxr_a
400	fftx_ar(i) = tri_ar (i,k)
401	ENDDO
402
403	CALL fft_x( fftx_ar, 'backward' )
404
405	m = nxl_a
406	DO n = 1, npe_s
407	DO i = nxl_p, nxr_p
408	work2(i,k,j,n) = fftx_ar(m)
409	m = m+1
410	ENDDO
411	ENDDO
412
413	ENDDO
414	ENDDO
415	#if defined( __KKMP )
416	!$OMP END PARALLEL
417	#endif
418
419	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
420
421	#if defined( __parallel )
422	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
423	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
424
425	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
426	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
427	comm2d, istat )
428
429	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
430	#else
431	work1 = work2
432	#endif
433
434	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
435
436	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
437	!$OMP DO
438	DO iouter = nxl_p, nxr_p, istride
439	iei = MIN( iouter+istride-1, nxr_p )
440	DO k = 1, nz
441
442	m = nys_a
443	DO n = 1, npe_s
444	DO j = nys_p, nyn_p
445	DO i = iouter, iei
446	ir = i - iouter + 1
447	ffty_ar(m,ir) = work1 (i,k,j,n)
448	ENDDO
449	m = m+1
450	ENDDO
451	ENDDO
452
453	DO i = iouter, iei
454	ii = nxl + i
455	ir = i - iouter + 1
456	CALL fft_y( ffty_ar(:,ir), 'backward' )
457
458	DO j = nys_a, nyn_a
459	ar(k,j,ii) = ffty_ar(j,ir)
460	ENDDO
461	ENDDO
462
463	ENDDO
464	ENDDO
465	!$OMP END PARALLEL
466
467	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
468
469	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
470
471	#if defined( __KKMP )
472	DEALLOCATE( tri )
473	#endif
474
475	END SUBROUTINE poisfft_hybrid_omp
476
477
478	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
479
480	USE cpulog
481	USE interfaces
482
483	IMPLICIT NONE
484
485	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
486	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
487
488	REAL, DIMENSION(0:nx,nz) :: tri_ar
489
490	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
491
492	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
493	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
494
495	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
496	#if defined( __KKMP )
497	INTEGER :: omp_get_thread_num
498	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
499	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
500	#else
501	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
502	#endif
503
504
505	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
506
507	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
508
509	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
510	!$OMP DO
511	!
512	!-- Store grid points to be transformed on a 1d-array, do the fft
513	!-- and sample the results on a 4d-array
514	DO i = nxl_p, nxr_p
515
516	DO j = nys_a, nyn_a
517	DO k = 1, nz
518	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
519	ENDDO
520	ENDDO
521
522	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
523	ENDDO
524
525	!$OMP DO
526	DO k = 1, nz
527	m = nys_a
528	DO n = 1, npe_s
529	DO j = nys_p, nyn_p
530	DO i = nxl_p, nxr_p
531	work1(i,k,j,n) = ffty_ar3(m,k,i)
532	ENDDO
533	m = m+1
534	ENDDO
535	ENDDO
536	ENDDO
537	!$OMP END PARALLEL
538
539	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
540
541	#if defined( __parallel )
542	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
543	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
544	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
545	comm2d, istat )
546	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
547	#else
548	work2 = work1
549	#endif
550
551	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
552
553	#if defined( __KKMP )
554	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
555	!$OMP DO
556	DO j = nys_p, nyn_p
557	jthread = omp_get_thread_num() + 1
558	#else
559	DO j = nys_p, nyn_p
560	jthread = 1
561	#endif
562	DO k = 1, nz
563
564	m = nxl_a
565	DO n = 1, npe_s
566	DO i = nxl_p, nxr_p
567	fftx_ar3(m,k,j) = work2(i,k,j,n)
568	m = m+1
569	ENDDO
570	ENDDO
571	ENDDO
572
573	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
574
575	DO k = 1, nz
576	DO i = nxl_a, nxr_a
577	tri_ar(i,k) = fftx_ar3(i,k,j)
578	ENDDO
579	ENDDO
580
581	jj = myid * (nyn_p-nys_p+1) + j
582	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
583
584	DO k = 1, nz
585	DO i = nxl_a, nxr_a
586	fftx_ar3(i,k,j) = tri_ar (i,k)
587	ENDDO
588	ENDDO
589
590	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
591
592	DO k = 1, nz
593	m = nxl_a
594	DO n = 1, npe_s
595	DO i = nxl_p, nxr_p
596	work2(i,k,j,n) = fftx_ar3(m,k,j)
597	m = m+1
598	ENDDO
599	ENDDO
600	ENDDO
601
602	ENDDO
603	#if defined( __KKMP )
604	!$OMP END PARALLEL
605	#endif
606
607	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
608
609	#if defined( __parallel )
610	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
611	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
612	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
613	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
614	comm2d, istat )
615	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
616	#else
617	work1 = work2
618	#endif
619
620	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
621
622	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
623	!$OMP DO
624	DO k = 1, nz
625	m = nys_a
626	DO n = 1, npe_s
627	DO j = nys_p, nyn_p
628	DO i = nxl_p, nxr_p
629	ffty_ar3(m,k,i) = work1(i,k,j,n)
630	ENDDO
631	m = m+1
632	ENDDO
633	ENDDO
634	ENDDO
635
636	!$OMP DO
637	DO i = nxl_p, nxr_p
638	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
639	DO j = nys_a, nyn_a
640	DO k = 1, nz
641	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
642	ENDDO
643	ENDDO
644	ENDDO
645	!$OMP END PARALLEL
646
647	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
648
649	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
650
651	#if defined( __KKMP )
652	DEALLOCATE( tri )
653	#endif
654
655	END SUBROUTINE poisfft_hybrid_omp_vec
656
657
658	SUBROUTINE poisfft_hybrid_nodes ( ar )
659
660	USE cpulog
661	USE interfaces
662
663	IMPLICIT NONE
664
665	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
666	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
667	n, nn, nt, nw1, nw2
668
669	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
670
671	REAL, DIMENSION(0:nx) :: fftx_ar
672	REAL, DIMENSION(0:ny,istride) :: ffty_ar
673
674	REAL, DIMENSION(0:nx,nz) :: tri_ar
675
676	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
677	nodes,nys_p:nyn_p) :: work1,work2
678	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
679
680
681	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
682
683	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
684
685	!
686	!-- Store grid points to be transformed on a 1d-array, do the fft
687	!-- and sample the results on a 4d-array
688	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
689	iei = MIN( iouter+istride-1, nxr_p )
690	DO k = 1, nz
691
692	DO i = iouter, iei
693	ii = nxl + i
694	ir = i - iouter + 1
695
696	DO j = nys_a, nyn_a
697	ffty_ar(j,ir) = ar(k,j,ii)
698	ENDDO
699
700	CALL fft_y( ffty_ar(:,ir), 'forward' )
701	ENDDO
702
703	m = nys_a
704	DO nn = 1, nodes
705	DO nt = 1, tasks_per_logical_node
706	DO j = nys_p, nyn_p
707	DO i = iouter, iei
708	ir = i - iouter + 1
709	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
710	ENDDO
711	m = m+1
712	ENDDO
713	ENDDO
714	ENDDO
715
716	ENDDO
717	ENDDO
718
719	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
720
721	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
722	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
723	DO nn = 1, nodes
724	DO j = nys_p, nyn_p
725	#if defined( __parallel )
726	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
727	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
728	comm_tasks, istat )
729	#endif
730	ENDDO
731	ENDDO
732	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
733
734
735	DO j = nys_p, nyn_p
736
737	CALL cascade( 1, j, nys_p, nyn_p )
738	nw2 = nw1 * SIZE( work1, 3 )
739	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
740	#if defined( __parallel )
741	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
742	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
743	comm_nodes, istat )
744	#endif
745	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
746	CALL cascade( 2, j, nys_p, nyn_p )
747
748	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
749	DO k = 1, nz
750
751	m = nxl_a
752	DO nn = 1, nodes
753	DO nt = 1, tasks_per_logical_node
754	DO i = nxl_p, nxr_p
755	fftx_ar(m) = work1(i,k,nt,nn,j)
756	m = m+1
757	ENDDO
758	ENDDO
759	ENDDO
760
761	CALL fft_x( fftx_ar, 'forward' )
762
763	DO i = nxl_a, nxr_a
764	tri_ar(i,k) = fftx_ar(i)
765	ENDDO
766
767	ENDDO
768
769	jj = myid * (nyn_p-nys_p+1) + j
770	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
771
772	DO k = 1, nz
773	DO i = nxl_a, nxr_a
774	fftx_ar(i) = tri_ar(i,k)
775	ENDDO
776
777	CALL fft_x( fftx_ar, 'backward' )
778
779	m = nxl_a
780	DO nn = 1, nodes
781	DO nt = 1, tasks_per_logical_node
782	DO i = nxl_p, nxr_p
783	work1(i,k,nt,nn,j) = fftx_ar(m)
784	m = m+1
785	ENDDO
786	ENDDO
787	ENDDO
788	ENDDO
789
790	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
791	nw2 = nw1 * SIZE( work1, 3 )
792	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
793	#if defined( __parallel )
794	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
795	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
796	comm_nodes, istat )
797	#endif
798	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
799
800	ENDDO
801
802	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
803	DO nn = 1, nodes
804	DO j = nys_p, nyn_p
805	#if defined( __parallel )
806	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
807	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
808	comm_tasks, istat )
809	#endif
810	ENDDO
811	ENDDO
812	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
813
814	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
815
816	DO iouter = nxl_p, nxr_p, istride
817	iei = MIN( iouter+istride-1, nxr_p )
818	DO k = 1, nz
819
820	m = nys_a
821	DO nn = 1, nodes
822	DO nt = 1, tasks_per_logical_node
823	DO j = nys_p, nyn_p
824	DO i = iouter, iei
825	ir = i - iouter + 1
826	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
827	ENDDO
828	m = m+1
829	ENDDO
830	ENDDO
831	ENDDO
832
833	DO i = iouter, iei
834	ii = nxl + i
835	ir = i - iouter + 1
836	CALL fft_y( ffty_ar(:,ir), 'backward' )
837
838	DO j = nys_a, nyn_a
839	ar(k,j,ii) = ffty_ar(j,ir)
840	ENDDO
841	ENDDO
842
843	ENDDO
844	ENDDO
845
846	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
847
848	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
849
850	END SUBROUTINE poisfft_hybrid_nodes
851
852
853
854	SUBROUTINE tridia_hybrid( j, ar, tri )
855
856	USE arrays_3d
857	USE control_parameters
858	USE grid_variables
859
860	IMPLICIT NONE
861
862	INTEGER :: i, j, k, nnyh
863	REAL, DIMENSION(0:nx,nz) :: ar
864	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
865	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
866
867	nnyh = (ny+1) / 2
868
869	tri = 0.0
870	!
871	!-- Define constant elements of the tridiagonal matrix.
872	DO k = 0, nz-1
873	DO i = 0,nx
874	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
875	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
876	ENDDO
877	ENDDO
878
879	IF ( j <= nnyh ) THEN
880	CALL maketri_hybrid( j )
881	ELSE
882	CALL maketri_hybrid( ny+1-j)
883	ENDIF
884	CALL zerleg_hybrid
885	CALL substi_hybrid( ar, tri )
886
887	CONTAINS
888
889	SUBROUTINE maketri_hybrid( j )
890
891	!----------------------------------------------------------------------!
892	! maketri !
893	! !
894	! computes the i- and j-dependent component of the matrix !
895	!----------------------------------------------------------------------!
896
897	USE constants
898
899	IMPLICIT NONE
900
901	INTEGER :: i, j, k, nnxh
902	REAL :: a, c
903
904	REAL, DIMENSION(0:nx) :: l
905
906
907	nnxh = (nx+1) / 2
908	!
909	!-- Provide the tridiagonal matrix for solution of the Poisson equation
910	!-- in Fourier space. The coefficients are computed following the method
911	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
912	!-- Siano's original version.
913	DO i = 0,nx
914	IF ( i >= 0 .AND. i < nnxh ) THEN
915	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
916	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
917	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
918	FLOAT( ny+1 ) ) ) / ( dy * dy )
919	ELSEIF ( i == nnxh ) THEN
920	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
921	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
922	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
923	FLOAT(ny+1) ) ) / ( dy * dy )
924	ELSE
925	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
926	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
927	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
928	FLOAT( ny+1 ) ) ) / ( dy * dy )
929	ENDIF
930	ENDDO
931
932	DO k = 0,nz-1
933	DO i = 0, nx
934	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
935	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
936	tri(1,i,k) = a + c - l(i)
937	ENDDO
938	ENDDO
939	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
940	DO i = 0,nx
941	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
942	ENDDO
943	ENDIF
944	IF ( ibc_p_t == 1 ) THEN
945	DO i = 0,nx
946	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
947	ENDDO
948	ENDIF
949
950	END SUBROUTINE maketri_hybrid
951
952
953	SUBROUTINE zerleg_hybrid
954
955	!----------------------------------------------------------------------!
956	! zerleg !
957	! !
958	! Splitting of the tridiagonal matrix (Thomas algorithm) !
959	!----------------------------------------------------------------------!
960
961	USE indices
962
963	IMPLICIT NONE
964
965	INTEGER :: i, k
966
967	!
968	!-- Splitting
969	DO i = 0, nx
970	tri(4,i,0) = tri(1,i,0)
971	ENDDO
972	DO k = 1, nz-1
973	DO i = 0,nx
974	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
975	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
976	ENDDO
977	ENDDO
978
979	END SUBROUTINE zerleg_hybrid
980
981	SUBROUTINE substi_hybrid( ar, tri )
982
983	!----------------------------------------------------------------------!
984	! substi !
985	! !
986	! Substitution (Forward and Backward) (Thomas algorithm) !
987	!----------------------------------------------------------------------!
988
989	IMPLICIT NONE
990
991	INTEGER :: i, j, k
992	REAL, DIMENSION(0:nx,nz) :: ar
993	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
994	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
995
996	!
997	!-- Forward substitution
998	DO i = 0, nx
999	ar1(i,0) = ar(i,1)
1000	ENDDO
1001	DO k = 1, nz - 1
1002	DO i = 0,nx
1003	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1004	ENDDO
1005	ENDDO
1006
1007	!
1008	!-- Backward substitution
1009	DO i = 0,nx
1010	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
1011	ENDDO
1012	DO k = nz-2, 0, -1
1013	DO i = 0,nx
1014	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1015	/ tri(4,i,k)
1016	ENDDO
1017	ENDDO
1018
1019	END SUBROUTINE substi_hybrid
1020
1021	END SUBROUTINE tridia_hybrid
1022
1023
1024	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
1025
1026	USE cpulog
1027
1028	IMPLICIT NONE
1029
1030	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
1031	INTEGER, SAVE :: tag = 10
1032	#if defined( __parallel )
1033	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
1034	INTEGER, DIMENSION(MPI_STATUS_SIZE,1) :: stata
1035	#endif
1036
1037	REAL :: buf, buf1
1038
1039
1040	buf = 1.0
1041	buf1 = 1.1
1042	IF ( me_node == 0 ) THEN ! first node only
1043
1044	SELECT CASE ( loca )
1045
1046	CASE ( 1 ) ! before alltoall
1047
1048	IF( me_task > 0 ) THEN ! first task does not wait
1049	#if defined( __parallel )
1050	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
1051	buf1, 1, MPI_REAL, me_task-1, 0, &
1052	comm_tasks, stat, ierr )
1053	#endif
1054	ELSEIF ( j > nys_p ) THEN
1055	req = 0
1056	tag = MOD( tag-10, 10 ) + 10
1057	#if defined( __parallel )
1058	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
1059	tag, comm_tasks, req, ierr )
1060	reqa = req
1061	CALL MPI_WAITALL( 1, reqa, stata, ierr )
1062	#endif
1063	ENDIF
1064
1065	CASE ( 2 ) ! after alltoall
1066
1067	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
1068	#if defined( __parallel )
1069	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
1070	buf1, 1, MPI_REAL, me_task+1, 0, &
1071	comm_tasks, stat, ierr)
1072	#endif
1073	ELSEIF ( j < nyn_p ) THEN
1074	req = 0
1075	tag = MOD( tag-10, 10 ) + 10
1076	#if defined( __parallel )
1077	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
1078	ierr )
1079	#endif
1080	ENDIF
1081
1082	END SELECT
1083
1084	ENDIF
1085
1086	END SUBROUTINE cascade
1087	#endif
1088	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |