Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 808

Last change on this file since 808 was 808, checked in by maronga, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 32.0 KB

Line
1	MODULE poisfft_hybrid_mod
2
3	!------------------------------------------------------------------------------
4	!
5	! Current revisions:
6	! -----------------
7	!
8	!
9	! Former revisions:
10	! -----------------
11	! $Id: poisfft_hybrid.f90 808 2012-01-25 12:02:30Z maronga $
12	!
13	! 807 2012-01-25 11:53:51Z maronga
14	! New cpp directive "__check" implemented which is used by check_namelist_files
15	! (most of the code is unneeded by check_namelist_files).
16	!
17	! 667 2010-12-23 12:06:00Z suehring/gryschka
18	! ddzu replaced by ddzu_pres due to changes in zu(0)
19	!
20	! 415 2009-12-15 10:26:23Z raasch
21	! Dimension of array stat in cascade change to prevent type problems with___
22	! mpi2 libraries
23	!
24	! 274 2009-03-26 15:11:21Z heinze
25	! Output of messages replaced by message handling routine.
26	!
27	! Feb. 2007
28	! RCS Log replace by Id keyword, revision history cleaned up
29	!
30	! Revision 1.11 2004/04/30 12:43:14 raasch
31	! Renaming of fft routines, additional argument in calls of fft_y_m
32	!
33	! Revision 1.2 2002/12/19 16:08:31 raasch
34	! Preprocessor directive KKMP introduced (OMP does NOT work),
35	! array tri will be a shared array in OpenMP loop, to get better cache
36	! utilization, the i index (x-direction) will be executed in stride
37	! "istride" as outer loop and in a shorter inner loop,
38	! overlapping of computation and communication realized by new routine
39	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
40	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
41	!
42	!
43	! Description:
44	! ------------
45	! Solution of the Poisson equation with a 2D spectral method.
46	! Hybrid version for parallel computers using a 1D domain decomposition,
47	! realized with MPI, along x and parallelization with OPEN-MP along y
48	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
49	! optimization is realized by overlapping of computation and communication
50	! and by simultaneously executing as many communication calls as switches
51	! per logical partition (LPAR) are available. This version comes into
52	! effect if more than one node is used and if the environment variable
53	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
54	! without any rest.
55	!
56	! WARNING: In case of OpenMP, there are problems with allocating large
57	! arrays in parallel regions.
58	!
59	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
60	!------------------------------------------------------------------------------!
61
62	USE fft_xy
63	USE indices
64	USE pegrid
65
66	IMPLICIT NONE
67
68	INTEGER, PARAMETER :: switch_per_lpar = 2
69
70	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
71	nxl_p, nxr_p, & ! partial x dimension
72	nys_a, nyn_a, & ! total y dimension
73	nys_p, nyn_p, & ! partial y dimension
74
75	npe_s, & ! total number of PEs for solver
76	nwords, & ! number of points to be exchanged
77	! with MPI_ALLTOALL
78	n_omp_threads ! number of OpenMP threads
79
80	!
81	!-- Variables for multi node version (cluster version) using routine
82	!-- poisfft_hybrid_nodes
83	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
84	comm_node_all, & ! communicater all PEs node version
85	comm_tasks, & ! communicater tasks
86	me, me_node, me_task,& ! identity of this PE
87	nodes, & ! number of nodes
88	tasks_per_logical_node = -1 ! default no cluster
89
90
91	PRIVATE
92
93
94	#if .NOT. defined ( __check )
95	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
96
97
98	!
99	!-- Public interfaces
100	INTERFACE poisfft_hybrid_ini
101	MODULE PROCEDURE poisfft_hybrid_ini
102	END INTERFACE poisfft_hybrid_ini
103
104	INTERFACE poisfft_hybrid
105	MODULE PROCEDURE poisfft_hybrid
106	END INTERFACE poisfft_hybrid
107
108	!
109	!-- Private interfaces
110	INTERFACE poisfft_hybrid_omp
111	MODULE PROCEDURE poisfft_hybrid_omp
112	END INTERFACE poisfft_hybrid_omp
113
114	INTERFACE poisfft_hybrid_omp_vec
115	MODULE PROCEDURE poisfft_hybrid_omp_vec
116	END INTERFACE poisfft_hybrid_omp_vec
117
118	INTERFACE poisfft_hybrid_nodes
119	MODULE PROCEDURE poisfft_hybrid_nodes
120	END INTERFACE poisfft_hybrid_nodes
121
122	INTERFACE tridia_hybrid
123	MODULE PROCEDURE tridia_hybrid
124	END INTERFACE tridia_hybrid
125
126	INTERFACE cascade
127	MODULE PROCEDURE cascade
128	END INTERFACE cascade
129	#else
130	PUBLIC poisfft_hybrid_ini
131
132	!
133	!-- Public interfaces
134	INTERFACE poisfft_hybrid_ini
135	MODULE PROCEDURE poisfft_hybrid_ini
136	END INTERFACE poisfft_hybrid_ini
137	#endif
138
139	CONTAINS
140
141
142	SUBROUTINE poisfft_hybrid_ini
143
144	USE control_parameters
145	USE pegrid
146
147	IMPLICIT NONE
148
149	CHARACTER(LEN=8) :: cdummy
150	INTEGER :: idummy, istat
151	INTEGER, DIMENSION(2) :: coords, dims
152
153	LOGICAL, DIMENSION(2) :: period = .false., re_dims
154
155
156	!
157	!-- Set the internal index values for the hybrid solver
158	#if defined( __parallel )
159	npe_s = pdims(1)
160	#else
161	npe_s = 1
162	#endif
163	nxl_a = 0
164	nxr_a = nx
165	nxl_p = 0
166	nxr_p = ( ( nx+1 ) / npe_s ) - 1
167	nys_a = nys
168	nyn_a = nyn
169	nys_p = 0
170	nyn_p = ( ( ny+1 ) / npe_s ) - 1
171
172	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
173
174	#if defined( __KKMP ) .AND. .NOT. defined ( __check )
175	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
176	READ ( cdummy, '(I8)' ) n_omp_threads
177	IF ( n_omp_threads > 1 ) THEN
178	WRITE( message_string, * ) 'Number of OpenMP threads = ', &
179	n_omp_threads
180	CALL message( 'poisfft_hybrid_ini', 'PA0280', 0, 0, 0, 6, 0 )
181	ENDIF
182	#else
183	n_omp_threads = 1
184	#endif
185	!
186	!-- Initialize the one-dimensional FFT routines
187	CALL fft_init
188
189	!
190	!-- Setup for multi node version (poisfft_hybrid_nodes)
191	IF ( n_omp_threads == 1 .AND. &
192	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
193
194	IF ( tasks_per_node /= -9999 ) THEN
195	!
196	!-- Multi node version requires that the available number of
197	!-- switches per logical partition must be an integral divisor
198	!-- of the chosen number of tasks per node
199	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
200	!
201	!-- Set the switch which decides about usage of the multi node
202	!-- version
203	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
204	numprocs > tasks_per_node ) THEN
205	tasks_per_logical_node = tasks_per_node / switch_per_lpar
206	ENDIF
207
208	IF ( tasks_per_logical_node > -1 ) THEN
209
210	WRITE( message_string, * ) 'running optimized ', &
211	'multinode version', &
212	'&switch_per_lpar = ', &
213	switch_per_lpar, &
214	'&tasks_per_lpar = ', &
215	tasks_per_node, &
216	'tasks_per_logical_node = ', &
217	tasks_per_logical_node
218	CALL message( 'poisfft_hybrid_ini', 'PA0281', 0, 0, 0, 6, 0 )
219
220	ENDIF
221
222	ENDIF
223	ENDIF
224	ENDIF
225
226	!
227	!-- Determine sub-topologies for multi node version
228	IF ( tasks_per_logical_node >= 2 ) THEN
229
230	#if defined( __parallel ) .AND. .NOT. defined ( __check )
231	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
232	tasks_per_logical_node
233	dims(1) = nodes
234	dims(2) = tasks_per_logical_node
235
236	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
237	comm_node_all, istat )
238	CALL MPI_COMM_RANK( comm_node_all, me, istat )
239
240	re_dims(1) = .TRUE.
241	re_dims(2) = .FALSE.
242	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
243	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
244
245	re_dims(1) = .FALSE.
246	re_dims(2) = .TRUE.
247	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
248	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
249
250	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
251	! tasks_per_logical_node
252	#elif .NOT. defined( __parallel )
253	message_string = 'parallel environment (MPI) required'
254	CALL message( 'poisfft_hybrid_ini', 'PA0282', 1, 2, 0, 6, 0 )
255	#endif
256	ENDIF
257
258	END SUBROUTINE poisfft_hybrid_ini
259
260	#if .NOT. defined ( __check )
261	SUBROUTINE poisfft_hybrid( ar )
262
263	USE control_parameters
264	USE interfaces
265
266	IMPLICIT NONE
267
268	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
269
270	IF ( host(1:3) == 'nec' ) THEN
271	CALL poisfft_hybrid_omp_vec( ar )
272	ELSE
273	IF ( tasks_per_logical_node == -1 ) THEN
274	CALL poisfft_hybrid_omp( ar )
275	ELSE
276	CALL poisfft_hybrid_nodes( ar )
277	ENDIF
278	ENDIF
279
280	END SUBROUTINE poisfft_hybrid
281
282
283	SUBROUTINE poisfft_hybrid_omp ( ar )
284
285	USE cpulog
286	USE interfaces
287
288	IMPLICIT NONE
289
290	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
291	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
292
293	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
294
295	REAL, DIMENSION(0:nx) :: fftx_ar
296	REAL, DIMENSION(0:ny,istride) :: ffty_ar
297
298	REAL, DIMENSION(0:nx,nz) :: tri_ar
299
300	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
301	#if defined( __KKMP )
302	INTEGER :: omp_get_thread_num
303	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
304	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
305	#else
306	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
307	#endif
308
309
310	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
311
312	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
313
314	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
315	!$OMP DO
316	!
317	!-- Store grid points to be transformed on a 1d-array, do the fft
318	!-- and sample the results on a 4d-array
319	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
320	iei = MIN( iouter+istride-1, nxr_p )
321	DO k = 1, nz
322
323	DO i = iouter, iei
324	ii = nxl + i
325	ir = i - iouter + 1
326
327	DO j = nys_a, nyn_a
328	ffty_ar(j,ir) = ar(k,j,ii)
329	ENDDO
330
331	CALL fft_y( ffty_ar(:,ir), 'forward' )
332	ENDDO
333
334	m = nys_a
335	DO n = 1, npe_s
336	DO j = nys_p, nyn_p
337	DO i = iouter, iei
338	ir = i - iouter + 1
339	work1(i,k,j,n) = ffty_ar(m,ir)
340	ENDDO
341	m = m+1
342	ENDDO
343	ENDDO
344
345	ENDDO
346	ENDDO
347	!$OMP END PARALLEL
348
349	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
350
351	#if defined( __parallel )
352	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
353
354	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
355	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
356	comm2d, istat )
357
358	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
359	#else
360	work2 = work1
361	#endif
362
363	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
364
365	#if defined( __KKMP )
366	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
367	!$OMP DO
368	DO j = nys_p, nyn_p
369	jthread = omp_get_thread_num() + 1
370	#else
371	DO j = nys_p, nyn_p
372	jthread = 1
373	#endif
374	DO k = 1, nz
375
376	m = nxl_a
377	DO n = 1, npe_s
378	DO i = nxl_p, nxr_p
379	fftx_ar(m) = work2(i,k,j,n)
380	m = m+1
381	ENDDO
382	ENDDO
383
384	CALL fft_x( fftx_ar, 'forward' )
385
386	DO i = nxl_a, nxr_a
387	tri_ar(i,k) = fftx_ar(i)
388	ENDDO
389
390	ENDDO
391
392	jj = myid * (nyn_p-nys_p+1) + j
393	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
394
395	DO k = 1, nz
396	DO i = nxl_a, nxr_a
397	fftx_ar(i) = tri_ar (i,k)
398	ENDDO
399
400	CALL fft_x( fftx_ar, 'backward' )
401
402	m = nxl_a
403	DO n = 1, npe_s
404	DO i = nxl_p, nxr_p
405	work2(i,k,j,n) = fftx_ar(m)
406	m = m+1
407	ENDDO
408	ENDDO
409
410	ENDDO
411	ENDDO
412	#if defined( __KKMP )
413	!$OMP END PARALLEL
414	#endif
415
416	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
417
418	#if defined( __parallel )
419	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
420	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
421
422	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
423	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
424	comm2d, istat )
425
426	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
427	#else
428	work1 = work2
429	#endif
430
431	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
432
433	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
434	!$OMP DO
435	DO iouter = nxl_p, nxr_p, istride
436	iei = MIN( iouter+istride-1, nxr_p )
437	DO k = 1, nz
438
439	m = nys_a
440	DO n = 1, npe_s
441	DO j = nys_p, nyn_p
442	DO i = iouter, iei
443	ir = i - iouter + 1
444	ffty_ar(m,ir) = work1 (i,k,j,n)
445	ENDDO
446	m = m+1
447	ENDDO
448	ENDDO
449
450	DO i = iouter, iei
451	ii = nxl + i
452	ir = i - iouter + 1
453	CALL fft_y( ffty_ar(:,ir), 'backward' )
454
455	DO j = nys_a, nyn_a
456	ar(k,j,ii) = ffty_ar(j,ir)
457	ENDDO
458	ENDDO
459
460	ENDDO
461	ENDDO
462	!$OMP END PARALLEL
463
464	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
465
466	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
467
468	#if defined( __KKMP )
469	DEALLOCATE( tri )
470	#endif
471
472	END SUBROUTINE poisfft_hybrid_omp
473
474
475	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
476
477	USE cpulog
478	USE interfaces
479
480	IMPLICIT NONE
481
482	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
483	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
484
485	REAL, DIMENSION(0:nx,nz) :: tri_ar
486
487	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
488
489	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
490	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
491
492	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
493	#if defined( __KKMP )
494	INTEGER :: omp_get_thread_num
495	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
496	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
497	#else
498	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
499	#endif
500
501
502	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
503
504	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
505
506	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
507	!$OMP DO
508	!
509	!-- Store grid points to be transformed on a 1d-array, do the fft
510	!-- and sample the results on a 4d-array
511	DO i = nxl_p, nxr_p
512
513	DO j = nys_a, nyn_a
514	DO k = 1, nz
515	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
516	ENDDO
517	ENDDO
518
519	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
520	ENDDO
521
522	!$OMP DO
523	DO k = 1, nz
524	m = nys_a
525	DO n = 1, npe_s
526	DO j = nys_p, nyn_p
527	DO i = nxl_p, nxr_p
528	work1(i,k,j,n) = ffty_ar3(m,k,i)
529	ENDDO
530	m = m+1
531	ENDDO
532	ENDDO
533	ENDDO
534	!$OMP END PARALLEL
535
536	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
537
538	#if defined( __parallel )
539	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
540	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
541	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
542	comm2d, istat )
543	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
544	#else
545	work2 = work1
546	#endif
547
548	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
549
550	#if defined( __KKMP )
551	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
552	!$OMP DO
553	DO j = nys_p, nyn_p
554	jthread = omp_get_thread_num() + 1
555	#else
556	DO j = nys_p, nyn_p
557	jthread = 1
558	#endif
559	DO k = 1, nz
560
561	m = nxl_a
562	DO n = 1, npe_s
563	DO i = nxl_p, nxr_p
564	fftx_ar3(m,k,j) = work2(i,k,j,n)
565	m = m+1
566	ENDDO
567	ENDDO
568	ENDDO
569
570	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
571
572	DO k = 1, nz
573	DO i = nxl_a, nxr_a
574	tri_ar(i,k) = fftx_ar3(i,k,j)
575	ENDDO
576	ENDDO
577
578	jj = myid * (nyn_p-nys_p+1) + j
579	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
580
581	DO k = 1, nz
582	DO i = nxl_a, nxr_a
583	fftx_ar3(i,k,j) = tri_ar (i,k)
584	ENDDO
585	ENDDO
586
587	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
588
589	DO k = 1, nz
590	m = nxl_a
591	DO n = 1, npe_s
592	DO i = nxl_p, nxr_p
593	work2(i,k,j,n) = fftx_ar3(m,k,j)
594	m = m+1
595	ENDDO
596	ENDDO
597	ENDDO
598
599	ENDDO
600	#if defined( __KKMP )
601	!$OMP END PARALLEL
602	#endif
603
604	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
605
606	#if defined( __parallel )
607	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
608	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
609	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
610	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
611	comm2d, istat )
612	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
613	#else
614	work1 = work2
615	#endif
616
617	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
618
619	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
620	!$OMP DO
621	DO k = 1, nz
622	m = nys_a
623	DO n = 1, npe_s
624	DO j = nys_p, nyn_p
625	DO i = nxl_p, nxr_p
626	ffty_ar3(m,k,i) = work1(i,k,j,n)
627	ENDDO
628	m = m+1
629	ENDDO
630	ENDDO
631	ENDDO
632
633	!$OMP DO
634	DO i = nxl_p, nxr_p
635	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
636	DO j = nys_a, nyn_a
637	DO k = 1, nz
638	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
639	ENDDO
640	ENDDO
641	ENDDO
642	!$OMP END PARALLEL
643
644	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
645
646	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
647
648	#if defined( __KKMP )
649	DEALLOCATE( tri )
650	#endif
651
652	END SUBROUTINE poisfft_hybrid_omp_vec
653
654
655	SUBROUTINE poisfft_hybrid_nodes ( ar )
656
657	USE cpulog
658	USE interfaces
659
660	IMPLICIT NONE
661
662	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
663	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
664	n, nn, nt, nw1, nw2
665
666	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
667
668	REAL, DIMENSION(0:nx) :: fftx_ar
669	REAL, DIMENSION(0:ny,istride) :: ffty_ar
670
671	REAL, DIMENSION(0:nx,nz) :: tri_ar
672
673	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
674	nodes,nys_p:nyn_p) :: work1,work2
675	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
676
677
678	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
679
680	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
681
682	!
683	!-- Store grid points to be transformed on a 1d-array, do the fft
684	!-- and sample the results on a 4d-array
685	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
686	iei = MIN( iouter+istride-1, nxr_p )
687	DO k = 1, nz
688
689	DO i = iouter, iei
690	ii = nxl + i
691	ir = i - iouter + 1
692
693	DO j = nys_a, nyn_a
694	ffty_ar(j,ir) = ar(k,j,ii)
695	ENDDO
696
697	CALL fft_y( ffty_ar(:,ir), 'forward' )
698	ENDDO
699
700	m = nys_a
701	DO nn = 1, nodes
702	DO nt = 1, tasks_per_logical_node
703	DO j = nys_p, nyn_p
704	DO i = iouter, iei
705	ir = i - iouter + 1
706	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
707	ENDDO
708	m = m+1
709	ENDDO
710	ENDDO
711	ENDDO
712
713	ENDDO
714	ENDDO
715
716	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
717
718	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
719	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
720	DO nn = 1, nodes
721	DO j = nys_p, nyn_p
722	#if defined( __parallel )
723	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
724	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
725	comm_tasks, istat )
726	#endif
727	ENDDO
728	ENDDO
729	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
730
731
732	DO j = nys_p, nyn_p
733
734	CALL cascade( 1, j, nys_p, nyn_p )
735	nw2 = nw1 * SIZE( work1, 3 )
736	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
737	#if defined( __parallel )
738	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
739	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
740	comm_nodes, istat )
741	#endif
742	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
743	CALL cascade( 2, j, nys_p, nyn_p )
744
745	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
746	DO k = 1, nz
747
748	m = nxl_a
749	DO nn = 1, nodes
750	DO nt = 1, tasks_per_logical_node
751	DO i = nxl_p, nxr_p
752	fftx_ar(m) = work1(i,k,nt,nn,j)
753	m = m+1
754	ENDDO
755	ENDDO
756	ENDDO
757
758	CALL fft_x( fftx_ar, 'forward' )
759
760	DO i = nxl_a, nxr_a
761	tri_ar(i,k) = fftx_ar(i)
762	ENDDO
763
764	ENDDO
765
766	jj = myid * (nyn_p-nys_p+1) + j
767	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
768
769	DO k = 1, nz
770	DO i = nxl_a, nxr_a
771	fftx_ar(i) = tri_ar(i,k)
772	ENDDO
773
774	CALL fft_x( fftx_ar, 'backward' )
775
776	m = nxl_a
777	DO nn = 1, nodes
778	DO nt = 1, tasks_per_logical_node
779	DO i = nxl_p, nxr_p
780	work1(i,k,nt,nn,j) = fftx_ar(m)
781	m = m+1
782	ENDDO
783	ENDDO
784	ENDDO
785	ENDDO
786
787	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
788	nw2 = nw1 * SIZE( work1, 3 )
789	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
790	#if defined( __parallel )
791	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
792	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
793	comm_nodes, istat )
794	#endif
795	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
796
797	ENDDO
798
799	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
800	DO nn = 1, nodes
801	DO j = nys_p, nyn_p
802	#if defined( __parallel )
803	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
804	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
805	comm_tasks, istat )
806	#endif
807	ENDDO
808	ENDDO
809	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
810
811	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
812
813	DO iouter = nxl_p, nxr_p, istride
814	iei = MIN( iouter+istride-1, nxr_p )
815	DO k = 1, nz
816
817	m = nys_a
818	DO nn = 1, nodes
819	DO nt = 1, tasks_per_logical_node
820	DO j = nys_p, nyn_p
821	DO i = iouter, iei
822	ir = i - iouter + 1
823	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
824	ENDDO
825	m = m+1
826	ENDDO
827	ENDDO
828	ENDDO
829
830	DO i = iouter, iei
831	ii = nxl + i
832	ir = i - iouter + 1
833	CALL fft_y( ffty_ar(:,ir), 'backward' )
834
835	DO j = nys_a, nyn_a
836	ar(k,j,ii) = ffty_ar(j,ir)
837	ENDDO
838	ENDDO
839
840	ENDDO
841	ENDDO
842
843	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
844
845	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
846
847	END SUBROUTINE poisfft_hybrid_nodes
848
849
850
851	SUBROUTINE tridia_hybrid( j, ar, tri )
852
853	USE arrays_3d
854	USE control_parameters
855	USE grid_variables
856
857	IMPLICIT NONE
858
859	INTEGER :: i, j, k, nnyh
860	REAL, DIMENSION(0:nx,nz) :: ar
861	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
862	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
863
864	nnyh = (ny+1) / 2
865
866	tri = 0.0
867	!
868	!-- Define constant elements of the tridiagonal matrix.
869	DO k = 0, nz-1
870	DO i = 0,nx
871	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
872	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
873	ENDDO
874	ENDDO
875
876	IF ( j <= nnyh ) THEN
877	CALL maketri_hybrid( j )
878	ELSE
879	CALL maketri_hybrid( ny+1-j)
880	ENDIF
881	CALL zerleg_hybrid
882	CALL substi_hybrid( ar, tri )
883
884	CONTAINS
885
886	SUBROUTINE maketri_hybrid( j )
887
888	!----------------------------------------------------------------------!
889	! maketri !
890	! !
891	! computes the i- and j-dependent component of the matrix !
892	!----------------------------------------------------------------------!
893
894	USE constants
895
896	IMPLICIT NONE
897
898	INTEGER :: i, j, k, nnxh
899	REAL :: a, c
900
901	REAL, DIMENSION(0:nx) :: l
902
903
904	nnxh = (nx+1) / 2
905	!
906	!-- Provide the tridiagonal matrix for solution of the Poisson equation
907	!-- in Fourier space. The coefficients are computed following the method
908	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
909	!-- Siano's original version.
910	DO i = 0,nx
911	IF ( i >= 0 .AND. i < nnxh ) THEN
912	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
913	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
914	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
915	FLOAT( ny+1 ) ) ) / ( dy * dy )
916	ELSEIF ( i == nnxh ) THEN
917	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
918	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
919	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
920	FLOAT(ny+1) ) ) / ( dy * dy )
921	ELSE
922	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
923	FLOAT( nx+1 ) ) ) / ( dx * dx ) + &
924	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
925	FLOAT( ny+1 ) ) ) / ( dy * dy )
926	ENDIF
927	ENDDO
928
929	DO k = 0,nz-1
930	DO i = 0, nx
931	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
932	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
933	tri(1,i,k) = a + c - l(i)
934	ENDDO
935	ENDDO
936	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
937	DO i = 0,nx
938	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
939	ENDDO
940	ENDIF
941	IF ( ibc_p_t == 1 ) THEN
942	DO i = 0,nx
943	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
944	ENDDO
945	ENDIF
946
947	END SUBROUTINE maketri_hybrid
948
949
950	SUBROUTINE zerleg_hybrid
951
952	!----------------------------------------------------------------------!
953	! zerleg !
954	! !
955	! Splitting of the tridiagonal matrix (Thomas algorithm) !
956	!----------------------------------------------------------------------!
957
958	USE indices
959
960	IMPLICIT NONE
961
962	INTEGER :: i, k
963
964	!
965	!-- Splitting
966	DO i = 0, nx
967	tri(4,i,0) = tri(1,i,0)
968	ENDDO
969	DO k = 1, nz-1
970	DO i = 0,nx
971	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
972	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
973	ENDDO
974	ENDDO
975
976	END SUBROUTINE zerleg_hybrid
977
978	SUBROUTINE substi_hybrid( ar, tri )
979
980	!----------------------------------------------------------------------!
981	! substi !
982	! !
983	! Substitution (Forward and Backward) (Thomas algorithm) !
984	!----------------------------------------------------------------------!
985
986	IMPLICIT NONE
987
988	INTEGER :: i, j, k
989	REAL, DIMENSION(0:nx,nz) :: ar
990	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
991	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
992
993	!
994	!-- Forward substitution
995	DO i = 0, nx
996	ar1(i,0) = ar(i,1)
997	ENDDO
998	DO k = 1, nz - 1
999	DO i = 0,nx
1000	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1001	ENDDO
1002	ENDDO
1003
1004	!
1005	!-- Backward substitution
1006	DO i = 0,nx
1007	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
1008	ENDDO
1009	DO k = nz-2, 0, -1
1010	DO i = 0,nx
1011	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1012	/ tri(4,i,k)
1013	ENDDO
1014	ENDDO
1015
1016	END SUBROUTINE substi_hybrid
1017
1018	END SUBROUTINE tridia_hybrid
1019
1020
1021	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
1022
1023	USE cpulog
1024
1025	IMPLICIT NONE
1026
1027	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
1028	INTEGER, SAVE :: tag = 10
1029	#if defined( __parallel )
1030	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
1031	INTEGER, DIMENSION(MPI_STATUS_SIZE,1) :: stata
1032	#endif
1033
1034	REAL :: buf, buf1
1035
1036
1037	buf = 1.0
1038	buf1 = 1.1
1039	IF ( me_node == 0 ) THEN ! first node only
1040
1041	SELECT CASE ( loca )
1042
1043	CASE ( 1 ) ! before alltoall
1044
1045	IF( me_task > 0 ) THEN ! first task does not wait
1046	#if defined( __parallel )
1047	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
1048	buf1, 1, MPI_REAL, me_task-1, 0, &
1049	comm_tasks, stat, ierr )
1050	#endif
1051	ELSEIF ( j > nys_p ) THEN
1052	req = 0
1053	tag = MOD( tag-10, 10 ) + 10
1054	#if defined( __parallel )
1055	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
1056	tag, comm_tasks, req, ierr )
1057	reqa = req
1058	CALL MPI_WAITALL( 1, reqa, stata, ierr )
1059	#endif
1060	ENDIF
1061
1062	CASE ( 2 ) ! after alltoall
1063
1064	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
1065	#if defined( __parallel )
1066	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
1067	buf1, 1, MPI_REAL, me_task+1, 0, &
1068	comm_tasks, stat, ierr)
1069	#endif
1070	ELSEIF ( j < nyn_p ) THEN
1071	req = 0
1072	tag = MOD( tag-10, 10 ) + 10
1073	#if defined( __parallel )
1074	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
1075	ierr )
1076	#endif
1077	ENDIF
1078
1079	END SELECT
1080
1081	ENDIF
1082
1083	END SUBROUTINE cascade
1084	#endif
1085	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |