Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 1139

Last change on this file since 1139 was 1112, checked in by raasch, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 33.4 KB

Line
1	MODULE poisfft_hybrid_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2012 Leibniz University Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft_hybrid.f90 1112 2013-03-09 00:34:37Z raasch $
27	!
28	! 1111 2013-03-08 23:54:10Z raasch
29	! poisfft_hybrid_ini is now called internally from poisfft_hybrid,
30	! ibc_p_b = 2 removed
31	!
32	! 1106 2013-03-04 05:31:38Z raasch
33	! calls of fft_x, fft_y replaced by fft_x_1d, fft_y_1d
34	!
35	! 1036 2012-10-22 13:43:42Z raasch
36	! code put under GPL (PALM 3.9)
37	!
38	! 1013 2012-09-21 07:03:55Z raasch
39	! FLOAT type conversion replaced by REAL
40	!
41	! 809 2012-01-30 13:32:58Z maronga
42	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
43	!
44	! 807 2012-01-25 11:53:51Z maronga
45	! New cpp directive "__check" implemented which is used by check_namelist_files
46	! (most of the code is unneeded by check_namelist_files).
47	!
48	! 667 2010-12-23 12:06:00Z suehring/gryschka
49	! ddzu replaced by ddzu_pres due to changes in zu(0)
50	!
51	! 415 2009-12-15 10:26:23Z raasch
52	! Dimension of array stat in cascade change to prevent type problems with___
53	! mpi2 libraries
54	!
55	! 274 2009-03-26 15:11:21Z heinze
56	! Output of messages replaced by message handling routine.
57	!
58	! Feb. 2007
59	! RCS Log replace by Id keyword, revision history cleaned up
60	!
61	! Revision 1.11 2004/04/30 12:43:14 raasch
62	! Renaming of fft routines, additional argument in calls of fft_y_m
63	!
64	! Revision 1.2 2002/12/19 16:08:31 raasch
65	! Preprocessor directive KKMP introduced (OMP does NOT work),
66	! array tri will be a shared array in OpenMP loop, to get better cache
67	! utilization, the i index (x-direction) will be executed in stride
68	! "istride" as outer loop and in a shorter inner loop,
69	! overlapping of computation and communication realized by new routine
70	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
71	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
72	!
73	!
74	! Description:
75	! ------------
76	! Solution of the Poisson equation with a 2D spectral method.
77	! Hybrid version for parallel computers using a 1D domain decomposition,
78	! realized with MPI, along x and parallelization with OPEN-MP along y
79	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
80	! optimization is realized by overlapping of computation and communication
81	! and by simultaneously executing as many communication calls as switches
82	! per logical partition (LPAR) are available. This version comes into
83	! effect if more than one node is used and if the environment variable
84	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
85	! without any rest.
86	!
87	! WARNING: In case of OpenMP, there are problems with allocating large
88	! arrays in parallel regions.
89	!
90	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
91	!------------------------------------------------------------------------------!
92
93	USE fft_xy
94	USE indices
95	USE pegrid
96
97	IMPLICIT NONE
98
99	INTEGER, PARAMETER :: switch_per_lpar = 2
100
101	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
102	nxl_p, nxr_p, & ! partial x dimension
103	nys_a, nyn_a, & ! total y dimension
104	nys_p, nyn_p, & ! partial y dimension
105
106	npe_s, & ! total number of PEs for solver
107	nwords, & ! number of points to be exchanged
108	! with MPI_ALLTOALL
109	n_omp_threads ! number of OpenMP threads
110
111	!
112	!-- Variables for multi node version (cluster version) using routine
113	!-- poisfft_hybrid_nodes
114	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
115	comm_node_all, & ! communicater all PEs node version
116	comm_tasks, & ! communicater tasks
117	me, me_node, me_task,& ! identity of this PE
118	nodes, & ! number of nodes
119	tasks_per_logical_node = -1 ! default no cluster
120
121	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
122
123
124	PRIVATE
125
126
127	#if ! defined ( __check )
128	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
129
130
131	!
132	!-- Public interfaces
133	INTERFACE poisfft_hybrid_ini
134	MODULE PROCEDURE poisfft_hybrid_ini
135	END INTERFACE poisfft_hybrid_ini
136
137	INTERFACE poisfft_hybrid
138	MODULE PROCEDURE poisfft_hybrid
139	END INTERFACE poisfft_hybrid
140
141	!
142	!-- Private interfaces
143	INTERFACE poisfft_hybrid_omp
144	MODULE PROCEDURE poisfft_hybrid_omp
145	END INTERFACE poisfft_hybrid_omp
146
147	INTERFACE poisfft_hybrid_omp_vec
148	MODULE PROCEDURE poisfft_hybrid_omp_vec
149	END INTERFACE poisfft_hybrid_omp_vec
150
151	INTERFACE poisfft_hybrid_nodes
152	MODULE PROCEDURE poisfft_hybrid_nodes
153	END INTERFACE poisfft_hybrid_nodes
154
155	INTERFACE tridia_hybrid
156	MODULE PROCEDURE tridia_hybrid
157	END INTERFACE tridia_hybrid
158
159	INTERFACE cascade
160	MODULE PROCEDURE cascade
161	END INTERFACE cascade
162	#else
163	PUBLIC poisfft_hybrid_ini
164
165	!
166	!-- Public interfaces
167	INTERFACE poisfft_hybrid_ini
168	MODULE PROCEDURE poisfft_hybrid_ini
169	END INTERFACE poisfft_hybrid_ini
170	#endif
171
172	CONTAINS
173
174
175	SUBROUTINE poisfft_hybrid_ini
176
177	USE control_parameters
178	USE pegrid
179
180	IMPLICIT NONE
181
182	CHARACTER(LEN=8) :: cdummy
183	INTEGER :: idummy, istat
184	INTEGER, DIMENSION(2) :: coords, dims
185
186	LOGICAL, DIMENSION(2) :: period = .false., re_dims
187
188
189	!
190	!-- Set the internal index values for the hybrid solver
191	#if defined( __parallel )
192	npe_s = pdims(1)
193	#else
194	npe_s = 1
195	#endif
196	nxl_a = 0
197	nxr_a = nx
198	nxl_p = 0
199	nxr_p = ( ( nx+1 ) / npe_s ) - 1
200	nys_a = nys
201	nyn_a = nyn
202	nys_p = 0
203	nyn_p = ( ( ny+1 ) / npe_s ) - 1
204
205	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
206
207	#if defined( __KKMP ) && ! defined ( __check )
208	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
209	READ ( cdummy, '(I8)' ) n_omp_threads
210	IF ( n_omp_threads > 1 ) THEN
211	WRITE( message_string, * ) 'Number of OpenMP threads = ', &
212	n_omp_threads
213	CALL message( 'poisfft_hybrid_ini', 'PA0280', 0, 0, 0, 6, 0 )
214	ENDIF
215	#else
216	n_omp_threads = 1
217	#endif
218	!
219	!-- Initialize the one-dimensional FFT routines
220	CALL fft_init
221
222	!
223	!-- Setup for multi node version (poisfft_hybrid_nodes)
224	IF ( n_omp_threads == 1 .AND. &
225	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
226
227	IF ( tasks_per_node /= -9999 ) THEN
228	!
229	!-- Multi node version requires that the available number of
230	!-- switches per logical partition must be an integral divisor
231	!-- of the chosen number of tasks per node
232	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
233	!
234	!-- Set the switch which decides about usage of the multi node
235	!-- version
236	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
237	numprocs > tasks_per_node ) THEN
238	tasks_per_logical_node = tasks_per_node / switch_per_lpar
239	ENDIF
240
241	IF ( tasks_per_logical_node > -1 ) THEN
242
243	WRITE( message_string, * ) 'running optimized ', &
244	'multinode version', &
245	'&switch_per_lpar = ', &
246	switch_per_lpar, &
247	'&tasks_per_lpar = ', &
248	tasks_per_node, &
249	'tasks_per_logical_node = ', &
250	tasks_per_logical_node
251	CALL message( 'poisfft_hybrid_ini', 'PA0281', 0, 0, 0, 6, 0 )
252
253	ENDIF
254
255	ENDIF
256	ENDIF
257	ENDIF
258
259	!
260	!-- Determine sub-topologies for multi node version
261	IF ( tasks_per_logical_node >= 2 ) THEN
262
263	#if defined( __parallel ) && ! defined ( __check )
264	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
265	tasks_per_logical_node
266	dims(1) = nodes
267	dims(2) = tasks_per_logical_node
268
269	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
270	comm_node_all, istat )
271	CALL MPI_COMM_RANK( comm_node_all, me, istat )
272
273	re_dims(1) = .TRUE.
274	re_dims(2) = .FALSE.
275	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
276	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
277
278	re_dims(1) = .FALSE.
279	re_dims(2) = .TRUE.
280	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
281	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
282
283	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
284	! tasks_per_logical_node
285	#elif ! defined( __parallel )
286	message_string = 'parallel environment (MPI) required'
287	CALL message( 'poisfft_hybrid_ini', 'PA0282', 1, 2, 0, 6, 0 )
288	#endif
289	ENDIF
290
291	poisfft_initialized = .TRUE.
292
293	END SUBROUTINE poisfft_hybrid_ini
294
295	#if ! defined ( __check )
296	SUBROUTINE poisfft_hybrid( ar )
297
298	USE control_parameters
299	USE interfaces
300
301	IMPLICIT NONE
302
303	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
304
305	IF ( .NOT. poisfft_initialized ) CALL poisfft_hybrid_ini
306
307	IF ( host(1:3) == 'nec' ) THEN
308	CALL poisfft_hybrid_omp_vec( ar )
309	ELSE
310	IF ( tasks_per_logical_node == -1 ) THEN
311	CALL poisfft_hybrid_omp( ar )
312	ELSE
313	CALL poisfft_hybrid_nodes( ar )
314	ENDIF
315	ENDIF
316
317	END SUBROUTINE poisfft_hybrid
318
319
320	SUBROUTINE poisfft_hybrid_omp ( ar )
321
322	USE cpulog
323	USE interfaces
324
325	IMPLICIT NONE
326
327	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
328	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
329
330	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
331
332	REAL, DIMENSION(0:nx) :: fftx_ar
333	REAL, DIMENSION(0:ny,istride) :: ffty_ar
334
335	REAL, DIMENSION(0:nx,nz) :: tri_ar
336
337	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
338	#if defined( __KKMP )
339	INTEGER :: omp_get_thread_num
340	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
341	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
342	#else
343	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
344	#endif
345
346
347	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
348
349	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
350
351	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
352	!$OMP DO
353	!
354	!-- Store grid points to be transformed on a 1d-array, do the fft
355	!-- and sample the results on a 4d-array
356	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
357	iei = MIN( iouter+istride-1, nxr_p )
358	DO k = 1, nz
359
360	DO i = iouter, iei
361	ii = nxl + i
362	ir = i - iouter + 1
363
364	DO j = nys_a, nyn_a
365	ffty_ar(j,ir) = ar(k,j,ii)
366	ENDDO
367
368	CALL fft_y_1d( ffty_ar(:,ir), 'forward' )
369	ENDDO
370
371	m = nys_a
372	DO n = 1, npe_s
373	DO j = nys_p, nyn_p
374	DO i = iouter, iei
375	ir = i - iouter + 1
376	work1(i,k,j,n) = ffty_ar(m,ir)
377	ENDDO
378	m = m+1
379	ENDDO
380	ENDDO
381
382	ENDDO
383	ENDDO
384	!$OMP END PARALLEL
385
386	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
387
388	#if defined( __parallel )
389	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
390
391	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
392	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
393	comm2d, istat )
394
395	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
396	#else
397	work2 = work1
398	#endif
399
400	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
401
402	#if defined( __KKMP )
403	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
404	!$OMP DO
405	DO j = nys_p, nyn_p
406	jthread = omp_get_thread_num() + 1
407	#else
408	DO j = nys_p, nyn_p
409	jthread = 1
410	#endif
411	DO k = 1, nz
412
413	m = nxl_a
414	DO n = 1, npe_s
415	DO i = nxl_p, nxr_p
416	fftx_ar(m) = work2(i,k,j,n)
417	m = m+1
418	ENDDO
419	ENDDO
420
421	CALL fft_x_1d( fftx_ar, 'forward' )
422
423	DO i = nxl_a, nxr_a
424	tri_ar(i,k) = fftx_ar(i)
425	ENDDO
426
427	ENDDO
428
429	jj = myid * (nyn_p-nys_p+1) + j
430	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
431
432	DO k = 1, nz
433	DO i = nxl_a, nxr_a
434	fftx_ar(i) = tri_ar (i,k)
435	ENDDO
436
437	CALL fft_x_1d( fftx_ar, 'backward' )
438
439	m = nxl_a
440	DO n = 1, npe_s
441	DO i = nxl_p, nxr_p
442	work2(i,k,j,n) = fftx_ar(m)
443	m = m+1
444	ENDDO
445	ENDDO
446
447	ENDDO
448	ENDDO
449	#if defined( __KKMP )
450	!$OMP END PARALLEL
451	#endif
452
453	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
454
455	#if defined( __parallel )
456	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
457	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
458
459	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
460	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
461	comm2d, istat )
462
463	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
464	#else
465	work1 = work2
466	#endif
467
468	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
469
470	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
471	!$OMP DO
472	DO iouter = nxl_p, nxr_p, istride
473	iei = MIN( iouter+istride-1, nxr_p )
474	DO k = 1, nz
475
476	m = nys_a
477	DO n = 1, npe_s
478	DO j = nys_p, nyn_p
479	DO i = iouter, iei
480	ir = i - iouter + 1
481	ffty_ar(m,ir) = work1 (i,k,j,n)
482	ENDDO
483	m = m+1
484	ENDDO
485	ENDDO
486
487	DO i = iouter, iei
488	ii = nxl + i
489	ir = i - iouter + 1
490	CALL fft_y_1d( ffty_ar(:,ir), 'backward' )
491
492	DO j = nys_a, nyn_a
493	ar(k,j,ii) = ffty_ar(j,ir)
494	ENDDO
495	ENDDO
496
497	ENDDO
498	ENDDO
499	!$OMP END PARALLEL
500
501	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
502
503	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
504
505	#if defined( __KKMP )
506	DEALLOCATE( tri )
507	#endif
508
509	END SUBROUTINE poisfft_hybrid_omp
510
511
512	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
513
514	USE cpulog
515	USE interfaces
516
517	IMPLICIT NONE
518
519	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
520	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
521
522	REAL, DIMENSION(0:nx,nz) :: tri_ar
523
524	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
525
526	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
527	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
528
529	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
530	#if defined( __KKMP )
531	INTEGER :: omp_get_thread_num
532	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
533	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
534	#else
535	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
536	#endif
537
538
539	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
540
541	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
542
543	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
544	!$OMP DO
545	!
546	!-- Store grid points to be transformed on a 1d-array, do the fft
547	!-- and sample the results on a 4d-array
548	DO i = nxl_p, nxr_p
549
550	DO j = nys_a, nyn_a
551	DO k = 1, nz
552	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
553	ENDDO
554	ENDDO
555
556	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
557	ENDDO
558
559	!$OMP DO
560	DO k = 1, nz
561	m = nys_a
562	DO n = 1, npe_s
563	DO j = nys_p, nyn_p
564	DO i = nxl_p, nxr_p
565	work1(i,k,j,n) = ffty_ar3(m,k,i)
566	ENDDO
567	m = m+1
568	ENDDO
569	ENDDO
570	ENDDO
571	!$OMP END PARALLEL
572
573	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
574
575	#if defined( __parallel )
576	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
577	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
578	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
579	comm2d, istat )
580	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
581	#else
582	work2 = work1
583	#endif
584
585	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
586
587	#if defined( __KKMP )
588	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
589	!$OMP DO
590	DO j = nys_p, nyn_p
591	jthread = omp_get_thread_num() + 1
592	#else
593	DO j = nys_p, nyn_p
594	jthread = 1
595	#endif
596	DO k = 1, nz
597
598	m = nxl_a
599	DO n = 1, npe_s
600	DO i = nxl_p, nxr_p
601	fftx_ar3(m,k,j) = work2(i,k,j,n)
602	m = m+1
603	ENDDO
604	ENDDO
605	ENDDO
606
607	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
608
609	DO k = 1, nz
610	DO i = nxl_a, nxr_a
611	tri_ar(i,k) = fftx_ar3(i,k,j)
612	ENDDO
613	ENDDO
614
615	jj = myid * (nyn_p-nys_p+1) + j
616	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
617
618	DO k = 1, nz
619	DO i = nxl_a, nxr_a
620	fftx_ar3(i,k,j) = tri_ar (i,k)
621	ENDDO
622	ENDDO
623
624	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
625
626	DO k = 1, nz
627	m = nxl_a
628	DO n = 1, npe_s
629	DO i = nxl_p, nxr_p
630	work2(i,k,j,n) = fftx_ar3(m,k,j)
631	m = m+1
632	ENDDO
633	ENDDO
634	ENDDO
635
636	ENDDO
637	#if defined( __KKMP )
638	!$OMP END PARALLEL
639	#endif
640
641	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
642
643	#if defined( __parallel )
644	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
645	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
646	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
647	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
648	comm2d, istat )
649	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
650	#else
651	work1 = work2
652	#endif
653
654	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
655
656	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
657	!$OMP DO
658	DO k = 1, nz
659	m = nys_a
660	DO n = 1, npe_s
661	DO j = nys_p, nyn_p
662	DO i = nxl_p, nxr_p
663	ffty_ar3(m,k,i) = work1(i,k,j,n)
664	ENDDO
665	m = m+1
666	ENDDO
667	ENDDO
668	ENDDO
669
670	!$OMP DO
671	DO i = nxl_p, nxr_p
672	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
673	DO j = nys_a, nyn_a
674	DO k = 1, nz
675	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
676	ENDDO
677	ENDDO
678	ENDDO
679	!$OMP END PARALLEL
680
681	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
682
683	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
684
685	#if defined( __KKMP )
686	DEALLOCATE( tri )
687	#endif
688
689	END SUBROUTINE poisfft_hybrid_omp_vec
690
691
692	SUBROUTINE poisfft_hybrid_nodes ( ar )
693
694	USE cpulog
695	USE interfaces
696
697	IMPLICIT NONE
698
699	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
700	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
701	n, nn, nt, nw1, nw2
702
703	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
704
705	REAL, DIMENSION(0:nx) :: fftx_ar
706	REAL, DIMENSION(0:ny,istride) :: ffty_ar
707
708	REAL, DIMENSION(0:nx,nz) :: tri_ar
709
710	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
711	nodes,nys_p:nyn_p) :: work1,work2
712	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
713
714
715	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
716
717	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
718
719	!
720	!-- Store grid points to be transformed on a 1d-array, do the fft
721	!-- and sample the results on a 4d-array
722	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
723	iei = MIN( iouter+istride-1, nxr_p )
724	DO k = 1, nz
725
726	DO i = iouter, iei
727	ii = nxl + i
728	ir = i - iouter + 1
729
730	DO j = nys_a, nyn_a
731	ffty_ar(j,ir) = ar(k,j,ii)
732	ENDDO
733
734	CALL fft_y_1d( ffty_ar(:,ir), 'forward' )
735	ENDDO
736
737	m = nys_a
738	DO nn = 1, nodes
739	DO nt = 1, tasks_per_logical_node
740	DO j = nys_p, nyn_p
741	DO i = iouter, iei
742	ir = i - iouter + 1
743	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
744	ENDDO
745	m = m+1
746	ENDDO
747	ENDDO
748	ENDDO
749
750	ENDDO
751	ENDDO
752
753	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
754
755	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
756	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
757	DO nn = 1, nodes
758	DO j = nys_p, nyn_p
759	#if defined( __parallel )
760	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
761	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
762	comm_tasks, istat )
763	#endif
764	ENDDO
765	ENDDO
766	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
767
768
769	DO j = nys_p, nyn_p
770
771	CALL cascade( 1, j, nys_p, nyn_p )
772	nw2 = nw1 * SIZE( work1, 3 )
773	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
774	#if defined( __parallel )
775	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
776	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
777	comm_nodes, istat )
778	#endif
779	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
780	CALL cascade( 2, j, nys_p, nyn_p )
781
782	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
783	DO k = 1, nz
784
785	m = nxl_a
786	DO nn = 1, nodes
787	DO nt = 1, tasks_per_logical_node
788	DO i = nxl_p, nxr_p
789	fftx_ar(m) = work1(i,k,nt,nn,j)
790	m = m+1
791	ENDDO
792	ENDDO
793	ENDDO
794
795	CALL fft_x_1d( fftx_ar, 'forward' )
796
797	DO i = nxl_a, nxr_a
798	tri_ar(i,k) = fftx_ar(i)
799	ENDDO
800
801	ENDDO
802
803	jj = myid * (nyn_p-nys_p+1) + j
804	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
805
806	DO k = 1, nz
807	DO i = nxl_a, nxr_a
808	fftx_ar(i) = tri_ar(i,k)
809	ENDDO
810
811	CALL fft_x_1d( fftx_ar, 'backward' )
812
813	m = nxl_a
814	DO nn = 1, nodes
815	DO nt = 1, tasks_per_logical_node
816	DO i = nxl_p, nxr_p
817	work1(i,k,nt,nn,j) = fftx_ar(m)
818	m = m+1
819	ENDDO
820	ENDDO
821	ENDDO
822	ENDDO
823
824	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
825	nw2 = nw1 * SIZE( work1, 3 )
826	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
827	#if defined( __parallel )
828	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
829	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
830	comm_nodes, istat )
831	#endif
832	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
833
834	ENDDO
835
836	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
837	DO nn = 1, nodes
838	DO j = nys_p, nyn_p
839	#if defined( __parallel )
840	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
841	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
842	comm_tasks, istat )
843	#endif
844	ENDDO
845	ENDDO
846	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
847
848	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
849
850	DO iouter = nxl_p, nxr_p, istride
851	iei = MIN( iouter+istride-1, nxr_p )
852	DO k = 1, nz
853
854	m = nys_a
855	DO nn = 1, nodes
856	DO nt = 1, tasks_per_logical_node
857	DO j = nys_p, nyn_p
858	DO i = iouter, iei
859	ir = i - iouter + 1
860	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
861	ENDDO
862	m = m+1
863	ENDDO
864	ENDDO
865	ENDDO
866
867	DO i = iouter, iei
868	ii = nxl + i
869	ir = i - iouter + 1
870	CALL fft_y_1d( ffty_ar(:,ir), 'backward' )
871
872	DO j = nys_a, nyn_a
873	ar(k,j,ii) = ffty_ar(j,ir)
874	ENDDO
875	ENDDO
876
877	ENDDO
878	ENDDO
879
880	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
881
882	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
883
884	END SUBROUTINE poisfft_hybrid_nodes
885
886
887
888	SUBROUTINE tridia_hybrid( j, ar, tri )
889
890	USE arrays_3d
891	USE control_parameters
892	USE grid_variables
893
894	IMPLICIT NONE
895
896	INTEGER :: i, j, k, nnyh
897	REAL, DIMENSION(0:nx,nz) :: ar
898	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
899	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
900
901	nnyh = (ny+1) / 2
902
903	tri = 0.0
904	!
905	!-- Define constant elements of the tridiagonal matrix.
906	DO k = 0, nz-1
907	DO i = 0,nx
908	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
909	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
910	ENDDO
911	ENDDO
912
913	IF ( j <= nnyh ) THEN
914	CALL maketri_hybrid( j )
915	ELSE
916	CALL maketri_hybrid( ny+1-j)
917	ENDIF
918	CALL zerleg_hybrid
919	CALL substi_hybrid( ar, tri )
920
921	CONTAINS
922
923	SUBROUTINE maketri_hybrid( j )
924
925	!----------------------------------------------------------------------!
926	! maketri !
927	! !
928	! computes the i- and j-dependent component of the matrix !
929	!----------------------------------------------------------------------!
930
931	USE constants
932
933	IMPLICIT NONE
934
935	INTEGER :: i, j, k, nnxh
936	REAL :: a, c
937
938	REAL, DIMENSION(0:nx) :: l
939
940
941	nnxh = (nx+1) / 2
942	!
943	!-- Provide the tridiagonal matrix for solution of the Poisson equation
944	!-- in Fourier space. The coefficients are computed following the method
945	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
946	!-- Siano's original version.
947	DO i = 0,nx
948	IF ( i >= 0 .AND. i < nnxh ) THEN
949	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
950	REAL( nx+1 ) ) ) / ( dx * dx ) + &
951	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
952	REAL( ny+1 ) ) ) / ( dy * dy )
953	ELSEIF ( i == nnxh ) THEN
954	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
955	REAL( nx+1 ) ) ) / ( dx * dx ) + &
956	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
957	REAL(ny+1) ) ) / ( dy * dy )
958	ELSE
959	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
960	REAL( nx+1 ) ) ) / ( dx * dx ) + &
961	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
962	REAL( ny+1 ) ) ) / ( dy * dy )
963	ENDIF
964	ENDDO
965
966	DO k = 0,nz-1
967	DO i = 0, nx
968	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
969	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
970	tri(1,i,k) = a + c - l(i)
971	ENDDO
972	ENDDO
973	IF ( ibc_p_b == 1 ) THEN
974	DO i = 0,nx
975	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
976	ENDDO
977	ENDIF
978	IF ( ibc_p_t == 1 ) THEN
979	DO i = 0,nx
980	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
981	ENDDO
982	ENDIF
983
984	END SUBROUTINE maketri_hybrid
985
986
987	SUBROUTINE zerleg_hybrid
988
989	!----------------------------------------------------------------------!
990	! zerleg !
991	! !
992	! Splitting of the tridiagonal matrix (Thomas algorithm) !
993	!----------------------------------------------------------------------!
994
995	USE indices
996
997	IMPLICIT NONE
998
999	INTEGER :: i, k
1000
1001	!
1002	!-- Splitting
1003	DO i = 0, nx
1004	tri(4,i,0) = tri(1,i,0)
1005	ENDDO
1006	DO k = 1, nz-1
1007	DO i = 0,nx
1008	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
1009	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1010	ENDDO
1011	ENDDO
1012
1013	END SUBROUTINE zerleg_hybrid
1014
1015	SUBROUTINE substi_hybrid( ar, tri )
1016
1017	!----------------------------------------------------------------------!
1018	! substi !
1019	! !
1020	! Substitution (Forward and Backward) (Thomas algorithm) !
1021	!----------------------------------------------------------------------!
1022
1023	IMPLICIT NONE
1024
1025	INTEGER :: i, j, k
1026	REAL, DIMENSION(0:nx,nz) :: ar
1027	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1028	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1029
1030	!
1031	!-- Forward substitution
1032	DO i = 0, nx
1033	ar1(i,0) = ar(i,1)
1034	ENDDO
1035	DO k = 1, nz - 1
1036	DO i = 0,nx
1037	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1038	ENDDO
1039	ENDDO
1040
1041	!
1042	!-- Backward substitution
1043	DO i = 0,nx
1044	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
1045	ENDDO
1046	DO k = nz-2, 0, -1
1047	DO i = 0,nx
1048	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1049	/ tri(4,i,k)
1050	ENDDO
1051	ENDDO
1052
1053	END SUBROUTINE substi_hybrid
1054
1055	END SUBROUTINE tridia_hybrid
1056
1057
1058	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
1059
1060	USE cpulog
1061
1062	IMPLICIT NONE
1063
1064	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
1065	INTEGER, SAVE :: tag = 10
1066	#if defined( __parallel )
1067	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
1068	INTEGER, DIMENSION(MPI_STATUS_SIZE,1) :: stata
1069	#endif
1070
1071	REAL :: buf, buf1
1072
1073
1074	buf = 1.0
1075	buf1 = 1.1
1076	IF ( me_node == 0 ) THEN ! first node only
1077
1078	SELECT CASE ( loca )
1079
1080	CASE ( 1 ) ! before alltoall
1081
1082	IF( me_task > 0 ) THEN ! first task does not wait
1083	#if defined( __parallel )
1084	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
1085	buf1, 1, MPI_REAL, me_task-1, 0, &
1086	comm_tasks, stat, ierr )
1087	#endif
1088	ELSEIF ( j > nys_p ) THEN
1089	req = 0
1090	tag = MOD( tag-10, 10 ) + 10
1091	#if defined( __parallel )
1092	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
1093	tag, comm_tasks, req, ierr )
1094	reqa = req
1095	CALL MPI_WAITALL( 1, reqa, stata, ierr )
1096	#endif
1097	ENDIF
1098
1099	CASE ( 2 ) ! after alltoall
1100
1101	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
1102	#if defined( __parallel )
1103	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
1104	buf1, 1, MPI_REAL, me_task+1, 0, &
1105	comm_tasks, stat, ierr)
1106	#endif
1107	ELSEIF ( j < nyn_p ) THEN
1108	req = 0
1109	tag = MOD( tag-10, 10 ) + 10
1110	#if defined( __parallel )
1111	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
1112	ierr )
1113	#endif
1114	ENDIF
1115
1116	END SELECT
1117
1118	ENDIF
1119
1120	END SUBROUTINE cascade
1121	#endif
1122	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |