Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 1107

Last change on this file since 1107 was 1107, checked in by raasch, 11 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 33.1 KB

Line
1	MODULE poisfft_hybrid_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2012 Leibniz University Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft_hybrid.f90 1107 2013-03-04 06:23:14Z raasch $
27	!
28	! 1106 2013-03-04 05:31:38Z raasch
29	! calls of fft_x, fft_y replaced by fft_x_1d, fft_y_1d
30	!
31	! 1036 2012-10-22 13:43:42Z raasch
32	! code put under GPL (PALM 3.9)
33	!
34	! 1013 2012-09-21 07:03:55Z raasch
35	! FLOAT type conversion replaced by REAL
36	!
37	! 809 2012-01-30 13:32:58Z maronga
38	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
39	!
40	! 807 2012-01-25 11:53:51Z maronga
41	! New cpp directive "__check" implemented which is used by check_namelist_files
42	! (most of the code is unneeded by check_namelist_files).
43	!
44	! 667 2010-12-23 12:06:00Z suehring/gryschka
45	! ddzu replaced by ddzu_pres due to changes in zu(0)
46	!
47	! 415 2009-12-15 10:26:23Z raasch
48	! Dimension of array stat in cascade change to prevent type problems with___
49	! mpi2 libraries
50	!
51	! 274 2009-03-26 15:11:21Z heinze
52	! Output of messages replaced by message handling routine.
53	!
54	! Feb. 2007
55	! RCS Log replace by Id keyword, revision history cleaned up
56	!
57	! Revision 1.11 2004/04/30 12:43:14 raasch
58	! Renaming of fft routines, additional argument in calls of fft_y_m
59	!
60	! Revision 1.2 2002/12/19 16:08:31 raasch
61	! Preprocessor directive KKMP introduced (OMP does NOT work),
62	! array tri will be a shared array in OpenMP loop, to get better cache
63	! utilization, the i index (x-direction) will be executed in stride
64	! "istride" as outer loop and in a shorter inner loop,
65	! overlapping of computation and communication realized by new routine
66	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
67	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
68	!
69	!
70	! Description:
71	! ------------
72	! Solution of the Poisson equation with a 2D spectral method.
73	! Hybrid version for parallel computers using a 1D domain decomposition,
74	! realized with MPI, along x and parallelization with OPEN-MP along y
75	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
76	! optimization is realized by overlapping of computation and communication
77	! and by simultaneously executing as many communication calls as switches
78	! per logical partition (LPAR) are available. This version comes into
79	! effect if more than one node is used and if the environment variable
80	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
81	! without any rest.
82	!
83	! WARNING: In case of OpenMP, there are problems with allocating large
84	! arrays in parallel regions.
85	!
86	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
87	!------------------------------------------------------------------------------!
88
89	USE fft_xy
90	USE indices
91	USE pegrid
92
93	IMPLICIT NONE
94
95	INTEGER, PARAMETER :: switch_per_lpar = 2
96
97	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
98	nxl_p, nxr_p, & ! partial x dimension
99	nys_a, nyn_a, & ! total y dimension
100	nys_p, nyn_p, & ! partial y dimension
101
102	npe_s, & ! total number of PEs for solver
103	nwords, & ! number of points to be exchanged
104	! with MPI_ALLTOALL
105	n_omp_threads ! number of OpenMP threads
106
107	!
108	!-- Variables for multi node version (cluster version) using routine
109	!-- poisfft_hybrid_nodes
110	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
111	comm_node_all, & ! communicater all PEs node version
112	comm_tasks, & ! communicater tasks
113	me, me_node, me_task,& ! identity of this PE
114	nodes, & ! number of nodes
115	tasks_per_logical_node = -1 ! default no cluster
116
117
118	PRIVATE
119
120
121	#if ! defined ( __check )
122	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
123
124
125	!
126	!-- Public interfaces
127	INTERFACE poisfft_hybrid_ini
128	MODULE PROCEDURE poisfft_hybrid_ini
129	END INTERFACE poisfft_hybrid_ini
130
131	INTERFACE poisfft_hybrid
132	MODULE PROCEDURE poisfft_hybrid
133	END INTERFACE poisfft_hybrid
134
135	!
136	!-- Private interfaces
137	INTERFACE poisfft_hybrid_omp
138	MODULE PROCEDURE poisfft_hybrid_omp
139	END INTERFACE poisfft_hybrid_omp
140
141	INTERFACE poisfft_hybrid_omp_vec
142	MODULE PROCEDURE poisfft_hybrid_omp_vec
143	END INTERFACE poisfft_hybrid_omp_vec
144
145	INTERFACE poisfft_hybrid_nodes
146	MODULE PROCEDURE poisfft_hybrid_nodes
147	END INTERFACE poisfft_hybrid_nodes
148
149	INTERFACE tridia_hybrid
150	MODULE PROCEDURE tridia_hybrid
151	END INTERFACE tridia_hybrid
152
153	INTERFACE cascade
154	MODULE PROCEDURE cascade
155	END INTERFACE cascade
156	#else
157	PUBLIC poisfft_hybrid_ini
158
159	!
160	!-- Public interfaces
161	INTERFACE poisfft_hybrid_ini
162	MODULE PROCEDURE poisfft_hybrid_ini
163	END INTERFACE poisfft_hybrid_ini
164	#endif
165
166	CONTAINS
167
168
169	SUBROUTINE poisfft_hybrid_ini
170
171	USE control_parameters
172	USE pegrid
173
174	IMPLICIT NONE
175
176	CHARACTER(LEN=8) :: cdummy
177	INTEGER :: idummy, istat
178	INTEGER, DIMENSION(2) :: coords, dims
179
180	LOGICAL, DIMENSION(2) :: period = .false., re_dims
181
182
183	!
184	!-- Set the internal index values for the hybrid solver
185	#if defined( __parallel )
186	npe_s = pdims(1)
187	#else
188	npe_s = 1
189	#endif
190	nxl_a = 0
191	nxr_a = nx
192	nxl_p = 0
193	nxr_p = ( ( nx+1 ) / npe_s ) - 1
194	nys_a = nys
195	nyn_a = nyn
196	nys_p = 0
197	nyn_p = ( ( ny+1 ) / npe_s ) - 1
198
199	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
200
201	#if defined( __KKMP ) && ! defined ( __check )
202	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
203	READ ( cdummy, '(I8)' ) n_omp_threads
204	IF ( n_omp_threads > 1 ) THEN
205	WRITE( message_string, * ) 'Number of OpenMP threads = ', &
206	n_omp_threads
207	CALL message( 'poisfft_hybrid_ini', 'PA0280', 0, 0, 0, 6, 0 )
208	ENDIF
209	#else
210	n_omp_threads = 1
211	#endif
212	!
213	!-- Initialize the one-dimensional FFT routines
214	CALL fft_init
215
216	!
217	!-- Setup for multi node version (poisfft_hybrid_nodes)
218	IF ( n_omp_threads == 1 .AND. &
219	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
220
221	IF ( tasks_per_node /= -9999 ) THEN
222	!
223	!-- Multi node version requires that the available number of
224	!-- switches per logical partition must be an integral divisor
225	!-- of the chosen number of tasks per node
226	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
227	!
228	!-- Set the switch which decides about usage of the multi node
229	!-- version
230	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
231	numprocs > tasks_per_node ) THEN
232	tasks_per_logical_node = tasks_per_node / switch_per_lpar
233	ENDIF
234
235	IF ( tasks_per_logical_node > -1 ) THEN
236
237	WRITE( message_string, * ) 'running optimized ', &
238	'multinode version', &
239	'&switch_per_lpar = ', &
240	switch_per_lpar, &
241	'&tasks_per_lpar = ', &
242	tasks_per_node, &
243	'tasks_per_logical_node = ', &
244	tasks_per_logical_node
245	CALL message( 'poisfft_hybrid_ini', 'PA0281', 0, 0, 0, 6, 0 )
246
247	ENDIF
248
249	ENDIF
250	ENDIF
251	ENDIF
252
253	!
254	!-- Determine sub-topologies for multi node version
255	IF ( tasks_per_logical_node >= 2 ) THEN
256
257	#if defined( __parallel ) && ! defined ( __check )
258	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
259	tasks_per_logical_node
260	dims(1) = nodes
261	dims(2) = tasks_per_logical_node
262
263	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
264	comm_node_all, istat )
265	CALL MPI_COMM_RANK( comm_node_all, me, istat )
266
267	re_dims(1) = .TRUE.
268	re_dims(2) = .FALSE.
269	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
270	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
271
272	re_dims(1) = .FALSE.
273	re_dims(2) = .TRUE.
274	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
275	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
276
277	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
278	! tasks_per_logical_node
279	#elif ! defined( __parallel )
280	message_string = 'parallel environment (MPI) required'
281	CALL message( 'poisfft_hybrid_ini', 'PA0282', 1, 2, 0, 6, 0 )
282	#endif
283	ENDIF
284
285	END SUBROUTINE poisfft_hybrid_ini
286
287	#if ! defined ( __check )
288	SUBROUTINE poisfft_hybrid( ar )
289
290	USE control_parameters
291	USE interfaces
292
293	IMPLICIT NONE
294
295	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
296
297	IF ( host(1:3) == 'nec' ) THEN
298	CALL poisfft_hybrid_omp_vec( ar )
299	ELSE
300	IF ( tasks_per_logical_node == -1 ) THEN
301	CALL poisfft_hybrid_omp( ar )
302	ELSE
303	CALL poisfft_hybrid_nodes( ar )
304	ENDIF
305	ENDIF
306
307	END SUBROUTINE poisfft_hybrid
308
309
310	SUBROUTINE poisfft_hybrid_omp ( ar )
311
312	USE cpulog
313	USE interfaces
314
315	IMPLICIT NONE
316
317	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
318	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
319
320	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
321
322	REAL, DIMENSION(0:nx) :: fftx_ar
323	REAL, DIMENSION(0:ny,istride) :: ffty_ar
324
325	REAL, DIMENSION(0:nx,nz) :: tri_ar
326
327	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
328	#if defined( __KKMP )
329	INTEGER :: omp_get_thread_num
330	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
331	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
332	#else
333	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
334	#endif
335
336
337	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
338
339	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
340
341	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
342	!$OMP DO
343	!
344	!-- Store grid points to be transformed on a 1d-array, do the fft
345	!-- and sample the results on a 4d-array
346	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
347	iei = MIN( iouter+istride-1, nxr_p )
348	DO k = 1, nz
349
350	DO i = iouter, iei
351	ii = nxl + i
352	ir = i - iouter + 1
353
354	DO j = nys_a, nyn_a
355	ffty_ar(j,ir) = ar(k,j,ii)
356	ENDDO
357
358	CALL fft_y_1d( ffty_ar(:,ir), 'forward' )
359	ENDDO
360
361	m = nys_a
362	DO n = 1, npe_s
363	DO j = nys_p, nyn_p
364	DO i = iouter, iei
365	ir = i - iouter + 1
366	work1(i,k,j,n) = ffty_ar(m,ir)
367	ENDDO
368	m = m+1
369	ENDDO
370	ENDDO
371
372	ENDDO
373	ENDDO
374	!$OMP END PARALLEL
375
376	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
377
378	#if defined( __parallel )
379	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
380
381	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
382	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
383	comm2d, istat )
384
385	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
386	#else
387	work2 = work1
388	#endif
389
390	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
391
392	#if defined( __KKMP )
393	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
394	!$OMP DO
395	DO j = nys_p, nyn_p
396	jthread = omp_get_thread_num() + 1
397	#else
398	DO j = nys_p, nyn_p
399	jthread = 1
400	#endif
401	DO k = 1, nz
402
403	m = nxl_a
404	DO n = 1, npe_s
405	DO i = nxl_p, nxr_p
406	fftx_ar(m) = work2(i,k,j,n)
407	m = m+1
408	ENDDO
409	ENDDO
410
411	CALL fft_x_1d( fftx_ar, 'forward' )
412
413	DO i = nxl_a, nxr_a
414	tri_ar(i,k) = fftx_ar(i)
415	ENDDO
416
417	ENDDO
418
419	jj = myid * (nyn_p-nys_p+1) + j
420	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
421
422	DO k = 1, nz
423	DO i = nxl_a, nxr_a
424	fftx_ar(i) = tri_ar (i,k)
425	ENDDO
426
427	CALL fft_x_1d( fftx_ar, 'backward' )
428
429	m = nxl_a
430	DO n = 1, npe_s
431	DO i = nxl_p, nxr_p
432	work2(i,k,j,n) = fftx_ar(m)
433	m = m+1
434	ENDDO
435	ENDDO
436
437	ENDDO
438	ENDDO
439	#if defined( __KKMP )
440	!$OMP END PARALLEL
441	#endif
442
443	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
444
445	#if defined( __parallel )
446	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
447	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
448
449	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
450	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
451	comm2d, istat )
452
453	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
454	#else
455	work1 = work2
456	#endif
457
458	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
459
460	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
461	!$OMP DO
462	DO iouter = nxl_p, nxr_p, istride
463	iei = MIN( iouter+istride-1, nxr_p )
464	DO k = 1, nz
465
466	m = nys_a
467	DO n = 1, npe_s
468	DO j = nys_p, nyn_p
469	DO i = iouter, iei
470	ir = i - iouter + 1
471	ffty_ar(m,ir) = work1 (i,k,j,n)
472	ENDDO
473	m = m+1
474	ENDDO
475	ENDDO
476
477	DO i = iouter, iei
478	ii = nxl + i
479	ir = i - iouter + 1
480	CALL fft_y_1d( ffty_ar(:,ir), 'backward' )
481
482	DO j = nys_a, nyn_a
483	ar(k,j,ii) = ffty_ar(j,ir)
484	ENDDO
485	ENDDO
486
487	ENDDO
488	ENDDO
489	!$OMP END PARALLEL
490
491	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
492
493	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
494
495	#if defined( __KKMP )
496	DEALLOCATE( tri )
497	#endif
498
499	END SUBROUTINE poisfft_hybrid_omp
500
501
502	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
503
504	USE cpulog
505	USE interfaces
506
507	IMPLICIT NONE
508
509	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
510	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
511
512	REAL, DIMENSION(0:nx,nz) :: tri_ar
513
514	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
515
516	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
517	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
518
519	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
520	#if defined( __KKMP )
521	INTEGER :: omp_get_thread_num
522	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
523	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
524	#else
525	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
526	#endif
527
528
529	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
530
531	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
532
533	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
534	!$OMP DO
535	!
536	!-- Store grid points to be transformed on a 1d-array, do the fft
537	!-- and sample the results on a 4d-array
538	DO i = nxl_p, nxr_p
539
540	DO j = nys_a, nyn_a
541	DO k = 1, nz
542	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
543	ENDDO
544	ENDDO
545
546	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
547	ENDDO
548
549	!$OMP DO
550	DO k = 1, nz
551	m = nys_a
552	DO n = 1, npe_s
553	DO j = nys_p, nyn_p
554	DO i = nxl_p, nxr_p
555	work1(i,k,j,n) = ffty_ar3(m,k,i)
556	ENDDO
557	m = m+1
558	ENDDO
559	ENDDO
560	ENDDO
561	!$OMP END PARALLEL
562
563	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
564
565	#if defined( __parallel )
566	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
567	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
568	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
569	comm2d, istat )
570	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
571	#else
572	work2 = work1
573	#endif
574
575	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
576
577	#if defined( __KKMP )
578	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
579	!$OMP DO
580	DO j = nys_p, nyn_p
581	jthread = omp_get_thread_num() + 1
582	#else
583	DO j = nys_p, nyn_p
584	jthread = 1
585	#endif
586	DO k = 1, nz
587
588	m = nxl_a
589	DO n = 1, npe_s
590	DO i = nxl_p, nxr_p
591	fftx_ar3(m,k,j) = work2(i,k,j,n)
592	m = m+1
593	ENDDO
594	ENDDO
595	ENDDO
596
597	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
598
599	DO k = 1, nz
600	DO i = nxl_a, nxr_a
601	tri_ar(i,k) = fftx_ar3(i,k,j)
602	ENDDO
603	ENDDO
604
605	jj = myid * (nyn_p-nys_p+1) + j
606	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
607
608	DO k = 1, nz
609	DO i = nxl_a, nxr_a
610	fftx_ar3(i,k,j) = tri_ar (i,k)
611	ENDDO
612	ENDDO
613
614	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
615
616	DO k = 1, nz
617	m = nxl_a
618	DO n = 1, npe_s
619	DO i = nxl_p, nxr_p
620	work2(i,k,j,n) = fftx_ar3(m,k,j)
621	m = m+1
622	ENDDO
623	ENDDO
624	ENDDO
625
626	ENDDO
627	#if defined( __KKMP )
628	!$OMP END PARALLEL
629	#endif
630
631	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
632
633	#if defined( __parallel )
634	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
635	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
636	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
637	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
638	comm2d, istat )
639	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
640	#else
641	work1 = work2
642	#endif
643
644	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
645
646	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
647	!$OMP DO
648	DO k = 1, nz
649	m = nys_a
650	DO n = 1, npe_s
651	DO j = nys_p, nyn_p
652	DO i = nxl_p, nxr_p
653	ffty_ar3(m,k,i) = work1(i,k,j,n)
654	ENDDO
655	m = m+1
656	ENDDO
657	ENDDO
658	ENDDO
659
660	!$OMP DO
661	DO i = nxl_p, nxr_p
662	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
663	DO j = nys_a, nyn_a
664	DO k = 1, nz
665	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
666	ENDDO
667	ENDDO
668	ENDDO
669	!$OMP END PARALLEL
670
671	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
672
673	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
674
675	#if defined( __KKMP )
676	DEALLOCATE( tri )
677	#endif
678
679	END SUBROUTINE poisfft_hybrid_omp_vec
680
681
682	SUBROUTINE poisfft_hybrid_nodes ( ar )
683
684	USE cpulog
685	USE interfaces
686
687	IMPLICIT NONE
688
689	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
690	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
691	n, nn, nt, nw1, nw2
692
693	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
694
695	REAL, DIMENSION(0:nx) :: fftx_ar
696	REAL, DIMENSION(0:ny,istride) :: ffty_ar
697
698	REAL, DIMENSION(0:nx,nz) :: tri_ar
699
700	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
701	nodes,nys_p:nyn_p) :: work1,work2
702	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
703
704
705	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
706
707	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
708
709	!
710	!-- Store grid points to be transformed on a 1d-array, do the fft
711	!-- and sample the results on a 4d-array
712	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
713	iei = MIN( iouter+istride-1, nxr_p )
714	DO k = 1, nz
715
716	DO i = iouter, iei
717	ii = nxl + i
718	ir = i - iouter + 1
719
720	DO j = nys_a, nyn_a
721	ffty_ar(j,ir) = ar(k,j,ii)
722	ENDDO
723
724	CALL fft_y_1d( ffty_ar(:,ir), 'forward' )
725	ENDDO
726
727	m = nys_a
728	DO nn = 1, nodes
729	DO nt = 1, tasks_per_logical_node
730	DO j = nys_p, nyn_p
731	DO i = iouter, iei
732	ir = i - iouter + 1
733	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
734	ENDDO
735	m = m+1
736	ENDDO
737	ENDDO
738	ENDDO
739
740	ENDDO
741	ENDDO
742
743	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
744
745	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
746	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
747	DO nn = 1, nodes
748	DO j = nys_p, nyn_p
749	#if defined( __parallel )
750	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
751	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
752	comm_tasks, istat )
753	#endif
754	ENDDO
755	ENDDO
756	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
757
758
759	DO j = nys_p, nyn_p
760
761	CALL cascade( 1, j, nys_p, nyn_p )
762	nw2 = nw1 * SIZE( work1, 3 )
763	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
764	#if defined( __parallel )
765	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
766	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
767	comm_nodes, istat )
768	#endif
769	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
770	CALL cascade( 2, j, nys_p, nyn_p )
771
772	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
773	DO k = 1, nz
774
775	m = nxl_a
776	DO nn = 1, nodes
777	DO nt = 1, tasks_per_logical_node
778	DO i = nxl_p, nxr_p
779	fftx_ar(m) = work1(i,k,nt,nn,j)
780	m = m+1
781	ENDDO
782	ENDDO
783	ENDDO
784
785	CALL fft_x_1d( fftx_ar, 'forward' )
786
787	DO i = nxl_a, nxr_a
788	tri_ar(i,k) = fftx_ar(i)
789	ENDDO
790
791	ENDDO
792
793	jj = myid * (nyn_p-nys_p+1) + j
794	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
795
796	DO k = 1, nz
797	DO i = nxl_a, nxr_a
798	fftx_ar(i) = tri_ar(i,k)
799	ENDDO
800
801	CALL fft_x_1d( fftx_ar, 'backward' )
802
803	m = nxl_a
804	DO nn = 1, nodes
805	DO nt = 1, tasks_per_logical_node
806	DO i = nxl_p, nxr_p
807	work1(i,k,nt,nn,j) = fftx_ar(m)
808	m = m+1
809	ENDDO
810	ENDDO
811	ENDDO
812	ENDDO
813
814	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
815	nw2 = nw1 * SIZE( work1, 3 )
816	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
817	#if defined( __parallel )
818	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
819	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
820	comm_nodes, istat )
821	#endif
822	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
823
824	ENDDO
825
826	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
827	DO nn = 1, nodes
828	DO j = nys_p, nyn_p
829	#if defined( __parallel )
830	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
831	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
832	comm_tasks, istat )
833	#endif
834	ENDDO
835	ENDDO
836	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
837
838	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
839
840	DO iouter = nxl_p, nxr_p, istride
841	iei = MIN( iouter+istride-1, nxr_p )
842	DO k = 1, nz
843
844	m = nys_a
845	DO nn = 1, nodes
846	DO nt = 1, tasks_per_logical_node
847	DO j = nys_p, nyn_p
848	DO i = iouter, iei
849	ir = i - iouter + 1
850	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
851	ENDDO
852	m = m+1
853	ENDDO
854	ENDDO
855	ENDDO
856
857	DO i = iouter, iei
858	ii = nxl + i
859	ir = i - iouter + 1
860	CALL fft_y_1d( ffty_ar(:,ir), 'backward' )
861
862	DO j = nys_a, nyn_a
863	ar(k,j,ii) = ffty_ar(j,ir)
864	ENDDO
865	ENDDO
866
867	ENDDO
868	ENDDO
869
870	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
871
872	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
873
874	END SUBROUTINE poisfft_hybrid_nodes
875
876
877
878	SUBROUTINE tridia_hybrid( j, ar, tri )
879
880	USE arrays_3d
881	USE control_parameters
882	USE grid_variables
883
884	IMPLICIT NONE
885
886	INTEGER :: i, j, k, nnyh
887	REAL, DIMENSION(0:nx,nz) :: ar
888	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
889	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
890
891	nnyh = (ny+1) / 2
892
893	tri = 0.0
894	!
895	!-- Define constant elements of the tridiagonal matrix.
896	DO k = 0, nz-1
897	DO i = 0,nx
898	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
899	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
900	ENDDO
901	ENDDO
902
903	IF ( j <= nnyh ) THEN
904	CALL maketri_hybrid( j )
905	ELSE
906	CALL maketri_hybrid( ny+1-j)
907	ENDIF
908	CALL zerleg_hybrid
909	CALL substi_hybrid( ar, tri )
910
911	CONTAINS
912
913	SUBROUTINE maketri_hybrid( j )
914
915	!----------------------------------------------------------------------!
916	! maketri !
917	! !
918	! computes the i- and j-dependent component of the matrix !
919	!----------------------------------------------------------------------!
920
921	USE constants
922
923	IMPLICIT NONE
924
925	INTEGER :: i, j, k, nnxh
926	REAL :: a, c
927
928	REAL, DIMENSION(0:nx) :: l
929
930
931	nnxh = (nx+1) / 2
932	!
933	!-- Provide the tridiagonal matrix for solution of the Poisson equation
934	!-- in Fourier space. The coefficients are computed following the method
935	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
936	!-- Siano's original version.
937	DO i = 0,nx
938	IF ( i >= 0 .AND. i < nnxh ) THEN
939	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
940	REAL( nx+1 ) ) ) / ( dx * dx ) + &
941	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
942	REAL( ny+1 ) ) ) / ( dy * dy )
943	ELSEIF ( i == nnxh ) THEN
944	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
945	REAL( nx+1 ) ) ) / ( dx * dx ) + &
946	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
947	REAL(ny+1) ) ) / ( dy * dy )
948	ELSE
949	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
950	REAL( nx+1 ) ) ) / ( dx * dx ) + &
951	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
952	REAL( ny+1 ) ) ) / ( dy * dy )
953	ENDIF
954	ENDDO
955
956	DO k = 0,nz-1
957	DO i = 0, nx
958	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
959	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
960	tri(1,i,k) = a + c - l(i)
961	ENDDO
962	ENDDO
963	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
964	DO i = 0,nx
965	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
966	ENDDO
967	ENDIF
968	IF ( ibc_p_t == 1 ) THEN
969	DO i = 0,nx
970	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
971	ENDDO
972	ENDIF
973
974	END SUBROUTINE maketri_hybrid
975
976
977	SUBROUTINE zerleg_hybrid
978
979	!----------------------------------------------------------------------!
980	! zerleg !
981	! !
982	! Splitting of the tridiagonal matrix (Thomas algorithm) !
983	!----------------------------------------------------------------------!
984
985	USE indices
986
987	IMPLICIT NONE
988
989	INTEGER :: i, k
990
991	!
992	!-- Splitting
993	DO i = 0, nx
994	tri(4,i,0) = tri(1,i,0)
995	ENDDO
996	DO k = 1, nz-1
997	DO i = 0,nx
998	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
999	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1000	ENDDO
1001	ENDDO
1002
1003	END SUBROUTINE zerleg_hybrid
1004
1005	SUBROUTINE substi_hybrid( ar, tri )
1006
1007	!----------------------------------------------------------------------!
1008	! substi !
1009	! !
1010	! Substitution (Forward and Backward) (Thomas algorithm) !
1011	!----------------------------------------------------------------------!
1012
1013	IMPLICIT NONE
1014
1015	INTEGER :: i, j, k
1016	REAL, DIMENSION(0:nx,nz) :: ar
1017	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1018	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1019
1020	!
1021	!-- Forward substitution
1022	DO i = 0, nx
1023	ar1(i,0) = ar(i,1)
1024	ENDDO
1025	DO k = 1, nz - 1
1026	DO i = 0,nx
1027	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1028	ENDDO
1029	ENDDO
1030
1031	!
1032	!-- Backward substitution
1033	DO i = 0,nx
1034	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
1035	ENDDO
1036	DO k = nz-2, 0, -1
1037	DO i = 0,nx
1038	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1039	/ tri(4,i,k)
1040	ENDDO
1041	ENDDO
1042
1043	END SUBROUTINE substi_hybrid
1044
1045	END SUBROUTINE tridia_hybrid
1046
1047
1048	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
1049
1050	USE cpulog
1051
1052	IMPLICIT NONE
1053
1054	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
1055	INTEGER, SAVE :: tag = 10
1056	#if defined( __parallel )
1057	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
1058	INTEGER, DIMENSION(MPI_STATUS_SIZE,1) :: stata
1059	#endif
1060
1061	REAL :: buf, buf1
1062
1063
1064	buf = 1.0
1065	buf1 = 1.1
1066	IF ( me_node == 0 ) THEN ! first node only
1067
1068	SELECT CASE ( loca )
1069
1070	CASE ( 1 ) ! before alltoall
1071
1072	IF( me_task > 0 ) THEN ! first task does not wait
1073	#if defined( __parallel )
1074	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
1075	buf1, 1, MPI_REAL, me_task-1, 0, &
1076	comm_tasks, stat, ierr )
1077	#endif
1078	ELSEIF ( j > nys_p ) THEN
1079	req = 0
1080	tag = MOD( tag-10, 10 ) + 10
1081	#if defined( __parallel )
1082	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
1083	tag, comm_tasks, req, ierr )
1084	reqa = req
1085	CALL MPI_WAITALL( 1, reqa, stata, ierr )
1086	#endif
1087	ENDIF
1088
1089	CASE ( 2 ) ! after alltoall
1090
1091	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
1092	#if defined( __parallel )
1093	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
1094	buf1, 1, MPI_REAL, me_task+1, 0, &
1095	comm_tasks, stat, ierr)
1096	#endif
1097	ELSEIF ( j < nyn_p ) THEN
1098	req = 0
1099	tag = MOD( tag-10, 10 ) + 10
1100	#if defined( __parallel )
1101	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
1102	ierr )
1103	#endif
1104	ENDIF
1105
1106	END SELECT
1107
1108	ENDIF
1109
1110	END SUBROUTINE cascade
1111	#endif
1112	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |