Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 1096

Last change on this file since 1096 was 1037, checked in by raasch, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 33.0 KB

Line
1	MODULE poisfft_hybrid_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2012 Leibniz University Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft_hybrid.f90 1037 2012-10-22 14:10:22Z raasch $
27	!
28	! 1036 2012-10-22 13:43:42Z raasch
29	! code put under GPL (PALM 3.9)
30	!
31	! 1013 2012-09-21 07:03:55Z raasch
32	! FLOAT type conversion replaced by REAL
33	!
34	! 809 2012-01-30 13:32:58Z maronga
35	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
36	!
37	! 807 2012-01-25 11:53:51Z maronga
38	! New cpp directive "__check" implemented which is used by check_namelist_files
39	! (most of the code is unneeded by check_namelist_files).
40	!
41	! 667 2010-12-23 12:06:00Z suehring/gryschka
42	! ddzu replaced by ddzu_pres due to changes in zu(0)
43	!
44	! 415 2009-12-15 10:26:23Z raasch
45	! Dimension of array stat in cascade change to prevent type problems with___
46	! mpi2 libraries
47	!
48	! 274 2009-03-26 15:11:21Z heinze
49	! Output of messages replaced by message handling routine.
50	!
51	! Feb. 2007
52	! RCS Log replace by Id keyword, revision history cleaned up
53	!
54	! Revision 1.11 2004/04/30 12:43:14 raasch
55	! Renaming of fft routines, additional argument in calls of fft_y_m
56	!
57	! Revision 1.2 2002/12/19 16:08:31 raasch
58	! Preprocessor directive KKMP introduced (OMP does NOT work),
59	! array tri will be a shared array in OpenMP loop, to get better cache
60	! utilization, the i index (x-direction) will be executed in stride
61	! "istride" as outer loop and in a shorter inner loop,
62	! overlapping of computation and communication realized by new routine
63	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
64	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
65	!
66	!
67	! Description:
68	! ------------
69	! Solution of the Poisson equation with a 2D spectral method.
70	! Hybrid version for parallel computers using a 1D domain decomposition,
71	! realized with MPI, along x and parallelization with OPEN-MP along y
72	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
73	! optimization is realized by overlapping of computation and communication
74	! and by simultaneously executing as many communication calls as switches
75	! per logical partition (LPAR) are available. This version comes into
76	! effect if more than one node is used and if the environment variable
77	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
78	! without any rest.
79	!
80	! WARNING: In case of OpenMP, there are problems with allocating large
81	! arrays in parallel regions.
82	!
83	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
84	!------------------------------------------------------------------------------!
85
86	USE fft_xy
87	USE indices
88	USE pegrid
89
90	IMPLICIT NONE
91
92	INTEGER, PARAMETER :: switch_per_lpar = 2
93
94	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
95	nxl_p, nxr_p, & ! partial x dimension
96	nys_a, nyn_a, & ! total y dimension
97	nys_p, nyn_p, & ! partial y dimension
98
99	npe_s, & ! total number of PEs for solver
100	nwords, & ! number of points to be exchanged
101	! with MPI_ALLTOALL
102	n_omp_threads ! number of OpenMP threads
103
104	!
105	!-- Variables for multi node version (cluster version) using routine
106	!-- poisfft_hybrid_nodes
107	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
108	comm_node_all, & ! communicater all PEs node version
109	comm_tasks, & ! communicater tasks
110	me, me_node, me_task,& ! identity of this PE
111	nodes, & ! number of nodes
112	tasks_per_logical_node = -1 ! default no cluster
113
114
115	PRIVATE
116
117
118	#if ! defined ( __check )
119	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
120
121
122	!
123	!-- Public interfaces
124	INTERFACE poisfft_hybrid_ini
125	MODULE PROCEDURE poisfft_hybrid_ini
126	END INTERFACE poisfft_hybrid_ini
127
128	INTERFACE poisfft_hybrid
129	MODULE PROCEDURE poisfft_hybrid
130	END INTERFACE poisfft_hybrid
131
132	!
133	!-- Private interfaces
134	INTERFACE poisfft_hybrid_omp
135	MODULE PROCEDURE poisfft_hybrid_omp
136	END INTERFACE poisfft_hybrid_omp
137
138	INTERFACE poisfft_hybrid_omp_vec
139	MODULE PROCEDURE poisfft_hybrid_omp_vec
140	END INTERFACE poisfft_hybrid_omp_vec
141
142	INTERFACE poisfft_hybrid_nodes
143	MODULE PROCEDURE poisfft_hybrid_nodes
144	END INTERFACE poisfft_hybrid_nodes
145
146	INTERFACE tridia_hybrid
147	MODULE PROCEDURE tridia_hybrid
148	END INTERFACE tridia_hybrid
149
150	INTERFACE cascade
151	MODULE PROCEDURE cascade
152	END INTERFACE cascade
153	#else
154	PUBLIC poisfft_hybrid_ini
155
156	!
157	!-- Public interfaces
158	INTERFACE poisfft_hybrid_ini
159	MODULE PROCEDURE poisfft_hybrid_ini
160	END INTERFACE poisfft_hybrid_ini
161	#endif
162
163	CONTAINS
164
165
166	SUBROUTINE poisfft_hybrid_ini
167
168	USE control_parameters
169	USE pegrid
170
171	IMPLICIT NONE
172
173	CHARACTER(LEN=8) :: cdummy
174	INTEGER :: idummy, istat
175	INTEGER, DIMENSION(2) :: coords, dims
176
177	LOGICAL, DIMENSION(2) :: period = .false., re_dims
178
179
180	!
181	!-- Set the internal index values for the hybrid solver
182	#if defined( __parallel )
183	npe_s = pdims(1)
184	#else
185	npe_s = 1
186	#endif
187	nxl_a = 0
188	nxr_a = nx
189	nxl_p = 0
190	nxr_p = ( ( nx+1 ) / npe_s ) - 1
191	nys_a = nys
192	nyn_a = nyn
193	nys_p = 0
194	nyn_p = ( ( ny+1 ) / npe_s ) - 1
195
196	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
197
198	#if defined( __KKMP ) && ! defined ( __check )
199	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
200	READ ( cdummy, '(I8)' ) n_omp_threads
201	IF ( n_omp_threads > 1 ) THEN
202	WRITE( message_string, * ) 'Number of OpenMP threads = ', &
203	n_omp_threads
204	CALL message( 'poisfft_hybrid_ini', 'PA0280', 0, 0, 0, 6, 0 )
205	ENDIF
206	#else
207	n_omp_threads = 1
208	#endif
209	!
210	!-- Initialize the one-dimensional FFT routines
211	CALL fft_init
212
213	!
214	!-- Setup for multi node version (poisfft_hybrid_nodes)
215	IF ( n_omp_threads == 1 .AND. &
216	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
217
218	IF ( tasks_per_node /= -9999 ) THEN
219	!
220	!-- Multi node version requires that the available number of
221	!-- switches per logical partition must be an integral divisor
222	!-- of the chosen number of tasks per node
223	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
224	!
225	!-- Set the switch which decides about usage of the multi node
226	!-- version
227	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
228	numprocs > tasks_per_node ) THEN
229	tasks_per_logical_node = tasks_per_node / switch_per_lpar
230	ENDIF
231
232	IF ( tasks_per_logical_node > -1 ) THEN
233
234	WRITE( message_string, * ) 'running optimized ', &
235	'multinode version', &
236	'&switch_per_lpar = ', &
237	switch_per_lpar, &
238	'&tasks_per_lpar = ', &
239	tasks_per_node, &
240	'tasks_per_logical_node = ', &
241	tasks_per_logical_node
242	CALL message( 'poisfft_hybrid_ini', 'PA0281', 0, 0, 0, 6, 0 )
243
244	ENDIF
245
246	ENDIF
247	ENDIF
248	ENDIF
249
250	!
251	!-- Determine sub-topologies for multi node version
252	IF ( tasks_per_logical_node >= 2 ) THEN
253
254	#if defined( __parallel ) && ! defined ( __check )
255	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
256	tasks_per_logical_node
257	dims(1) = nodes
258	dims(2) = tasks_per_logical_node
259
260	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
261	comm_node_all, istat )
262	CALL MPI_COMM_RANK( comm_node_all, me, istat )
263
264	re_dims(1) = .TRUE.
265	re_dims(2) = .FALSE.
266	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
267	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
268
269	re_dims(1) = .FALSE.
270	re_dims(2) = .TRUE.
271	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
272	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
273
274	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
275	! tasks_per_logical_node
276	#elif ! defined( __parallel )
277	message_string = 'parallel environment (MPI) required'
278	CALL message( 'poisfft_hybrid_ini', 'PA0282', 1, 2, 0, 6, 0 )
279	#endif
280	ENDIF
281
282	END SUBROUTINE poisfft_hybrid_ini
283
284	#if ! defined ( __check )
285	SUBROUTINE poisfft_hybrid( ar )
286
287	USE control_parameters
288	USE interfaces
289
290	IMPLICIT NONE
291
292	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
293
294	IF ( host(1:3) == 'nec' ) THEN
295	CALL poisfft_hybrid_omp_vec( ar )
296	ELSE
297	IF ( tasks_per_logical_node == -1 ) THEN
298	CALL poisfft_hybrid_omp( ar )
299	ELSE
300	CALL poisfft_hybrid_nodes( ar )
301	ENDIF
302	ENDIF
303
304	END SUBROUTINE poisfft_hybrid
305
306
307	SUBROUTINE poisfft_hybrid_omp ( ar )
308
309	USE cpulog
310	USE interfaces
311
312	IMPLICIT NONE
313
314	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
315	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
316
317	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
318
319	REAL, DIMENSION(0:nx) :: fftx_ar
320	REAL, DIMENSION(0:ny,istride) :: ffty_ar
321
322	REAL, DIMENSION(0:nx,nz) :: tri_ar
323
324	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
325	#if defined( __KKMP )
326	INTEGER :: omp_get_thread_num
327	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
328	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
329	#else
330	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
331	#endif
332
333
334	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
335
336	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
337
338	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
339	!$OMP DO
340	!
341	!-- Store grid points to be transformed on a 1d-array, do the fft
342	!-- and sample the results on a 4d-array
343	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
344	iei = MIN( iouter+istride-1, nxr_p )
345	DO k = 1, nz
346
347	DO i = iouter, iei
348	ii = nxl + i
349	ir = i - iouter + 1
350
351	DO j = nys_a, nyn_a
352	ffty_ar(j,ir) = ar(k,j,ii)
353	ENDDO
354
355	CALL fft_y( ffty_ar(:,ir), 'forward' )
356	ENDDO
357
358	m = nys_a
359	DO n = 1, npe_s
360	DO j = nys_p, nyn_p
361	DO i = iouter, iei
362	ir = i - iouter + 1
363	work1(i,k,j,n) = ffty_ar(m,ir)
364	ENDDO
365	m = m+1
366	ENDDO
367	ENDDO
368
369	ENDDO
370	ENDDO
371	!$OMP END PARALLEL
372
373	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
374
375	#if defined( __parallel )
376	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
377
378	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
379	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
380	comm2d, istat )
381
382	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
383	#else
384	work2 = work1
385	#endif
386
387	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
388
389	#if defined( __KKMP )
390	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
391	!$OMP DO
392	DO j = nys_p, nyn_p
393	jthread = omp_get_thread_num() + 1
394	#else
395	DO j = nys_p, nyn_p
396	jthread = 1
397	#endif
398	DO k = 1, nz
399
400	m = nxl_a
401	DO n = 1, npe_s
402	DO i = nxl_p, nxr_p
403	fftx_ar(m) = work2(i,k,j,n)
404	m = m+1
405	ENDDO
406	ENDDO
407
408	CALL fft_x( fftx_ar, 'forward' )
409
410	DO i = nxl_a, nxr_a
411	tri_ar(i,k) = fftx_ar(i)
412	ENDDO
413
414	ENDDO
415
416	jj = myid * (nyn_p-nys_p+1) + j
417	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
418
419	DO k = 1, nz
420	DO i = nxl_a, nxr_a
421	fftx_ar(i) = tri_ar (i,k)
422	ENDDO
423
424	CALL fft_x( fftx_ar, 'backward' )
425
426	m = nxl_a
427	DO n = 1, npe_s
428	DO i = nxl_p, nxr_p
429	work2(i,k,j,n) = fftx_ar(m)
430	m = m+1
431	ENDDO
432	ENDDO
433
434	ENDDO
435	ENDDO
436	#if defined( __KKMP )
437	!$OMP END PARALLEL
438	#endif
439
440	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
441
442	#if defined( __parallel )
443	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
444	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
445
446	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
447	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
448	comm2d, istat )
449
450	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
451	#else
452	work1 = work2
453	#endif
454
455	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
456
457	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
458	!$OMP DO
459	DO iouter = nxl_p, nxr_p, istride
460	iei = MIN( iouter+istride-1, nxr_p )
461	DO k = 1, nz
462
463	m = nys_a
464	DO n = 1, npe_s
465	DO j = nys_p, nyn_p
466	DO i = iouter, iei
467	ir = i - iouter + 1
468	ffty_ar(m,ir) = work1 (i,k,j,n)
469	ENDDO
470	m = m+1
471	ENDDO
472	ENDDO
473
474	DO i = iouter, iei
475	ii = nxl + i
476	ir = i - iouter + 1
477	CALL fft_y( ffty_ar(:,ir), 'backward' )
478
479	DO j = nys_a, nyn_a
480	ar(k,j,ii) = ffty_ar(j,ir)
481	ENDDO
482	ENDDO
483
484	ENDDO
485	ENDDO
486	!$OMP END PARALLEL
487
488	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
489
490	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
491
492	#if defined( __KKMP )
493	DEALLOCATE( tri )
494	#endif
495
496	END SUBROUTINE poisfft_hybrid_omp
497
498
499	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
500
501	USE cpulog
502	USE interfaces
503
504	IMPLICIT NONE
505
506	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
507	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
508
509	REAL, DIMENSION(0:nx,nz) :: tri_ar
510
511	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
512
513	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
514	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
515
516	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
517	#if defined( __KKMP )
518	INTEGER :: omp_get_thread_num
519	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
520	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
521	#else
522	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
523	#endif
524
525
526	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
527
528	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
529
530	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
531	!$OMP DO
532	!
533	!-- Store grid points to be transformed on a 1d-array, do the fft
534	!-- and sample the results on a 4d-array
535	DO i = nxl_p, nxr_p
536
537	DO j = nys_a, nyn_a
538	DO k = 1, nz
539	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
540	ENDDO
541	ENDDO
542
543	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
544	ENDDO
545
546	!$OMP DO
547	DO k = 1, nz
548	m = nys_a
549	DO n = 1, npe_s
550	DO j = nys_p, nyn_p
551	DO i = nxl_p, nxr_p
552	work1(i,k,j,n) = ffty_ar3(m,k,i)
553	ENDDO
554	m = m+1
555	ENDDO
556	ENDDO
557	ENDDO
558	!$OMP END PARALLEL
559
560	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
561
562	#if defined( __parallel )
563	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
564	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
565	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
566	comm2d, istat )
567	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
568	#else
569	work2 = work1
570	#endif
571
572	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
573
574	#if defined( __KKMP )
575	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
576	!$OMP DO
577	DO j = nys_p, nyn_p
578	jthread = omp_get_thread_num() + 1
579	#else
580	DO j = nys_p, nyn_p
581	jthread = 1
582	#endif
583	DO k = 1, nz
584
585	m = nxl_a
586	DO n = 1, npe_s
587	DO i = nxl_p, nxr_p
588	fftx_ar3(m,k,j) = work2(i,k,j,n)
589	m = m+1
590	ENDDO
591	ENDDO
592	ENDDO
593
594	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
595
596	DO k = 1, nz
597	DO i = nxl_a, nxr_a
598	tri_ar(i,k) = fftx_ar3(i,k,j)
599	ENDDO
600	ENDDO
601
602	jj = myid * (nyn_p-nys_p+1) + j
603	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
604
605	DO k = 1, nz
606	DO i = nxl_a, nxr_a
607	fftx_ar3(i,k,j) = tri_ar (i,k)
608	ENDDO
609	ENDDO
610
611	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
612
613	DO k = 1, nz
614	m = nxl_a
615	DO n = 1, npe_s
616	DO i = nxl_p, nxr_p
617	work2(i,k,j,n) = fftx_ar3(m,k,j)
618	m = m+1
619	ENDDO
620	ENDDO
621	ENDDO
622
623	ENDDO
624	#if defined( __KKMP )
625	!$OMP END PARALLEL
626	#endif
627
628	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
629
630	#if defined( __parallel )
631	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
632	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
633	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
634	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
635	comm2d, istat )
636	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
637	#else
638	work1 = work2
639	#endif
640
641	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
642
643	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
644	!$OMP DO
645	DO k = 1, nz
646	m = nys_a
647	DO n = 1, npe_s
648	DO j = nys_p, nyn_p
649	DO i = nxl_p, nxr_p
650	ffty_ar3(m,k,i) = work1(i,k,j,n)
651	ENDDO
652	m = m+1
653	ENDDO
654	ENDDO
655	ENDDO
656
657	!$OMP DO
658	DO i = nxl_p, nxr_p
659	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
660	DO j = nys_a, nyn_a
661	DO k = 1, nz
662	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
663	ENDDO
664	ENDDO
665	ENDDO
666	!$OMP END PARALLEL
667
668	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
669
670	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
671
672	#if defined( __KKMP )
673	DEALLOCATE( tri )
674	#endif
675
676	END SUBROUTINE poisfft_hybrid_omp_vec
677
678
679	SUBROUTINE poisfft_hybrid_nodes ( ar )
680
681	USE cpulog
682	USE interfaces
683
684	IMPLICIT NONE
685
686	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
687	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
688	n, nn, nt, nw1, nw2
689
690	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
691
692	REAL, DIMENSION(0:nx) :: fftx_ar
693	REAL, DIMENSION(0:ny,istride) :: ffty_ar
694
695	REAL, DIMENSION(0:nx,nz) :: tri_ar
696
697	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
698	nodes,nys_p:nyn_p) :: work1,work2
699	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
700
701
702	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
703
704	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
705
706	!
707	!-- Store grid points to be transformed on a 1d-array, do the fft
708	!-- and sample the results on a 4d-array
709	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
710	iei = MIN( iouter+istride-1, nxr_p )
711	DO k = 1, nz
712
713	DO i = iouter, iei
714	ii = nxl + i
715	ir = i - iouter + 1
716
717	DO j = nys_a, nyn_a
718	ffty_ar(j,ir) = ar(k,j,ii)
719	ENDDO
720
721	CALL fft_y( ffty_ar(:,ir), 'forward' )
722	ENDDO
723
724	m = nys_a
725	DO nn = 1, nodes
726	DO nt = 1, tasks_per_logical_node
727	DO j = nys_p, nyn_p
728	DO i = iouter, iei
729	ir = i - iouter + 1
730	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
731	ENDDO
732	m = m+1
733	ENDDO
734	ENDDO
735	ENDDO
736
737	ENDDO
738	ENDDO
739
740	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
741
742	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
743	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
744	DO nn = 1, nodes
745	DO j = nys_p, nyn_p
746	#if defined( __parallel )
747	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
748	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
749	comm_tasks, istat )
750	#endif
751	ENDDO
752	ENDDO
753	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
754
755
756	DO j = nys_p, nyn_p
757
758	CALL cascade( 1, j, nys_p, nyn_p )
759	nw2 = nw1 * SIZE( work1, 3 )
760	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
761	#if defined( __parallel )
762	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
763	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
764	comm_nodes, istat )
765	#endif
766	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
767	CALL cascade( 2, j, nys_p, nyn_p )
768
769	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
770	DO k = 1, nz
771
772	m = nxl_a
773	DO nn = 1, nodes
774	DO nt = 1, tasks_per_logical_node
775	DO i = nxl_p, nxr_p
776	fftx_ar(m) = work1(i,k,nt,nn,j)
777	m = m+1
778	ENDDO
779	ENDDO
780	ENDDO
781
782	CALL fft_x( fftx_ar, 'forward' )
783
784	DO i = nxl_a, nxr_a
785	tri_ar(i,k) = fftx_ar(i)
786	ENDDO
787
788	ENDDO
789
790	jj = myid * (nyn_p-nys_p+1) + j
791	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
792
793	DO k = 1, nz
794	DO i = nxl_a, nxr_a
795	fftx_ar(i) = tri_ar(i,k)
796	ENDDO
797
798	CALL fft_x( fftx_ar, 'backward' )
799
800	m = nxl_a
801	DO nn = 1, nodes
802	DO nt = 1, tasks_per_logical_node
803	DO i = nxl_p, nxr_p
804	work1(i,k,nt,nn,j) = fftx_ar(m)
805	m = m+1
806	ENDDO
807	ENDDO
808	ENDDO
809	ENDDO
810
811	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
812	nw2 = nw1 * SIZE( work1, 3 )
813	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
814	#if defined( __parallel )
815	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
816	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
817	comm_nodes, istat )
818	#endif
819	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
820
821	ENDDO
822
823	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
824	DO nn = 1, nodes
825	DO j = nys_p, nyn_p
826	#if defined( __parallel )
827	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
828	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
829	comm_tasks, istat )
830	#endif
831	ENDDO
832	ENDDO
833	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
834
835	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
836
837	DO iouter = nxl_p, nxr_p, istride
838	iei = MIN( iouter+istride-1, nxr_p )
839	DO k = 1, nz
840
841	m = nys_a
842	DO nn = 1, nodes
843	DO nt = 1, tasks_per_logical_node
844	DO j = nys_p, nyn_p
845	DO i = iouter, iei
846	ir = i - iouter + 1
847	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
848	ENDDO
849	m = m+1
850	ENDDO
851	ENDDO
852	ENDDO
853
854	DO i = iouter, iei
855	ii = nxl + i
856	ir = i - iouter + 1
857	CALL fft_y( ffty_ar(:,ir), 'backward' )
858
859	DO j = nys_a, nyn_a
860	ar(k,j,ii) = ffty_ar(j,ir)
861	ENDDO
862	ENDDO
863
864	ENDDO
865	ENDDO
866
867	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
868
869	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
870
871	END SUBROUTINE poisfft_hybrid_nodes
872
873
874
875	SUBROUTINE tridia_hybrid( j, ar, tri )
876
877	USE arrays_3d
878	USE control_parameters
879	USE grid_variables
880
881	IMPLICIT NONE
882
883	INTEGER :: i, j, k, nnyh
884	REAL, DIMENSION(0:nx,nz) :: ar
885	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
886	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
887
888	nnyh = (ny+1) / 2
889
890	tri = 0.0
891	!
892	!-- Define constant elements of the tridiagonal matrix.
893	DO k = 0, nz-1
894	DO i = 0,nx
895	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
896	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
897	ENDDO
898	ENDDO
899
900	IF ( j <= nnyh ) THEN
901	CALL maketri_hybrid( j )
902	ELSE
903	CALL maketri_hybrid( ny+1-j)
904	ENDIF
905	CALL zerleg_hybrid
906	CALL substi_hybrid( ar, tri )
907
908	CONTAINS
909
910	SUBROUTINE maketri_hybrid( j )
911
912	!----------------------------------------------------------------------!
913	! maketri !
914	! !
915	! computes the i- and j-dependent component of the matrix !
916	!----------------------------------------------------------------------!
917
918	USE constants
919
920	IMPLICIT NONE
921
922	INTEGER :: i, j, k, nnxh
923	REAL :: a, c
924
925	REAL, DIMENSION(0:nx) :: l
926
927
928	nnxh = (nx+1) / 2
929	!
930	!-- Provide the tridiagonal matrix for solution of the Poisson equation
931	!-- in Fourier space. The coefficients are computed following the method
932	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
933	!-- Siano's original version.
934	DO i = 0,nx
935	IF ( i >= 0 .AND. i < nnxh ) THEN
936	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
937	REAL( nx+1 ) ) ) / ( dx * dx ) + &
938	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
939	REAL( ny+1 ) ) ) / ( dy * dy )
940	ELSEIF ( i == nnxh ) THEN
941	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
942	REAL( nx+1 ) ) ) / ( dx * dx ) + &
943	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
944	REAL(ny+1) ) ) / ( dy * dy )
945	ELSE
946	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
947	REAL( nx+1 ) ) ) / ( dx * dx ) + &
948	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
949	REAL( ny+1 ) ) ) / ( dy * dy )
950	ENDIF
951	ENDDO
952
953	DO k = 0,nz-1
954	DO i = 0, nx
955	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
956	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
957	tri(1,i,k) = a + c - l(i)
958	ENDDO
959	ENDDO
960	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
961	DO i = 0,nx
962	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
963	ENDDO
964	ENDIF
965	IF ( ibc_p_t == 1 ) THEN
966	DO i = 0,nx
967	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
968	ENDDO
969	ENDIF
970
971	END SUBROUTINE maketri_hybrid
972
973
974	SUBROUTINE zerleg_hybrid
975
976	!----------------------------------------------------------------------!
977	! zerleg !
978	! !
979	! Splitting of the tridiagonal matrix (Thomas algorithm) !
980	!----------------------------------------------------------------------!
981
982	USE indices
983
984	IMPLICIT NONE
985
986	INTEGER :: i, k
987
988	!
989	!-- Splitting
990	DO i = 0, nx
991	tri(4,i,0) = tri(1,i,0)
992	ENDDO
993	DO k = 1, nz-1
994	DO i = 0,nx
995	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
996	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
997	ENDDO
998	ENDDO
999
1000	END SUBROUTINE zerleg_hybrid
1001
1002	SUBROUTINE substi_hybrid( ar, tri )
1003
1004	!----------------------------------------------------------------------!
1005	! substi !
1006	! !
1007	! Substitution (Forward and Backward) (Thomas algorithm) !
1008	!----------------------------------------------------------------------!
1009
1010	IMPLICIT NONE
1011
1012	INTEGER :: i, j, k
1013	REAL, DIMENSION(0:nx,nz) :: ar
1014	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1015	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1016
1017	!
1018	!-- Forward substitution
1019	DO i = 0, nx
1020	ar1(i,0) = ar(i,1)
1021	ENDDO
1022	DO k = 1, nz - 1
1023	DO i = 0,nx
1024	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1025	ENDDO
1026	ENDDO
1027
1028	!
1029	!-- Backward substitution
1030	DO i = 0,nx
1031	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
1032	ENDDO
1033	DO k = nz-2, 0, -1
1034	DO i = 0,nx
1035	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1036	/ tri(4,i,k)
1037	ENDDO
1038	ENDDO
1039
1040	END SUBROUTINE substi_hybrid
1041
1042	END SUBROUTINE tridia_hybrid
1043
1044
1045	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
1046
1047	USE cpulog
1048
1049	IMPLICIT NONE
1050
1051	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
1052	INTEGER, SAVE :: tag = 10
1053	#if defined( __parallel )
1054	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
1055	INTEGER, DIMENSION(MPI_STATUS_SIZE,1) :: stata
1056	#endif
1057
1058	REAL :: buf, buf1
1059
1060
1061	buf = 1.0
1062	buf1 = 1.1
1063	IF ( me_node == 0 ) THEN ! first node only
1064
1065	SELECT CASE ( loca )
1066
1067	CASE ( 1 ) ! before alltoall
1068
1069	IF( me_task > 0 ) THEN ! first task does not wait
1070	#if defined( __parallel )
1071	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
1072	buf1, 1, MPI_REAL, me_task-1, 0, &
1073	comm_tasks, stat, ierr )
1074	#endif
1075	ELSEIF ( j > nys_p ) THEN
1076	req = 0
1077	tag = MOD( tag-10, 10 ) + 10
1078	#if defined( __parallel )
1079	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
1080	tag, comm_tasks, req, ierr )
1081	reqa = req
1082	CALL MPI_WAITALL( 1, reqa, stata, ierr )
1083	#endif
1084	ENDIF
1085
1086	CASE ( 2 ) ! after alltoall
1087
1088	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
1089	#if defined( __parallel )
1090	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
1091	buf1, 1, MPI_REAL, me_task+1, 0, &
1092	comm_tasks, stat, ierr)
1093	#endif
1094	ELSEIF ( j < nyn_p ) THEN
1095	req = 0
1096	tag = MOD( tag-10, 10 ) + 10
1097	#if defined( __parallel )
1098	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
1099	ierr )
1100	#endif
1101	ENDIF
1102
1103	END SELECT
1104
1105	ENDIF
1106
1107	END SUBROUTINE cascade
1108	#endif
1109	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |