Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_hybrid.f90 @ 1014

Last change on this file since 1014 was 1014, checked in by raasch, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 32.1 KB

Line
1	MODULE poisfft_hybrid_mod
2
3	!------------------------------------------------------------------------------
4	! Current revisions:
5	! -----------------
6	!
7	!
8	! Former revisions:
9	! -----------------
10	! $Id: poisfft_hybrid.f90 1014 2012-09-21 07:09:03Z raasch $
11	!
12	! 1013 2012-09-21 07:03:55Z raasch
13	! FLOAT type conversion replaced by REAL
14	!
15	! 809 2012-01-30 13:32:58Z maronga
16	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
17	!
18	! 807 2012-01-25 11:53:51Z maronga
19	! New cpp directive "__check" implemented which is used by check_namelist_files
20	! (most of the code is unneeded by check_namelist_files).
21	!
22	! 667 2010-12-23 12:06:00Z suehring/gryschka
23	! ddzu replaced by ddzu_pres due to changes in zu(0)
24	!
25	! 415 2009-12-15 10:26:23Z raasch
26	! Dimension of array stat in cascade change to prevent type problems with___
27	! mpi2 libraries
28	!
29	! 274 2009-03-26 15:11:21Z heinze
30	! Output of messages replaced by message handling routine.
31	!
32	! Feb. 2007
33	! RCS Log replace by Id keyword, revision history cleaned up
34	!
35	! Revision 1.11 2004/04/30 12:43:14 raasch
36	! Renaming of fft routines, additional argument in calls of fft_y_m
37	!
38	! Revision 1.2 2002/12/19 16:08:31 raasch
39	! Preprocessor directive KKMP introduced (OMP does NOT work),
40	! array tri will be a shared array in OpenMP loop, to get better cache
41	! utilization, the i index (x-direction) will be executed in stride
42	! "istride" as outer loop and in a shorter inner loop,
43	! overlapping of computation and communication realized by new routine
44	! poisfft_hybrid_nodes, name of old routine poisfft_hybrid changed to
45	! poisfft_hybrid_omp, STOP statement replaced by call of subroutine local_stop
46	!
47	!
48	! Description:
49	! ------------
50	! Solution of the Poisson equation with a 2D spectral method.
51	! Hybrid version for parallel computers using a 1D domain decomposition,
52	! realized with MPI, along x and parallelization with OPEN-MP along y
53	! (routine poisfft_hybrid_omp). In a second version (poisfft_hybrid_nodes),
54	! optimization is realized by overlapping of computation and communication
55	! and by simultaneously executing as many communication calls as switches
56	! per logical partition (LPAR) are available. This version comes into
57	! effect if more than one node is used and if the environment variable
58	! tasks_per_node is set in a way that it can be devided by switch_per_lpar
59	! without any rest.
60	!
61	! WARNING: In case of OpenMP, there are problems with allocating large
62	! arrays in parallel regions.
63	!
64	! Copyright Klaus Ketelsen / Siegfried Raasch May 2002
65	!------------------------------------------------------------------------------!
66
67	USE fft_xy
68	USE indices
69	USE pegrid
70
71	IMPLICIT NONE
72
73	INTEGER, PARAMETER :: switch_per_lpar = 2
74
75	INTEGER, SAVE :: nxl_a, nxr_a, & ! total x dimension
76	nxl_p, nxr_p, & ! partial x dimension
77	nys_a, nyn_a, & ! total y dimension
78	nys_p, nyn_p, & ! partial y dimension
79
80	npe_s, & ! total number of PEs for solver
81	nwords, & ! number of points to be exchanged
82	! with MPI_ALLTOALL
83	n_omp_threads ! number of OpenMP threads
84
85	!
86	!-- Variables for multi node version (cluster version) using routine
87	!-- poisfft_hybrid_nodes
88	INTEGER, SAVE :: comm_nodes, & ! communicater nodes
89	comm_node_all, & ! communicater all PEs node version
90	comm_tasks, & ! communicater tasks
91	me, me_node, me_task,& ! identity of this PE
92	nodes, & ! number of nodes
93	tasks_per_logical_node = -1 ! default no cluster
94
95
96	PRIVATE
97
98
99	#if ! defined ( __check )
100	PUBLIC poisfft_hybrid, poisfft_hybrid_ini
101
102
103	!
104	!-- Public interfaces
105	INTERFACE poisfft_hybrid_ini
106	MODULE PROCEDURE poisfft_hybrid_ini
107	END INTERFACE poisfft_hybrid_ini
108
109	INTERFACE poisfft_hybrid
110	MODULE PROCEDURE poisfft_hybrid
111	END INTERFACE poisfft_hybrid
112
113	!
114	!-- Private interfaces
115	INTERFACE poisfft_hybrid_omp
116	MODULE PROCEDURE poisfft_hybrid_omp
117	END INTERFACE poisfft_hybrid_omp
118
119	INTERFACE poisfft_hybrid_omp_vec
120	MODULE PROCEDURE poisfft_hybrid_omp_vec
121	END INTERFACE poisfft_hybrid_omp_vec
122
123	INTERFACE poisfft_hybrid_nodes
124	MODULE PROCEDURE poisfft_hybrid_nodes
125	END INTERFACE poisfft_hybrid_nodes
126
127	INTERFACE tridia_hybrid
128	MODULE PROCEDURE tridia_hybrid
129	END INTERFACE tridia_hybrid
130
131	INTERFACE cascade
132	MODULE PROCEDURE cascade
133	END INTERFACE cascade
134	#else
135	PUBLIC poisfft_hybrid_ini
136
137	!
138	!-- Public interfaces
139	INTERFACE poisfft_hybrid_ini
140	MODULE PROCEDURE poisfft_hybrid_ini
141	END INTERFACE poisfft_hybrid_ini
142	#endif
143
144	CONTAINS
145
146
147	SUBROUTINE poisfft_hybrid_ini
148
149	USE control_parameters
150	USE pegrid
151
152	IMPLICIT NONE
153
154	CHARACTER(LEN=8) :: cdummy
155	INTEGER :: idummy, istat
156	INTEGER, DIMENSION(2) :: coords, dims
157
158	LOGICAL, DIMENSION(2) :: period = .false., re_dims
159
160
161	!
162	!-- Set the internal index values for the hybrid solver
163	#if defined( __parallel )
164	npe_s = pdims(1)
165	#else
166	npe_s = 1
167	#endif
168	nxl_a = 0
169	nxr_a = nx
170	nxl_p = 0
171	nxr_p = ( ( nx+1 ) / npe_s ) - 1
172	nys_a = nys
173	nyn_a = nyn
174	nys_p = 0
175	nyn_p = ( ( ny+1 ) / npe_s ) - 1
176
177	nwords = ( nxr_p-nxl_p+1 ) * nz * ( nyn_p-nys_p+1 )
178
179	#if defined( __KKMP ) && ! defined ( __check )
180	CALL LOCAL_GETENV( 'OMP_NUM_THREADS', 15, cdummy, idummy )
181	READ ( cdummy, '(I8)' ) n_omp_threads
182	IF ( n_omp_threads > 1 ) THEN
183	WRITE( message_string, * ) 'Number of OpenMP threads = ', &
184	n_omp_threads
185	CALL message( 'poisfft_hybrid_ini', 'PA0280', 0, 0, 0, 6, 0 )
186	ENDIF
187	#else
188	n_omp_threads = 1
189	#endif
190	!
191	!-- Initialize the one-dimensional FFT routines
192	CALL fft_init
193
194	!
195	!-- Setup for multi node version (poisfft_hybrid_nodes)
196	IF ( n_omp_threads == 1 .AND. &
197	( host(1:4) == 'ibmh' .OR. host(1:4) == 'ibmb' ) ) THEN
198
199	IF ( tasks_per_node /= -9999 ) THEN
200	!
201	!-- Multi node version requires that the available number of
202	!-- switches per logical partition must be an integral divisor
203	!-- of the chosen number of tasks per node
204	IF ( MOD( tasks_per_node, switch_per_lpar ) == 0 ) THEN
205	!
206	!-- Set the switch which decides about usage of the multi node
207	!-- version
208	IF ( tasks_per_node / switch_per_lpar > 1 .AND. &
209	numprocs > tasks_per_node ) THEN
210	tasks_per_logical_node = tasks_per_node / switch_per_lpar
211	ENDIF
212
213	IF ( tasks_per_logical_node > -1 ) THEN
214
215	WRITE( message_string, * ) 'running optimized ', &
216	'multinode version', &
217	'&switch_per_lpar = ', &
218	switch_per_lpar, &
219	'&tasks_per_lpar = ', &
220	tasks_per_node, &
221	'tasks_per_logical_node = ', &
222	tasks_per_logical_node
223	CALL message( 'poisfft_hybrid_ini', 'PA0281', 0, 0, 0, 6, 0 )
224
225	ENDIF
226
227	ENDIF
228	ENDIF
229	ENDIF
230
231	!
232	!-- Determine sub-topologies for multi node version
233	IF ( tasks_per_logical_node >= 2 ) THEN
234
235	#if defined( __parallel ) && ! defined ( __check )
236	nodes = ( numprocs + tasks_per_logical_node - 1 ) / &
237	tasks_per_logical_node
238	dims(1) = nodes
239	dims(2) = tasks_per_logical_node
240
241	CALL MPI_CART_CREATE( comm2d, 2, dims, period, .FALSE., &
242	comm_node_all, istat )
243	CALL MPI_COMM_RANK( comm_node_all, me, istat )
244
245	re_dims(1) = .TRUE.
246	re_dims(2) = .FALSE.
247	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_nodes, istat )
248	CALL MPI_COMM_RANK( comm_nodes, me_node, istat )
249
250	re_dims(1) = .FALSE.
251	re_dims(2) = .TRUE.
252	CALL MPI_CART_SUB( comm_node_all, re_dims, comm_tasks, istat )
253	CALL MPI_COMM_RANK( comm_tasks, me_task, istat )
254
255	! write(0,*) 'who am i',myid,me,me_node,me_task,nodes,&
256	! tasks_per_logical_node
257	#elif ! defined( __parallel )
258	message_string = 'parallel environment (MPI) required'
259	CALL message( 'poisfft_hybrid_ini', 'PA0282', 1, 2, 0, 6, 0 )
260	#endif
261	ENDIF
262
263	END SUBROUTINE poisfft_hybrid_ini
264
265	#if ! defined ( __check )
266	SUBROUTINE poisfft_hybrid( ar )
267
268	USE control_parameters
269	USE interfaces
270
271	IMPLICIT NONE
272
273	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
274
275	IF ( host(1:3) == 'nec' ) THEN
276	CALL poisfft_hybrid_omp_vec( ar )
277	ELSE
278	IF ( tasks_per_logical_node == -1 ) THEN
279	CALL poisfft_hybrid_omp( ar )
280	ELSE
281	CALL poisfft_hybrid_nodes( ar )
282	ENDIF
283	ENDIF
284
285	END SUBROUTINE poisfft_hybrid
286
287
288	SUBROUTINE poisfft_hybrid_omp ( ar )
289
290	USE cpulog
291	USE interfaces
292
293	IMPLICIT NONE
294
295	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
296	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
297
298	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
299
300	REAL, DIMENSION(0:nx) :: fftx_ar
301	REAL, DIMENSION(0:ny,istride) :: ffty_ar
302
303	REAL, DIMENSION(0:nx,nz) :: tri_ar
304
305	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
306	#if defined( __KKMP )
307	INTEGER :: omp_get_thread_num
308	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
309	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
310	#else
311	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
312	#endif
313
314
315	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'start' )
316
317	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
318
319	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
320	!$OMP DO
321	!
322	!-- Store grid points to be transformed on a 1d-array, do the fft
323	!-- and sample the results on a 4d-array
324	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
325	iei = MIN( iouter+istride-1, nxr_p )
326	DO k = 1, nz
327
328	DO i = iouter, iei
329	ii = nxl + i
330	ir = i - iouter + 1
331
332	DO j = nys_a, nyn_a
333	ffty_ar(j,ir) = ar(k,j,ii)
334	ENDDO
335
336	CALL fft_y( ffty_ar(:,ir), 'forward' )
337	ENDDO
338
339	m = nys_a
340	DO n = 1, npe_s
341	DO j = nys_p, nyn_p
342	DO i = iouter, iei
343	ir = i - iouter + 1
344	work1(i,k,j,n) = ffty_ar(m,ir)
345	ENDDO
346	m = m+1
347	ENDDO
348	ENDDO
349
350	ENDDO
351	ENDDO
352	!$OMP END PARALLEL
353
354	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
355
356	#if defined( __parallel )
357	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
358
359	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
360	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
361	comm2d, istat )
362
363	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
364	#else
365	work2 = work1
366	#endif
367
368	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
369
370	#if defined( __KKMP )
371	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,fftx_ar,tri_ar,jthread)
372	!$OMP DO
373	DO j = nys_p, nyn_p
374	jthread = omp_get_thread_num() + 1
375	#else
376	DO j = nys_p, nyn_p
377	jthread = 1
378	#endif
379	DO k = 1, nz
380
381	m = nxl_a
382	DO n = 1, npe_s
383	DO i = nxl_p, nxr_p
384	fftx_ar(m) = work2(i,k,j,n)
385	m = m+1
386	ENDDO
387	ENDDO
388
389	CALL fft_x( fftx_ar, 'forward' )
390
391	DO i = nxl_a, nxr_a
392	tri_ar(i,k) = fftx_ar(i)
393	ENDDO
394
395	ENDDO
396
397	jj = myid * (nyn_p-nys_p+1) + j
398	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
399
400	DO k = 1, nz
401	DO i = nxl_a, nxr_a
402	fftx_ar(i) = tri_ar (i,k)
403	ENDDO
404
405	CALL fft_x( fftx_ar, 'backward' )
406
407	m = nxl_a
408	DO n = 1, npe_s
409	DO i = nxl_p, nxr_p
410	work2(i,k,j,n) = fftx_ar(m)
411	m = m+1
412	ENDDO
413	ENDDO
414
415	ENDDO
416	ENDDO
417	#if defined( __KKMP )
418	!$OMP END PARALLEL
419	#endif
420
421	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
422
423	#if defined( __parallel )
424	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
425	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
426
427	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
428	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
429	comm2d, istat )
430
431	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
432	#else
433	work1 = work2
434	#endif
435
436	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
437
438	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n,ffty_ar)
439	!$OMP DO
440	DO iouter = nxl_p, nxr_p, istride
441	iei = MIN( iouter+istride-1, nxr_p )
442	DO k = 1, nz
443
444	m = nys_a
445	DO n = 1, npe_s
446	DO j = nys_p, nyn_p
447	DO i = iouter, iei
448	ir = i - iouter + 1
449	ffty_ar(m,ir) = work1 (i,k,j,n)
450	ENDDO
451	m = m+1
452	ENDDO
453	ENDDO
454
455	DO i = iouter, iei
456	ii = nxl + i
457	ir = i - iouter + 1
458	CALL fft_y( ffty_ar(:,ir), 'backward' )
459
460	DO j = nys_a, nyn_a
461	ar(k,j,ii) = ffty_ar(j,ir)
462	ENDDO
463	ENDDO
464
465	ENDDO
466	ENDDO
467	!$OMP END PARALLEL
468
469	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
470
471	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_omp', 'stop' )
472
473	#if defined( __KKMP )
474	DEALLOCATE( tri )
475	#endif
476
477	END SUBROUTINE poisfft_hybrid_omp
478
479
480	SUBROUTINE poisfft_hybrid_omp_vec ( ar )
481
482	USE cpulog
483	USE interfaces
484
485	IMPLICIT NONE
486
487	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
488	INTEGER :: i, ii, ir, iei, iouter, istat, j, jj, k, m, n, jthread
489
490	REAL, DIMENSION(0:nx,nz) :: tri_ar
491
492	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
493
494	REAL, DIMENSION(0:ny+3,nz,nxl_p:nxr_p) :: ffty_ar3
495	REAL, DIMENSION(0:nx+3,nz,nys_p:nyn_p) :: fftx_ar3
496
497	REAL, DIMENSION(nxl_p:nxr_p,nz,nys_p:nyn_p,npe_s) :: work1, work2
498	#if defined( __KKMP )
499	INTEGER :: omp_get_thread_num
500	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
501	ALLOCATE( tri(5,0:nx,0:nz-1,n_omp_threads ) )
502	#else
503	REAL, DIMENSION(5,0:nx,0:nz-1,1) :: tri
504	#endif
505
506
507	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'start' )
508
509	CALL cpu_log( log_point_s(7), 'fft_y_m', 'start' )
510
511	!$OMP PARALLEL PRIVATE (i,j,k,m,n)
512	!$OMP DO
513	!
514	!-- Store grid points to be transformed on a 1d-array, do the fft
515	!-- and sample the results on a 4d-array
516	DO i = nxl_p, nxr_p
517
518	DO j = nys_a, nyn_a
519	DO k = 1, nz
520	ffty_ar3(j,k,i) = ar(k,j,i+nxl)
521	ENDDO
522	ENDDO
523
524	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'forward' )
525	ENDDO
526
527	!$OMP DO
528	DO k = 1, nz
529	m = nys_a
530	DO n = 1, npe_s
531	DO j = nys_p, nyn_p
532	DO i = nxl_p, nxr_p
533	work1(i,k,j,n) = ffty_ar3(m,k,i)
534	ENDDO
535	m = m+1
536	ENDDO
537	ENDDO
538	ENDDO
539	!$OMP END PARALLEL
540
541	CALL cpu_log( log_point_s(7), 'fft_y_m', 'pause' )
542
543	#if defined( __parallel )
544	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
545	CALL MPI_ALLTOALL( work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
546	work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
547	comm2d, istat )
548	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
549	#else
550	work2 = work1
551	#endif
552
553	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'start' )
554
555	#if defined( __KKMP )
556	!$OMP PARALLEL PRIVATE (i,j,jj,k,m,n,tri_ar,jthread)
557	!$OMP DO
558	DO j = nys_p, nyn_p
559	jthread = omp_get_thread_num() + 1
560	#else
561	DO j = nys_p, nyn_p
562	jthread = 1
563	#endif
564	DO k = 1, nz
565
566	m = nxl_a
567	DO n = 1, npe_s
568	DO i = nxl_p, nxr_p
569	fftx_ar3(m,k,j) = work2(i,k,j,n)
570	m = m+1
571	ENDDO
572	ENDDO
573	ENDDO
574
575	CALL fft_x_m( fftx_ar3(:,:,j), 'forward' )
576
577	DO k = 1, nz
578	DO i = nxl_a, nxr_a
579	tri_ar(i,k) = fftx_ar3(i,k,j)
580	ENDDO
581	ENDDO
582
583	jj = myid * (nyn_p-nys_p+1) + j
584	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:,jthread))
585
586	DO k = 1, nz
587	DO i = nxl_a, nxr_a
588	fftx_ar3(i,k,j) = tri_ar (i,k)
589	ENDDO
590	ENDDO
591
592	CALL fft_x_m( fftx_ar3(:,:,j), 'backward' )
593
594	DO k = 1, nz
595	m = nxl_a
596	DO n = 1, npe_s
597	DO i = nxl_p, nxr_p
598	work2(i,k,j,n) = fftx_ar3(m,k,j)
599	m = m+1
600	ENDDO
601	ENDDO
602	ENDDO
603
604	ENDDO
605	#if defined( __KKMP )
606	!$OMP END PARALLEL
607	#endif
608
609	CALL cpu_log( log_point_s(33), 'fft_x_m + tridia', 'stop' )
610
611	#if defined( __parallel )
612	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
613	nwords = (nxr_p-nxl_p+1) * nz * (nyn_p-nys_p+1)
614	CALL MPI_ALLTOALL( work2(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
615	work1(nxl_p,1,nys_p,1), nwords, MPI_REAL, &
616	comm2d, istat )
617	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
618	#else
619	work1 = work2
620	#endif
621
622	CALL cpu_log( log_point_s(7), 'fft_y_m', 'continue' )
623
624	!$OMP PARALLEL PRIVATE (i,iouter,ii,ir,iei,j,k,m,n)
625	!$OMP DO
626	DO k = 1, nz
627	m = nys_a
628	DO n = 1, npe_s
629	DO j = nys_p, nyn_p
630	DO i = nxl_p, nxr_p
631	ffty_ar3(m,k,i) = work1(i,k,j,n)
632	ENDDO
633	m = m+1
634	ENDDO
635	ENDDO
636	ENDDO
637
638	!$OMP DO
639	DO i = nxl_p, nxr_p
640	CALL fft_y_m( ffty_ar3(:,:,i), ny+3, 'backward' )
641	DO j = nys_a, nyn_a
642	DO k = 1, nz
643	ar(k,j,i+nxl) = ffty_ar3(j,k,i)
644	ENDDO
645	ENDDO
646	ENDDO
647	!$OMP END PARALLEL
648
649	CALL cpu_log( log_point_s(7), 'fft_y_m', 'stop' )
650
651	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_vec', 'stop' )
652
653	#if defined( __KKMP )
654	DEALLOCATE( tri )
655	#endif
656
657	END SUBROUTINE poisfft_hybrid_omp_vec
658
659
660	SUBROUTINE poisfft_hybrid_nodes ( ar )
661
662	USE cpulog
663	USE interfaces
664
665	IMPLICIT NONE
666
667	INTEGER, PARAMETER :: istride = 4 ! stride of i loop
668	INTEGER :: i, iei, ii, iouter, ir, istat, j, jj, k, m, &
669	n, nn, nt, nw1, nw2
670
671	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
672
673	REAL, DIMENSION(0:nx) :: fftx_ar
674	REAL, DIMENSION(0:ny,istride) :: ffty_ar
675
676	REAL, DIMENSION(0:nx,nz) :: tri_ar
677
678	REAL, DIMENSION(nxl_p:nxr_p,nz,tasks_per_logical_node, &
679	nodes,nys_p:nyn_p) :: work1,work2
680	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
681
682
683	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'start' )
684
685	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
686
687	!
688	!-- Store grid points to be transformed on a 1d-array, do the fft
689	!-- and sample the results on a 4d-array
690	DO iouter = nxl_p, nxr_p, istride ! stride loop, better cache
691	iei = MIN( iouter+istride-1, nxr_p )
692	DO k = 1, nz
693
694	DO i = iouter, iei
695	ii = nxl + i
696	ir = i - iouter + 1
697
698	DO j = nys_a, nyn_a
699	ffty_ar(j,ir) = ar(k,j,ii)
700	ENDDO
701
702	CALL fft_y( ffty_ar(:,ir), 'forward' )
703	ENDDO
704
705	m = nys_a
706	DO nn = 1, nodes
707	DO nt = 1, tasks_per_logical_node
708	DO j = nys_p, nyn_p
709	DO i = iouter, iei
710	ir = i - iouter + 1
711	work1(i,k,nt,nn,j) = ffty_ar(m,ir)
712	ENDDO
713	m = m+1
714	ENDDO
715	ENDDO
716	ENDDO
717
718	ENDDO
719	ENDDO
720
721	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
722
723	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
724	nw1 = SIZE( work1, 1 ) * SIZE( work1, 2 )
725	DO nn = 1, nodes
726	DO j = nys_p, nyn_p
727	#if defined( __parallel )
728	CALL MPI_ALLTOALL( work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
729	work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
730	comm_tasks, istat )
731	#endif
732	ENDDO
733	ENDDO
734	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
735
736
737	DO j = nys_p, nyn_p
738
739	CALL cascade( 1, j, nys_p, nyn_p )
740	nw2 = nw1 * SIZE( work1, 3 )
741	CALL cpu_log( log_point_s(37), 'alltoall_node', 'start' )
742	#if defined( __parallel )
743	CALL MPI_ALLTOALL( work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
744	work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
745	comm_nodes, istat )
746	#endif
747	CALL cpu_log( log_point_s(37), 'alltoall_node', 'pause' )
748	CALL cascade( 2, j, nys_p, nyn_p )
749
750	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'start' )
751	DO k = 1, nz
752
753	m = nxl_a
754	DO nn = 1, nodes
755	DO nt = 1, tasks_per_logical_node
756	DO i = nxl_p, nxr_p
757	fftx_ar(m) = work1(i,k,nt,nn,j)
758	m = m+1
759	ENDDO
760	ENDDO
761	ENDDO
762
763	CALL fft_x( fftx_ar, 'forward' )
764
765	DO i = nxl_a, nxr_a
766	tri_ar(i,k) = fftx_ar(i)
767	ENDDO
768
769	ENDDO
770
771	jj = myid * (nyn_p-nys_p+1) + j
772	CALL tridia_hybrid( jj, tri_ar, tri(:,:,:) )
773
774	DO k = 1, nz
775	DO i = nxl_a, nxr_a
776	fftx_ar(i) = tri_ar(i,k)
777	ENDDO
778
779	CALL fft_x( fftx_ar, 'backward' )
780
781	m = nxl_a
782	DO nn = 1, nodes
783	DO nt = 1, tasks_per_logical_node
784	DO i = nxl_p, nxr_p
785	work1(i,k,nt,nn,j) = fftx_ar(m)
786	m = m+1
787	ENDDO
788	ENDDO
789	ENDDO
790	ENDDO
791
792	CALL cpu_log( log_point_s(33), 'fft_x + tridia', 'stop' )
793	nw2 = nw1 * SIZE( work1, 3 )
794	CALL cpu_log( log_point_s(37), 'alltoall_node', 'continue' )
795	#if defined( __parallel )
796	CALL MPI_ALLTOALL( work1(nxl_p,1,1,1,j), nw2, MPI_REAL, &
797	work2(nxl_p,1,1,1,j), nw2, MPI_REAL, &
798	comm_nodes, istat )
799	#endif
800	CALL cpu_log( log_point_s(37), 'alltoall_node', 'stop' )
801
802	ENDDO
803
804	CALL cpu_log( log_point_s(32), 'alltoall_task', 'start' )
805	DO nn = 1, nodes
806	DO j = nys_p, nyn_p
807	#if defined( __parallel )
808	CALL MPI_ALLTOALL( work2(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
809	work1(nxl_p,1,1,nn,j), nw1, MPI_REAL, &
810	comm_tasks, istat )
811	#endif
812	ENDDO
813	ENDDO
814	CALL cpu_log( log_point_s(32), 'alltoall_task', 'stop' )
815
816	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
817
818	DO iouter = nxl_p, nxr_p, istride
819	iei = MIN( iouter+istride-1, nxr_p )
820	DO k = 1, nz
821
822	m = nys_a
823	DO nn = 1, nodes
824	DO nt = 1, tasks_per_logical_node
825	DO j = nys_p, nyn_p
826	DO i = iouter, iei
827	ir = i - iouter + 1
828	ffty_ar(m,ir) = work1(i,k,nt,nn,j)
829	ENDDO
830	m = m+1
831	ENDDO
832	ENDDO
833	ENDDO
834
835	DO i = iouter, iei
836	ii = nxl + i
837	ir = i - iouter + 1
838	CALL fft_y( ffty_ar(:,ir), 'backward' )
839
840	DO j = nys_a, nyn_a
841	ar(k,j,ii) = ffty_ar(j,ir)
842	ENDDO
843	ENDDO
844
845	ENDDO
846	ENDDO
847
848	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
849
850	CALL cpu_log( log_point_s(30), 'poisfft_hybrid_nodes', 'stop' )
851
852	END SUBROUTINE poisfft_hybrid_nodes
853
854
855
856	SUBROUTINE tridia_hybrid( j, ar, tri )
857
858	USE arrays_3d
859	USE control_parameters
860	USE grid_variables
861
862	IMPLICIT NONE
863
864	INTEGER :: i, j, k, nnyh
865	REAL, DIMENSION(0:nx,nz) :: ar
866	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
867	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
868
869	nnyh = (ny+1) / 2
870
871	tri = 0.0
872	!
873	!-- Define constant elements of the tridiagonal matrix.
874	DO k = 0, nz-1
875	DO i = 0,nx
876	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
877	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
878	ENDDO
879	ENDDO
880
881	IF ( j <= nnyh ) THEN
882	CALL maketri_hybrid( j )
883	ELSE
884	CALL maketri_hybrid( ny+1-j)
885	ENDIF
886	CALL zerleg_hybrid
887	CALL substi_hybrid( ar, tri )
888
889	CONTAINS
890
891	SUBROUTINE maketri_hybrid( j )
892
893	!----------------------------------------------------------------------!
894	! maketri !
895	! !
896	! computes the i- and j-dependent component of the matrix !
897	!----------------------------------------------------------------------!
898
899	USE constants
900
901	IMPLICIT NONE
902
903	INTEGER :: i, j, k, nnxh
904	REAL :: a, c
905
906	REAL, DIMENSION(0:nx) :: l
907
908
909	nnxh = (nx+1) / 2
910	!
911	!-- Provide the tridiagonal matrix for solution of the Poisson equation
912	!-- in Fourier space. The coefficients are computed following the method
913	!-- of Schmidt et al. (DFVLR-Mitteilung 84-15) --> departs from Stephan
914	!-- Siano's original version.
915	DO i = 0,nx
916	IF ( i >= 0 .AND. i < nnxh ) THEN
917	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
918	REAL( nx+1 ) ) ) / ( dx * dx ) + &
919	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
920	REAL( ny+1 ) ) ) / ( dy * dy )
921	ELSEIF ( i == nnxh ) THEN
922	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
923	REAL( nx+1 ) ) ) / ( dx * dx ) + &
924	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
925	REAL(ny+1) ) ) / ( dy * dy )
926	ELSE
927	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
928	REAL( nx+1 ) ) ) / ( dx * dx ) + &
929	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
930	REAL( ny+1 ) ) ) / ( dy * dy )
931	ENDIF
932	ENDDO
933
934	DO k = 0,nz-1
935	DO i = 0, nx
936	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
937	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
938	tri(1,i,k) = a + c - l(i)
939	ENDDO
940	ENDDO
941	IF ( ibc_p_b == 1 .OR. ibc_p_b == 2 ) THEN
942	DO i = 0,nx
943	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
944	ENDDO
945	ENDIF
946	IF ( ibc_p_t == 1 ) THEN
947	DO i = 0,nx
948	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
949	ENDDO
950	ENDIF
951
952	END SUBROUTINE maketri_hybrid
953
954
955	SUBROUTINE zerleg_hybrid
956
957	!----------------------------------------------------------------------!
958	! zerleg !
959	! !
960	! Splitting of the tridiagonal matrix (Thomas algorithm) !
961	!----------------------------------------------------------------------!
962
963	USE indices
964
965	IMPLICIT NONE
966
967	INTEGER :: i, k
968
969	!
970	!-- Splitting
971	DO i = 0, nx
972	tri(4,i,0) = tri(1,i,0)
973	ENDDO
974	DO k = 1, nz-1
975	DO i = 0,nx
976	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
977	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
978	ENDDO
979	ENDDO
980
981	END SUBROUTINE zerleg_hybrid
982
983	SUBROUTINE substi_hybrid( ar, tri )
984
985	!----------------------------------------------------------------------!
986	! substi !
987	! !
988	! Substitution (Forward and Backward) (Thomas algorithm) !
989	!----------------------------------------------------------------------!
990
991	IMPLICIT NONE
992
993	INTEGER :: i, j, k
994	REAL, DIMENSION(0:nx,nz) :: ar
995	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
996	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
997
998	!
999	!-- Forward substitution
1000	DO i = 0, nx
1001	ar1(i,0) = ar(i,1)
1002	ENDDO
1003	DO k = 1, nz - 1
1004	DO i = 0,nx
1005	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1006	ENDDO
1007	ENDDO
1008
1009	!
1010	!-- Backward substitution
1011	DO i = 0,nx
1012	ar(i,nz) = ar1(i,nz-1) / tri(4,i,nz-1)
1013	ENDDO
1014	DO k = nz-2, 0, -1
1015	DO i = 0,nx
1016	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1017	/ tri(4,i,k)
1018	ENDDO
1019	ENDDO
1020
1021	END SUBROUTINE substi_hybrid
1022
1023	END SUBROUTINE tridia_hybrid
1024
1025
1026	SUBROUTINE cascade( loca, j, nys_p, nyn_p )
1027
1028	USE cpulog
1029
1030	IMPLICIT NONE
1031
1032	INTEGER :: ier, j, loca, nyn_p, nys_p, req, reqa(1)
1033	INTEGER, SAVE :: tag = 10
1034	#if defined( __parallel )
1035	INTEGER, DIMENSION(MPI_STATUS_SIZE) :: stat
1036	INTEGER, DIMENSION(MPI_STATUS_SIZE,1) :: stata
1037	#endif
1038
1039	REAL :: buf, buf1
1040
1041
1042	buf = 1.0
1043	buf1 = 1.1
1044	IF ( me_node == 0 ) THEN ! first node only
1045
1046	SELECT CASE ( loca )
1047
1048	CASE ( 1 ) ! before alltoall
1049
1050	IF( me_task > 0 ) THEN ! first task does not wait
1051	#if defined( __parallel )
1052	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task-1, 0, &
1053	buf1, 1, MPI_REAL, me_task-1, 0, &
1054	comm_tasks, stat, ierr )
1055	#endif
1056	ELSEIF ( j > nys_p ) THEN
1057	req = 0
1058	tag = MOD( tag-10, 10 ) + 10
1059	#if defined( __parallel )
1060	CALL MPI_IRECV( buf, 1, MPI_REAL, tasks_per_logical_node-1,&
1061	tag, comm_tasks, req, ierr )
1062	reqa = req
1063	CALL MPI_WAITALL( 1, reqa, stata, ierr )
1064	#endif
1065	ENDIF
1066
1067	CASE ( 2 ) ! after alltoall
1068
1069	IF ( me_task < tasks_per_logical_node-1 ) THEN ! last task
1070	#if defined( __parallel )
1071	CALL MPI_SENDRECV( buf, 1, MPI_REAL, me_task+1, 0, &
1072	buf1, 1, MPI_REAL, me_task+1, 0, &
1073	comm_tasks, stat, ierr)
1074	#endif
1075	ELSEIF ( j < nyn_p ) THEN
1076	req = 0
1077	tag = MOD( tag-10, 10 ) + 10
1078	#if defined( __parallel )
1079	CALL MPI_ISEND( buf, 1, MPI_REAL, 0, tag, comm_tasks, req, &
1080	ierr )
1081	#endif
1082	ENDIF
1083
1084	END SELECT
1085
1086	ENDIF
1087
1088	END SUBROUTINE cascade
1089	#endif
1090	END MODULE poisfft_hybrid_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |