Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1171

Last change on this file since 1171 was 1112, checked in by raasch, 12 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 45.8 KB

Line
1	MODULE poisfft_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2012 Leibniz University Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft.f90 1112 2013-03-09 00:34:37Z raasch $
27	!
28	! 1111 2013-03-08 23:54:10Z raasch
29	! further openACC porting of non-parallel (MPI) branch:
30	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
31	! no distinction between parallel/non-parallel in poisfft and tridia any more,
32	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
33	! (otherwise "invalid device function" is indicated during runtime),
34	! optimization of tridia routines: constant elements and coefficients of tri are
35	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
36	! to 2,
37	! poisfft_init is now called internally from poisfft, maketri is called from
38	! poisfft_init,
39	! ibc_p_b = 2 removed
40	!
41	! 1106 2013-03-04 05:31:38Z raasch
42	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
43	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
44	! fft_y_1d
45	!
46	! 1103 2013-02-20 02:15:53Z raasch
47	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
48	! sometimes cause segmentation faults with intel 12.1 compiler
49	!
50	! 1092 2013-02-02 11:24:22Z raasch
51	! unused variables removed
52	!
53	! 1036 2012-10-22 13:43:42Z raasch
54	! code put under GPL (PALM 3.9)
55	!
56	! 2012-09-21 07:03:55Z raasch
57	! FLOAT type conversion replaced by REAL
58	!
59	! 1003 2012-09-14 14:35:53Z raasch
60	! indices nxa, nya, etc. replaced by nx, ny, etc.
61	!
62	! 940 2012-07-09 14:31:00Z raasch
63	! special handling of tri-array as an argument in tridia_1dd routines switched
64	! off because it caused segmentation faults with intel 12.1 compiler
65	!
66	! 877 2012-04-03 11:21:44Z suehring
67	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
68	! pressure at the top of the model domain.
69	!
70	! 809 2012-01-30 13:32:58Z maronga
71	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
72	!
73	! 807 2012-01-25 11:53:51Z maronga
74	! New cpp directive "__check" implemented which is used by check_namelist_files
75	! (most of the code is unneeded by check_namelist_files).
76	!
77	! 763 2011-10-06 09:32:09Z suehring
78	! Comment added concerning the last change.
79	!
80	! 761 2011-10-05 17:58:52Z suehring
81	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
82	! pressure at the top of the model domain.
83	!
84	! 696 2011-03-18 07:03:49Z raasch
85	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
86	!
87	! 683 2011-02-09 14:25:15Z raasch
88	! openMP parallelization for 2d-domain-decomposition
89	!
90	! 667 2010-12-23 12:06:00Z suehring/gryschka
91	! ddzu replaced by ddzu_pres due to changes in zu(0)
92	!
93	! 622 2010-12-10 08:08:13Z raasch
94	! optional barriers included in order to speed up collective operations
95	!
96	! 377 2009-09-04 11:09:00Z raasch
97	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
98	!
99	! 164 2008-05-15 08:46:15Z raasch
100	! Arguments removed from transpose routines
101	!
102	! 128 2007-10-26 13:11:14Z raasch
103	! Bugfix: wavenumber calculation for even nx in routines maketri
104	!
105	! 85 2007-05-11 09:35:14Z raasch
106	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
107	!
108	! 76 2007-03-29 00:58:32Z raasch
109	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
110	! the bottom and the top.
111	!
112	! RCS Log replace by Id keyword, revision history cleaned up
113	!
114	! Revision 1.24 2006/08/04 15:00:24 raasch
115	! Default setting of the thread number tn in case of not using OpenMP
116	!
117	! Revision 1.23 2006/02/23 12:48:38 raasch
118	! Additional compiler directive in routine tridia_1dd for preventing loop
119	! exchange on NEC-SX6
120	!
121	! Revision 1.20 2004/04/30 12:38:09 raasch
122	! Parts of former poisfft_hybrid moved to this subroutine,
123	! former subroutine changed to a module, renaming of FFT-subroutines and
124	! -module, FFTs completely substituted by calls of fft_x and fft_y,
125	! NAG fft used in the non-parallel case completely removed, l in maketri
126	! is now a 1d-array, variables passed by modules instead of using parameter
127	! lists, enlarged transposition arrays introduced
128	!
129	! Revision 1.1 1997/07/24 11:24:14 raasch
130	! Initial revision
131	!
132	!
133	! Description:
134	! ------------
135	! See below.
136	!------------------------------------------------------------------------------!
137
138	!--------------------------------------------------------------------------!
139	! poisfft !
140	! !
141	! Original version: Stephan Siano (pois3d) !
142	! !
143	! Institute of Meteorology and Climatology, University of Hannover !
144	! Germany !
145	! !
146	! Version as of July 23,1996 !
147	! !
148	! !
149	! Version for parallel computers: Siegfried Raasch !
150	! !
151	! Version as of July 03,1997 !
152	! !
153	! Solves the Poisson equation with a 2D spectral method !
154	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
155	! !
156	! Input: !
157	! real ar contains in the (nnx,nny,nnz) elements, !
158	! starting from the element (1,nys,nxl), the !
159	! values for s !
160	! real work Temporary array !
161	! !
162	! Output: !
163	! real ar contains the solution for p !
164	!--------------------------------------------------------------------------!
165
166	USE fft_xy
167	USE indices
168	USE transpose_indices
169
170	IMPLICIT NONE
171
172	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
173
174	REAL, DIMENSION(:,:), ALLOCATABLE :: ddzuw
175
176	PRIVATE
177
178	#if ! defined ( __check )
179	PUBLIC poisfft, poisfft_init
180
181	INTERFACE poisfft
182	MODULE PROCEDURE poisfft
183	END INTERFACE poisfft
184
185	INTERFACE poisfft_init
186	MODULE PROCEDURE poisfft_init
187	END INTERFACE poisfft_init
188	#else
189	PUBLIC poisfft_init
190
191	INTERFACE poisfft_init
192	MODULE PROCEDURE poisfft_init
193	END INTERFACE poisfft_init
194	#endif
195
196	CONTAINS
197
198	SUBROUTINE poisfft_init
199
200	USE arrays_3d, ONLY: ddzu_pres, ddzw
201
202	IMPLICIT NONE
203
204	INTEGER :: k
205
206
207	CALL fft_init
208
209	ALLOCATE( ddzuw(0:nz-1,3) )
210
211	DO k = 0, nz-1
212	ddzuw(k,1) = ddzu_pres(k+1) * ddzw(k+1)
213	ddzuw(k,2) = ddzu_pres(k+2) * ddzw(k+1)
214	ddzuw(k,3) = -1.0 * &
215	( ddzu_pres(k+2) * ddzw(k+1) + ddzu_pres(k+1) * ddzw(k+1) )
216	ENDDO
217	!
218	!-- Calculate constant coefficients of the tridiagonal matrix
219	#if ! defined ( __check )
220	CALL maketri
221	#endif
222
223	poisfft_initialized = .TRUE.
224
225	END SUBROUTINE poisfft_init
226
227
228	#if ! defined ( __check )
229	SUBROUTINE poisfft( ar, work )
230
231	USE cpulog
232	USE interfaces
233	USE pegrid
234
235	IMPLICIT NONE
236
237	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar, work
238
239
240	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
241
242	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
243
244	!
245	!-- Two-dimensional Fourier Transformation in x- and y-direction.
246	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
247
248	!
249	!-- 1d-domain-decomposition along x:
250	!-- FFT along y and transposition y --> x
251	CALL ffty_tr_yx( ar, work, ar )
252
253	!
254	!-- FFT along x, solving the tridiagonal system and backward FFT
255	CALL fftx_tri_fftx( ar )
256
257	!
258	!-- Transposition x --> y and backward FFT along y
259	CALL tr_xy_ffty( ar, work, ar )
260
261	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
262
263	!
264	!-- 1d-domain-decomposition along y:
265	!-- FFT along x and transposition x --> y
266	CALL fftx_tr_xy( ar, work, ar )
267
268	!
269	!-- FFT along y, solving the tridiagonal system and backward FFT
270	CALL ffty_tri_ffty( ar )
271
272	!
273	!-- Transposition y --> x and backward FFT along x
274	CALL tr_yx_fftx( ar, work, ar )
275
276	ELSE
277
278	!
279	!-- 2d-domain-decomposition or no decomposition (1 PE run)
280	!-- Transposition z --> x
281	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
282	CALL transpose_zx( ar, work, ar )
283	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
284
285	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
286	CALL fft_x( ar, 'forward' )
287	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
288
289	!
290	!-- Transposition x --> y
291	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
292	CALL transpose_xy( ar, work, ar )
293	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
294
295	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
296	CALL fft_y( ar, 'forward' )
297	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
298
299	!
300	!-- Transposition y --> z
301	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
302	CALL transpose_yz( ar, work, ar )
303	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
304
305	!
306	!-- Solve the tridiagonal equation system along z
307	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
308	CALL tridia( ar )
309	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
310
311	!
312	!-- Inverse Fourier Transformation
313	!-- Transposition z --> y
314	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
315	CALL transpose_zy( ar, work, ar )
316	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
317
318	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
319	CALL fft_y( ar, 'backward' )
320	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
321
322	!
323	!-- Transposition y --> x
324	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
325	CALL transpose_yx( ar, work, ar )
326	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
327
328	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
329	CALL fft_x( ar, 'backward' )
330	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
331
332	!
333	!-- Transposition x --> z
334	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
335	CALL transpose_xz( ar, work, ar )
336	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
337
338	ENDIF
339
340	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
341
342	END SUBROUTINE poisfft
343
344
345
346	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
347
348	!------------------------------------------------------------------------------!
349	! Fourier-transformation along y with subsequent transposition y --> x for
350	! a 1d-decomposition along x
351	!
352	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
353	! if the first index of work_ffty_vec is odd. Otherwise
354	! memory bank conflicts may occur (especially if the index is a
355	! multiple of 128). That's why work_ffty_vec is dimensioned as
356	! 0:ny+1.
357	! Of course, this will not work if users are using an odd number
358	! of gridpoints along y.
359	!------------------------------------------------------------------------------!
360
361	USE control_parameters
362	USE cpulog
363	USE indices
364	USE interfaces
365	USE pegrid
366	USE transpose_indices
367
368	IMPLICIT NONE
369
370	INTEGER :: i, iend, iouter, ir, j, k
371	INTEGER, PARAMETER :: stridex = 4
372
373	REAL, DIMENSION(0:ny,stridex) :: work_ffty
374	#if defined( __nec )
375	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
376	#endif
377	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
378	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
379	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
380
381	!
382	!-- Carry out the FFT along y, where all data are present due to the
383	!-- 1d-decomposition along x. Resort the data in a way that x becomes
384	!-- the first index.
385	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
386
387	IF ( host(1:3) == 'nec' ) THEN
388	#if defined( __nec )
389	!
390	!-- Code optimized for vector processors
391	!$OMP PARALLEL PRIVATE ( i, j, k )
392	!$OMP DO
393	DO i = nxl, nxr
394
395	DO j = 0, ny
396	DO k = 1, nz
397	work_ffty_vec(j,k,i) = f_in(k,j,i)
398	ENDDO
399	ENDDO
400
401	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
402
403	ENDDO
404
405	!$OMP DO
406	DO k = 1, nz
407	DO j = 0, ny
408	DO i = nxl, nxr
409	work(i,k,j) = work_ffty_vec(j,k,i)
410	ENDDO
411	ENDDO
412	ENDDO
413	!$OMP END PARALLEL
414	#endif
415
416	ELSE
417
418	!
419	!-- Cache optimized code.
420	!-- The i-(x-)direction is split into a strided outer loop and an inner
421	!-- loop for better cache performance
422	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
423	!$OMP DO
424	DO iouter = nxl, nxr, stridex
425
426	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
427
428	DO k = 1, nz
429
430	DO i = iouter, iend
431
432	ir = i-iouter+1 ! counter within a stride
433	DO j = 0, ny
434	work_ffty(j,ir) = f_in(k,j,i)
435	ENDDO
436	!
437	!-- FFT along y
438	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
439
440	ENDDO
441
442	!
443	!-- Resort
444	DO j = 0, ny
445	DO i = iouter, iend
446	work(i,k,j) = work_ffty(j,i-iouter+1)
447	ENDDO
448	ENDDO
449
450	ENDDO
451
452	ENDDO
453	!$OMP END PARALLEL
454
455	ENDIF
456	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
457
458	!
459	!-- Transpose array
460	#if defined( __parallel )
461	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
462	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
463	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
464	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
465	comm1dx, ierr )
466	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
467	#endif
468
469	END SUBROUTINE ffty_tr_yx
470
471
472	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
473
474	!------------------------------------------------------------------------------!
475	! Transposition x --> y with a subsequent backward Fourier transformation for
476	! a 1d-decomposition along x
477	!------------------------------------------------------------------------------!
478
479	USE control_parameters
480	USE cpulog
481	USE indices
482	USE interfaces
483	USE pegrid
484	USE transpose_indices
485
486	IMPLICIT NONE
487
488	INTEGER :: i, iend, iouter, ir, j, k
489	INTEGER, PARAMETER :: stridex = 4
490
491	REAL, DIMENSION(0:ny,stridex) :: work_ffty
492	#if defined( __nec )
493	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
494	#endif
495	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
496	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
497	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
498
499	!
500	!-- Transpose array
501	#if defined( __parallel )
502	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
503	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
504	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
505	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
506	comm1dx, ierr )
507	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
508	#endif
509
510	!
511	!-- Resort the data in a way that y becomes the first index and carry out the
512	!-- backward fft along y.
513	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
514
515	IF ( host(1:3) == 'nec' ) THEN
516	#if defined( __nec )
517	!
518	!-- Code optimized for vector processors
519	!$OMP PARALLEL PRIVATE ( i, j, k )
520	!$OMP DO
521	DO k = 1, nz
522	DO j = 0, ny
523	DO i = nxl, nxr
524	work_ffty_vec(j,k,i) = work(i,k,j)
525	ENDDO
526	ENDDO
527	ENDDO
528
529	!$OMP DO
530	DO i = nxl, nxr
531
532	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
533
534	DO j = 0, ny
535	DO k = 1, nz
536	f_out(k,j,i) = work_ffty_vec(j,k,i)
537	ENDDO
538	ENDDO
539
540	ENDDO
541	!$OMP END PARALLEL
542	#endif
543
544	ELSE
545
546	!
547	!-- Cache optimized code.
548	!-- The i-(x-)direction is split into a strided outer loop and an inner
549	!-- loop for better cache performance
550	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
551	!$OMP DO
552	DO iouter = nxl, nxr, stridex
553
554	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
555
556	DO k = 1, nz
557	!
558	!-- Resort
559	DO j = 0, ny
560	DO i = iouter, iend
561	work_ffty(j,i-iouter+1) = work(i,k,j)
562	ENDDO
563	ENDDO
564
565	DO i = iouter, iend
566
567	!
568	!-- FFT along y
569	ir = i-iouter+1 ! counter within a stride
570	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
571
572	DO j = 0, ny
573	f_out(k,j,i) = work_ffty(j,ir)
574	ENDDO
575	ENDDO
576
577	ENDDO
578
579	ENDDO
580	!$OMP END PARALLEL
581
582	ENDIF
583
584	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
585
586	END SUBROUTINE tr_xy_ffty
587
588
589	SUBROUTINE fftx_tri_fftx( ar )
590
591	!------------------------------------------------------------------------------!
592	! FFT along x, solution of the tridiagonal system and backward FFT for
593	! a 1d-decomposition along x
594	!
595	! WARNING: this subroutine may still not work for hybrid parallelization
596	! with OpenMP (for possible necessary changes see the original
597	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
598	!------------------------------------------------------------------------------!
599
600	USE control_parameters
601	USE cpulog
602	USE grid_variables
603	USE indices
604	USE interfaces
605	USE pegrid
606	USE transpose_indices
607
608	IMPLICIT NONE
609
610	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
611
612	REAL, DIMENSION(0:nx) :: work_fftx
613	REAL, DIMENSION(0:nx,1:nz) :: work_trix
614	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
615	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
616
617
618	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
619
620	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
621
622	tn = 0 ! Default thread number in case of one thread
623	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
624	DO j = nys_x, nyn_x
625
626	!$ tn = omp_get_thread_num()
627
628	IF ( host(1:3) == 'nec' ) THEN
629	!
630	!-- Code optimized for vector processors
631	DO k = 1, nz
632
633	m = 0
634	DO n = 1, pdims(1)
635	DO i = 1, nnx
636	work_trix(m,k) = ar(i,k,j,n)
637	m = m + 1
638	ENDDO
639	ENDDO
640
641	ENDDO
642
643	CALL fft_x_m( work_trix, 'forward' )
644
645	ELSE
646	!
647	!-- Cache optimized code
648	DO k = 1, nz
649
650	m = 0
651	DO n = 1, pdims(1)
652	DO i = 1, nnx
653	work_fftx(m) = ar(i,k,j,n)
654	m = m + 1
655	ENDDO
656	ENDDO
657
658	CALL fft_x_1d( work_fftx, 'forward' )
659
660	DO i = 0, nx
661	work_trix(i,k) = work_fftx(i)
662	ENDDO
663
664	ENDDO
665
666	ENDIF
667
668	!
669	!-- Solve the linear equation system
670	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
671
672	IF ( host(1:3) == 'nec' ) THEN
673	!
674	!-- Code optimized for vector processors
675	CALL fft_x_m( work_trix, 'backward' )
676
677	DO k = 1, nz
678
679	m = 0
680	DO n = 1, pdims(1)
681	DO i = 1, nnx
682	ar(i,k,j,n) = work_trix(m,k)
683	m = m + 1
684	ENDDO
685	ENDDO
686
687	ENDDO
688
689	ELSE
690	!
691	!-- Cache optimized code
692	DO k = 1, nz
693
694	DO i = 0, nx
695	work_fftx(i) = work_trix(i,k)
696	ENDDO
697
698	CALL fft_x_1d( work_fftx, 'backward' )
699
700	m = 0
701	DO n = 1, pdims(1)
702	DO i = 1, nnx
703	ar(i,k,j,n) = work_fftx(m)
704	m = m + 1
705	ENDDO
706	ENDDO
707
708	ENDDO
709
710	ENDIF
711
712	ENDDO
713
714	DEALLOCATE( tri )
715
716	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
717
718	END SUBROUTINE fftx_tri_fftx
719
720
721	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
722
723	!------------------------------------------------------------------------------!
724	! Fourier-transformation along x with subsequent transposition x --> y for
725	! a 1d-decomposition along y
726	!
727	! ATTENTION: The NEC-branch of this routine may significantly profit from
728	! further optimizations. So far, performance is much worse than
729	! for routine ffty_tr_yx (more than three times slower).
730	!------------------------------------------------------------------------------!
731
732	USE control_parameters
733	USE cpulog
734	USE indices
735	USE interfaces
736	USE pegrid
737	USE transpose_indices
738
739	IMPLICIT NONE
740
741	INTEGER :: i, j, k
742
743	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
744	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
745	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
746	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
747
748	!
749	!-- Carry out the FFT along x, where all data are present due to the
750	!-- 1d-decomposition along y. Resort the data in a way that y becomes
751	!-- the first index.
752	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
753
754	IF ( host(1:3) == 'nec' ) THEN
755	!
756	!-- Code for vector processors
757	!$OMP PARALLEL PRIVATE ( i, j, k )
758	!$OMP DO
759	DO i = 0, nx
760
761	DO j = nys, nyn
762	DO k = 1, nz
763	work_fftx(i,k,j) = f_in(k,j,i)
764	ENDDO
765	ENDDO
766
767	ENDDO
768
769	!$OMP DO
770	DO j = nys, nyn
771
772	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
773
774	DO k = 1, nz
775	DO i = 0, nx
776	work(j,k,i) = work_fftx(i,k,j)
777	ENDDO
778	ENDDO
779
780	ENDDO
781	!$OMP END PARALLEL
782
783	ELSE
784
785	!
786	!-- Cache optimized code (there might be still a potential for better
787	!-- optimization).
788	!$OMP PARALLEL PRIVATE (i,j,k)
789	!$OMP DO
790	DO i = 0, nx
791
792	DO j = nys, nyn
793	DO k = 1, nz
794	work_fftx(i,k,j) = f_in(k,j,i)
795	ENDDO
796	ENDDO
797
798	ENDDO
799
800	!$OMP DO
801	DO j = nys, nyn
802	DO k = 1, nz
803
804	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
805
806	DO i = 0, nx
807	work(j,k,i) = work_fftx(i,k,j)
808	ENDDO
809	ENDDO
810
811	ENDDO
812	!$OMP END PARALLEL
813
814	ENDIF
815	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
816
817	!
818	!-- Transpose array
819	#if defined( __parallel )
820	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
821	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
822	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
823	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
824	comm1dy, ierr )
825	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
826	#endif
827
828	END SUBROUTINE fftx_tr_xy
829
830
831	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
832
833	!------------------------------------------------------------------------------!
834	! Transposition y --> x with a subsequent backward Fourier transformation for
835	! a 1d-decomposition along x
836	!------------------------------------------------------------------------------!
837
838	USE control_parameters
839	USE cpulog
840	USE indices
841	USE interfaces
842	USE pegrid
843	USE transpose_indices
844
845	IMPLICIT NONE
846
847	INTEGER :: i, j, k
848
849	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
850	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
851	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
852	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
853
854	!
855	!-- Transpose array
856	#if defined( __parallel )
857	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
858	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
859	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
860	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
861	comm1dy, ierr )
862	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
863	#endif
864
865	!
866	!-- Carry out the FFT along x, where all data are present due to the
867	!-- 1d-decomposition along y. Resort the data in a way that y becomes
868	!-- the first index.
869	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
870
871	IF ( host(1:3) == 'nec' ) THEN
872	!
873	!-- Code optimized for vector processors
874	!$OMP PARALLEL PRIVATE ( i, j, k )
875	!$OMP DO
876	DO j = nys, nyn
877
878	DO k = 1, nz
879	DO i = 0, nx
880	work_fftx(i,k,j) = work(j,k,i)
881	ENDDO
882	ENDDO
883
884	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
885
886	ENDDO
887
888	!$OMP DO
889	DO i = 0, nx
890	DO j = nys, nyn
891	DO k = 1, nz
892	f_out(k,j,i) = work_fftx(i,k,j)
893	ENDDO
894	ENDDO
895	ENDDO
896	!$OMP END PARALLEL
897
898	ELSE
899
900	!
901	!-- Cache optimized code (there might be still a potential for better
902	!-- optimization).
903	!$OMP PARALLEL PRIVATE (i,j,k)
904	!$OMP DO
905	DO j = nys, nyn
906	DO k = 1, nz
907
908	DO i = 0, nx
909	work_fftx(i,k,j) = work(j,k,i)
910	ENDDO
911
912	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
913
914	ENDDO
915	ENDDO
916
917	!$OMP DO
918	DO i = 0, nx
919	DO j = nys, nyn
920	DO k = 1, nz
921	f_out(k,j,i) = work_fftx(i,k,j)
922	ENDDO
923	ENDDO
924	ENDDO
925	!$OMP END PARALLEL
926
927	ENDIF
928	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
929
930	END SUBROUTINE tr_yx_fftx
931
932
933	SUBROUTINE ffty_tri_ffty( ar )
934
935	!------------------------------------------------------------------------------!
936	! FFT along y, solution of the tridiagonal system and backward FFT for
937	! a 1d-decomposition along y
938	!
939	! WARNING: this subroutine may still not work for hybrid parallelization
940	! with OpenMP (for possible necessary changes see the original
941	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
942	!------------------------------------------------------------------------------!
943
944	USE control_parameters
945	USE cpulog
946	USE grid_variables
947	USE indices
948	USE interfaces
949	USE pegrid
950	USE transpose_indices
951
952	IMPLICIT NONE
953
954	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
955
956	REAL, DIMENSION(0:ny) :: work_ffty
957	REAL, DIMENSION(0:ny,1:nz) :: work_triy
958	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
959	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
960
961
962	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
963
964	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
965
966	tn = 0 ! Default thread number in case of one thread
967	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
968	DO i = nxl_y, nxr_y
969
970	!$ tn = omp_get_thread_num()
971
972	IF ( host(1:3) == 'nec' ) THEN
973	!
974	!-- Code optimized for vector processors
975	DO k = 1, nz
976
977	m = 0
978	DO n = 1, pdims(2)
979	DO j = 1, nny
980	work_triy(m,k) = ar(j,k,i,n)
981	m = m + 1
982	ENDDO
983	ENDDO
984
985	ENDDO
986
987	CALL fft_y_m( work_triy, ny, 'forward' )
988
989	ELSE
990	!
991	!-- Cache optimized code
992	DO k = 1, nz
993
994	m = 0
995	DO n = 1, pdims(2)
996	DO j = 1, nny
997	work_ffty(m) = ar(j,k,i,n)
998	m = m + 1
999	ENDDO
1000	ENDDO
1001
1002	CALL fft_y_1d( work_ffty, 'forward' )
1003
1004	DO j = 0, ny
1005	work_triy(j,k) = work_ffty(j)
1006	ENDDO
1007
1008	ENDDO
1009
1010	ENDIF
1011
1012	!
1013	!-- Solve the linear equation system
1014	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1015
1016	IF ( host(1:3) == 'nec' ) THEN
1017	!
1018	!-- Code optimized for vector processors
1019	CALL fft_y_m( work_triy, ny, 'backward' )
1020
1021	DO k = 1, nz
1022
1023	m = 0
1024	DO n = 1, pdims(2)
1025	DO j = 1, nny
1026	ar(j,k,i,n) = work_triy(m,k)
1027	m = m + 1
1028	ENDDO
1029	ENDDO
1030
1031	ENDDO
1032
1033	ELSE
1034	!
1035	!-- Cache optimized code
1036	DO k = 1, nz
1037
1038	DO j = 0, ny
1039	work_ffty(j) = work_triy(j,k)
1040	ENDDO
1041
1042	CALL fft_y_1d( work_ffty, 'backward' )
1043
1044	m = 0
1045	DO n = 1, pdims(2)
1046	DO j = 1, nny
1047	ar(j,k,i,n) = work_ffty(m)
1048	m = m + 1
1049	ENDDO
1050	ENDDO
1051
1052	ENDDO
1053
1054	ENDIF
1055
1056	ENDDO
1057
1058	DEALLOCATE( tri )
1059
1060	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1061
1062	END SUBROUTINE ffty_tri_ffty
1063
1064
1065	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
1066
1067	!------------------------------------------------------------------------------!
1068	! Solves the linear system of equations for a 1d-decomposition along x (see
1069	! tridia)
1070	!
1071	! Attention: when using the intel compilers older than 12.0, array tri must
1072	! be passed as an argument to the contained subroutines. Otherwise
1073	! addres faults will occur. This feature can be activated with
1074	! cpp-switch __intel11
1075	! On NEC, tri should not be passed (except for routine substi_1dd)
1076	! because this causes very bad performance.
1077	!------------------------------------------------------------------------------!
1078
1079	USE arrays_3d
1080	USE control_parameters
1081
1082	USE pegrid
1083
1084	IMPLICIT NONE
1085
1086	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
1087
1088	REAL :: ddx2, ddy2
1089
1090	REAL, DIMENSION(0:nx,1:nz) :: ar
1091	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1092
1093
1094	nnyh = ( ny + 1 ) / 2
1095
1096	!
1097	!-- Define constant elements of the tridiagonal matrix.
1098	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
1099	!-- the exchanged loops create bank conflicts. The following directive
1100	!-- prohibits loop exchange and the loops perform much better.
1101	! tn = omp_get_thread_num()
1102	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
1103	! CALL local_flush( 120+tn )
1104	!CDIR NOLOOPCHG
1105	DO k = 0, nz-1
1106	DO i = 0,nx
1107	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
1108	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
1109	ENDDO
1110	ENDDO
1111	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
1112	! CALL local_flush( 120+tn )
1113
1114	IF ( j <= nnyh ) THEN
1115	#if defined( __intel11 )
1116	CALL maketri_1dd( j, tri )
1117	#else
1118	CALL maketri_1dd( j )
1119	#endif
1120	ELSE
1121	#if defined( __intel11 )
1122	CALL maketri_1dd( ny+1-j, tri )
1123	#else
1124	CALL maketri_1dd( ny+1-j )
1125	#endif
1126	ENDIF
1127	#if defined( __intel11 )
1128	CALL split_1dd( tri )
1129	#else
1130	CALL split_1dd
1131	#endif
1132	CALL substi_1dd( ar, tri )
1133
1134	CONTAINS
1135
1136	#if defined( __intel11 )
1137	SUBROUTINE maketri_1dd( j, tri )
1138	#else
1139	SUBROUTINE maketri_1dd( j )
1140	#endif
1141
1142	!------------------------------------------------------------------------------!
1143	! computes the i- and j-dependent component of the matrix
1144	!------------------------------------------------------------------------------!
1145
1146	USE constants
1147
1148	IMPLICIT NONE
1149
1150	INTEGER :: i, j, k, nnxh
1151	REAL :: a, c
1152
1153	REAL, DIMENSION(0:nx) :: l
1154
1155	#if defined( __intel11 )
1156	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1157	#endif
1158
1159
1160	nnxh = ( nx + 1 ) / 2
1161	!
1162	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
1163	!-- Fourier space. The coefficients are computed following the method of
1164	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1165	!-- Siano's original version by discretizing the Poisson equation,
1166	!-- before it is Fourier-transformed
1167	DO i = 0, nx
1168	IF ( i >= 0 .AND. i <= nnxh ) THEN
1169	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1170	REAL( nx+1 ) ) ) * ddx2 + &
1171	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1172	REAL( ny+1 ) ) ) * ddy2
1173	ELSE
1174	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1175	REAL( nx+1 ) ) ) * ddx2 + &
1176	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1177	REAL( ny+1 ) ) ) * ddy2
1178	ENDIF
1179	ENDDO
1180
1181	DO k = 0, nz-1
1182	DO i = 0, nx
1183	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
1184	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
1185	tri(1,i,k) = a + c - l(i)
1186	ENDDO
1187	ENDDO
1188	IF ( ibc_p_b == 1 ) THEN
1189	DO i = 0, nx
1190	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
1191	ENDDO
1192	ENDIF
1193	IF ( ibc_p_t == 1 ) THEN
1194	DO i = 0, nx
1195	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
1196	ENDDO
1197	ENDIF
1198
1199	END SUBROUTINE maketri_1dd
1200
1201
1202	#if defined( __intel11 )
1203	SUBROUTINE split_1dd( tri )
1204	#else
1205	SUBROUTINE split_1dd
1206	#endif
1207
1208	!------------------------------------------------------------------------------!
1209	! Splitting of the tridiagonal matrix (Thomas algorithm)
1210	!------------------------------------------------------------------------------!
1211
1212	IMPLICIT NONE
1213
1214	INTEGER :: i, k
1215
1216	#if defined( __intel11 )
1217	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1218	#endif
1219
1220
1221	!
1222	!-- Splitting
1223	DO i = 0, nx
1224	tri(4,i,0) = tri(1,i,0)
1225	ENDDO
1226	DO k = 1, nz-1
1227	DO i = 0, nx
1228	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
1229	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1230	ENDDO
1231	ENDDO
1232
1233	END SUBROUTINE split_1dd
1234
1235
1236	SUBROUTINE substi_1dd( ar, tri )
1237
1238	!------------------------------------------------------------------------------!
1239	! Substitution (Forward and Backward) (Thomas algorithm)
1240	!------------------------------------------------------------------------------!
1241
1242	IMPLICIT NONE
1243
1244	INTEGER :: i, k
1245
1246	REAL, DIMENSION(0:nx,nz) :: ar
1247	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1248	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1249
1250	!
1251	!-- Forward substitution
1252	DO i = 0, nx
1253	ar1(i,0) = ar(i,1)
1254	ENDDO
1255	DO k = 1, nz-1
1256	DO i = 0, nx
1257	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1258	ENDDO
1259	ENDDO
1260
1261	!
1262	!-- Backward substitution
1263	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
1264	!-- by zero appearing if the pressure bc is set to neumann at the top of
1265	!-- the model domain.
1266	DO i = 0, nx
1267	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
1268	ENDDO
1269	DO k = nz-2, 0, -1
1270	DO i = 0, nx
1271	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1272	/ tri(4,i,k)
1273	ENDDO
1274	ENDDO
1275
1276	!
1277	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1278	!-- The respective values of ar should be zero at all k-levels if
1279	!-- acceleration of horizontally averaged vertical velocity is zero.
1280	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1281	IF ( j == 0 ) THEN
1282	DO k = 1, nz
1283	ar(0,k) = 0.0
1284	ENDDO
1285	ENDIF
1286	ENDIF
1287
1288	END SUBROUTINE substi_1dd
1289
1290	END SUBROUTINE tridia_1dd
1291
1292
1293	SUBROUTINE tridia( ar )
1294
1295	!------------------------------------------------------------------------------!
1296	! solves the linear system of equations:
1297	!
1298	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
1299	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
1300	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
1301	!
1302	! by using the Thomas algorithm
1303	!------------------------------------------------------------------------------!
1304
1305	USE arrays_3d
1306
1307	IMPLICIT NONE
1308
1309	INTEGER :: i, j, k
1310
1311	!$acc declare create( tri )
1312	REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1,2) :: tri
1313
1314	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
1315
1316
1317	CALL split( tri )
1318	CALL substi( ar, tri )
1319
1320	END SUBROUTINE tridia
1321
1322
1323	SUBROUTINE maketri
1324
1325	!------------------------------------------------------------------------------!
1326	! Computes the i- and j-dependent component of the matrix
1327	!------------------------------------------------------------------------------!
1328
1329	USE arrays_3d, ONLY: tric
1330	USE constants
1331	USE control_parameters
1332	USE grid_variables
1333
1334	IMPLICIT NONE
1335
1336	INTEGER :: i, j, k, nnxh, nnyh
1337
1338	!$acc declare create( ll )
1339	REAL :: ll(nxl_z:nxr_z,nys_z:nyn_z)
1340
1341
1342	nnxh = ( nx + 1 ) / 2
1343	nnyh = ( ny + 1 ) / 2
1344
1345	!
1346	!-- Provide the constant coefficients of the tridiagonal matrix for solution
1347	!-- of the Poisson equation in Fourier space.
1348	!-- The coefficients are computed following the method of
1349	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1350	!-- Siano's original version by discretizing the Poisson equation,
1351	!-- before it is Fourier-transformed.
1352
1353	!$acc kernels present( tric )
1354	!$acc loop vector( 32 )
1355	DO j = nys_z, nyn_z
1356	DO i = nxl_z, nxr_z
1357	IF ( j >= 0 .AND. j <= nnyh ) THEN
1358	IF ( i >= 0 .AND. i <= nnxh ) THEN
1359	ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1360	REAL( nx+1 ) ) ) / ( dx * dx ) + &
1361	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1362	REAL( ny+1 ) ) ) / ( dy * dy )
1363	ELSE
1364	ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1365	REAL( nx+1 ) ) ) / ( dx * dx ) + &
1366	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1367	REAL( ny+1 ) ) ) / ( dy * dy )
1368	ENDIF
1369	ELSE
1370	IF ( i >= 0 .AND. i <= nnxh ) THEN
1371	ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1372	REAL( nx+1 ) ) ) / ( dx * dx ) + &
1373	2.0 * ( 1.0 - COS( ( 2.0 * pi * ( ny+1-j ) ) / &
1374	REAL( ny+1 ) ) ) / ( dy * dy )
1375	ELSE
1376	ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1377	REAL( nx+1 ) ) ) / ( dx * dx ) + &
1378	2.0 * ( 1.0 - COS( ( 2.0 * pi * ( ny+1-j ) ) / &
1379	REAL( ny+1 ) ) ) / ( dy * dy )
1380	ENDIF
1381	ENDIF
1382	ENDDO
1383	ENDDO
1384
1385	!$acc loop
1386	DO k = 0, nz-1
1387	DO j = nys_z, nyn_z
1388	!$acc loop vector( 32 )
1389	DO i = nxl_z, nxr_z
1390	tric(i,j,k) = ddzuw(k,3) - ll(i,j)
1391	ENDDO
1392	ENDDO
1393	ENDDO
1394	!$acc end kernels
1395
1396	IF ( ibc_p_b == 1 ) THEN
1397	!$acc kernels present( tric )
1398	!$acc loop
1399	DO j = nys_z, nyn_z
1400	DO i = nxl_z, nxr_z
1401	tric(i,j,0) = tric(i,j,0) + ddzuw(0,1)
1402	ENDDO
1403	ENDDO
1404	!$acc end kernels
1405	ENDIF
1406	IF ( ibc_p_t == 1 ) THEN
1407	!$acc kernels present( tric )
1408	!$acc loop
1409	DO j = nys_z, nyn_z
1410	DO i = nxl_z, nxr_z
1411	tric(i,j,nz-1) = tric(i,j,nz-1) + ddzuw(nz-1,2)
1412	ENDDO
1413	ENDDO
1414	!$acc end kernels
1415	ENDIF
1416
1417	END SUBROUTINE maketri
1418
1419
1420	SUBROUTINE substi( ar, tri )
1421
1422	!------------------------------------------------------------------------------!
1423	! Substitution (Forward and Backward) (Thomas algorithm)
1424	!------------------------------------------------------------------------------!
1425
1426	USE control_parameters
1427
1428	IMPLICIT NONE
1429
1430	INTEGER :: i, j, k
1431
1432	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
1433	REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1,2) :: tri
1434
1435	!$acc declare create( ar1 )
1436	REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1) :: ar1
1437
1438	!
1439	!-- Forward substitution
1440	DO k = 0, nz - 1
1441	!$acc kernels present( ar, tri )
1442	!$acc loop
1443	DO j = nys_z, nyn_z
1444	DO i = nxl_z, nxr_z
1445
1446	IF ( k == 0 ) THEN
1447	ar1(i,j,k) = ar(i,j,k+1)
1448	ELSE
1449	ar1(i,j,k) = ar(i,j,k+1) - tri(i,j,k,2) * ar1(i,j,k-1)
1450	ENDIF
1451
1452	ENDDO
1453	ENDDO
1454	!$acc end kernels
1455	ENDDO
1456
1457	!
1458	!-- Backward substitution
1459	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
1460	!-- by zero appearing if the pressure bc is set to neumann at the top of
1461	!-- the model domain.
1462	DO k = nz-1, 0, -1
1463	!$acc kernels present( ar, tri )
1464	!$acc loop
1465	DO j = nys_z, nyn_z
1466	DO i = nxl_z, nxr_z
1467
1468	IF ( k == nz-1 ) THEN
1469	ar(i,j,k+1) = ar1(i,j,k) / ( tri(i,j,k,1) + 1.0E-20 )
1470	ELSE
1471	ar(i,j,k+1) = ( ar1(i,j,k) - ddzuw(k,2) * ar(i,j,k+2) ) &
1472	/ tri(i,j,k,1)
1473	ENDIF
1474	ENDDO
1475	ENDDO
1476	!$acc end kernels
1477	ENDDO
1478
1479	!
1480	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1481	!-- The respective values of ar should be zero at all k-levels if
1482	!-- acceleration of horizontally averaged vertical velocity is zero.
1483	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1484	IF ( nys_z == 0 .AND. nxl_z == 0 ) THEN
1485	!$acc kernels loop present( ar )
1486	DO k = 1, nz
1487	ar(nxl_z,nys_z,k) = 0.0
1488	ENDDO
1489	ENDIF
1490	ENDIF
1491
1492	END SUBROUTINE substi
1493
1494
1495	SUBROUTINE split( tri )
1496
1497	!------------------------------------------------------------------------------!
1498	! Splitting of the tridiagonal matrix (Thomas algorithm)
1499	!------------------------------------------------------------------------------!
1500
1501	USE arrays_3d, ONLY: tric
1502
1503	IMPLICIT NONE
1504
1505	INTEGER :: i, j, k
1506
1507	REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1,2) :: tri
1508
1509	!
1510	!-- Splitting
1511	!$acc kernels present( tri, tric )
1512	!$acc loop
1513	DO j = nys_z, nyn_z
1514	!$acc loop vector( 32 )
1515	DO i = nxl_z, nxr_z
1516	tri(i,j,0,1) = tric(i,j,0)
1517	ENDDO
1518	ENDDO
1519	!$acc end kernels
1520
1521	DO k = 1, nz-1
1522	!$acc kernels present( tri, tric )
1523	!$acc loop
1524	DO j = nys_z, nyn_z
1525	!$acc loop vector( 32 )
1526	DO i = nxl_z, nxr_z
1527	tri(i,j,k,2) = ddzuw(k,1) / tri(i,j,k-1,1)
1528	tri(i,j,k,1) = tric(i,j,k) - ddzuw(k-1,2) * tri(i,j,k,2)
1529	ENDDO
1530	ENDDO
1531	!$acc end kernels
1532	ENDDO
1533
1534	END SUBROUTINE split
1535
1536	#endif
1537
1538	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |