Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1211

Last change on this file since 1211 was 1209, checked in by raasch, 11 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 46.9 KB

Line
1	MODULE poisfft_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2012 Leibniz University Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft.f90 1209 2013-08-13 06:45:24Z raasch $
27	!
28	! 1208 2013-08-13 06:41:49Z raasch
29	! acc-update clauses added for "ar" so that ffts other than cufft can also be
30	! used (although they are not ported and will give a poor performance)
31	!
32	! 1111 2013-03-08 23:54:10Z raasch
33	! further openACC porting of non-parallel (MPI) branch:
34	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
35	! no distinction between parallel/non-parallel in poisfft and tridia any more,
36	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
37	! (otherwise "invalid device function" is indicated during runtime),
38	! optimization of tridia routines: constant elements and coefficients of tri are
39	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
40	! to 2,
41	! poisfft_init is now called internally from poisfft, maketri is called from
42	! poisfft_init,
43	! ibc_p_b = 2 removed
44	!
45	! 1106 2013-03-04 05:31:38Z raasch
46	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
47	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
48	! fft_y_1d
49	!
50	! 1103 2013-02-20 02:15:53Z raasch
51	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
52	! sometimes cause segmentation faults with intel 12.1 compiler
53	!
54	! 1092 2013-02-02 11:24:22Z raasch
55	! unused variables removed
56	!
57	! 1036 2012-10-22 13:43:42Z raasch
58	! code put under GPL (PALM 3.9)
59	!
60	! 2012-09-21 07:03:55Z raasch
61	! FLOAT type conversion replaced by REAL
62	!
63	! 1003 2012-09-14 14:35:53Z raasch
64	! indices nxa, nya, etc. replaced by nx, ny, etc.
65	!
66	! 940 2012-07-09 14:31:00Z raasch
67	! special handling of tri-array as an argument in tridia_1dd routines switched
68	! off because it caused segmentation faults with intel 12.1 compiler
69	!
70	! 877 2012-04-03 11:21:44Z suehring
71	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
72	! pressure at the top of the model domain.
73	!
74	! 809 2012-01-30 13:32:58Z maronga
75	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
76	!
77	! 807 2012-01-25 11:53:51Z maronga
78	! New cpp directive "__check" implemented which is used by check_namelist_files
79	! (most of the code is unneeded by check_namelist_files).
80	!
81	! 763 2011-10-06 09:32:09Z suehring
82	! Comment added concerning the last change.
83	!
84	! 761 2011-10-05 17:58:52Z suehring
85	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
86	! pressure at the top of the model domain.
87	!
88	! 696 2011-03-18 07:03:49Z raasch
89	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
90	!
91	! 683 2011-02-09 14:25:15Z raasch
92	! openMP parallelization for 2d-domain-decomposition
93	!
94	! 667 2010-12-23 12:06:00Z suehring/gryschka
95	! ddzu replaced by ddzu_pres due to changes in zu(0)
96	!
97	! 622 2010-12-10 08:08:13Z raasch
98	! optional barriers included in order to speed up collective operations
99	!
100	! 377 2009-09-04 11:09:00Z raasch
101	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
102	!
103	! 164 2008-05-15 08:46:15Z raasch
104	! Arguments removed from transpose routines
105	!
106	! 128 2007-10-26 13:11:14Z raasch
107	! Bugfix: wavenumber calculation for even nx in routines maketri
108	!
109	! 85 2007-05-11 09:35:14Z raasch
110	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
111	!
112	! 76 2007-03-29 00:58:32Z raasch
113	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
114	! the bottom and the top.
115	!
116	! RCS Log replace by Id keyword, revision history cleaned up
117	!
118	! Revision 1.24 2006/08/04 15:00:24 raasch
119	! Default setting of the thread number tn in case of not using OpenMP
120	!
121	! Revision 1.23 2006/02/23 12:48:38 raasch
122	! Additional compiler directive in routine tridia_1dd for preventing loop
123	! exchange on NEC-SX6
124	!
125	! Revision 1.20 2004/04/30 12:38:09 raasch
126	! Parts of former poisfft_hybrid moved to this subroutine,
127	! former subroutine changed to a module, renaming of FFT-subroutines and
128	! -module, FFTs completely substituted by calls of fft_x and fft_y,
129	! NAG fft used in the non-parallel case completely removed, l in maketri
130	! is now a 1d-array, variables passed by modules instead of using parameter
131	! lists, enlarged transposition arrays introduced
132	!
133	! Revision 1.1 1997/07/24 11:24:14 raasch
134	! Initial revision
135	!
136	!
137	! Description:
138	! ------------
139	! See below.
140	!------------------------------------------------------------------------------!
141
142	!--------------------------------------------------------------------------!
143	! poisfft !
144	! !
145	! Original version: Stephan Siano (pois3d) !
146	! !
147	! Institute of Meteorology and Climatology, University of Hannover !
148	! Germany !
149	! !
150	! Version as of July 23,1996 !
151	! !
152	! !
153	! Version for parallel computers: Siegfried Raasch !
154	! !
155	! Version as of July 03,1997 !
156	! !
157	! Solves the Poisson equation with a 2D spectral method !
158	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
159	! !
160	! Input: !
161	! real ar contains in the (nnx,nny,nnz) elements, !
162	! starting from the element (1,nys,nxl), the !
163	! values for s !
164	! real work Temporary array !
165	! !
166	! Output: !
167	! real ar contains the solution for p !
168	!--------------------------------------------------------------------------!
169
170	USE fft_xy
171	USE indices
172	USE transpose_indices
173
174	IMPLICIT NONE
175
176	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
177
178	REAL, DIMENSION(:,:), ALLOCATABLE :: ddzuw
179
180	PRIVATE
181
182	#if ! defined ( __check )
183	PUBLIC poisfft, poisfft_init
184
185	INTERFACE poisfft
186	MODULE PROCEDURE poisfft
187	END INTERFACE poisfft
188
189	INTERFACE poisfft_init
190	MODULE PROCEDURE poisfft_init
191	END INTERFACE poisfft_init
192	#else
193	PUBLIC poisfft_init
194
195	INTERFACE poisfft_init
196	MODULE PROCEDURE poisfft_init
197	END INTERFACE poisfft_init
198	#endif
199
200	CONTAINS
201
202	SUBROUTINE poisfft_init
203
204	USE arrays_3d, ONLY: ddzu_pres, ddzw
205
206	IMPLICIT NONE
207
208	INTEGER :: k
209
210
211	CALL fft_init
212
213	ALLOCATE( ddzuw(0:nz-1,3) )
214
215	DO k = 0, nz-1
216	ddzuw(k,1) = ddzu_pres(k+1) * ddzw(k+1)
217	ddzuw(k,2) = ddzu_pres(k+2) * ddzw(k+1)
218	ddzuw(k,3) = -1.0 * &
219	( ddzu_pres(k+2) * ddzw(k+1) + ddzu_pres(k+1) * ddzw(k+1) )
220	ENDDO
221	!
222	!-- Calculate constant coefficients of the tridiagonal matrix
223	#if ! defined ( __check )
224	CALL maketri
225	#endif
226
227	poisfft_initialized = .TRUE.
228
229	END SUBROUTINE poisfft_init
230
231
232	#if ! defined ( __check )
233	SUBROUTINE poisfft( ar, work )
234
235	USE control_parameters, ONLY : fft_method
236	USE cpulog
237	USE interfaces
238	USE pegrid
239
240	IMPLICIT NONE
241
242	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar, work
243
244
245	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
246
247	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
248
249	!
250	!-- Two-dimensional Fourier Transformation in x- and y-direction.
251	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
252
253	!
254	!-- 1d-domain-decomposition along x:
255	!-- FFT along y and transposition y --> x
256	CALL ffty_tr_yx( ar, work, ar )
257
258	!
259	!-- FFT along x, solving the tridiagonal system and backward FFT
260	CALL fftx_tri_fftx( ar )
261
262	!
263	!-- Transposition x --> y and backward FFT along y
264	CALL tr_xy_ffty( ar, work, ar )
265
266	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
267
268	!
269	!-- 1d-domain-decomposition along y:
270	!-- FFT along x and transposition x --> y
271	CALL fftx_tr_xy( ar, work, ar )
272
273	!
274	!-- FFT along y, solving the tridiagonal system and backward FFT
275	CALL ffty_tri_ffty( ar )
276
277	!
278	!-- Transposition y --> x and backward FFT along x
279	CALL tr_yx_fftx( ar, work, ar )
280
281	ELSE
282
283	!
284	!-- 2d-domain-decomposition or no decomposition (1 PE run)
285	!-- Transposition z --> x
286	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
287	CALL transpose_zx( ar, work, ar )
288	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
289
290	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
291	IF ( fft_method /= 'system-specific' ) THEN
292	!$acc update host( ar )
293	ENDIF
294	CALL fft_x( ar, 'forward' )
295	IF ( fft_method /= 'system-specific' ) THEN
296	!$acc update device( ar )
297	ENDIF
298	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
299
300	!
301	!-- Transposition x --> y
302	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
303	CALL transpose_xy( ar, work, ar )
304	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
305
306	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
307	IF ( fft_method /= 'system-specific' ) THEN
308	!$acc update host( ar )
309	ENDIF
310	CALL fft_y( ar, 'forward' )
311	IF ( fft_method /= 'system-specific' ) THEN
312	!$acc update device( ar )
313	ENDIF
314	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
315
316	!
317	!-- Transposition y --> z
318	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
319	CALL transpose_yz( ar, work, ar )
320	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
321
322	!
323	!-- Solve the tridiagonal equation system along z
324	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
325	CALL tridia( ar )
326	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
327
328	!
329	!-- Inverse Fourier Transformation
330	!-- Transposition z --> y
331	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
332	CALL transpose_zy( ar, work, ar )
333	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
334
335	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
336	IF ( fft_method /= 'system-specific' ) THEN
337	!$acc update host( ar )
338	ENDIF
339	CALL fft_y( ar, 'backward' )
340	IF ( fft_method /= 'system-specific' ) THEN
341	!$acc update device( ar )
342	ENDIF
343	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
344
345	!
346	!-- Transposition y --> x
347	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
348	CALL transpose_yx( ar, work, ar )
349	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
350
351	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
352	IF ( fft_method /= 'system-specific' ) THEN
353	!$acc update host( ar )
354	ENDIF
355	CALL fft_x( ar, 'backward' )
356	IF ( fft_method /= 'system-specific' ) THEN
357	!$acc update device( ar )
358	ENDIF
359	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
360
361	!
362	!-- Transposition x --> z
363	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
364	CALL transpose_xz( ar, work, ar )
365	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
366
367	ENDIF
368
369	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
370
371	END SUBROUTINE poisfft
372
373
374
375	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
376
377	!------------------------------------------------------------------------------!
378	! Fourier-transformation along y with subsequent transposition y --> x for
379	! a 1d-decomposition along x
380	!
381	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
382	! if the first index of work_ffty_vec is odd. Otherwise
383	! memory bank conflicts may occur (especially if the index is a
384	! multiple of 128). That's why work_ffty_vec is dimensioned as
385	! 0:ny+1.
386	! Of course, this will not work if users are using an odd number
387	! of gridpoints along y.
388	!------------------------------------------------------------------------------!
389
390	USE control_parameters
391	USE cpulog
392	USE indices
393	USE interfaces
394	USE pegrid
395	USE transpose_indices
396
397	IMPLICIT NONE
398
399	INTEGER :: i, iend, iouter, ir, j, k
400	INTEGER, PARAMETER :: stridex = 4
401
402	REAL, DIMENSION(0:ny,stridex) :: work_ffty
403	#if defined( __nec )
404	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
405	#endif
406	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
407	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
408	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
409
410	!
411	!-- Carry out the FFT along y, where all data are present due to the
412	!-- 1d-decomposition along x. Resort the data in a way that x becomes
413	!-- the first index.
414	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
415
416	IF ( host(1:3) == 'nec' ) THEN
417	#if defined( __nec )
418	!
419	!-- Code optimized for vector processors
420	!$OMP PARALLEL PRIVATE ( i, j, k )
421	!$OMP DO
422	DO i = nxl, nxr
423
424	DO j = 0, ny
425	DO k = 1, nz
426	work_ffty_vec(j,k,i) = f_in(k,j,i)
427	ENDDO
428	ENDDO
429
430	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
431
432	ENDDO
433
434	!$OMP DO
435	DO k = 1, nz
436	DO j = 0, ny
437	DO i = nxl, nxr
438	work(i,k,j) = work_ffty_vec(j,k,i)
439	ENDDO
440	ENDDO
441	ENDDO
442	!$OMP END PARALLEL
443	#endif
444
445	ELSE
446
447	!
448	!-- Cache optimized code.
449	!-- The i-(x-)direction is split into a strided outer loop and an inner
450	!-- loop for better cache performance
451	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
452	!$OMP DO
453	DO iouter = nxl, nxr, stridex
454
455	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
456
457	DO k = 1, nz
458
459	DO i = iouter, iend
460
461	ir = i-iouter+1 ! counter within a stride
462	DO j = 0, ny
463	work_ffty(j,ir) = f_in(k,j,i)
464	ENDDO
465	!
466	!-- FFT along y
467	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
468
469	ENDDO
470
471	!
472	!-- Resort
473	DO j = 0, ny
474	DO i = iouter, iend
475	work(i,k,j) = work_ffty(j,i-iouter+1)
476	ENDDO
477	ENDDO
478
479	ENDDO
480
481	ENDDO
482	!$OMP END PARALLEL
483
484	ENDIF
485	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
486
487	!
488	!-- Transpose array
489	#if defined( __parallel )
490	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
491	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
492	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
493	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
494	comm1dx, ierr )
495	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
496	#endif
497
498	END SUBROUTINE ffty_tr_yx
499
500
501	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
502
503	!------------------------------------------------------------------------------!
504	! Transposition x --> y with a subsequent backward Fourier transformation for
505	! a 1d-decomposition along x
506	!------------------------------------------------------------------------------!
507
508	USE control_parameters
509	USE cpulog
510	USE indices
511	USE interfaces
512	USE pegrid
513	USE transpose_indices
514
515	IMPLICIT NONE
516
517	INTEGER :: i, iend, iouter, ir, j, k
518	INTEGER, PARAMETER :: stridex = 4
519
520	REAL, DIMENSION(0:ny,stridex) :: work_ffty
521	#if defined( __nec )
522	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
523	#endif
524	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
525	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
526	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
527
528	!
529	!-- Transpose array
530	#if defined( __parallel )
531	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
532	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
533	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
534	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
535	comm1dx, ierr )
536	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
537	#endif
538
539	!
540	!-- Resort the data in a way that y becomes the first index and carry out the
541	!-- backward fft along y.
542	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
543
544	IF ( host(1:3) == 'nec' ) THEN
545	#if defined( __nec )
546	!
547	!-- Code optimized for vector processors
548	!$OMP PARALLEL PRIVATE ( i, j, k )
549	!$OMP DO
550	DO k = 1, nz
551	DO j = 0, ny
552	DO i = nxl, nxr
553	work_ffty_vec(j,k,i) = work(i,k,j)
554	ENDDO
555	ENDDO
556	ENDDO
557
558	!$OMP DO
559	DO i = nxl, nxr
560
561	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
562
563	DO j = 0, ny
564	DO k = 1, nz
565	f_out(k,j,i) = work_ffty_vec(j,k,i)
566	ENDDO
567	ENDDO
568
569	ENDDO
570	!$OMP END PARALLEL
571	#endif
572
573	ELSE
574
575	!
576	!-- Cache optimized code.
577	!-- The i-(x-)direction is split into a strided outer loop and an inner
578	!-- loop for better cache performance
579	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
580	!$OMP DO
581	DO iouter = nxl, nxr, stridex
582
583	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
584
585	DO k = 1, nz
586	!
587	!-- Resort
588	DO j = 0, ny
589	DO i = iouter, iend
590	work_ffty(j,i-iouter+1) = work(i,k,j)
591	ENDDO
592	ENDDO
593
594	DO i = iouter, iend
595
596	!
597	!-- FFT along y
598	ir = i-iouter+1 ! counter within a stride
599	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
600
601	DO j = 0, ny
602	f_out(k,j,i) = work_ffty(j,ir)
603	ENDDO
604	ENDDO
605
606	ENDDO
607
608	ENDDO
609	!$OMP END PARALLEL
610
611	ENDIF
612
613	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
614
615	END SUBROUTINE tr_xy_ffty
616
617
618	SUBROUTINE fftx_tri_fftx( ar )
619
620	!------------------------------------------------------------------------------!
621	! FFT along x, solution of the tridiagonal system and backward FFT for
622	! a 1d-decomposition along x
623	!
624	! WARNING: this subroutine may still not work for hybrid parallelization
625	! with OpenMP (for possible necessary changes see the original
626	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
627	!------------------------------------------------------------------------------!
628
629	USE control_parameters
630	USE cpulog
631	USE grid_variables
632	USE indices
633	USE interfaces
634	USE pegrid
635	USE transpose_indices
636
637	IMPLICIT NONE
638
639	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
640
641	REAL, DIMENSION(0:nx) :: work_fftx
642	REAL, DIMENSION(0:nx,1:nz) :: work_trix
643	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
644	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
645
646
647	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
648
649	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
650
651	tn = 0 ! Default thread number in case of one thread
652	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
653	DO j = nys_x, nyn_x
654
655	!$ tn = omp_get_thread_num()
656
657	IF ( host(1:3) == 'nec' ) THEN
658	!
659	!-- Code optimized for vector processors
660	DO k = 1, nz
661
662	m = 0
663	DO n = 1, pdims(1)
664	DO i = 1, nnx
665	work_trix(m,k) = ar(i,k,j,n)
666	m = m + 1
667	ENDDO
668	ENDDO
669
670	ENDDO
671
672	CALL fft_x_m( work_trix, 'forward' )
673
674	ELSE
675	!
676	!-- Cache optimized code
677	DO k = 1, nz
678
679	m = 0
680	DO n = 1, pdims(1)
681	DO i = 1, nnx
682	work_fftx(m) = ar(i,k,j,n)
683	m = m + 1
684	ENDDO
685	ENDDO
686
687	CALL fft_x_1d( work_fftx, 'forward' )
688
689	DO i = 0, nx
690	work_trix(i,k) = work_fftx(i)
691	ENDDO
692
693	ENDDO
694
695	ENDIF
696
697	!
698	!-- Solve the linear equation system
699	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
700
701	IF ( host(1:3) == 'nec' ) THEN
702	!
703	!-- Code optimized for vector processors
704	CALL fft_x_m( work_trix, 'backward' )
705
706	DO k = 1, nz
707
708	m = 0
709	DO n = 1, pdims(1)
710	DO i = 1, nnx
711	ar(i,k,j,n) = work_trix(m,k)
712	m = m + 1
713	ENDDO
714	ENDDO
715
716	ENDDO
717
718	ELSE
719	!
720	!-- Cache optimized code
721	DO k = 1, nz
722
723	DO i = 0, nx
724	work_fftx(i) = work_trix(i,k)
725	ENDDO
726
727	CALL fft_x_1d( work_fftx, 'backward' )
728
729	m = 0
730	DO n = 1, pdims(1)
731	DO i = 1, nnx
732	ar(i,k,j,n) = work_fftx(m)
733	m = m + 1
734	ENDDO
735	ENDDO
736
737	ENDDO
738
739	ENDIF
740
741	ENDDO
742
743	DEALLOCATE( tri )
744
745	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
746
747	END SUBROUTINE fftx_tri_fftx
748
749
750	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
751
752	!------------------------------------------------------------------------------!
753	! Fourier-transformation along x with subsequent transposition x --> y for
754	! a 1d-decomposition along y
755	!
756	! ATTENTION: The NEC-branch of this routine may significantly profit from
757	! further optimizations. So far, performance is much worse than
758	! for routine ffty_tr_yx (more than three times slower).
759	!------------------------------------------------------------------------------!
760
761	USE control_parameters
762	USE cpulog
763	USE indices
764	USE interfaces
765	USE pegrid
766	USE transpose_indices
767
768	IMPLICIT NONE
769
770	INTEGER :: i, j, k
771
772	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
773	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
774	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
775	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
776
777	!
778	!-- Carry out the FFT along x, where all data are present due to the
779	!-- 1d-decomposition along y. Resort the data in a way that y becomes
780	!-- the first index.
781	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
782
783	IF ( host(1:3) == 'nec' ) THEN
784	!
785	!-- Code for vector processors
786	!$OMP PARALLEL PRIVATE ( i, j, k )
787	!$OMP DO
788	DO i = 0, nx
789
790	DO j = nys, nyn
791	DO k = 1, nz
792	work_fftx(i,k,j) = f_in(k,j,i)
793	ENDDO
794	ENDDO
795
796	ENDDO
797
798	!$OMP DO
799	DO j = nys, nyn
800
801	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
802
803	DO k = 1, nz
804	DO i = 0, nx
805	work(j,k,i) = work_fftx(i,k,j)
806	ENDDO
807	ENDDO
808
809	ENDDO
810	!$OMP END PARALLEL
811
812	ELSE
813
814	!
815	!-- Cache optimized code (there might be still a potential for better
816	!-- optimization).
817	!$OMP PARALLEL PRIVATE (i,j,k)
818	!$OMP DO
819	DO i = 0, nx
820
821	DO j = nys, nyn
822	DO k = 1, nz
823	work_fftx(i,k,j) = f_in(k,j,i)
824	ENDDO
825	ENDDO
826
827	ENDDO
828
829	!$OMP DO
830	DO j = nys, nyn
831	DO k = 1, nz
832
833	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
834
835	DO i = 0, nx
836	work(j,k,i) = work_fftx(i,k,j)
837	ENDDO
838	ENDDO
839
840	ENDDO
841	!$OMP END PARALLEL
842
843	ENDIF
844	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
845
846	!
847	!-- Transpose array
848	#if defined( __parallel )
849	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
850	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
851	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
852	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
853	comm1dy, ierr )
854	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
855	#endif
856
857	END SUBROUTINE fftx_tr_xy
858
859
860	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
861
862	!------------------------------------------------------------------------------!
863	! Transposition y --> x with a subsequent backward Fourier transformation for
864	! a 1d-decomposition along x
865	!------------------------------------------------------------------------------!
866
867	USE control_parameters
868	USE cpulog
869	USE indices
870	USE interfaces
871	USE pegrid
872	USE transpose_indices
873
874	IMPLICIT NONE
875
876	INTEGER :: i, j, k
877
878	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
879	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
880	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
881	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
882
883	!
884	!-- Transpose array
885	#if defined( __parallel )
886	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
887	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
888	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
889	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
890	comm1dy, ierr )
891	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
892	#endif
893
894	!
895	!-- Carry out the FFT along x, where all data are present due to the
896	!-- 1d-decomposition along y. Resort the data in a way that y becomes
897	!-- the first index.
898	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
899
900	IF ( host(1:3) == 'nec' ) THEN
901	!
902	!-- Code optimized for vector processors
903	!$OMP PARALLEL PRIVATE ( i, j, k )
904	!$OMP DO
905	DO j = nys, nyn
906
907	DO k = 1, nz
908	DO i = 0, nx
909	work_fftx(i,k,j) = work(j,k,i)
910	ENDDO
911	ENDDO
912
913	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
914
915	ENDDO
916
917	!$OMP DO
918	DO i = 0, nx
919	DO j = nys, nyn
920	DO k = 1, nz
921	f_out(k,j,i) = work_fftx(i,k,j)
922	ENDDO
923	ENDDO
924	ENDDO
925	!$OMP END PARALLEL
926
927	ELSE
928
929	!
930	!-- Cache optimized code (there might be still a potential for better
931	!-- optimization).
932	!$OMP PARALLEL PRIVATE (i,j,k)
933	!$OMP DO
934	DO j = nys, nyn
935	DO k = 1, nz
936
937	DO i = 0, nx
938	work_fftx(i,k,j) = work(j,k,i)
939	ENDDO
940
941	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
942
943	ENDDO
944	ENDDO
945
946	!$OMP DO
947	DO i = 0, nx
948	DO j = nys, nyn
949	DO k = 1, nz
950	f_out(k,j,i) = work_fftx(i,k,j)
951	ENDDO
952	ENDDO
953	ENDDO
954	!$OMP END PARALLEL
955
956	ENDIF
957	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
958
959	END SUBROUTINE tr_yx_fftx
960
961
962	SUBROUTINE ffty_tri_ffty( ar )
963
964	!------------------------------------------------------------------------------!
965	! FFT along y, solution of the tridiagonal system and backward FFT for
966	! a 1d-decomposition along y
967	!
968	! WARNING: this subroutine may still not work for hybrid parallelization
969	! with OpenMP (for possible necessary changes see the original
970	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
971	!------------------------------------------------------------------------------!
972
973	USE control_parameters
974	USE cpulog
975	USE grid_variables
976	USE indices
977	USE interfaces
978	USE pegrid
979	USE transpose_indices
980
981	IMPLICIT NONE
982
983	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
984
985	REAL, DIMENSION(0:ny) :: work_ffty
986	REAL, DIMENSION(0:ny,1:nz) :: work_triy
987	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
988	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
989
990
991	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
992
993	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
994
995	tn = 0 ! Default thread number in case of one thread
996	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
997	DO i = nxl_y, nxr_y
998
999	!$ tn = omp_get_thread_num()
1000
1001	IF ( host(1:3) == 'nec' ) THEN
1002	!
1003	!-- Code optimized for vector processors
1004	DO k = 1, nz
1005
1006	m = 0
1007	DO n = 1, pdims(2)
1008	DO j = 1, nny
1009	work_triy(m,k) = ar(j,k,i,n)
1010	m = m + 1
1011	ENDDO
1012	ENDDO
1013
1014	ENDDO
1015
1016	CALL fft_y_m( work_triy, ny, 'forward' )
1017
1018	ELSE
1019	!
1020	!-- Cache optimized code
1021	DO k = 1, nz
1022
1023	m = 0
1024	DO n = 1, pdims(2)
1025	DO j = 1, nny
1026	work_ffty(m) = ar(j,k,i,n)
1027	m = m + 1
1028	ENDDO
1029	ENDDO
1030
1031	CALL fft_y_1d( work_ffty, 'forward' )
1032
1033	DO j = 0, ny
1034	work_triy(j,k) = work_ffty(j)
1035	ENDDO
1036
1037	ENDDO
1038
1039	ENDIF
1040
1041	!
1042	!-- Solve the linear equation system
1043	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1044
1045	IF ( host(1:3) == 'nec' ) THEN
1046	!
1047	!-- Code optimized for vector processors
1048	CALL fft_y_m( work_triy, ny, 'backward' )
1049
1050	DO k = 1, nz
1051
1052	m = 0
1053	DO n = 1, pdims(2)
1054	DO j = 1, nny
1055	ar(j,k,i,n) = work_triy(m,k)
1056	m = m + 1
1057	ENDDO
1058	ENDDO
1059
1060	ENDDO
1061
1062	ELSE
1063	!
1064	!-- Cache optimized code
1065	DO k = 1, nz
1066
1067	DO j = 0, ny
1068	work_ffty(j) = work_triy(j,k)
1069	ENDDO
1070
1071	CALL fft_y_1d( work_ffty, 'backward' )
1072
1073	m = 0
1074	DO n = 1, pdims(2)
1075	DO j = 1, nny
1076	ar(j,k,i,n) = work_ffty(m)
1077	m = m + 1
1078	ENDDO
1079	ENDDO
1080
1081	ENDDO
1082
1083	ENDIF
1084
1085	ENDDO
1086
1087	DEALLOCATE( tri )
1088
1089	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1090
1091	END SUBROUTINE ffty_tri_ffty
1092
1093
1094	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
1095
1096	!------------------------------------------------------------------------------!
1097	! Solves the linear system of equations for a 1d-decomposition along x (see
1098	! tridia)
1099	!
1100	! Attention: when using the intel compilers older than 12.0, array tri must
1101	! be passed as an argument to the contained subroutines. Otherwise
1102	! addres faults will occur. This feature can be activated with
1103	! cpp-switch __intel11
1104	! On NEC, tri should not be passed (except for routine substi_1dd)
1105	! because this causes very bad performance.
1106	!------------------------------------------------------------------------------!
1107
1108	USE arrays_3d
1109	USE control_parameters
1110
1111	USE pegrid
1112
1113	IMPLICIT NONE
1114
1115	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
1116
1117	REAL :: ddx2, ddy2
1118
1119	REAL, DIMENSION(0:nx,1:nz) :: ar
1120	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1121
1122
1123	nnyh = ( ny + 1 ) / 2
1124
1125	!
1126	!-- Define constant elements of the tridiagonal matrix.
1127	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
1128	!-- the exchanged loops create bank conflicts. The following directive
1129	!-- prohibits loop exchange and the loops perform much better.
1130	! tn = omp_get_thread_num()
1131	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
1132	! CALL local_flush( 120+tn )
1133	!CDIR NOLOOPCHG
1134	DO k = 0, nz-1
1135	DO i = 0,nx
1136	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
1137	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
1138	ENDDO
1139	ENDDO
1140	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
1141	! CALL local_flush( 120+tn )
1142
1143	IF ( j <= nnyh ) THEN
1144	#if defined( __intel11 )
1145	CALL maketri_1dd( j, tri )
1146	#else
1147	CALL maketri_1dd( j )
1148	#endif
1149	ELSE
1150	#if defined( __intel11 )
1151	CALL maketri_1dd( ny+1-j, tri )
1152	#else
1153	CALL maketri_1dd( ny+1-j )
1154	#endif
1155	ENDIF
1156	#if defined( __intel11 )
1157	CALL split_1dd( tri )
1158	#else
1159	CALL split_1dd
1160	#endif
1161	CALL substi_1dd( ar, tri )
1162
1163	CONTAINS
1164
1165	#if defined( __intel11 )
1166	SUBROUTINE maketri_1dd( j, tri )
1167	#else
1168	SUBROUTINE maketri_1dd( j )
1169	#endif
1170
1171	!------------------------------------------------------------------------------!
1172	! computes the i- and j-dependent component of the matrix
1173	!------------------------------------------------------------------------------!
1174
1175	USE constants
1176
1177	IMPLICIT NONE
1178
1179	INTEGER :: i, j, k, nnxh
1180	REAL :: a, c
1181
1182	REAL, DIMENSION(0:nx) :: l
1183
1184	#if defined( __intel11 )
1185	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1186	#endif
1187
1188
1189	nnxh = ( nx + 1 ) / 2
1190	!
1191	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
1192	!-- Fourier space. The coefficients are computed following the method of
1193	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1194	!-- Siano's original version by discretizing the Poisson equation,
1195	!-- before it is Fourier-transformed
1196	DO i = 0, nx
1197	IF ( i >= 0 .AND. i <= nnxh ) THEN
1198	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1199	REAL( nx+1 ) ) ) * ddx2 + &
1200	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1201	REAL( ny+1 ) ) ) * ddy2
1202	ELSE
1203	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1204	REAL( nx+1 ) ) ) * ddx2 + &
1205	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1206	REAL( ny+1 ) ) ) * ddy2
1207	ENDIF
1208	ENDDO
1209
1210	DO k = 0, nz-1
1211	DO i = 0, nx
1212	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
1213	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
1214	tri(1,i,k) = a + c - l(i)
1215	ENDDO
1216	ENDDO
1217	IF ( ibc_p_b == 1 ) THEN
1218	DO i = 0, nx
1219	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
1220	ENDDO
1221	ENDIF
1222	IF ( ibc_p_t == 1 ) THEN
1223	DO i = 0, nx
1224	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
1225	ENDDO
1226	ENDIF
1227
1228	END SUBROUTINE maketri_1dd
1229
1230
1231	#if defined( __intel11 )
1232	SUBROUTINE split_1dd( tri )
1233	#else
1234	SUBROUTINE split_1dd
1235	#endif
1236
1237	!------------------------------------------------------------------------------!
1238	! Splitting of the tridiagonal matrix (Thomas algorithm)
1239	!------------------------------------------------------------------------------!
1240
1241	IMPLICIT NONE
1242
1243	INTEGER :: i, k
1244
1245	#if defined( __intel11 )
1246	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1247	#endif
1248
1249
1250	!
1251	!-- Splitting
1252	DO i = 0, nx
1253	tri(4,i,0) = tri(1,i,0)
1254	ENDDO
1255	DO k = 1, nz-1
1256	DO i = 0, nx
1257	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
1258	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1259	ENDDO
1260	ENDDO
1261
1262	END SUBROUTINE split_1dd
1263
1264
1265	SUBROUTINE substi_1dd( ar, tri )
1266
1267	!------------------------------------------------------------------------------!
1268	! Substitution (Forward and Backward) (Thomas algorithm)
1269	!------------------------------------------------------------------------------!
1270
1271	IMPLICIT NONE
1272
1273	INTEGER :: i, k
1274
1275	REAL, DIMENSION(0:nx,nz) :: ar
1276	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1277	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1278
1279	!
1280	!-- Forward substitution
1281	DO i = 0, nx
1282	ar1(i,0) = ar(i,1)
1283	ENDDO
1284	DO k = 1, nz-1
1285	DO i = 0, nx
1286	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1287	ENDDO
1288	ENDDO
1289
1290	!
1291	!-- Backward substitution
1292	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
1293	!-- by zero appearing if the pressure bc is set to neumann at the top of
1294	!-- the model domain.
1295	DO i = 0, nx
1296	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
1297	ENDDO
1298	DO k = nz-2, 0, -1
1299	DO i = 0, nx
1300	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1301	/ tri(4,i,k)
1302	ENDDO
1303	ENDDO
1304
1305	!
1306	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1307	!-- The respective values of ar should be zero at all k-levels if
1308	!-- acceleration of horizontally averaged vertical velocity is zero.
1309	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1310	IF ( j == 0 ) THEN
1311	DO k = 1, nz
1312	ar(0,k) = 0.0
1313	ENDDO
1314	ENDIF
1315	ENDIF
1316
1317	END SUBROUTINE substi_1dd
1318
1319	END SUBROUTINE tridia_1dd
1320
1321
1322	SUBROUTINE tridia( ar )
1323
1324	!------------------------------------------------------------------------------!
1325	! solves the linear system of equations:
1326	!
1327	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
1328	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
1329	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
1330	!
1331	! by using the Thomas algorithm
1332	!------------------------------------------------------------------------------!
1333
1334	USE arrays_3d
1335
1336	IMPLICIT NONE
1337
1338	INTEGER :: i, j, k
1339
1340	!$acc declare create( tri )
1341	REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1,2) :: tri
1342
1343	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
1344
1345
1346	CALL split( tri )
1347	CALL substi( ar, tri )
1348
1349	END SUBROUTINE tridia
1350
1351
1352	SUBROUTINE maketri
1353
1354	!------------------------------------------------------------------------------!
1355	! Computes the i- and j-dependent component of the matrix
1356	!------------------------------------------------------------------------------!
1357
1358	USE arrays_3d, ONLY: tric
1359	USE constants
1360	USE control_parameters
1361	USE grid_variables
1362
1363	IMPLICIT NONE
1364
1365	INTEGER :: i, j, k, nnxh, nnyh
1366
1367	!$acc declare create( ll )
1368	REAL :: ll(nxl_z:nxr_z,nys_z:nyn_z)
1369
1370
1371	nnxh = ( nx + 1 ) / 2
1372	nnyh = ( ny + 1 ) / 2
1373
1374	!
1375	!-- Provide the constant coefficients of the tridiagonal matrix for solution
1376	!-- of the Poisson equation in Fourier space.
1377	!-- The coefficients are computed following the method of
1378	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1379	!-- Siano's original version by discretizing the Poisson equation,
1380	!-- before it is Fourier-transformed.
1381
1382	!$acc kernels present( tric )
1383	!$acc loop vector( 32 )
1384	DO j = nys_z, nyn_z
1385	DO i = nxl_z, nxr_z
1386	IF ( j >= 0 .AND. j <= nnyh ) THEN
1387	IF ( i >= 0 .AND. i <= nnxh ) THEN
1388	ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1389	REAL( nx+1 ) ) ) / ( dx * dx ) + &
1390	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1391	REAL( ny+1 ) ) ) / ( dy * dy )
1392	ELSE
1393	ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1394	REAL( nx+1 ) ) ) / ( dx * dx ) + &
1395	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1396	REAL( ny+1 ) ) ) / ( dy * dy )
1397	ENDIF
1398	ELSE
1399	IF ( i >= 0 .AND. i <= nnxh ) THEN
1400	ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1401	REAL( nx+1 ) ) ) / ( dx * dx ) + &
1402	2.0 * ( 1.0 - COS( ( 2.0 * pi * ( ny+1-j ) ) / &
1403	REAL( ny+1 ) ) ) / ( dy * dy )
1404	ELSE
1405	ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1406	REAL( nx+1 ) ) ) / ( dx * dx ) + &
1407	2.0 * ( 1.0 - COS( ( 2.0 * pi * ( ny+1-j ) ) / &
1408	REAL( ny+1 ) ) ) / ( dy * dy )
1409	ENDIF
1410	ENDIF
1411	ENDDO
1412	ENDDO
1413
1414	!$acc loop
1415	DO k = 0, nz-1
1416	DO j = nys_z, nyn_z
1417	!$acc loop vector( 32 )
1418	DO i = nxl_z, nxr_z
1419	tric(i,j,k) = ddzuw(k,3) - ll(i,j)
1420	ENDDO
1421	ENDDO
1422	ENDDO
1423	!$acc end kernels
1424
1425	IF ( ibc_p_b == 1 ) THEN
1426	!$acc kernels present( tric )
1427	!$acc loop
1428	DO j = nys_z, nyn_z
1429	DO i = nxl_z, nxr_z
1430	tric(i,j,0) = tric(i,j,0) + ddzuw(0,1)
1431	ENDDO
1432	ENDDO
1433	!$acc end kernels
1434	ENDIF
1435	IF ( ibc_p_t == 1 ) THEN
1436	!$acc kernels present( tric )
1437	!$acc loop
1438	DO j = nys_z, nyn_z
1439	DO i = nxl_z, nxr_z
1440	tric(i,j,nz-1) = tric(i,j,nz-1) + ddzuw(nz-1,2)
1441	ENDDO
1442	ENDDO
1443	!$acc end kernels
1444	ENDIF
1445
1446	END SUBROUTINE maketri
1447
1448
1449	SUBROUTINE substi( ar, tri )
1450
1451	!------------------------------------------------------------------------------!
1452	! Substitution (Forward and Backward) (Thomas algorithm)
1453	!------------------------------------------------------------------------------!
1454
1455	USE control_parameters
1456
1457	IMPLICIT NONE
1458
1459	INTEGER :: i, j, k
1460
1461	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
1462	REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1,2) :: tri
1463
1464	!$acc declare create( ar1 )
1465	REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1) :: ar1
1466
1467	!
1468	!-- Forward substitution
1469	DO k = 0, nz - 1
1470	!$acc kernels present( ar, tri )
1471	!$acc loop
1472	DO j = nys_z, nyn_z
1473	DO i = nxl_z, nxr_z
1474
1475	IF ( k == 0 ) THEN
1476	ar1(i,j,k) = ar(i,j,k+1)
1477	ELSE
1478	ar1(i,j,k) = ar(i,j,k+1) - tri(i,j,k,2) * ar1(i,j,k-1)
1479	ENDIF
1480
1481	ENDDO
1482	ENDDO
1483	!$acc end kernels
1484	ENDDO
1485
1486	!
1487	!-- Backward substitution
1488	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
1489	!-- by zero appearing if the pressure bc is set to neumann at the top of
1490	!-- the model domain.
1491	DO k = nz-1, 0, -1
1492	!$acc kernels present( ar, tri )
1493	!$acc loop
1494	DO j = nys_z, nyn_z
1495	DO i = nxl_z, nxr_z
1496
1497	IF ( k == nz-1 ) THEN
1498	ar(i,j,k+1) = ar1(i,j,k) / ( tri(i,j,k,1) + 1.0E-20 )
1499	ELSE
1500	ar(i,j,k+1) = ( ar1(i,j,k) - ddzuw(k,2) * ar(i,j,k+2) ) &
1501	/ tri(i,j,k,1)
1502	ENDIF
1503	ENDDO
1504	ENDDO
1505	!$acc end kernels
1506	ENDDO
1507
1508	!
1509	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1510	!-- The respective values of ar should be zero at all k-levels if
1511	!-- acceleration of horizontally averaged vertical velocity is zero.
1512	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1513	IF ( nys_z == 0 .AND. nxl_z == 0 ) THEN
1514	!$acc kernels loop present( ar )
1515	DO k = 1, nz
1516	ar(nxl_z,nys_z,k) = 0.0
1517	ENDDO
1518	ENDIF
1519	ENDIF
1520
1521	END SUBROUTINE substi
1522
1523
1524	SUBROUTINE split( tri )
1525
1526	!------------------------------------------------------------------------------!
1527	! Splitting of the tridiagonal matrix (Thomas algorithm)
1528	!------------------------------------------------------------------------------!
1529
1530	USE arrays_3d, ONLY: tric
1531
1532	IMPLICIT NONE
1533
1534	INTEGER :: i, j, k
1535
1536	REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1,2) :: tri
1537
1538	!
1539	!-- Splitting
1540	!$acc kernels present( tri, tric )
1541	!$acc loop
1542	DO j = nys_z, nyn_z
1543	!$acc loop vector( 32 )
1544	DO i = nxl_z, nxr_z
1545	tri(i,j,0,1) = tric(i,j,0)
1546	ENDDO
1547	ENDDO
1548	!$acc end kernels
1549
1550	DO k = 1, nz-1
1551	!$acc kernels present( tri, tric )
1552	!$acc loop
1553	DO j = nys_z, nyn_z
1554	!$acc loop vector( 32 )
1555	DO i = nxl_z, nxr_z
1556	tri(i,j,k,2) = ddzuw(k,1) / tri(i,j,k-1,1)
1557	tri(i,j,k,1) = tric(i,j,k) - ddzuw(k-1,2) * tri(i,j,k,2)
1558	ENDDO
1559	ENDDO
1560	!$acc end kernels
1561	ENDDO
1562
1563	END SUBROUTINE split
1564
1565	#endif
1566
1567	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |