Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1208

Last change on this file since 1208 was 1208, checked in by raasch, 12 years ago
acc-update clauses added for "ar" so that ffts other than cufft can also be used
Property svn:keywords set to `Id`
File size: 46.8 KB

Line
1	MODULE poisfft_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2012 Leibniz University Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	! acc-update clauses added for "ar" so that ffts other than cufft can also be
23	! used (although they are not ported and will give a poor performance)
24	!
25	! Former revisions:
26	! -----------------
27	! $Id: poisfft.f90 1208 2013-08-13 06:41:49Z raasch $
28	!
29	! 1111 2013-03-08 23:54:10Z raasch
30	! further openACC porting of non-parallel (MPI) branch:
31	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
32	! no distinction between parallel/non-parallel in poisfft and tridia any more,
33	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
34	! (otherwise "invalid device function" is indicated during runtime),
35	! optimization of tridia routines: constant elements and coefficients of tri are
36	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
37	! to 2,
38	! poisfft_init is now called internally from poisfft, maketri is called from
39	! poisfft_init,
40	! ibc_p_b = 2 removed
41	!
42	! 1106 2013-03-04 05:31:38Z raasch
43	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
44	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
45	! fft_y_1d
46	!
47	! 1103 2013-02-20 02:15:53Z raasch
48	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
49	! sometimes cause segmentation faults with intel 12.1 compiler
50	!
51	! 1092 2013-02-02 11:24:22Z raasch
52	! unused variables removed
53	!
54	! 1036 2012-10-22 13:43:42Z raasch
55	! code put under GPL (PALM 3.9)
56	!
57	! 2012-09-21 07:03:55Z raasch
58	! FLOAT type conversion replaced by REAL
59	!
60	! 1003 2012-09-14 14:35:53Z raasch
61	! indices nxa, nya, etc. replaced by nx, ny, etc.
62	!
63	! 940 2012-07-09 14:31:00Z raasch
64	! special handling of tri-array as an argument in tridia_1dd routines switched
65	! off because it caused segmentation faults with intel 12.1 compiler
66	!
67	! 877 2012-04-03 11:21:44Z suehring
68	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
69	! pressure at the top of the model domain.
70	!
71	! 809 2012-01-30 13:32:58Z maronga
72	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
73	!
74	! 807 2012-01-25 11:53:51Z maronga
75	! New cpp directive "__check" implemented which is used by check_namelist_files
76	! (most of the code is unneeded by check_namelist_files).
77	!
78	! 763 2011-10-06 09:32:09Z suehring
79	! Comment added concerning the last change.
80	!
81	! 761 2011-10-05 17:58:52Z suehring
82	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
83	! pressure at the top of the model domain.
84	!
85	! 696 2011-03-18 07:03:49Z raasch
86	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
87	!
88	! 683 2011-02-09 14:25:15Z raasch
89	! openMP parallelization for 2d-domain-decomposition
90	!
91	! 667 2010-12-23 12:06:00Z suehring/gryschka
92	! ddzu replaced by ddzu_pres due to changes in zu(0)
93	!
94	! 622 2010-12-10 08:08:13Z raasch
95	! optional barriers included in order to speed up collective operations
96	!
97	! 377 2009-09-04 11:09:00Z raasch
98	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
99	!
100	! 164 2008-05-15 08:46:15Z raasch
101	! Arguments removed from transpose routines
102	!
103	! 128 2007-10-26 13:11:14Z raasch
104	! Bugfix: wavenumber calculation for even nx in routines maketri
105	!
106	! 85 2007-05-11 09:35:14Z raasch
107	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
108	!
109	! 76 2007-03-29 00:58:32Z raasch
110	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
111	! the bottom and the top.
112	!
113	! RCS Log replace by Id keyword, revision history cleaned up
114	!
115	! Revision 1.24 2006/08/04 15:00:24 raasch
116	! Default setting of the thread number tn in case of not using OpenMP
117	!
118	! Revision 1.23 2006/02/23 12:48:38 raasch
119	! Additional compiler directive in routine tridia_1dd for preventing loop
120	! exchange on NEC-SX6
121	!
122	! Revision 1.20 2004/04/30 12:38:09 raasch
123	! Parts of former poisfft_hybrid moved to this subroutine,
124	! former subroutine changed to a module, renaming of FFT-subroutines and
125	! -module, FFTs completely substituted by calls of fft_x and fft_y,
126	! NAG fft used in the non-parallel case completely removed, l in maketri
127	! is now a 1d-array, variables passed by modules instead of using parameter
128	! lists, enlarged transposition arrays introduced
129	!
130	! Revision 1.1 1997/07/24 11:24:14 raasch
131	! Initial revision
132	!
133	!
134	! Description:
135	! ------------
136	! See below.
137	!------------------------------------------------------------------------------!
138
139	!--------------------------------------------------------------------------!
140	! poisfft !
141	! !
142	! Original version: Stephan Siano (pois3d) !
143	! !
144	! Institute of Meteorology and Climatology, University of Hannover !
145	! Germany !
146	! !
147	! Version as of July 23,1996 !
148	! !
149	! !
150	! Version for parallel computers: Siegfried Raasch !
151	! !
152	! Version as of July 03,1997 !
153	! !
154	! Solves the Poisson equation with a 2D spectral method !
155	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s !
156	! !
157	! Input: !
158	! real ar contains in the (nnx,nny,nnz) elements, !
159	! starting from the element (1,nys,nxl), the !
160	! values for s !
161	! real work Temporary array !
162	! !
163	! Output: !
164	! real ar contains the solution for p !
165	!--------------------------------------------------------------------------!
166
167	USE fft_xy
168	USE indices
169	USE transpose_indices
170
171	IMPLICIT NONE
172
173	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
174
175	REAL, DIMENSION(:,:), ALLOCATABLE :: ddzuw
176
177	PRIVATE
178
179	#if ! defined ( __check )
180	PUBLIC poisfft, poisfft_init
181
182	INTERFACE poisfft
183	MODULE PROCEDURE poisfft
184	END INTERFACE poisfft
185
186	INTERFACE poisfft_init
187	MODULE PROCEDURE poisfft_init
188	END INTERFACE poisfft_init
189	#else
190	PUBLIC poisfft_init
191
192	INTERFACE poisfft_init
193	MODULE PROCEDURE poisfft_init
194	END INTERFACE poisfft_init
195	#endif
196
197	CONTAINS
198
199	SUBROUTINE poisfft_init
200
201	USE arrays_3d, ONLY: ddzu_pres, ddzw
202
203	IMPLICIT NONE
204
205	INTEGER :: k
206
207
208	CALL fft_init
209
210	ALLOCATE( ddzuw(0:nz-1,3) )
211
212	DO k = 0, nz-1
213	ddzuw(k,1) = ddzu_pres(k+1) * ddzw(k+1)
214	ddzuw(k,2) = ddzu_pres(k+2) * ddzw(k+1)
215	ddzuw(k,3) = -1.0 * &
216	( ddzu_pres(k+2) * ddzw(k+1) + ddzu_pres(k+1) * ddzw(k+1) )
217	ENDDO
218	!
219	!-- Calculate constant coefficients of the tridiagonal matrix
220	#if ! defined ( __check )
221	CALL maketri
222	#endif
223
224	poisfft_initialized = .TRUE.
225
226	END SUBROUTINE poisfft_init
227
228
229	#if ! defined ( __check )
230	SUBROUTINE poisfft( ar, work )
231
232	USE control_parameters, ONLY : fft_method
233	USE cpulog
234	USE interfaces
235	USE pegrid
236
237	IMPLICIT NONE
238
239	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar, work
240
241
242	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
243
244	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
245
246	!
247	!-- Two-dimensional Fourier Transformation in x- and y-direction.
248	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
249
250	!
251	!-- 1d-domain-decomposition along x:
252	!-- FFT along y and transposition y --> x
253	CALL ffty_tr_yx( ar, work, ar )
254
255	!
256	!-- FFT along x, solving the tridiagonal system and backward FFT
257	CALL fftx_tri_fftx( ar )
258
259	!
260	!-- Transposition x --> y and backward FFT along y
261	CALL tr_xy_ffty( ar, work, ar )
262
263	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
264
265	!
266	!-- 1d-domain-decomposition along y:
267	!-- FFT along x and transposition x --> y
268	CALL fftx_tr_xy( ar, work, ar )
269
270	!
271	!-- FFT along y, solving the tridiagonal system and backward FFT
272	CALL ffty_tri_ffty( ar )
273
274	!
275	!-- Transposition y --> x and backward FFT along x
276	CALL tr_yx_fftx( ar, work, ar )
277
278	ELSE
279
280	!
281	!-- 2d-domain-decomposition or no decomposition (1 PE run)
282	!-- Transposition z --> x
283	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
284	CALL transpose_zx( ar, work, ar )
285	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
286
287	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
288	IF ( fft_method /= 'system-specific' ) THEN
289	!$acc update host( ar )
290	ENDIF
291	CALL fft_x( ar, 'forward' )
292	IF ( fft_method /= 'system-specific' ) THEN
293	!$acc update device( ar )
294	ENDIF
295	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
296
297	!
298	!-- Transposition x --> y
299	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
300	CALL transpose_xy( ar, work, ar )
301	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
302
303	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
304	IF ( fft_method /= 'system-specific' ) THEN
305	!$acc update host( ar )
306	ENDIF
307	CALL fft_y( ar, 'forward' )
308	IF ( fft_method /= 'system-specific' ) THEN
309	!$acc update device( ar )
310	ENDIF
311	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
312
313	!
314	!-- Transposition y --> z
315	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
316	CALL transpose_yz( ar, work, ar )
317	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
318
319	!
320	!-- Solve the tridiagonal equation system along z
321	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
322	CALL tridia( ar )
323	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
324
325	!
326	!-- Inverse Fourier Transformation
327	!-- Transposition z --> y
328	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
329	CALL transpose_zy( ar, work, ar )
330	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
331
332	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
333	IF ( fft_method /= 'system-specific' ) THEN
334	!$acc update host( ar )
335	ENDIF
336	CALL fft_y( ar, 'backward' )
337	IF ( fft_method /= 'system-specific' ) THEN
338	!$acc update device( ar )
339	ENDIF
340	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
341
342	!
343	!-- Transposition y --> x
344	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
345	CALL transpose_yx( ar, work, ar )
346	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
347
348	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
349	IF ( fft_method /= 'system-specific' ) THEN
350	!$acc update host( ar )
351	ENDIF
352	CALL fft_x( ar, 'backward' )
353	IF ( fft_method /= 'system-specific' ) THEN
354	!$acc update device( ar )
355	ENDIF
356	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
357
358	!
359	!-- Transposition x --> z
360	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
361	CALL transpose_xz( ar, work, ar )
362	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
363
364	ENDIF
365
366	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
367
368	END SUBROUTINE poisfft
369
370
371
372	SUBROUTINE ffty_tr_yx( f_in, work, f_out )
373
374	!------------------------------------------------------------------------------!
375	! Fourier-transformation along y with subsequent transposition y --> x for
376	! a 1d-decomposition along x
377	!
378	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
379	! if the first index of work_ffty_vec is odd. Otherwise
380	! memory bank conflicts may occur (especially if the index is a
381	! multiple of 128). That's why work_ffty_vec is dimensioned as
382	! 0:ny+1.
383	! Of course, this will not work if users are using an odd number
384	! of gridpoints along y.
385	!------------------------------------------------------------------------------!
386
387	USE control_parameters
388	USE cpulog
389	USE indices
390	USE interfaces
391	USE pegrid
392	USE transpose_indices
393
394	IMPLICIT NONE
395
396	INTEGER :: i, iend, iouter, ir, j, k
397	INTEGER, PARAMETER :: stridex = 4
398
399	REAL, DIMENSION(0:ny,stridex) :: work_ffty
400	#if defined( __nec )
401	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
402	#endif
403	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
404	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
405	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
406
407	!
408	!-- Carry out the FFT along y, where all data are present due to the
409	!-- 1d-decomposition along x. Resort the data in a way that x becomes
410	!-- the first index.
411	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
412
413	IF ( host(1:3) == 'nec' ) THEN
414	#if defined( __nec )
415	!
416	!-- Code optimized for vector processors
417	!$OMP PARALLEL PRIVATE ( i, j, k )
418	!$OMP DO
419	DO i = nxl, nxr
420
421	DO j = 0, ny
422	DO k = 1, nz
423	work_ffty_vec(j,k,i) = f_in(k,j,i)
424	ENDDO
425	ENDDO
426
427	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
428
429	ENDDO
430
431	!$OMP DO
432	DO k = 1, nz
433	DO j = 0, ny
434	DO i = nxl, nxr
435	work(i,k,j) = work_ffty_vec(j,k,i)
436	ENDDO
437	ENDDO
438	ENDDO
439	!$OMP END PARALLEL
440	#endif
441
442	ELSE
443
444	!
445	!-- Cache optimized code.
446	!-- The i-(x-)direction is split into a strided outer loop and an inner
447	!-- loop for better cache performance
448	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
449	!$OMP DO
450	DO iouter = nxl, nxr, stridex
451
452	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
453
454	DO k = 1, nz
455
456	DO i = iouter, iend
457
458	ir = i-iouter+1 ! counter within a stride
459	DO j = 0, ny
460	work_ffty(j,ir) = f_in(k,j,i)
461	ENDDO
462	!
463	!-- FFT along y
464	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
465
466	ENDDO
467
468	!
469	!-- Resort
470	DO j = 0, ny
471	DO i = iouter, iend
472	work(i,k,j) = work_ffty(j,i-iouter+1)
473	ENDDO
474	ENDDO
475
476	ENDDO
477
478	ENDDO
479	!$OMP END PARALLEL
480
481	ENDIF
482	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
483
484	!
485	!-- Transpose array
486	#if defined( __parallel )
487	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
488	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
489	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
490	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
491	comm1dx, ierr )
492	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
493	#endif
494
495	END SUBROUTINE ffty_tr_yx
496
497
498	SUBROUTINE tr_xy_ffty( f_in, work, f_out )
499
500	!------------------------------------------------------------------------------!
501	! Transposition x --> y with a subsequent backward Fourier transformation for
502	! a 1d-decomposition along x
503	!------------------------------------------------------------------------------!
504
505	USE control_parameters
506	USE cpulog
507	USE indices
508	USE interfaces
509	USE pegrid
510	USE transpose_indices
511
512	IMPLICIT NONE
513
514	INTEGER :: i, iend, iouter, ir, j, k
515	INTEGER, PARAMETER :: stridex = 4
516
517	REAL, DIMENSION(0:ny,stridex) :: work_ffty
518	#if defined( __nec )
519	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
520	#endif
521	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
522	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
523	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
524
525	!
526	!-- Transpose array
527	#if defined( __parallel )
528	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
529	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
530	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
531	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
532	comm1dx, ierr )
533	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
534	#endif
535
536	!
537	!-- Resort the data in a way that y becomes the first index and carry out the
538	!-- backward fft along y.
539	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
540
541	IF ( host(1:3) == 'nec' ) THEN
542	#if defined( __nec )
543	!
544	!-- Code optimized for vector processors
545	!$OMP PARALLEL PRIVATE ( i, j, k )
546	!$OMP DO
547	DO k = 1, nz
548	DO j = 0, ny
549	DO i = nxl, nxr
550	work_ffty_vec(j,k,i) = work(i,k,j)
551	ENDDO
552	ENDDO
553	ENDDO
554
555	!$OMP DO
556	DO i = nxl, nxr
557
558	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
559
560	DO j = 0, ny
561	DO k = 1, nz
562	f_out(k,j,i) = work_ffty_vec(j,k,i)
563	ENDDO
564	ENDDO
565
566	ENDDO
567	!$OMP END PARALLEL
568	#endif
569
570	ELSE
571
572	!
573	!-- Cache optimized code.
574	!-- The i-(x-)direction is split into a strided outer loop and an inner
575	!-- loop for better cache performance
576	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
577	!$OMP DO
578	DO iouter = nxl, nxr, stridex
579
580	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
581
582	DO k = 1, nz
583	!
584	!-- Resort
585	DO j = 0, ny
586	DO i = iouter, iend
587	work_ffty(j,i-iouter+1) = work(i,k,j)
588	ENDDO
589	ENDDO
590
591	DO i = iouter, iend
592
593	!
594	!-- FFT along y
595	ir = i-iouter+1 ! counter within a stride
596	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
597
598	DO j = 0, ny
599	f_out(k,j,i) = work_ffty(j,ir)
600	ENDDO
601	ENDDO
602
603	ENDDO
604
605	ENDDO
606	!$OMP END PARALLEL
607
608	ENDIF
609
610	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
611
612	END SUBROUTINE tr_xy_ffty
613
614
615	SUBROUTINE fftx_tri_fftx( ar )
616
617	!------------------------------------------------------------------------------!
618	! FFT along x, solution of the tridiagonal system and backward FFT for
619	! a 1d-decomposition along x
620	!
621	! WARNING: this subroutine may still not work for hybrid parallelization
622	! with OpenMP (for possible necessary changes see the original
623	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
624	!------------------------------------------------------------------------------!
625
626	USE control_parameters
627	USE cpulog
628	USE grid_variables
629	USE indices
630	USE interfaces
631	USE pegrid
632	USE transpose_indices
633
634	IMPLICIT NONE
635
636	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
637
638	REAL, DIMENSION(0:nx) :: work_fftx
639	REAL, DIMENSION(0:nx,1:nz) :: work_trix
640	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
641	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
642
643
644	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
645
646	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
647
648	tn = 0 ! Default thread number in case of one thread
649	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
650	DO j = nys_x, nyn_x
651
652	!$ tn = omp_get_thread_num()
653
654	IF ( host(1:3) == 'nec' ) THEN
655	!
656	!-- Code optimized for vector processors
657	DO k = 1, nz
658
659	m = 0
660	DO n = 1, pdims(1)
661	DO i = 1, nnx
662	work_trix(m,k) = ar(i,k,j,n)
663	m = m + 1
664	ENDDO
665	ENDDO
666
667	ENDDO
668
669	CALL fft_x_m( work_trix, 'forward' )
670
671	ELSE
672	!
673	!-- Cache optimized code
674	DO k = 1, nz
675
676	m = 0
677	DO n = 1, pdims(1)
678	DO i = 1, nnx
679	work_fftx(m) = ar(i,k,j,n)
680	m = m + 1
681	ENDDO
682	ENDDO
683
684	CALL fft_x_1d( work_fftx, 'forward' )
685
686	DO i = 0, nx
687	work_trix(i,k) = work_fftx(i)
688	ENDDO
689
690	ENDDO
691
692	ENDIF
693
694	!
695	!-- Solve the linear equation system
696	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
697
698	IF ( host(1:3) == 'nec' ) THEN
699	!
700	!-- Code optimized for vector processors
701	CALL fft_x_m( work_trix, 'backward' )
702
703	DO k = 1, nz
704
705	m = 0
706	DO n = 1, pdims(1)
707	DO i = 1, nnx
708	ar(i,k,j,n) = work_trix(m,k)
709	m = m + 1
710	ENDDO
711	ENDDO
712
713	ENDDO
714
715	ELSE
716	!
717	!-- Cache optimized code
718	DO k = 1, nz
719
720	DO i = 0, nx
721	work_fftx(i) = work_trix(i,k)
722	ENDDO
723
724	CALL fft_x_1d( work_fftx, 'backward' )
725
726	m = 0
727	DO n = 1, pdims(1)
728	DO i = 1, nnx
729	ar(i,k,j,n) = work_fftx(m)
730	m = m + 1
731	ENDDO
732	ENDDO
733
734	ENDDO
735
736	ENDIF
737
738	ENDDO
739
740	DEALLOCATE( tri )
741
742	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
743
744	END SUBROUTINE fftx_tri_fftx
745
746
747	SUBROUTINE fftx_tr_xy( f_in, work, f_out )
748
749	!------------------------------------------------------------------------------!
750	! Fourier-transformation along x with subsequent transposition x --> y for
751	! a 1d-decomposition along y
752	!
753	! ATTENTION: The NEC-branch of this routine may significantly profit from
754	! further optimizations. So far, performance is much worse than
755	! for routine ffty_tr_yx (more than three times slower).
756	!------------------------------------------------------------------------------!
757
758	USE control_parameters
759	USE cpulog
760	USE indices
761	USE interfaces
762	USE pegrid
763	USE transpose_indices
764
765	IMPLICIT NONE
766
767	INTEGER :: i, j, k
768
769	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
770	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
771	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
772	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
773
774	!
775	!-- Carry out the FFT along x, where all data are present due to the
776	!-- 1d-decomposition along y. Resort the data in a way that y becomes
777	!-- the first index.
778	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
779
780	IF ( host(1:3) == 'nec' ) THEN
781	!
782	!-- Code for vector processors
783	!$OMP PARALLEL PRIVATE ( i, j, k )
784	!$OMP DO
785	DO i = 0, nx
786
787	DO j = nys, nyn
788	DO k = 1, nz
789	work_fftx(i,k,j) = f_in(k,j,i)
790	ENDDO
791	ENDDO
792
793	ENDDO
794
795	!$OMP DO
796	DO j = nys, nyn
797
798	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
799
800	DO k = 1, nz
801	DO i = 0, nx
802	work(j,k,i) = work_fftx(i,k,j)
803	ENDDO
804	ENDDO
805
806	ENDDO
807	!$OMP END PARALLEL
808
809	ELSE
810
811	!
812	!-- Cache optimized code (there might be still a potential for better
813	!-- optimization).
814	!$OMP PARALLEL PRIVATE (i,j,k)
815	!$OMP DO
816	DO i = 0, nx
817
818	DO j = nys, nyn
819	DO k = 1, nz
820	work_fftx(i,k,j) = f_in(k,j,i)
821	ENDDO
822	ENDDO
823
824	ENDDO
825
826	!$OMP DO
827	DO j = nys, nyn
828	DO k = 1, nz
829
830	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
831
832	DO i = 0, nx
833	work(j,k,i) = work_fftx(i,k,j)
834	ENDDO
835	ENDDO
836
837	ENDDO
838	!$OMP END PARALLEL
839
840	ENDIF
841	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
842
843	!
844	!-- Transpose array
845	#if defined( __parallel )
846	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
847	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
848	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
849	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
850	comm1dy, ierr )
851	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
852	#endif
853
854	END SUBROUTINE fftx_tr_xy
855
856
857	SUBROUTINE tr_yx_fftx( f_in, work, f_out )
858
859	!------------------------------------------------------------------------------!
860	! Transposition y --> x with a subsequent backward Fourier transformation for
861	! a 1d-decomposition along x
862	!------------------------------------------------------------------------------!
863
864	USE control_parameters
865	USE cpulog
866	USE indices
867	USE interfaces
868	USE pegrid
869	USE transpose_indices
870
871	IMPLICIT NONE
872
873	INTEGER :: i, j, k
874
875	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
876	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
877	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
878	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
879
880	!
881	!-- Transpose array
882	#if defined( __parallel )
883	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
884	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
885	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
886	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
887	comm1dy, ierr )
888	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
889	#endif
890
891	!
892	!-- Carry out the FFT along x, where all data are present due to the
893	!-- 1d-decomposition along y. Resort the data in a way that y becomes
894	!-- the first index.
895	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
896
897	IF ( host(1:3) == 'nec' ) THEN
898	!
899	!-- Code optimized for vector processors
900	!$OMP PARALLEL PRIVATE ( i, j, k )
901	!$OMP DO
902	DO j = nys, nyn
903
904	DO k = 1, nz
905	DO i = 0, nx
906	work_fftx(i,k,j) = work(j,k,i)
907	ENDDO
908	ENDDO
909
910	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
911
912	ENDDO
913
914	!$OMP DO
915	DO i = 0, nx
916	DO j = nys, nyn
917	DO k = 1, nz
918	f_out(k,j,i) = work_fftx(i,k,j)
919	ENDDO
920	ENDDO
921	ENDDO
922	!$OMP END PARALLEL
923
924	ELSE
925
926	!
927	!-- Cache optimized code (there might be still a potential for better
928	!-- optimization).
929	!$OMP PARALLEL PRIVATE (i,j,k)
930	!$OMP DO
931	DO j = nys, nyn
932	DO k = 1, nz
933
934	DO i = 0, nx
935	work_fftx(i,k,j) = work(j,k,i)
936	ENDDO
937
938	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
939
940	ENDDO
941	ENDDO
942
943	!$OMP DO
944	DO i = 0, nx
945	DO j = nys, nyn
946	DO k = 1, nz
947	f_out(k,j,i) = work_fftx(i,k,j)
948	ENDDO
949	ENDDO
950	ENDDO
951	!$OMP END PARALLEL
952
953	ENDIF
954	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
955
956	END SUBROUTINE tr_yx_fftx
957
958
959	SUBROUTINE ffty_tri_ffty( ar )
960
961	!------------------------------------------------------------------------------!
962	! FFT along y, solution of the tridiagonal system and backward FFT for
963	! a 1d-decomposition along y
964	!
965	! WARNING: this subroutine may still not work for hybrid parallelization
966	! with OpenMP (for possible necessary changes see the original
967	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
968	!------------------------------------------------------------------------------!
969
970	USE control_parameters
971	USE cpulog
972	USE grid_variables
973	USE indices
974	USE interfaces
975	USE pegrid
976	USE transpose_indices
977
978	IMPLICIT NONE
979
980	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
981
982	REAL, DIMENSION(0:ny) :: work_ffty
983	REAL, DIMENSION(0:ny,1:nz) :: work_triy
984	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
985	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
986
987
988	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
989
990	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
991
992	tn = 0 ! Default thread number in case of one thread
993	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
994	DO i = nxl_y, nxr_y
995
996	!$ tn = omp_get_thread_num()
997
998	IF ( host(1:3) == 'nec' ) THEN
999	!
1000	!-- Code optimized for vector processors
1001	DO k = 1, nz
1002
1003	m = 0
1004	DO n = 1, pdims(2)
1005	DO j = 1, nny
1006	work_triy(m,k) = ar(j,k,i,n)
1007	m = m + 1
1008	ENDDO
1009	ENDDO
1010
1011	ENDDO
1012
1013	CALL fft_y_m( work_triy, ny, 'forward' )
1014
1015	ELSE
1016	!
1017	!-- Cache optimized code
1018	DO k = 1, nz
1019
1020	m = 0
1021	DO n = 1, pdims(2)
1022	DO j = 1, nny
1023	work_ffty(m) = ar(j,k,i,n)
1024	m = m + 1
1025	ENDDO
1026	ENDDO
1027
1028	CALL fft_y_1d( work_ffty, 'forward' )
1029
1030	DO j = 0, ny
1031	work_triy(j,k) = work_ffty(j)
1032	ENDDO
1033
1034	ENDDO
1035
1036	ENDIF
1037
1038	!
1039	!-- Solve the linear equation system
1040	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1041
1042	IF ( host(1:3) == 'nec' ) THEN
1043	!
1044	!-- Code optimized for vector processors
1045	CALL fft_y_m( work_triy, ny, 'backward' )
1046
1047	DO k = 1, nz
1048
1049	m = 0
1050	DO n = 1, pdims(2)
1051	DO j = 1, nny
1052	ar(j,k,i,n) = work_triy(m,k)
1053	m = m + 1
1054	ENDDO
1055	ENDDO
1056
1057	ENDDO
1058
1059	ELSE
1060	!
1061	!-- Cache optimized code
1062	DO k = 1, nz
1063
1064	DO j = 0, ny
1065	work_ffty(j) = work_triy(j,k)
1066	ENDDO
1067
1068	CALL fft_y_1d( work_ffty, 'backward' )
1069
1070	m = 0
1071	DO n = 1, pdims(2)
1072	DO j = 1, nny
1073	ar(j,k,i,n) = work_ffty(m)
1074	m = m + 1
1075	ENDDO
1076	ENDDO
1077
1078	ENDDO
1079
1080	ENDIF
1081
1082	ENDDO
1083
1084	DEALLOCATE( tri )
1085
1086	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1087
1088	END SUBROUTINE ffty_tri_ffty
1089
1090
1091	SUBROUTINE tridia_1dd( ddx2, ddy2, nx, ny, j, ar, tri )
1092
1093	!------------------------------------------------------------------------------!
1094	! Solves the linear system of equations for a 1d-decomposition along x (see
1095	! tridia)
1096	!
1097	! Attention: when using the intel compilers older than 12.0, array tri must
1098	! be passed as an argument to the contained subroutines. Otherwise
1099	! addres faults will occur. This feature can be activated with
1100	! cpp-switch __intel11
1101	! On NEC, tri should not be passed (except for routine substi_1dd)
1102	! because this causes very bad performance.
1103	!------------------------------------------------------------------------------!
1104
1105	USE arrays_3d
1106	USE control_parameters
1107
1108	USE pegrid
1109
1110	IMPLICIT NONE
1111
1112	INTEGER :: i, j, k, nnyh, nx, ny, omp_get_thread_num, tn
1113
1114	REAL :: ddx2, ddy2
1115
1116	REAL, DIMENSION(0:nx,1:nz) :: ar
1117	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1118
1119
1120	nnyh = ( ny + 1 ) / 2
1121
1122	!
1123	!-- Define constant elements of the tridiagonal matrix.
1124	!-- The compiler on SX6 does loop exchange. If 0:nx is a high power of 2,
1125	!-- the exchanged loops create bank conflicts. The following directive
1126	!-- prohibits loop exchange and the loops perform much better.
1127	! tn = omp_get_thread_num()
1128	! WRITE( 120+tn, * ) '+++ id=',myid,' nx=',nx,' thread=', omp_get_thread_num()
1129	! CALL local_flush( 120+tn )
1130	!CDIR NOLOOPCHG
1131	DO k = 0, nz-1
1132	DO i = 0,nx
1133	tri(2,i,k) = ddzu_pres(k+1) * ddzw(k+1)
1134	tri(3,i,k) = ddzu_pres(k+2) * ddzw(k+1)
1135	ENDDO
1136	ENDDO
1137	! WRITE( 120+tn, * ) '+++ id=',myid,' end of first tridia loop thread=', omp_get_thread_num()
1138	! CALL local_flush( 120+tn )
1139
1140	IF ( j <= nnyh ) THEN
1141	#if defined( __intel11 )
1142	CALL maketri_1dd( j, tri )
1143	#else
1144	CALL maketri_1dd( j )
1145	#endif
1146	ELSE
1147	#if defined( __intel11 )
1148	CALL maketri_1dd( ny+1-j, tri )
1149	#else
1150	CALL maketri_1dd( ny+1-j )
1151	#endif
1152	ENDIF
1153	#if defined( __intel11 )
1154	CALL split_1dd( tri )
1155	#else
1156	CALL split_1dd
1157	#endif
1158	CALL substi_1dd( ar, tri )
1159
1160	CONTAINS
1161
1162	#if defined( __intel11 )
1163	SUBROUTINE maketri_1dd( j, tri )
1164	#else
1165	SUBROUTINE maketri_1dd( j )
1166	#endif
1167
1168	!------------------------------------------------------------------------------!
1169	! computes the i- and j-dependent component of the matrix
1170	!------------------------------------------------------------------------------!
1171
1172	USE constants
1173
1174	IMPLICIT NONE
1175
1176	INTEGER :: i, j, k, nnxh
1177	REAL :: a, c
1178
1179	REAL, DIMENSION(0:nx) :: l
1180
1181	#if defined( __intel11 )
1182	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1183	#endif
1184
1185
1186	nnxh = ( nx + 1 ) / 2
1187	!
1188	!-- Provide the tridiagonal matrix for solution of the Poisson equation in
1189	!-- Fourier space. The coefficients are computed following the method of
1190	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1191	!-- Siano's original version by discretizing the Poisson equation,
1192	!-- before it is Fourier-transformed
1193	DO i = 0, nx
1194	IF ( i >= 0 .AND. i <= nnxh ) THEN
1195	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1196	REAL( nx+1 ) ) ) * ddx2 + &
1197	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1198	REAL( ny+1 ) ) ) * ddy2
1199	ELSE
1200	l(i) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1201	REAL( nx+1 ) ) ) * ddx2 + &
1202	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1203	REAL( ny+1 ) ) ) * ddy2
1204	ENDIF
1205	ENDDO
1206
1207	DO k = 0, nz-1
1208	DO i = 0, nx
1209	a = -1.0 * ddzu_pres(k+2) * ddzw(k+1)
1210	c = -1.0 * ddzu_pres(k+1) * ddzw(k+1)
1211	tri(1,i,k) = a + c - l(i)
1212	ENDDO
1213	ENDDO
1214	IF ( ibc_p_b == 1 ) THEN
1215	DO i = 0, nx
1216	tri(1,i,0) = tri(1,i,0) + tri(2,i,0)
1217	ENDDO
1218	ENDIF
1219	IF ( ibc_p_t == 1 ) THEN
1220	DO i = 0, nx
1221	tri(1,i,nz-1) = tri(1,i,nz-1) + tri(3,i,nz-1)
1222	ENDDO
1223	ENDIF
1224
1225	END SUBROUTINE maketri_1dd
1226
1227
1228	#if defined( __intel11 )
1229	SUBROUTINE split_1dd( tri )
1230	#else
1231	SUBROUTINE split_1dd
1232	#endif
1233
1234	!------------------------------------------------------------------------------!
1235	! Splitting of the tridiagonal matrix (Thomas algorithm)
1236	!------------------------------------------------------------------------------!
1237
1238	IMPLICIT NONE
1239
1240	INTEGER :: i, k
1241
1242	#if defined( __intel11 )
1243	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1244	#endif
1245
1246
1247	!
1248	!-- Splitting
1249	DO i = 0, nx
1250	tri(4,i,0) = tri(1,i,0)
1251	ENDDO
1252	DO k = 1, nz-1
1253	DO i = 0, nx
1254	tri(5,i,k) = tri(2,i,k) / tri(4,i,k-1)
1255	tri(4,i,k) = tri(1,i,k) - tri(3,i,k-1) * tri(5,i,k)
1256	ENDDO
1257	ENDDO
1258
1259	END SUBROUTINE split_1dd
1260
1261
1262	SUBROUTINE substi_1dd( ar, tri )
1263
1264	!------------------------------------------------------------------------------!
1265	! Substitution (Forward and Backward) (Thomas algorithm)
1266	!------------------------------------------------------------------------------!
1267
1268	IMPLICIT NONE
1269
1270	INTEGER :: i, k
1271
1272	REAL, DIMENSION(0:nx,nz) :: ar
1273	REAL, DIMENSION(0:nx,0:nz-1) :: ar1
1274	REAL, DIMENSION(5,0:nx,0:nz-1) :: tri
1275
1276	!
1277	!-- Forward substitution
1278	DO i = 0, nx
1279	ar1(i,0) = ar(i,1)
1280	ENDDO
1281	DO k = 1, nz-1
1282	DO i = 0, nx
1283	ar1(i,k) = ar(i,k+1) - tri(5,i,k) * ar1(i,k-1)
1284	ENDDO
1285	ENDDO
1286
1287	!
1288	!-- Backward substitution
1289	!-- Note, the add of 1.0E-20 in the denominator is due to avoid divisions
1290	!-- by zero appearing if the pressure bc is set to neumann at the top of
1291	!-- the model domain.
1292	DO i = 0, nx
1293	ar(i,nz) = ar1(i,nz-1) / ( tri(4,i,nz-1) + 1.0E-20 )
1294	ENDDO
1295	DO k = nz-2, 0, -1
1296	DO i = 0, nx
1297	ar(i,k+1) = ( ar1(i,k) - tri(3,i,k) * ar(i,k+2) ) &
1298	/ tri(4,i,k)
1299	ENDDO
1300	ENDDO
1301
1302	!
1303	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1304	!-- The respective values of ar should be zero at all k-levels if
1305	!-- acceleration of horizontally averaged vertical velocity is zero.
1306	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1307	IF ( j == 0 ) THEN
1308	DO k = 1, nz
1309	ar(0,k) = 0.0
1310	ENDDO
1311	ENDIF
1312	ENDIF
1313
1314	END SUBROUTINE substi_1dd
1315
1316	END SUBROUTINE tridia_1dd
1317
1318
1319	SUBROUTINE tridia( ar )
1320
1321	!------------------------------------------------------------------------------!
1322	! solves the linear system of equations:
1323	!
1324	! -(4 pi^2(i^2/(dx^2nnx^2)+j^2/(dy^2nny^2))+
1325	! 1/(dzu(k)dzw(k))+1/(dzu(k-1)dzw(k)))*p(i,j,k)+
1326	! 1/(dzu(k)dzw(k))p(i,j,k+1)+1/(dzu(k-1)dzw(k))p(i,j,k-1)=d(i,j,k)
1327	!
1328	! by using the Thomas algorithm
1329	!------------------------------------------------------------------------------!
1330
1331	USE arrays_3d
1332
1333	IMPLICIT NONE
1334
1335	INTEGER :: i, j, k
1336
1337	!$acc declare create( tri )
1338	REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1,2) :: tri
1339
1340	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
1341
1342
1343	CALL split( tri )
1344	CALL substi( ar, tri )
1345
1346	END SUBROUTINE tridia
1347
1348
1349	SUBROUTINE maketri
1350
1351	!------------------------------------------------------------------------------!
1352	! Computes the i- and j-dependent component of the matrix
1353	!------------------------------------------------------------------------------!
1354
1355	USE arrays_3d, ONLY: tric
1356	USE constants
1357	USE control_parameters
1358	USE grid_variables
1359
1360	IMPLICIT NONE
1361
1362	INTEGER :: i, j, k, nnxh, nnyh
1363
1364	!$acc declare create( ll )
1365	REAL :: ll(nxl_z:nxr_z,nys_z:nyn_z)
1366
1367
1368	nnxh = ( nx + 1 ) / 2
1369	nnyh = ( ny + 1 ) / 2
1370
1371	!
1372	!-- Provide the constant coefficients of the tridiagonal matrix for solution
1373	!-- of the Poisson equation in Fourier space.
1374	!-- The coefficients are computed following the method of
1375	!-- Schmidt et al. (DFVLR-Mitteilung 84-15), which departs from Stephan
1376	!-- Siano's original version by discretizing the Poisson equation,
1377	!-- before it is Fourier-transformed.
1378
1379	!$acc kernels present( tric )
1380	!$acc loop vector( 32 )
1381	DO j = nys_z, nyn_z
1382	DO i = nxl_z, nxr_z
1383	IF ( j >= 0 .AND. j <= nnyh ) THEN
1384	IF ( i >= 0 .AND. i <= nnxh ) THEN
1385	ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1386	REAL( nx+1 ) ) ) / ( dx * dx ) + &
1387	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1388	REAL( ny+1 ) ) ) / ( dy * dy )
1389	ELSE
1390	ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1391	REAL( nx+1 ) ) ) / ( dx * dx ) + &
1392	2.0 * ( 1.0 - COS( ( 2.0 * pi * j ) / &
1393	REAL( ny+1 ) ) ) / ( dy * dy )
1394	ENDIF
1395	ELSE
1396	IF ( i >= 0 .AND. i <= nnxh ) THEN
1397	ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * i ) / &
1398	REAL( nx+1 ) ) ) / ( dx * dx ) + &
1399	2.0 * ( 1.0 - COS( ( 2.0 * pi * ( ny+1-j ) ) / &
1400	REAL( ny+1 ) ) ) / ( dy * dy )
1401	ELSE
1402	ll(i,j) = 2.0 * ( 1.0 - COS( ( 2.0 * pi * ( nx+1-i ) ) / &
1403	REAL( nx+1 ) ) ) / ( dx * dx ) + &
1404	2.0 * ( 1.0 - COS( ( 2.0 * pi * ( ny+1-j ) ) / &
1405	REAL( ny+1 ) ) ) / ( dy * dy )
1406	ENDIF
1407	ENDIF
1408	ENDDO
1409	ENDDO
1410
1411	!$acc loop
1412	DO k = 0, nz-1
1413	DO j = nys_z, nyn_z
1414	!$acc loop vector( 32 )
1415	DO i = nxl_z, nxr_z
1416	tric(i,j,k) = ddzuw(k,3) - ll(i,j)
1417	ENDDO
1418	ENDDO
1419	ENDDO
1420	!$acc end kernels
1421
1422	IF ( ibc_p_b == 1 ) THEN
1423	!$acc kernels present( tric )
1424	!$acc loop
1425	DO j = nys_z, nyn_z
1426	DO i = nxl_z, nxr_z
1427	tric(i,j,0) = tric(i,j,0) + ddzuw(0,1)
1428	ENDDO
1429	ENDDO
1430	!$acc end kernels
1431	ENDIF
1432	IF ( ibc_p_t == 1 ) THEN
1433	!$acc kernels present( tric )
1434	!$acc loop
1435	DO j = nys_z, nyn_z
1436	DO i = nxl_z, nxr_z
1437	tric(i,j,nz-1) = tric(i,j,nz-1) + ddzuw(nz-1,2)
1438	ENDDO
1439	ENDDO
1440	!$acc end kernels
1441	ENDIF
1442
1443	END SUBROUTINE maketri
1444
1445
1446	SUBROUTINE substi( ar, tri )
1447
1448	!------------------------------------------------------------------------------!
1449	! Substitution (Forward and Backward) (Thomas algorithm)
1450	!------------------------------------------------------------------------------!
1451
1452	USE control_parameters
1453
1454	IMPLICIT NONE
1455
1456	INTEGER :: i, j, k
1457
1458	REAL :: ar(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
1459	REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1,2) :: tri
1460
1461	!$acc declare create( ar1 )
1462	REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1) :: ar1
1463
1464	!
1465	!-- Forward substitution
1466	DO k = 0, nz - 1
1467	!$acc kernels present( ar, tri )
1468	!$acc loop
1469	DO j = nys_z, nyn_z
1470	DO i = nxl_z, nxr_z
1471
1472	IF ( k == 0 ) THEN
1473	ar1(i,j,k) = ar(i,j,k+1)
1474	ELSE
1475	ar1(i,j,k) = ar(i,j,k+1) - tri(i,j,k,2) * ar1(i,j,k-1)
1476	ENDIF
1477
1478	ENDDO
1479	ENDDO
1480	!$acc end kernels
1481	ENDDO
1482
1483	!
1484	!-- Backward substitution
1485	!-- Note, the 1.0E-20 in the denominator is due to avoid divisions
1486	!-- by zero appearing if the pressure bc is set to neumann at the top of
1487	!-- the model domain.
1488	DO k = nz-1, 0, -1
1489	!$acc kernels present( ar, tri )
1490	!$acc loop
1491	DO j = nys_z, nyn_z
1492	DO i = nxl_z, nxr_z
1493
1494	IF ( k == nz-1 ) THEN
1495	ar(i,j,k+1) = ar1(i,j,k) / ( tri(i,j,k,1) + 1.0E-20 )
1496	ELSE
1497	ar(i,j,k+1) = ( ar1(i,j,k) - ddzuw(k,2) * ar(i,j,k+2) ) &
1498	/ tri(i,j,k,1)
1499	ENDIF
1500	ENDDO
1501	ENDDO
1502	!$acc end kernels
1503	ENDDO
1504
1505	!
1506	!-- Indices i=0, j=0 correspond to horizontally averaged pressure.
1507	!-- The respective values of ar should be zero at all k-levels if
1508	!-- acceleration of horizontally averaged vertical velocity is zero.
1509	IF ( ibc_p_b == 1 .AND. ibc_p_t == 1 ) THEN
1510	IF ( nys_z == 0 .AND. nxl_z == 0 ) THEN
1511	!$acc kernels loop present( ar )
1512	DO k = 1, nz
1513	ar(nxl_z,nys_z,k) = 0.0
1514	ENDDO
1515	ENDIF
1516	ENDIF
1517
1518	END SUBROUTINE substi
1519
1520
1521	SUBROUTINE split( tri )
1522
1523	!------------------------------------------------------------------------------!
1524	! Splitting of the tridiagonal matrix (Thomas algorithm)
1525	!------------------------------------------------------------------------------!
1526
1527	USE arrays_3d, ONLY: tric
1528
1529	IMPLICIT NONE
1530
1531	INTEGER :: i, j, k
1532
1533	REAL, DIMENSION(nxl_z:nxr_z,nys_z:nyn_z,0:nz-1,2) :: tri
1534
1535	!
1536	!-- Splitting
1537	!$acc kernels present( tri, tric )
1538	!$acc loop
1539	DO j = nys_z, nyn_z
1540	!$acc loop vector( 32 )
1541	DO i = nxl_z, nxr_z
1542	tri(i,j,0,1) = tric(i,j,0)
1543	ENDDO
1544	ENDDO
1545	!$acc end kernels
1546
1547	DO k = 1, nz-1
1548	!$acc kernels present( tri, tric )
1549	!$acc loop
1550	DO j = nys_z, nyn_z
1551	!$acc loop vector( 32 )
1552	DO i = nxl_z, nxr_z
1553	tri(i,j,k,2) = ddzuw(k,1) / tri(i,j,k-1,1)
1554	tri(i,j,k,1) = tric(i,j,k) - ddzuw(k-1,2) * tri(i,j,k,2)
1555	ENDDO
1556	ENDDO
1557	!$acc end kernels
1558	ENDDO
1559
1560	END SUBROUTINE split
1561
1562	#endif
1563
1564	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |