Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 2062

Last change on this file since 2062 was 2001, checked in by knoop, 8 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 45.7 KB

Line
1	!> @file poisfft_mod.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of PALM.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2016 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft_mod.f90 2001 2016-08-20 18:41:22Z maronga $
27	!
28	! 2000 2016-08-20 18:09:15Z knoop
29	! Forced header and separation lines into 80 columns
30	!
31	! 1850 2016-04-08 13:29:27Z maronga
32	! Module renamed
33	!
34	!
35	! 1804 2016-04-05 16:30:18Z maronga
36	! Removed code for parameter file check (__check)
37	!
38	! 1682 2015-10-07 23:56:08Z knoop
39	! Code annotations made doxygen readable
40	!
41	! 1482 2014-10-18 12:34:45Z raasch
42	! use 2d-decomposition, if accelerator boards are used
43	!
44	! 1406 2014-05-16 13:47:01Z raasch
45	! bugfix for pgi 14.4: declare create moved after array declaration
46	!
47	! 1320 2014-03-20 08:40:49Z raasch
48	! ONLY-attribute added to USE-statements,
49	! kind-parameters added to all INTEGER and REAL declaration statements,
50	! kinds are defined in new module kinds,
51	! old module precision_kind is removed,
52	! revision history before 2012 removed,
53	! comment fields (!:) to be used for variable explanations added to
54	! all variable declaration statements
55	!
56	! 1318 2014-03-17 13:35:16Z raasch
57	! module interfaces removed
58	!
59	! 1306 2014-03-13 14:30:59Z raasch
60	! openmp sections removed from the overlap branch,
61	! second argument removed from parameter list
62	!
63	! 1216 2013-08-26 09:31:42Z raasch
64	! resorting of arrays moved to separate routines resort_for_...,
65	! one argument, used as temporary work array, removed from all transpose
66	! routines
67	! overlapping fft / transposition implemented
68	!
69	! 1212 2013-08-15 08:46:27Z raasch
70	! tridia routines moved to seperate module tridia_solver
71	!
72	! 1208 2013-08-13 06:41:49Z raasch
73	! acc-update clauses added for "ar" so that ffts other than cufft can also be
74	! used (although they are not ported and will give a poor performance)
75	!
76	! 1111 2013-03-08 23:54:10Z raasch
77	! further openACC porting of non-parallel (MPI) branch:
78	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
79	! no distinction between parallel/non-parallel in poisfft and tridia any more,
80	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
81	! (otherwise "invalid device function" is indicated during runtime),
82	! optimization of tridia routines: constant elements and coefficients of tri are
83	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
84	! to 2,
85	! poisfft_init is now called internally from poisfft, maketri is called from
86	! poisfft_init,
87	! ibc_p_b = 2 removed
88	!
89	! 1106 2013-03-04 05:31:38Z raasch
90	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
91	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
92	! fft_y_1d
93	!
94	! 1103 2013-02-20 02:15:53Z raasch
95	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
96	! sometimes cause segmentation faults with intel 12.1 compiler
97	!
98	! 1092 2013-02-02 11:24:22Z raasch
99	! unused variables removed
100	!
101	! 1036 2012-10-22 13:43:42Z raasch
102	! code put under GPL (PALM 3.9)
103	!
104	! 2012-09-21 07:03:55Z raasch
105	! FLOAT type conversion replaced by REAL
106	!
107	! 1003 2012-09-14 14:35:53Z raasch
108	! indices nxa, nya, etc. replaced by nx, ny, etc.
109	!
110	! 940 2012-07-09 14:31:00Z raasch
111	! special handling of tri-array as an argument in tridia_1dd routines switched
112	! off because it caused segmentation faults with intel 12.1 compiler
113	!
114	! 877 2012-04-03 11:21:44Z suehring
115	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
116	! pressure at the top of the model domain.
117	!
118	! 809 2012-01-30 13:32:58Z maronga
119	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
120	!
121	! 807 2012-01-25 11:53:51Z maronga
122	! New cpp directive "__check" implemented which is used by check_namelist_files
123	! (most of the code is unneeded by check_namelist_files).
124	!
125	! Revision 1.1 1997/07/24 11:24:14 raasch
126	! Initial revision
127	!
128	!
129	! Description:
130	! ------------
131	!> Solves the Poisson equation with a 2D spectral method
132	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
133	!>
134	!> Input:
135	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
136	!> starting from (1,nys,nxl)
137	!>
138	!> Output:
139	!> real ar contains the solution for perturbation pressure p
140	!------------------------------------------------------------------------------!
141	MODULE poisfft_mod
142
143
144	USE fft_xy, &
145	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
146
147	USE indices, &
148	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
149
150	USE transpose_indices, &
151	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
152	nzb_y, nzt_x, nzt_y
153
154	USE tridia_solver, &
155	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
156
157	IMPLICIT NONE
158
159	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
160
161	PRIVATE
162
163	PUBLIC poisfft, poisfft_init
164
165	INTERFACE poisfft
166	MODULE PROCEDURE poisfft
167	END INTERFACE poisfft
168
169	INTERFACE poisfft_init
170	MODULE PROCEDURE poisfft_init
171	END INTERFACE poisfft_init
172
173
174	CONTAINS
175
176	!------------------------------------------------------------------------------!
177	! Description:
178	! ------------
179	!> @todo Missing subroutine description.
180	!------------------------------------------------------------------------------!
181	SUBROUTINE poisfft_init
182
183	USE arrays_3d, &
184	ONLY: ddzu_pres, ddzw
185
186	USE kinds
187
188	IMPLICIT NONE
189
190	INTEGER(iwp) :: k !<
191
192
193	CALL fft_init
194
195	CALL tridia_init
196
197	poisfft_initialized = .TRUE.
198
199	END SUBROUTINE poisfft_init
200
201
202
203	!------------------------------------------------------------------------------!
204	! Description:
205	! ------------
206	!> Two-dimensional Fourier Transformation in x- and y-direction.
207	!------------------------------------------------------------------------------!
208	SUBROUTINE poisfft( ar )
209
210	USE control_parameters, &
211	ONLY: fft_method, transpose_compute_overlap
212
213	USE cpulog, &
214	ONLY: cpu_log, cpu_log_nowait, log_point_s
215
216	USE kinds
217
218	USE pegrid
219
220	IMPLICIT NONE
221
222	INTEGER(iwp) :: ii !<
223	INTEGER(iwp) :: iind !<
224	INTEGER(iwp) :: inew !<
225	INTEGER(iwp) :: jj !<
226	INTEGER(iwp) :: jind !<
227	INTEGER(iwp) :: jnew !<
228	INTEGER(iwp) :: ki !<
229	INTEGER(iwp) :: kk !<
230	INTEGER(iwp) :: knew !<
231	INTEGER(iwp) :: n !<
232	INTEGER(iwp) :: nblk !<
233	INTEGER(iwp) :: nnx_y !<
234	INTEGER(iwp) :: nny_z !<
235	INTEGER(iwp) :: nnz_t !<
236	INTEGER(iwp) :: nnz_x !<
237	INTEGER(iwp) :: nxl_y_bound !<
238	INTEGER(iwp) :: nxr_y_bound !<
239
240	INTEGER(iwp), DIMENSION(4) :: isave !<
241
242	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
243	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
244	!$acc declare create( ar_inv )
245
246	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
247	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
248	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
249	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
250	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
251
252
253	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
254
255	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
256
257	!
258	!-- Two-dimensional Fourier Transformation in x- and y-direction.
259	IF ( pdims(2) == 1 .AND. pdims(1) > 1 .AND. num_acc_per_node == 0 ) &
260	THEN
261
262	!
263	!-- 1d-domain-decomposition along x:
264	!-- FFT along y and transposition y --> x
265	CALL ffty_tr_yx( ar, ar )
266
267	!
268	!-- FFT along x, solving the tridiagonal system and backward FFT
269	CALL fftx_tri_fftx( ar )
270
271	!
272	!-- Transposition x --> y and backward FFT along y
273	CALL tr_xy_ffty( ar, ar )
274
275	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 .AND. num_acc_per_node == 0 ) &
276	THEN
277
278	!
279	!-- 1d-domain-decomposition along y:
280	!-- FFT along x and transposition x --> y
281	CALL fftx_tr_xy( ar, ar )
282
283	!
284	!-- FFT along y, solving the tridiagonal system and backward FFT
285	CALL ffty_tri_ffty( ar )
286
287	!
288	!-- Transposition y --> x and backward FFT along x
289	CALL tr_yx_fftx( ar, ar )
290
291	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
292
293	!
294	!-- 2d-domain-decomposition or no decomposition (1 PE run)
295	!-- Transposition z --> x
296	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
297	CALL resort_for_zx( ar, ar_inv )
298	CALL transpose_zx( ar_inv, ar )
299	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
300
301	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
302	IF ( fft_method /= 'system-specific' ) THEN
303	!$acc update host( ar )
304	ENDIF
305	CALL fft_x( ar, 'forward' )
306	IF ( fft_method /= 'system-specific' ) THEN
307	!$acc update device( ar )
308	ENDIF
309	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
310
311	!
312	!-- Transposition x --> y
313	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
314	CALL resort_for_xy( ar, ar_inv )
315	CALL transpose_xy( ar_inv, ar )
316	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
317
318	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
319	IF ( fft_method /= 'system-specific' ) THEN
320	!$acc update host( ar )
321	ENDIF
322	CALL fft_y( ar, 'forward', ar_tr = ar, &
323	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
324	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
325	IF ( fft_method /= 'system-specific' ) THEN
326	!$acc update device( ar )
327	ENDIF
328	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
329
330	!
331	!-- Transposition y --> z
332	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
333	CALL resort_for_yz( ar, ar_inv )
334	CALL transpose_yz( ar_inv, ar )
335	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
336
337	!
338	!-- Solve the tridiagonal equation system along z
339	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
340	CALL tridia_substi( ar )
341	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
342
343	!
344	!-- Inverse Fourier Transformation
345	!-- Transposition z --> y
346	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
347	CALL transpose_zy( ar, ar_inv )
348	CALL resort_for_zy( ar_inv, ar )
349	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
350
351	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
352	IF ( fft_method /= 'system-specific' ) THEN
353	!$acc update host( ar )
354	ENDIF
355	CALL fft_y( ar, 'backward', ar_tr = ar, &
356	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
357	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
358	IF ( fft_method /= 'system-specific' ) THEN
359	!$acc update device( ar )
360	ENDIF
361	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
362
363	!
364	!-- Transposition y --> x
365	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
366	CALL transpose_yx( ar, ar_inv )
367	CALL resort_for_yx( ar_inv, ar )
368	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
369
370	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
371	IF ( fft_method /= 'system-specific' ) THEN
372	!$acc update host( ar )
373	ENDIF
374	CALL fft_x( ar, 'backward' )
375	IF ( fft_method /= 'system-specific' ) THEN
376	!$acc update device( ar )
377	ENDIF
378	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
379
380	!
381	!-- Transposition x --> z
382	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
383	CALL transpose_xz( ar, ar_inv )
384	CALL resort_for_xz( ar_inv, ar )
385	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
386
387	ELSE
388
389	!
390	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
391	!-- overlapping transposition / fft
392	!-- cputime logging must not use barriers, which would prevent overlapping
393	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
394	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
395	!
396	!-- Transposition z --> x + subsequent fft along x
397	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
398	CALL resort_for_zx( ar, f_inv )
399	!
400	!-- Save original indices and gridpoint counter
401	isave(1) = nz
402	isave(2) = nzb_x
403	isave(3) = nzt_x
404	isave(4) = sendrecvcount_zx
405	!
406	!-- Set new indices for transformation
407	nblk = nz / pdims(1)
408	nz = pdims(1)
409	nnz_x = 1
410	nzb_x = 1 + myidx * nnz_x
411	nzt_x = ( myidx + 1 ) * nnz_x
412	sendrecvcount_zx = nnx * nny * nnz_x
413
414	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
415	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
416
417	DO kk = 1, nblk
418
419	IF ( kk == 1 ) THEN
420	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
421	ELSE
422	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
423	ENDIF
424
425	DO knew = 1, nz
426	ki = kk + nblk * ( knew - 1 )
427	f_in(:,:,knew) = f_inv(:,:,ki)
428	ENDDO
429
430	CALL transpose_zx( f_in, ar1(:,:,:))
431	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
432
433	IF ( kk == 1 ) THEN
434	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
435	ELSE
436	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
437	ENDIF
438
439	n = isave(2) + kk - 1
440	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
441	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
442
443	ENDDO
444	!
445	!-- Restore original indices/counters
446	nz = isave(1)
447	nzb_x = isave(2)
448	nzt_x = isave(3)
449	sendrecvcount_zx = isave(4)
450
451	DEALLOCATE( ar1, f_in, f_inv )
452
453	!
454	!-- Transposition x --> y + subsequent fft along y
455	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
456	CALL resort_for_xy( f_out_z, f_inv )
457	!
458	!-- Save original indices and gridpoint counter
459	isave(1) = nx
460	isave(2) = nxl_y
461	isave(3) = nxr_y
462	isave(4) = sendrecvcount_xy
463	!
464	!-- Set new indices for transformation
465	nblk = ( ( nx+1 ) / pdims(2) ) - 1
466	nx = pdims(2)
467	nnx_y = 1
468	nxl_y = myidy * nnx_y
469	nxr_y = ( myidy + 1 ) * nnx_y - 1
470	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
471
472	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
473	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
474
475	DO ii = 0, nblk
476
477	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
478
479	DO inew = 0, nx-1
480	iind = ii + ( nblk + 1 ) * inew
481	f_in(:,:,inew) = f_inv(:,:,iind)
482	ENDDO
483
484	CALL transpose_xy( f_in, ar1(:,:,:) )
485
486	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
487
488	IF ( ii == 1 ) THEN
489	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
490	ELSE
491	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
492	ENDIF
493
494	nxl_y_bound = isave(2)
495	nxr_y_bound = isave(3)
496	n = isave(2) + ii
497	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
498	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
499	nxl_y_l = n, nxr_y_l = n )
500
501	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
502
503	ENDDO
504	!
505	!-- Restore original indices/counters
506	nx = isave(1)
507	nxl_y = isave(2)
508	nxr_y = isave(3)
509	sendrecvcount_xy = isave(4)
510
511	DEALLOCATE( ar1, f_in, f_inv )
512
513	!
514	!-- Transposition y --> z + subsequent tridia + resort for z --> y
515	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
516	CALL resort_for_yz( f_out_y, f_inv )
517	!
518	!-- Save original indices and gridpoint counter
519	isave(1) = ny
520	isave(2) = nys_z
521	isave(3) = nyn_z
522	isave(4) = sendrecvcount_yz
523	!
524	!-- Set new indices for transformation
525	nblk = ( ( ny+1 ) / pdims(1) ) - 1
526	ny = pdims(1)
527	nny_z = 1
528	nys_z = myidx * nny_z
529	nyn_z = ( myidx + 1 ) * nny_z - 1
530	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
531
532	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
533	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
534
535	DO jj = 0, nblk
536	!
537	!-- Forward Fourier Transformation
538	!-- Transposition y --> z
539	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
540
541	DO jnew = 0, ny-1
542	jind = jj + ( nblk + 1 ) * jnew
543	f_in(:,:,jnew) = f_inv(:,:,jind)
544	ENDDO
545
546	CALL transpose_yz( f_in, ar1(:,:,:) )
547
548	IF ( jj == nblk ) THEN
549	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
550	ELSE
551	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
552	ENDIF
553
554	!
555	!-- Solve the tridiagonal equation system along z
556	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
557
558	n = isave(2) + jj
559	CALL tridia_substi_overlap( ar1(:,:,:), n )
560
561	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
562
563	!
564	!-- Inverse Fourier Transformation
565	!-- Transposition z --> y
566	!-- Only one thread should call MPI routines, therefore forward and
567	!-- backward tranpose are in the same section
568	IF ( jj == 0 ) THEN
569	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
570	ELSE
571	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
572	ENDIF
573
574	CALL transpose_zy( ar1(:,:,:), f_in )
575
576	DO jnew = 0, ny-1
577	jind = jj + ( nblk + 1 ) * jnew
578	f_inv(:,:,jind) = f_in(:,:,jnew)
579	ENDDO
580
581	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
582
583	ENDDO
584	!
585	!-- Restore original indices/counters
586	ny = isave(1)
587	nys_z = isave(2)
588	nyn_z = isave(3)
589	sendrecvcount_yz = isave(4)
590
591	CALL resort_for_zy( f_inv, f_out_y )
592
593	DEALLOCATE( ar1, f_in, f_inv )
594
595	!
596	!-- fft along y backward + subsequent transposition y --> x
597	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
598	!
599	!-- Save original indices and gridpoint counter
600	isave(1) = nx
601	isave(2) = nxl_y
602	isave(3) = nxr_y
603	isave(4) = sendrecvcount_xy
604	!
605	!-- Set new indices for transformation
606	nblk = (( nx+1 ) / pdims(2) ) - 1
607	nx = pdims(2)
608	nnx_y = 1
609	nxl_y = myidy * nnx_y
610	nxr_y = ( myidy + 1 ) * nnx_y - 1
611	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
612
613	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
614	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
615
616	DO ii = 0, nblk
617
618	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
619
620	n = isave(2) + ii
621	nxl_y_bound = isave(2)
622	nxr_y_bound = isave(3)
623
624	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
625	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
626	nxl_y_l = n, nxr_y_l = n )
627
628	IF ( ii == nblk ) THEN
629	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
630	ELSE
631	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
632	ENDIF
633
634	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
635
636	CALL transpose_yx( ar1(:,:,:), f_in )
637
638	DO inew = 0, nx-1
639	iind = ii + (nblk+1) * inew
640	f_inv(:,:,iind) = f_in(:,:,inew)
641	ENDDO
642
643	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
644
645	ENDDO
646	!
647	!-- Restore original indices/counters
648	nx = isave(1)
649	nxl_y = isave(2)
650	nxr_y = isave(3)
651	sendrecvcount_xy = isave(4)
652
653	CALL resort_for_yx( f_inv, f_out_z )
654
655	DEALLOCATE( ar1, f_in, f_inv )
656
657	!
658	!-- fft along x backward + subsequent final transposition x --> z
659	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
660	!
661	!-- Save original indices and gridpoint counter
662	isave(1) = nz
663	isave(2) = nzb_x
664	isave(3) = nzt_x
665	isave(4) = sendrecvcount_zx
666	!
667	!-- Set new indices for transformation
668	nblk = nz / pdims(1)
669	nz = pdims(1)
670	nnz_x = 1
671	nzb_x = 1 + myidx * nnz_x
672	nzt_x = ( myidx + 1 ) * nnz_x
673	sendrecvcount_zx = nnx * nny * nnz_x
674
675	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
676	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
677
678	DO kk = 1, nblk
679
680	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
681
682	n = isave(2) + kk - 1
683	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
684
685	IF ( kk == nblk ) THEN
686	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
687	ELSE
688	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
689	ENDIF
690
691	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
692
693	CALL transpose_xz( ar1(:,:,:), f_in )
694
695	DO knew = 1, nz
696	ki = kk + nblk * (knew-1)
697	f_inv(:,:,ki) = f_in(:,:,knew)
698	ENDDO
699
700	IF ( kk == nblk ) THEN
701	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
702	ELSE
703	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
704	ENDIF
705
706	ENDDO
707	!
708	!-- Restore original indices/counters
709	nz = isave(1)
710	nzb_x = isave(2)
711	nzt_x = isave(3)
712	sendrecvcount_zx = isave(4)
713
714	CALL resort_for_xz( f_inv, ar )
715
716	DEALLOCATE( ar1, f_in, f_inv )
717
718	ENDIF
719
720	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
721
722	END SUBROUTINE poisfft
723
724
725	!------------------------------------------------------------------------------!
726	! Description:
727	! ------------
728	!> Fourier-transformation along y with subsequent transposition y --> x for
729	!> a 1d-decomposition along x.
730	!>
731	!> @attention The performance of this routine is much faster on the NEC-SX6,
732	!> if the first index of work_ffty_vec is odd. Otherwise
733	!> memory bank conflicts may occur (especially if the index is a
734	!> multiple of 128). That's why work_ffty_vec is dimensioned as
735	!> 0:ny+1.
736	!> Of course, this will not work if users are using an odd number
737	!> of gridpoints along y.
738	!------------------------------------------------------------------------------!
739	SUBROUTINE ffty_tr_yx( f_in, f_out )
740
741
742	USE control_parameters, &
743	ONLY: host
744
745	USE cpulog, &
746	ONLY: cpu_log, log_point_s
747
748	USE kinds
749
750	USE pegrid
751
752	IMPLICIT NONE
753
754	INTEGER(iwp) :: i !<
755	INTEGER(iwp) :: iend !<
756	INTEGER(iwp) :: iouter !<
757	INTEGER(iwp) :: ir !<
758	INTEGER(iwp) :: j !<
759	INTEGER(iwp) :: k !<
760
761	INTEGER(iwp), PARAMETER :: stridex = 4 !<
762
763	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
764	#if defined( __nec )
765	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
766	#endif
767	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
768	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
769	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
770
771	!
772	!-- Carry out the FFT along y, where all data are present due to the
773	!-- 1d-decomposition along x. Resort the data in a way that x becomes
774	!-- the first index.
775	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
776
777	IF ( host(1:3) == 'nec' ) THEN
778	#if defined( __nec )
779	!
780	!-- Code optimized for vector processors
781	!$OMP PARALLEL PRIVATE ( i, j, k )
782	!$OMP DO
783	DO i = nxl, nxr
784
785	DO j = 0, ny
786	DO k = 1, nz
787	work_ffty_vec(j,k,i) = f_in(k,j,i)
788	ENDDO
789	ENDDO
790
791	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
792
793	ENDDO
794
795	!$OMP DO
796	DO k = 1, nz
797	DO j = 0, ny
798	DO i = nxl, nxr
799	work(i,k,j) = work_ffty_vec(j,k,i)
800	ENDDO
801	ENDDO
802	ENDDO
803	!$OMP END PARALLEL
804	#endif
805
806	ELSE
807
808	!
809	!-- Cache optimized code.
810	!-- The i-(x-)direction is split into a strided outer loop and an inner
811	!-- loop for better cache performance
812	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
813	!$OMP DO
814	DO iouter = nxl, nxr, stridex
815
816	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
817
818	DO k = 1, nz
819
820	DO i = iouter, iend
821
822	ir = i-iouter+1 ! counter within a stride
823	DO j = 0, ny
824	work_ffty(j,ir) = f_in(k,j,i)
825	ENDDO
826	!
827	!-- FFT along y
828	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
829
830	ENDDO
831
832	!
833	!-- Resort
834	DO j = 0, ny
835	DO i = iouter, iend
836	work(i,k,j) = work_ffty(j,i-iouter+1)
837	ENDDO
838	ENDDO
839
840	ENDDO
841
842	ENDDO
843	!$OMP END PARALLEL
844
845	ENDIF
846	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
847
848	!
849	!-- Transpose array
850	#if defined( __parallel )
851	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
852	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
853	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
854	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
855	comm1dx, ierr )
856	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
857	#endif
858
859	END SUBROUTINE ffty_tr_yx
860
861
862	!------------------------------------------------------------------------------!
863	! Description:
864	! ------------
865	!> Transposition x --> y with a subsequent backward Fourier transformation for
866	!> a 1d-decomposition along x
867	!------------------------------------------------------------------------------!
868	SUBROUTINE tr_xy_ffty( f_in, f_out )
869
870
871	USE control_parameters, &
872	ONLY: host
873
874	USE cpulog, &
875	ONLY: cpu_log, log_point_s
876
877	USE kinds
878
879	USE pegrid
880
881	IMPLICIT NONE
882
883	INTEGER(iwp) :: i !<
884	INTEGER(iwp) :: iend !<
885	INTEGER(iwp) :: iouter !<
886	INTEGER(iwp) :: ir !<
887	INTEGER(iwp) :: j !<
888	INTEGER(iwp) :: k !<
889
890	INTEGER(iwp), PARAMETER :: stridex = 4 !<
891
892	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
893	#if defined( __nec )
894	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
895	#endif
896	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
897	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
898	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
899
900	!
901	!-- Transpose array
902	#if defined( __parallel )
903	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
904	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
905	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
906	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
907	comm1dx, ierr )
908	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
909	#endif
910
911	!
912	!-- Resort the data in a way that y becomes the first index and carry out the
913	!-- backward fft along y.
914	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
915
916	IF ( host(1:3) == 'nec' ) THEN
917	#if defined( __nec )
918	!
919	!-- Code optimized for vector processors
920	!$OMP PARALLEL PRIVATE ( i, j, k )
921	!$OMP DO
922	DO k = 1, nz
923	DO j = 0, ny
924	DO i = nxl, nxr
925	work_ffty_vec(j,k,i) = work(i,k,j)
926	ENDDO
927	ENDDO
928	ENDDO
929
930	!$OMP DO
931	DO i = nxl, nxr
932
933	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
934
935	DO j = 0, ny
936	DO k = 1, nz
937	f_out(k,j,i) = work_ffty_vec(j,k,i)
938	ENDDO
939	ENDDO
940
941	ENDDO
942	!$OMP END PARALLEL
943	#endif
944
945	ELSE
946
947	!
948	!-- Cache optimized code.
949	!-- The i-(x-)direction is split into a strided outer loop and an inner
950	!-- loop for better cache performance
951	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
952	!$OMP DO
953	DO iouter = nxl, nxr, stridex
954
955	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
956
957	DO k = 1, nz
958	!
959	!-- Resort
960	DO j = 0, ny
961	DO i = iouter, iend
962	work_ffty(j,i-iouter+1) = work(i,k,j)
963	ENDDO
964	ENDDO
965
966	DO i = iouter, iend
967
968	!
969	!-- FFT along y
970	ir = i-iouter+1 ! counter within a stride
971	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
972
973	DO j = 0, ny
974	f_out(k,j,i) = work_ffty(j,ir)
975	ENDDO
976	ENDDO
977
978	ENDDO
979
980	ENDDO
981	!$OMP END PARALLEL
982
983	ENDIF
984
985	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
986
987	END SUBROUTINE tr_xy_ffty
988
989
990	!------------------------------------------------------------------------------!
991	! Description:
992	! ------------
993	!> FFT along x, solution of the tridiagonal system and backward FFT for
994	!> a 1d-decomposition along x
995	!>
996	!> @warning this subroutine may still not work for hybrid parallelization
997	!> with OpenMP (for possible necessary changes see the original
998	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
999	!------------------------------------------------------------------------------!
1000	SUBROUTINE fftx_tri_fftx( ar )
1001
1002
1003	USE control_parameters, &
1004	ONLY: host
1005
1006	USE cpulog, &
1007	ONLY: cpu_log, log_point_s
1008
1009	USE grid_variables, &
1010	ONLY: ddx2, ddy2
1011
1012	USE kinds
1013
1014	USE pegrid
1015
1016	IMPLICIT NONE
1017
1018	INTEGER(iwp) :: i !<
1019	INTEGER(iwp) :: j !<
1020	INTEGER(iwp) :: k !<
1021	INTEGER(iwp) :: m !<
1022	INTEGER(iwp) :: n !<
1023	INTEGER(iwp) :: omp_get_thread_num !<
1024	INTEGER(iwp) :: tn !<
1025
1026	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
1027	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
1028	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
1029	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1030
1031
1032	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
1033
1034	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1035
1036	tn = 0 ! Default thread number in case of one thread
1037	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1038	DO j = nys_x, nyn_x
1039
1040	!$ tn = omp_get_thread_num()
1041
1042	IF ( host(1:3) == 'nec' ) THEN
1043	!
1044	!-- Code optimized for vector processors
1045	DO k = 1, nz
1046
1047	m = 0
1048	DO n = 1, pdims(1)
1049	DO i = 1, nnx
1050	work_trix(m,k) = ar(i,k,j,n)
1051	m = m + 1
1052	ENDDO
1053	ENDDO
1054
1055	ENDDO
1056
1057	CALL fft_x_m( work_trix, 'forward' )
1058
1059	ELSE
1060	!
1061	!-- Cache optimized code
1062	DO k = 1, nz
1063
1064	m = 0
1065	DO n = 1, pdims(1)
1066	DO i = 1, nnx
1067	work_fftx(m) = ar(i,k,j,n)
1068	m = m + 1
1069	ENDDO
1070	ENDDO
1071
1072	CALL fft_x_1d( work_fftx, 'forward' )
1073
1074	DO i = 0, nx
1075	work_trix(i,k) = work_fftx(i)
1076	ENDDO
1077
1078	ENDDO
1079
1080	ENDIF
1081
1082	!
1083	!-- Solve the linear equation system
1084	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1085
1086	IF ( host(1:3) == 'nec' ) THEN
1087	!
1088	!-- Code optimized for vector processors
1089	CALL fft_x_m( work_trix, 'backward' )
1090
1091	DO k = 1, nz
1092
1093	m = 0
1094	DO n = 1, pdims(1)
1095	DO i = 1, nnx
1096	ar(i,k,j,n) = work_trix(m,k)
1097	m = m + 1
1098	ENDDO
1099	ENDDO
1100
1101	ENDDO
1102
1103	ELSE
1104	!
1105	!-- Cache optimized code
1106	DO k = 1, nz
1107
1108	DO i = 0, nx
1109	work_fftx(i) = work_trix(i,k)
1110	ENDDO
1111
1112	CALL fft_x_1d( work_fftx, 'backward' )
1113
1114	m = 0
1115	DO n = 1, pdims(1)
1116	DO i = 1, nnx
1117	ar(i,k,j,n) = work_fftx(m)
1118	m = m + 1
1119	ENDDO
1120	ENDDO
1121
1122	ENDDO
1123
1124	ENDIF
1125
1126	ENDDO
1127
1128	DEALLOCATE( tri )
1129
1130	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1131
1132	END SUBROUTINE fftx_tri_fftx
1133
1134
1135	!------------------------------------------------------------------------------!
1136	! Description:
1137	! ------------
1138	!> Fourier-transformation along x with subsequent transposition x --> y for
1139	!> a 1d-decomposition along y.
1140	!>
1141	!> @attention NEC-branch of this routine may significantly profit from
1142	!> further optimizations. So far, performance is much worse than
1143	!> for routine ffty_tr_yx (more than three times slower).
1144	!------------------------------------------------------------------------------!
1145	SUBROUTINE fftx_tr_xy( f_in, f_out )
1146
1147
1148	USE control_parameters, &
1149	ONLY: host
1150
1151	USE cpulog, &
1152	ONLY: cpu_log, log_point_s
1153
1154	USE kinds
1155
1156	USE pegrid
1157
1158	IMPLICIT NONE
1159
1160	INTEGER(iwp) :: i !<
1161	INTEGER(iwp) :: j !<
1162	INTEGER(iwp) :: k !<
1163
1164	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1165	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
1166	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
1167	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1168
1169	!
1170	!-- Carry out the FFT along x, where all data are present due to the
1171	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1172	!-- the first index.
1173	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1174
1175	IF ( host(1:3) == 'nec' ) THEN
1176	!
1177	!-- Code for vector processors
1178	!$OMP PARALLEL PRIVATE ( i, j, k )
1179	!$OMP DO
1180	DO i = 0, nx
1181
1182	DO j = nys, nyn
1183	DO k = 1, nz
1184	work_fftx(i,k,j) = f_in(k,j,i)
1185	ENDDO
1186	ENDDO
1187
1188	ENDDO
1189
1190	!$OMP DO
1191	DO j = nys, nyn
1192
1193	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1194
1195	DO k = 1, nz
1196	DO i = 0, nx
1197	work(j,k,i) = work_fftx(i,k,j)
1198	ENDDO
1199	ENDDO
1200
1201	ENDDO
1202	!$OMP END PARALLEL
1203
1204	ELSE
1205
1206	!
1207	!-- Cache optimized code (there might be still a potential for better
1208	!-- optimization).
1209	!$OMP PARALLEL PRIVATE (i,j,k)
1210	!$OMP DO
1211	DO i = 0, nx
1212
1213	DO j = nys, nyn
1214	DO k = 1, nz
1215	work_fftx(i,k,j) = f_in(k,j,i)
1216	ENDDO
1217	ENDDO
1218
1219	ENDDO
1220
1221	!$OMP DO
1222	DO j = nys, nyn
1223	DO k = 1, nz
1224
1225	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1226
1227	DO i = 0, nx
1228	work(j,k,i) = work_fftx(i,k,j)
1229	ENDDO
1230	ENDDO
1231
1232	ENDDO
1233	!$OMP END PARALLEL
1234
1235	ENDIF
1236	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1237
1238	!
1239	!-- Transpose array
1240	#if defined( __parallel )
1241	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1242	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1243	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1244	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1245	comm1dy, ierr )
1246	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1247	#endif
1248
1249	END SUBROUTINE fftx_tr_xy
1250
1251
1252	!------------------------------------------------------------------------------!
1253	! Description:
1254	! ------------
1255	!> Transposition y --> x with a subsequent backward Fourier transformation for
1256	!> a 1d-decomposition along x.
1257	!------------------------------------------------------------------------------!
1258	SUBROUTINE tr_yx_fftx( f_in, f_out )
1259
1260
1261	USE control_parameters, &
1262	ONLY: host
1263
1264	USE cpulog, &
1265	ONLY: cpu_log, log_point_s
1266
1267	USE kinds
1268
1269	USE pegrid
1270
1271	IMPLICIT NONE
1272
1273	INTEGER(iwp) :: i !<
1274	INTEGER(iwp) :: j !<
1275	INTEGER(iwp) :: k !<
1276
1277	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1278	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
1279	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
1280	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1281
1282	!
1283	!-- Transpose array
1284	#if defined( __parallel )
1285	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1286	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1287	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1288	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1289	comm1dy, ierr )
1290	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1291	#endif
1292
1293	!
1294	!-- Carry out the FFT along x, where all data are present due to the
1295	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1296	!-- the first index.
1297	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1298
1299	IF ( host(1:3) == 'nec' ) THEN
1300	!
1301	!-- Code optimized for vector processors
1302	!$OMP PARALLEL PRIVATE ( i, j, k )
1303	!$OMP DO
1304	DO j = nys, nyn
1305
1306	DO k = 1, nz
1307	DO i = 0, nx
1308	work_fftx(i,k,j) = work(j,k,i)
1309	ENDDO
1310	ENDDO
1311
1312	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1313
1314	ENDDO
1315
1316	!$OMP DO
1317	DO i = 0, nx
1318	DO j = nys, nyn
1319	DO k = 1, nz
1320	f_out(k,j,i) = work_fftx(i,k,j)
1321	ENDDO
1322	ENDDO
1323	ENDDO
1324	!$OMP END PARALLEL
1325
1326	ELSE
1327
1328	!
1329	!-- Cache optimized code (there might be still a potential for better
1330	!-- optimization).
1331	!$OMP PARALLEL PRIVATE (i,j,k)
1332	!$OMP DO
1333	DO j = nys, nyn
1334	DO k = 1, nz
1335
1336	DO i = 0, nx
1337	work_fftx(i,k,j) = work(j,k,i)
1338	ENDDO
1339
1340	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1341
1342	ENDDO
1343	ENDDO
1344
1345	!$OMP DO
1346	DO i = 0, nx
1347	DO j = nys, nyn
1348	DO k = 1, nz
1349	f_out(k,j,i) = work_fftx(i,k,j)
1350	ENDDO
1351	ENDDO
1352	ENDDO
1353	!$OMP END PARALLEL
1354
1355	ENDIF
1356	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1357
1358	END SUBROUTINE tr_yx_fftx
1359
1360
1361	!------------------------------------------------------------------------------!
1362	! Description:
1363	! ------------
1364	!> FFT along y, solution of the tridiagonal system and backward FFT for
1365	!> a 1d-decomposition along y.
1366	!>
1367	!> @warning this subroutine may still not work for hybrid parallelization
1368	!> with OpenMP (for possible necessary changes see the original
1369	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1370	!------------------------------------------------------------------------------!
1371	SUBROUTINE ffty_tri_ffty( ar )
1372
1373
1374	USE control_parameters, &
1375	ONLY: host
1376
1377	USE cpulog, &
1378	ONLY: cpu_log, log_point_s
1379
1380	USE grid_variables, &
1381	ONLY: ddx2, ddy2
1382
1383	USE kinds
1384
1385	USE pegrid
1386
1387	IMPLICIT NONE
1388
1389	INTEGER(iwp) :: i !<
1390	INTEGER(iwp) :: j !<
1391	INTEGER(iwp) :: k !<
1392	INTEGER(iwp) :: m !<
1393	INTEGER(iwp) :: n !<
1394	INTEGER(iwp) :: omp_get_thread_num !<
1395	INTEGER(iwp) :: tn !<
1396
1397	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
1398	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
1399	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
1400	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1401
1402
1403	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1404
1405	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1406
1407	tn = 0 ! Default thread number in case of one thread
1408	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1409	DO i = nxl_y, nxr_y
1410
1411	!$ tn = omp_get_thread_num()
1412
1413	IF ( host(1:3) == 'nec' ) THEN
1414	!
1415	!-- Code optimized for vector processors
1416	DO k = 1, nz
1417
1418	m = 0
1419	DO n = 1, pdims(2)
1420	DO j = 1, nny
1421	work_triy(m,k) = ar(j,k,i,n)
1422	m = m + 1
1423	ENDDO
1424	ENDDO
1425
1426	ENDDO
1427
1428	CALL fft_y_m( work_triy, ny, 'forward' )
1429
1430	ELSE
1431	!
1432	!-- Cache optimized code
1433	DO k = 1, nz
1434
1435	m = 0
1436	DO n = 1, pdims(2)
1437	DO j = 1, nny
1438	work_ffty(m) = ar(j,k,i,n)
1439	m = m + 1
1440	ENDDO
1441	ENDDO
1442
1443	CALL fft_y_1d( work_ffty, 'forward' )
1444
1445	DO j = 0, ny
1446	work_triy(j,k) = work_ffty(j)
1447	ENDDO
1448
1449	ENDDO
1450
1451	ENDIF
1452
1453	!
1454	!-- Solve the linear equation system
1455	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1456
1457	IF ( host(1:3) == 'nec' ) THEN
1458	!
1459	!-- Code optimized for vector processors
1460	CALL fft_y_m( work_triy, ny, 'backward' )
1461
1462	DO k = 1, nz
1463
1464	m = 0
1465	DO n = 1, pdims(2)
1466	DO j = 1, nny
1467	ar(j,k,i,n) = work_triy(m,k)
1468	m = m + 1
1469	ENDDO
1470	ENDDO
1471
1472	ENDDO
1473
1474	ELSE
1475	!
1476	!-- Cache optimized code
1477	DO k = 1, nz
1478
1479	DO j = 0, ny
1480	work_ffty(j) = work_triy(j,k)
1481	ENDDO
1482
1483	CALL fft_y_1d( work_ffty, 'backward' )
1484
1485	m = 0
1486	DO n = 1, pdims(2)
1487	DO j = 1, nny
1488	ar(j,k,i,n) = work_ffty(m)
1489	m = m + 1
1490	ENDDO
1491	ENDDO
1492
1493	ENDDO
1494
1495	ENDIF
1496
1497	ENDDO
1498
1499	DEALLOCATE( tri )
1500
1501	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1502
1503	END SUBROUTINE ffty_tri_ffty
1504
1505	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |