Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 2000

Last change on this file since 2000 was 2000, checked in by knoop, 8 years ago
Forced header and separation lines into 80 columns
Property svn:keywords set to `Id`
File size: 45.7 KB

Line
1	!> @file poisfft_mod.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of PALM.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2016 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	! Forced header and separation lines into 80 columns
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft_mod.f90 2000 2016-08-20 18:09:15Z knoop $
27	!
28	! 1850 2016-04-08 13:29:27Z maronga
29	! Module renamed
30	!
31	!
32	! 1804 2016-04-05 16:30:18Z maronga
33	! Removed code for parameter file check (__check)
34	!
35	! 1682 2015-10-07 23:56:08Z knoop
36	! Code annotations made doxygen readable
37	!
38	! 1482 2014-10-18 12:34:45Z raasch
39	! use 2d-decomposition, if accelerator boards are used
40	!
41	! 1406 2014-05-16 13:47:01Z raasch
42	! bugfix for pgi 14.4: declare create moved after array declaration
43	!
44	! 1320 2014-03-20 08:40:49Z raasch
45	! ONLY-attribute added to USE-statements,
46	! kind-parameters added to all INTEGER and REAL declaration statements,
47	! kinds are defined in new module kinds,
48	! old module precision_kind is removed,
49	! revision history before 2012 removed,
50	! comment fields (!:) to be used for variable explanations added to
51	! all variable declaration statements
52	!
53	! 1318 2014-03-17 13:35:16Z raasch
54	! module interfaces removed
55	!
56	! 1306 2014-03-13 14:30:59Z raasch
57	! openmp sections removed from the overlap branch,
58	! second argument removed from parameter list
59	!
60	! 1216 2013-08-26 09:31:42Z raasch
61	! resorting of arrays moved to separate routines resort_for_...,
62	! one argument, used as temporary work array, removed from all transpose
63	! routines
64	! overlapping fft / transposition implemented
65	!
66	! 1212 2013-08-15 08:46:27Z raasch
67	! tridia routines moved to seperate module tridia_solver
68	!
69	! 1208 2013-08-13 06:41:49Z raasch
70	! acc-update clauses added for "ar" so that ffts other than cufft can also be
71	! used (although they are not ported and will give a poor performance)
72	!
73	! 1111 2013-03-08 23:54:10Z raasch
74	! further openACC porting of non-parallel (MPI) branch:
75	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
76	! no distinction between parallel/non-parallel in poisfft and tridia any more,
77	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
78	! (otherwise "invalid device function" is indicated during runtime),
79	! optimization of tridia routines: constant elements and coefficients of tri are
80	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
81	! to 2,
82	! poisfft_init is now called internally from poisfft, maketri is called from
83	! poisfft_init,
84	! ibc_p_b = 2 removed
85	!
86	! 1106 2013-03-04 05:31:38Z raasch
87	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
88	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
89	! fft_y_1d
90	!
91	! 1103 2013-02-20 02:15:53Z raasch
92	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
93	! sometimes cause segmentation faults with intel 12.1 compiler
94	!
95	! 1092 2013-02-02 11:24:22Z raasch
96	! unused variables removed
97	!
98	! 1036 2012-10-22 13:43:42Z raasch
99	! code put under GPL (PALM 3.9)
100	!
101	! 2012-09-21 07:03:55Z raasch
102	! FLOAT type conversion replaced by REAL
103	!
104	! 1003 2012-09-14 14:35:53Z raasch
105	! indices nxa, nya, etc. replaced by nx, ny, etc.
106	!
107	! 940 2012-07-09 14:31:00Z raasch
108	! special handling of tri-array as an argument in tridia_1dd routines switched
109	! off because it caused segmentation faults with intel 12.1 compiler
110	!
111	! 877 2012-04-03 11:21:44Z suehring
112	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
113	! pressure at the top of the model domain.
114	!
115	! 809 2012-01-30 13:32:58Z maronga
116	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
117	!
118	! 807 2012-01-25 11:53:51Z maronga
119	! New cpp directive "__check" implemented which is used by check_namelist_files
120	! (most of the code is unneeded by check_namelist_files).
121	!
122	! Revision 1.1 1997/07/24 11:24:14 raasch
123	! Initial revision
124	!
125	!
126	! Description:
127	! ------------
128	!> Solves the Poisson equation with a 2D spectral method
129	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
130	!>
131	!> Input:
132	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
133	!> starting from (1,nys,nxl)
134	!>
135	!> Output:
136	!> real ar contains the solution for perturbation pressure p
137	!------------------------------------------------------------------------------!
138	MODULE poisfft_mod
139
140
141	USE fft_xy, &
142	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
143
144	USE indices, &
145	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
146
147	USE transpose_indices, &
148	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
149	nzb_y, nzt_x, nzt_y
150
151	USE tridia_solver, &
152	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
153
154	IMPLICIT NONE
155
156	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
157
158	PRIVATE
159
160	PUBLIC poisfft, poisfft_init
161
162	INTERFACE poisfft
163	MODULE PROCEDURE poisfft
164	END INTERFACE poisfft
165
166	INTERFACE poisfft_init
167	MODULE PROCEDURE poisfft_init
168	END INTERFACE poisfft_init
169
170
171	CONTAINS
172
173	!------------------------------------------------------------------------------!
174	! Description:
175	! ------------
176	!> @todo Missing subroutine description.
177	!------------------------------------------------------------------------------!
178	SUBROUTINE poisfft_init
179
180	USE arrays_3d, &
181	ONLY: ddzu_pres, ddzw
182
183	USE kinds
184
185	IMPLICIT NONE
186
187	INTEGER(iwp) :: k !<
188
189
190	CALL fft_init
191
192	CALL tridia_init
193
194	poisfft_initialized = .TRUE.
195
196	END SUBROUTINE poisfft_init
197
198
199
200	!------------------------------------------------------------------------------!
201	! Description:
202	! ------------
203	!> Two-dimensional Fourier Transformation in x- and y-direction.
204	!------------------------------------------------------------------------------!
205	SUBROUTINE poisfft( ar )
206
207	USE control_parameters, &
208	ONLY: fft_method, transpose_compute_overlap
209
210	USE cpulog, &
211	ONLY: cpu_log, cpu_log_nowait, log_point_s
212
213	USE kinds
214
215	USE pegrid
216
217	IMPLICIT NONE
218
219	INTEGER(iwp) :: ii !<
220	INTEGER(iwp) :: iind !<
221	INTEGER(iwp) :: inew !<
222	INTEGER(iwp) :: jj !<
223	INTEGER(iwp) :: jind !<
224	INTEGER(iwp) :: jnew !<
225	INTEGER(iwp) :: ki !<
226	INTEGER(iwp) :: kk !<
227	INTEGER(iwp) :: knew !<
228	INTEGER(iwp) :: n !<
229	INTEGER(iwp) :: nblk !<
230	INTEGER(iwp) :: nnx_y !<
231	INTEGER(iwp) :: nny_z !<
232	INTEGER(iwp) :: nnz_t !<
233	INTEGER(iwp) :: nnz_x !<
234	INTEGER(iwp) :: nxl_y_bound !<
235	INTEGER(iwp) :: nxr_y_bound !<
236
237	INTEGER(iwp), DIMENSION(4) :: isave !<
238
239	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
240	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
241	!$acc declare create( ar_inv )
242
243	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
244	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
245	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
246	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
247	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
248
249
250	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
251
252	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
253
254	!
255	!-- Two-dimensional Fourier Transformation in x- and y-direction.
256	IF ( pdims(2) == 1 .AND. pdims(1) > 1 .AND. num_acc_per_node == 0 ) &
257	THEN
258
259	!
260	!-- 1d-domain-decomposition along x:
261	!-- FFT along y and transposition y --> x
262	CALL ffty_tr_yx( ar, ar )
263
264	!
265	!-- FFT along x, solving the tridiagonal system and backward FFT
266	CALL fftx_tri_fftx( ar )
267
268	!
269	!-- Transposition x --> y and backward FFT along y
270	CALL tr_xy_ffty( ar, ar )
271
272	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 .AND. num_acc_per_node == 0 ) &
273	THEN
274
275	!
276	!-- 1d-domain-decomposition along y:
277	!-- FFT along x and transposition x --> y
278	CALL fftx_tr_xy( ar, ar )
279
280	!
281	!-- FFT along y, solving the tridiagonal system and backward FFT
282	CALL ffty_tri_ffty( ar )
283
284	!
285	!-- Transposition y --> x and backward FFT along x
286	CALL tr_yx_fftx( ar, ar )
287
288	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
289
290	!
291	!-- 2d-domain-decomposition or no decomposition (1 PE run)
292	!-- Transposition z --> x
293	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
294	CALL resort_for_zx( ar, ar_inv )
295	CALL transpose_zx( ar_inv, ar )
296	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
297
298	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
299	IF ( fft_method /= 'system-specific' ) THEN
300	!$acc update host( ar )
301	ENDIF
302	CALL fft_x( ar, 'forward' )
303	IF ( fft_method /= 'system-specific' ) THEN
304	!$acc update device( ar )
305	ENDIF
306	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
307
308	!
309	!-- Transposition x --> y
310	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
311	CALL resort_for_xy( ar, ar_inv )
312	CALL transpose_xy( ar_inv, ar )
313	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
314
315	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
316	IF ( fft_method /= 'system-specific' ) THEN
317	!$acc update host( ar )
318	ENDIF
319	CALL fft_y( ar, 'forward', ar_tr = ar, &
320	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
321	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
322	IF ( fft_method /= 'system-specific' ) THEN
323	!$acc update device( ar )
324	ENDIF
325	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
326
327	!
328	!-- Transposition y --> z
329	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
330	CALL resort_for_yz( ar, ar_inv )
331	CALL transpose_yz( ar_inv, ar )
332	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
333
334	!
335	!-- Solve the tridiagonal equation system along z
336	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
337	CALL tridia_substi( ar )
338	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
339
340	!
341	!-- Inverse Fourier Transformation
342	!-- Transposition z --> y
343	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
344	CALL transpose_zy( ar, ar_inv )
345	CALL resort_for_zy( ar_inv, ar )
346	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
347
348	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
349	IF ( fft_method /= 'system-specific' ) THEN
350	!$acc update host( ar )
351	ENDIF
352	CALL fft_y( ar, 'backward', ar_tr = ar, &
353	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
354	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
355	IF ( fft_method /= 'system-specific' ) THEN
356	!$acc update device( ar )
357	ENDIF
358	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
359
360	!
361	!-- Transposition y --> x
362	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
363	CALL transpose_yx( ar, ar_inv )
364	CALL resort_for_yx( ar_inv, ar )
365	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
366
367	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
368	IF ( fft_method /= 'system-specific' ) THEN
369	!$acc update host( ar )
370	ENDIF
371	CALL fft_x( ar, 'backward' )
372	IF ( fft_method /= 'system-specific' ) THEN
373	!$acc update device( ar )
374	ENDIF
375	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
376
377	!
378	!-- Transposition x --> z
379	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
380	CALL transpose_xz( ar, ar_inv )
381	CALL resort_for_xz( ar_inv, ar )
382	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
383
384	ELSE
385
386	!
387	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
388	!-- overlapping transposition / fft
389	!-- cputime logging must not use barriers, which would prevent overlapping
390	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
391	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
392	!
393	!-- Transposition z --> x + subsequent fft along x
394	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
395	CALL resort_for_zx( ar, f_inv )
396	!
397	!-- Save original indices and gridpoint counter
398	isave(1) = nz
399	isave(2) = nzb_x
400	isave(3) = nzt_x
401	isave(4) = sendrecvcount_zx
402	!
403	!-- Set new indices for transformation
404	nblk = nz / pdims(1)
405	nz = pdims(1)
406	nnz_x = 1
407	nzb_x = 1 + myidx * nnz_x
408	nzt_x = ( myidx + 1 ) * nnz_x
409	sendrecvcount_zx = nnx * nny * nnz_x
410
411	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
412	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
413
414	DO kk = 1, nblk
415
416	IF ( kk == 1 ) THEN
417	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
418	ELSE
419	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
420	ENDIF
421
422	DO knew = 1, nz
423	ki = kk + nblk * ( knew - 1 )
424	f_in(:,:,knew) = f_inv(:,:,ki)
425	ENDDO
426
427	CALL transpose_zx( f_in, ar1(:,:,:))
428	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
429
430	IF ( kk == 1 ) THEN
431	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
432	ELSE
433	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
434	ENDIF
435
436	n = isave(2) + kk - 1
437	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
438	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
439
440	ENDDO
441	!
442	!-- Restore original indices/counters
443	nz = isave(1)
444	nzb_x = isave(2)
445	nzt_x = isave(3)
446	sendrecvcount_zx = isave(4)
447
448	DEALLOCATE( ar1, f_in, f_inv )
449
450	!
451	!-- Transposition x --> y + subsequent fft along y
452	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
453	CALL resort_for_xy( f_out_z, f_inv )
454	!
455	!-- Save original indices and gridpoint counter
456	isave(1) = nx
457	isave(2) = nxl_y
458	isave(3) = nxr_y
459	isave(4) = sendrecvcount_xy
460	!
461	!-- Set new indices for transformation
462	nblk = ( ( nx+1 ) / pdims(2) ) - 1
463	nx = pdims(2)
464	nnx_y = 1
465	nxl_y = myidy * nnx_y
466	nxr_y = ( myidy + 1 ) * nnx_y - 1
467	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
468
469	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
470	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
471
472	DO ii = 0, nblk
473
474	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
475
476	DO inew = 0, nx-1
477	iind = ii + ( nblk + 1 ) * inew
478	f_in(:,:,inew) = f_inv(:,:,iind)
479	ENDDO
480
481	CALL transpose_xy( f_in, ar1(:,:,:) )
482
483	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
484
485	IF ( ii == 1 ) THEN
486	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
487	ELSE
488	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
489	ENDIF
490
491	nxl_y_bound = isave(2)
492	nxr_y_bound = isave(3)
493	n = isave(2) + ii
494	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
495	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
496	nxl_y_l = n, nxr_y_l = n )
497
498	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
499
500	ENDDO
501	!
502	!-- Restore original indices/counters
503	nx = isave(1)
504	nxl_y = isave(2)
505	nxr_y = isave(3)
506	sendrecvcount_xy = isave(4)
507
508	DEALLOCATE( ar1, f_in, f_inv )
509
510	!
511	!-- Transposition y --> z + subsequent tridia + resort for z --> y
512	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
513	CALL resort_for_yz( f_out_y, f_inv )
514	!
515	!-- Save original indices and gridpoint counter
516	isave(1) = ny
517	isave(2) = nys_z
518	isave(3) = nyn_z
519	isave(4) = sendrecvcount_yz
520	!
521	!-- Set new indices for transformation
522	nblk = ( ( ny+1 ) / pdims(1) ) - 1
523	ny = pdims(1)
524	nny_z = 1
525	nys_z = myidx * nny_z
526	nyn_z = ( myidx + 1 ) * nny_z - 1
527	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
528
529	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
530	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
531
532	DO jj = 0, nblk
533	!
534	!-- Forward Fourier Transformation
535	!-- Transposition y --> z
536	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
537
538	DO jnew = 0, ny-1
539	jind = jj + ( nblk + 1 ) * jnew
540	f_in(:,:,jnew) = f_inv(:,:,jind)
541	ENDDO
542
543	CALL transpose_yz( f_in, ar1(:,:,:) )
544
545	IF ( jj == nblk ) THEN
546	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
547	ELSE
548	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
549	ENDIF
550
551	!
552	!-- Solve the tridiagonal equation system along z
553	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
554
555	n = isave(2) + jj
556	CALL tridia_substi_overlap( ar1(:,:,:), n )
557
558	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
559
560	!
561	!-- Inverse Fourier Transformation
562	!-- Transposition z --> y
563	!-- Only one thread should call MPI routines, therefore forward and
564	!-- backward tranpose are in the same section
565	IF ( jj == 0 ) THEN
566	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
567	ELSE
568	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
569	ENDIF
570
571	CALL transpose_zy( ar1(:,:,:), f_in )
572
573	DO jnew = 0, ny-1
574	jind = jj + ( nblk + 1 ) * jnew
575	f_inv(:,:,jind) = f_in(:,:,jnew)
576	ENDDO
577
578	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
579
580	ENDDO
581	!
582	!-- Restore original indices/counters
583	ny = isave(1)
584	nys_z = isave(2)
585	nyn_z = isave(3)
586	sendrecvcount_yz = isave(4)
587
588	CALL resort_for_zy( f_inv, f_out_y )
589
590	DEALLOCATE( ar1, f_in, f_inv )
591
592	!
593	!-- fft along y backward + subsequent transposition y --> x
594	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
595	!
596	!-- Save original indices and gridpoint counter
597	isave(1) = nx
598	isave(2) = nxl_y
599	isave(3) = nxr_y
600	isave(4) = sendrecvcount_xy
601	!
602	!-- Set new indices for transformation
603	nblk = (( nx+1 ) / pdims(2) ) - 1
604	nx = pdims(2)
605	nnx_y = 1
606	nxl_y = myidy * nnx_y
607	nxr_y = ( myidy + 1 ) * nnx_y - 1
608	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
609
610	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
611	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
612
613	DO ii = 0, nblk
614
615	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
616
617	n = isave(2) + ii
618	nxl_y_bound = isave(2)
619	nxr_y_bound = isave(3)
620
621	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
622	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
623	nxl_y_l = n, nxr_y_l = n )
624
625	IF ( ii == nblk ) THEN
626	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
627	ELSE
628	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
629	ENDIF
630
631	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
632
633	CALL transpose_yx( ar1(:,:,:), f_in )
634
635	DO inew = 0, nx-1
636	iind = ii + (nblk+1) * inew
637	f_inv(:,:,iind) = f_in(:,:,inew)
638	ENDDO
639
640	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
641
642	ENDDO
643	!
644	!-- Restore original indices/counters
645	nx = isave(1)
646	nxl_y = isave(2)
647	nxr_y = isave(3)
648	sendrecvcount_xy = isave(4)
649
650	CALL resort_for_yx( f_inv, f_out_z )
651
652	DEALLOCATE( ar1, f_in, f_inv )
653
654	!
655	!-- fft along x backward + subsequent final transposition x --> z
656	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
657	!
658	!-- Save original indices and gridpoint counter
659	isave(1) = nz
660	isave(2) = nzb_x
661	isave(3) = nzt_x
662	isave(4) = sendrecvcount_zx
663	!
664	!-- Set new indices for transformation
665	nblk = nz / pdims(1)
666	nz = pdims(1)
667	nnz_x = 1
668	nzb_x = 1 + myidx * nnz_x
669	nzt_x = ( myidx + 1 ) * nnz_x
670	sendrecvcount_zx = nnx * nny * nnz_x
671
672	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
673	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
674
675	DO kk = 1, nblk
676
677	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
678
679	n = isave(2) + kk - 1
680	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
681
682	IF ( kk == nblk ) THEN
683	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
684	ELSE
685	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
686	ENDIF
687
688	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
689
690	CALL transpose_xz( ar1(:,:,:), f_in )
691
692	DO knew = 1, nz
693	ki = kk + nblk * (knew-1)
694	f_inv(:,:,ki) = f_in(:,:,knew)
695	ENDDO
696
697	IF ( kk == nblk ) THEN
698	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
699	ELSE
700	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
701	ENDIF
702
703	ENDDO
704	!
705	!-- Restore original indices/counters
706	nz = isave(1)
707	nzb_x = isave(2)
708	nzt_x = isave(3)
709	sendrecvcount_zx = isave(4)
710
711	CALL resort_for_xz( f_inv, ar )
712
713	DEALLOCATE( ar1, f_in, f_inv )
714
715	ENDIF
716
717	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
718
719	END SUBROUTINE poisfft
720
721
722	!------------------------------------------------------------------------------!
723	! Description:
724	! ------------
725	!> Fourier-transformation along y with subsequent transposition y --> x for
726	!> a 1d-decomposition along x.
727	!>
728	!> @attention The performance of this routine is much faster on the NEC-SX6,
729	!> if the first index of work_ffty_vec is odd. Otherwise
730	!> memory bank conflicts may occur (especially if the index is a
731	!> multiple of 128). That's why work_ffty_vec is dimensioned as
732	!> 0:ny+1.
733	!> Of course, this will not work if users are using an odd number
734	!> of gridpoints along y.
735	!------------------------------------------------------------------------------!
736	SUBROUTINE ffty_tr_yx( f_in, f_out )
737
738
739	USE control_parameters, &
740	ONLY: host
741
742	USE cpulog, &
743	ONLY: cpu_log, log_point_s
744
745	USE kinds
746
747	USE pegrid
748
749	IMPLICIT NONE
750
751	INTEGER(iwp) :: i !<
752	INTEGER(iwp) :: iend !<
753	INTEGER(iwp) :: iouter !<
754	INTEGER(iwp) :: ir !<
755	INTEGER(iwp) :: j !<
756	INTEGER(iwp) :: k !<
757
758	INTEGER(iwp), PARAMETER :: stridex = 4 !<
759
760	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
761	#if defined( __nec )
762	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
763	#endif
764	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
765	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
766	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
767
768	!
769	!-- Carry out the FFT along y, where all data are present due to the
770	!-- 1d-decomposition along x. Resort the data in a way that x becomes
771	!-- the first index.
772	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
773
774	IF ( host(1:3) == 'nec' ) THEN
775	#if defined( __nec )
776	!
777	!-- Code optimized for vector processors
778	!$OMP PARALLEL PRIVATE ( i, j, k )
779	!$OMP DO
780	DO i = nxl, nxr
781
782	DO j = 0, ny
783	DO k = 1, nz
784	work_ffty_vec(j,k,i) = f_in(k,j,i)
785	ENDDO
786	ENDDO
787
788	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
789
790	ENDDO
791
792	!$OMP DO
793	DO k = 1, nz
794	DO j = 0, ny
795	DO i = nxl, nxr
796	work(i,k,j) = work_ffty_vec(j,k,i)
797	ENDDO
798	ENDDO
799	ENDDO
800	!$OMP END PARALLEL
801	#endif
802
803	ELSE
804
805	!
806	!-- Cache optimized code.
807	!-- The i-(x-)direction is split into a strided outer loop and an inner
808	!-- loop for better cache performance
809	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
810	!$OMP DO
811	DO iouter = nxl, nxr, stridex
812
813	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
814
815	DO k = 1, nz
816
817	DO i = iouter, iend
818
819	ir = i-iouter+1 ! counter within a stride
820	DO j = 0, ny
821	work_ffty(j,ir) = f_in(k,j,i)
822	ENDDO
823	!
824	!-- FFT along y
825	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
826
827	ENDDO
828
829	!
830	!-- Resort
831	DO j = 0, ny
832	DO i = iouter, iend
833	work(i,k,j) = work_ffty(j,i-iouter+1)
834	ENDDO
835	ENDDO
836
837	ENDDO
838
839	ENDDO
840	!$OMP END PARALLEL
841
842	ENDIF
843	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
844
845	!
846	!-- Transpose array
847	#if defined( __parallel )
848	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
849	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
850	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
851	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
852	comm1dx, ierr )
853	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
854	#endif
855
856	END SUBROUTINE ffty_tr_yx
857
858
859	!------------------------------------------------------------------------------!
860	! Description:
861	! ------------
862	!> Transposition x --> y with a subsequent backward Fourier transformation for
863	!> a 1d-decomposition along x
864	!------------------------------------------------------------------------------!
865	SUBROUTINE tr_xy_ffty( f_in, f_out )
866
867
868	USE control_parameters, &
869	ONLY: host
870
871	USE cpulog, &
872	ONLY: cpu_log, log_point_s
873
874	USE kinds
875
876	USE pegrid
877
878	IMPLICIT NONE
879
880	INTEGER(iwp) :: i !<
881	INTEGER(iwp) :: iend !<
882	INTEGER(iwp) :: iouter !<
883	INTEGER(iwp) :: ir !<
884	INTEGER(iwp) :: j !<
885	INTEGER(iwp) :: k !<
886
887	INTEGER(iwp), PARAMETER :: stridex = 4 !<
888
889	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
890	#if defined( __nec )
891	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
892	#endif
893	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
894	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
895	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
896
897	!
898	!-- Transpose array
899	#if defined( __parallel )
900	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
901	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
902	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
903	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
904	comm1dx, ierr )
905	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
906	#endif
907
908	!
909	!-- Resort the data in a way that y becomes the first index and carry out the
910	!-- backward fft along y.
911	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
912
913	IF ( host(1:3) == 'nec' ) THEN
914	#if defined( __nec )
915	!
916	!-- Code optimized for vector processors
917	!$OMP PARALLEL PRIVATE ( i, j, k )
918	!$OMP DO
919	DO k = 1, nz
920	DO j = 0, ny
921	DO i = nxl, nxr
922	work_ffty_vec(j,k,i) = work(i,k,j)
923	ENDDO
924	ENDDO
925	ENDDO
926
927	!$OMP DO
928	DO i = nxl, nxr
929
930	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
931
932	DO j = 0, ny
933	DO k = 1, nz
934	f_out(k,j,i) = work_ffty_vec(j,k,i)
935	ENDDO
936	ENDDO
937
938	ENDDO
939	!$OMP END PARALLEL
940	#endif
941
942	ELSE
943
944	!
945	!-- Cache optimized code.
946	!-- The i-(x-)direction is split into a strided outer loop and an inner
947	!-- loop for better cache performance
948	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
949	!$OMP DO
950	DO iouter = nxl, nxr, stridex
951
952	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
953
954	DO k = 1, nz
955	!
956	!-- Resort
957	DO j = 0, ny
958	DO i = iouter, iend
959	work_ffty(j,i-iouter+1) = work(i,k,j)
960	ENDDO
961	ENDDO
962
963	DO i = iouter, iend
964
965	!
966	!-- FFT along y
967	ir = i-iouter+1 ! counter within a stride
968	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
969
970	DO j = 0, ny
971	f_out(k,j,i) = work_ffty(j,ir)
972	ENDDO
973	ENDDO
974
975	ENDDO
976
977	ENDDO
978	!$OMP END PARALLEL
979
980	ENDIF
981
982	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
983
984	END SUBROUTINE tr_xy_ffty
985
986
987	!------------------------------------------------------------------------------!
988	! Description:
989	! ------------
990	!> FFT along x, solution of the tridiagonal system and backward FFT for
991	!> a 1d-decomposition along x
992	!>
993	!> @warning this subroutine may still not work for hybrid parallelization
994	!> with OpenMP (for possible necessary changes see the original
995	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
996	!------------------------------------------------------------------------------!
997	SUBROUTINE fftx_tri_fftx( ar )
998
999
1000	USE control_parameters, &
1001	ONLY: host
1002
1003	USE cpulog, &
1004	ONLY: cpu_log, log_point_s
1005
1006	USE grid_variables, &
1007	ONLY: ddx2, ddy2
1008
1009	USE kinds
1010
1011	USE pegrid
1012
1013	IMPLICIT NONE
1014
1015	INTEGER(iwp) :: i !<
1016	INTEGER(iwp) :: j !<
1017	INTEGER(iwp) :: k !<
1018	INTEGER(iwp) :: m !<
1019	INTEGER(iwp) :: n !<
1020	INTEGER(iwp) :: omp_get_thread_num !<
1021	INTEGER(iwp) :: tn !<
1022
1023	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
1024	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
1025	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
1026	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1027
1028
1029	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
1030
1031	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1032
1033	tn = 0 ! Default thread number in case of one thread
1034	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1035	DO j = nys_x, nyn_x
1036
1037	!$ tn = omp_get_thread_num()
1038
1039	IF ( host(1:3) == 'nec' ) THEN
1040	!
1041	!-- Code optimized for vector processors
1042	DO k = 1, nz
1043
1044	m = 0
1045	DO n = 1, pdims(1)
1046	DO i = 1, nnx
1047	work_trix(m,k) = ar(i,k,j,n)
1048	m = m + 1
1049	ENDDO
1050	ENDDO
1051
1052	ENDDO
1053
1054	CALL fft_x_m( work_trix, 'forward' )
1055
1056	ELSE
1057	!
1058	!-- Cache optimized code
1059	DO k = 1, nz
1060
1061	m = 0
1062	DO n = 1, pdims(1)
1063	DO i = 1, nnx
1064	work_fftx(m) = ar(i,k,j,n)
1065	m = m + 1
1066	ENDDO
1067	ENDDO
1068
1069	CALL fft_x_1d( work_fftx, 'forward' )
1070
1071	DO i = 0, nx
1072	work_trix(i,k) = work_fftx(i)
1073	ENDDO
1074
1075	ENDDO
1076
1077	ENDIF
1078
1079	!
1080	!-- Solve the linear equation system
1081	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1082
1083	IF ( host(1:3) == 'nec' ) THEN
1084	!
1085	!-- Code optimized for vector processors
1086	CALL fft_x_m( work_trix, 'backward' )
1087
1088	DO k = 1, nz
1089
1090	m = 0
1091	DO n = 1, pdims(1)
1092	DO i = 1, nnx
1093	ar(i,k,j,n) = work_trix(m,k)
1094	m = m + 1
1095	ENDDO
1096	ENDDO
1097
1098	ENDDO
1099
1100	ELSE
1101	!
1102	!-- Cache optimized code
1103	DO k = 1, nz
1104
1105	DO i = 0, nx
1106	work_fftx(i) = work_trix(i,k)
1107	ENDDO
1108
1109	CALL fft_x_1d( work_fftx, 'backward' )
1110
1111	m = 0
1112	DO n = 1, pdims(1)
1113	DO i = 1, nnx
1114	ar(i,k,j,n) = work_fftx(m)
1115	m = m + 1
1116	ENDDO
1117	ENDDO
1118
1119	ENDDO
1120
1121	ENDIF
1122
1123	ENDDO
1124
1125	DEALLOCATE( tri )
1126
1127	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1128
1129	END SUBROUTINE fftx_tri_fftx
1130
1131
1132	!------------------------------------------------------------------------------!
1133	! Description:
1134	! ------------
1135	!> Fourier-transformation along x with subsequent transposition x --> y for
1136	!> a 1d-decomposition along y.
1137	!>
1138	!> @attention NEC-branch of this routine may significantly profit from
1139	!> further optimizations. So far, performance is much worse than
1140	!> for routine ffty_tr_yx (more than three times slower).
1141	!------------------------------------------------------------------------------!
1142	SUBROUTINE fftx_tr_xy( f_in, f_out )
1143
1144
1145	USE control_parameters, &
1146	ONLY: host
1147
1148	USE cpulog, &
1149	ONLY: cpu_log, log_point_s
1150
1151	USE kinds
1152
1153	USE pegrid
1154
1155	IMPLICIT NONE
1156
1157	INTEGER(iwp) :: i !<
1158	INTEGER(iwp) :: j !<
1159	INTEGER(iwp) :: k !<
1160
1161	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1162	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
1163	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
1164	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1165
1166	!
1167	!-- Carry out the FFT along x, where all data are present due to the
1168	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1169	!-- the first index.
1170	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1171
1172	IF ( host(1:3) == 'nec' ) THEN
1173	!
1174	!-- Code for vector processors
1175	!$OMP PARALLEL PRIVATE ( i, j, k )
1176	!$OMP DO
1177	DO i = 0, nx
1178
1179	DO j = nys, nyn
1180	DO k = 1, nz
1181	work_fftx(i,k,j) = f_in(k,j,i)
1182	ENDDO
1183	ENDDO
1184
1185	ENDDO
1186
1187	!$OMP DO
1188	DO j = nys, nyn
1189
1190	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1191
1192	DO k = 1, nz
1193	DO i = 0, nx
1194	work(j,k,i) = work_fftx(i,k,j)
1195	ENDDO
1196	ENDDO
1197
1198	ENDDO
1199	!$OMP END PARALLEL
1200
1201	ELSE
1202
1203	!
1204	!-- Cache optimized code (there might be still a potential for better
1205	!-- optimization).
1206	!$OMP PARALLEL PRIVATE (i,j,k)
1207	!$OMP DO
1208	DO i = 0, nx
1209
1210	DO j = nys, nyn
1211	DO k = 1, nz
1212	work_fftx(i,k,j) = f_in(k,j,i)
1213	ENDDO
1214	ENDDO
1215
1216	ENDDO
1217
1218	!$OMP DO
1219	DO j = nys, nyn
1220	DO k = 1, nz
1221
1222	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1223
1224	DO i = 0, nx
1225	work(j,k,i) = work_fftx(i,k,j)
1226	ENDDO
1227	ENDDO
1228
1229	ENDDO
1230	!$OMP END PARALLEL
1231
1232	ENDIF
1233	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1234
1235	!
1236	!-- Transpose array
1237	#if defined( __parallel )
1238	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1239	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1240	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1241	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1242	comm1dy, ierr )
1243	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1244	#endif
1245
1246	END SUBROUTINE fftx_tr_xy
1247
1248
1249	!------------------------------------------------------------------------------!
1250	! Description:
1251	! ------------
1252	!> Transposition y --> x with a subsequent backward Fourier transformation for
1253	!> a 1d-decomposition along x.
1254	!------------------------------------------------------------------------------!
1255	SUBROUTINE tr_yx_fftx( f_in, f_out )
1256
1257
1258	USE control_parameters, &
1259	ONLY: host
1260
1261	USE cpulog, &
1262	ONLY: cpu_log, log_point_s
1263
1264	USE kinds
1265
1266	USE pegrid
1267
1268	IMPLICIT NONE
1269
1270	INTEGER(iwp) :: i !<
1271	INTEGER(iwp) :: j !<
1272	INTEGER(iwp) :: k !<
1273
1274	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1275	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
1276	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
1277	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1278
1279	!
1280	!-- Transpose array
1281	#if defined( __parallel )
1282	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1283	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1284	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1285	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1286	comm1dy, ierr )
1287	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1288	#endif
1289
1290	!
1291	!-- Carry out the FFT along x, where all data are present due to the
1292	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1293	!-- the first index.
1294	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1295
1296	IF ( host(1:3) == 'nec' ) THEN
1297	!
1298	!-- Code optimized for vector processors
1299	!$OMP PARALLEL PRIVATE ( i, j, k )
1300	!$OMP DO
1301	DO j = nys, nyn
1302
1303	DO k = 1, nz
1304	DO i = 0, nx
1305	work_fftx(i,k,j) = work(j,k,i)
1306	ENDDO
1307	ENDDO
1308
1309	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1310
1311	ENDDO
1312
1313	!$OMP DO
1314	DO i = 0, nx
1315	DO j = nys, nyn
1316	DO k = 1, nz
1317	f_out(k,j,i) = work_fftx(i,k,j)
1318	ENDDO
1319	ENDDO
1320	ENDDO
1321	!$OMP END PARALLEL
1322
1323	ELSE
1324
1325	!
1326	!-- Cache optimized code (there might be still a potential for better
1327	!-- optimization).
1328	!$OMP PARALLEL PRIVATE (i,j,k)
1329	!$OMP DO
1330	DO j = nys, nyn
1331	DO k = 1, nz
1332
1333	DO i = 0, nx
1334	work_fftx(i,k,j) = work(j,k,i)
1335	ENDDO
1336
1337	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1338
1339	ENDDO
1340	ENDDO
1341
1342	!$OMP DO
1343	DO i = 0, nx
1344	DO j = nys, nyn
1345	DO k = 1, nz
1346	f_out(k,j,i) = work_fftx(i,k,j)
1347	ENDDO
1348	ENDDO
1349	ENDDO
1350	!$OMP END PARALLEL
1351
1352	ENDIF
1353	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1354
1355	END SUBROUTINE tr_yx_fftx
1356
1357
1358	!------------------------------------------------------------------------------!
1359	! Description:
1360	! ------------
1361	!> FFT along y, solution of the tridiagonal system and backward FFT for
1362	!> a 1d-decomposition along y.
1363	!>
1364	!> @warning this subroutine may still not work for hybrid parallelization
1365	!> with OpenMP (for possible necessary changes see the original
1366	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1367	!------------------------------------------------------------------------------!
1368	SUBROUTINE ffty_tri_ffty( ar )
1369
1370
1371	USE control_parameters, &
1372	ONLY: host
1373
1374	USE cpulog, &
1375	ONLY: cpu_log, log_point_s
1376
1377	USE grid_variables, &
1378	ONLY: ddx2, ddy2
1379
1380	USE kinds
1381
1382	USE pegrid
1383
1384	IMPLICIT NONE
1385
1386	INTEGER(iwp) :: i !<
1387	INTEGER(iwp) :: j !<
1388	INTEGER(iwp) :: k !<
1389	INTEGER(iwp) :: m !<
1390	INTEGER(iwp) :: n !<
1391	INTEGER(iwp) :: omp_get_thread_num !<
1392	INTEGER(iwp) :: tn !<
1393
1394	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
1395	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
1396	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
1397	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1398
1399
1400	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1401
1402	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1403
1404	tn = 0 ! Default thread number in case of one thread
1405	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1406	DO i = nxl_y, nxr_y
1407
1408	!$ tn = omp_get_thread_num()
1409
1410	IF ( host(1:3) == 'nec' ) THEN
1411	!
1412	!-- Code optimized for vector processors
1413	DO k = 1, nz
1414
1415	m = 0
1416	DO n = 1, pdims(2)
1417	DO j = 1, nny
1418	work_triy(m,k) = ar(j,k,i,n)
1419	m = m + 1
1420	ENDDO
1421	ENDDO
1422
1423	ENDDO
1424
1425	CALL fft_y_m( work_triy, ny, 'forward' )
1426
1427	ELSE
1428	!
1429	!-- Cache optimized code
1430	DO k = 1, nz
1431
1432	m = 0
1433	DO n = 1, pdims(2)
1434	DO j = 1, nny
1435	work_ffty(m) = ar(j,k,i,n)
1436	m = m + 1
1437	ENDDO
1438	ENDDO
1439
1440	CALL fft_y_1d( work_ffty, 'forward' )
1441
1442	DO j = 0, ny
1443	work_triy(j,k) = work_ffty(j)
1444	ENDDO
1445
1446	ENDDO
1447
1448	ENDIF
1449
1450	!
1451	!-- Solve the linear equation system
1452	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1453
1454	IF ( host(1:3) == 'nec' ) THEN
1455	!
1456	!-- Code optimized for vector processors
1457	CALL fft_y_m( work_triy, ny, 'backward' )
1458
1459	DO k = 1, nz
1460
1461	m = 0
1462	DO n = 1, pdims(2)
1463	DO j = 1, nny
1464	ar(j,k,i,n) = work_triy(m,k)
1465	m = m + 1
1466	ENDDO
1467	ENDDO
1468
1469	ENDDO
1470
1471	ELSE
1472	!
1473	!-- Cache optimized code
1474	DO k = 1, nz
1475
1476	DO j = 0, ny
1477	work_ffty(j) = work_triy(j,k)
1478	ENDDO
1479
1480	CALL fft_y_1d( work_ffty, 'backward' )
1481
1482	m = 0
1483	DO n = 1, pdims(2)
1484	DO j = 1, nny
1485	ar(j,k,i,n) = work_ffty(m)
1486	m = m + 1
1487	ENDDO
1488	ENDDO
1489
1490	ENDDO
1491
1492	ENDIF
1493
1494	ENDDO
1495
1496	DEALLOCATE( tri )
1497
1498	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1499
1500	END SUBROUTINE ffty_tri_ffty
1501
1502	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |