Home

Context Navigation

poisfft.f90 @ 1836

Last change on this file since 1836 was 1805, checked in by maronga, 9 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 45.5 KB

Line
1	!> @file poisfft.f90
2	!--------------------------------------------------------------------------------!
3	! This file is part of PALM.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the terms
6	! of the GNU General Public License as published by the Free Software Foundation,
7	! either version 3 of the License, or (at your option) any later version.
8	!
9	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
10	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
11	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12	!
13	! You should have received a copy of the GNU General Public License along with
14	! PALM. If not, see <http://www.gnu.org/licenses/>.
15	!
16	! Copyright 1997-2014 Leibniz Universitaet Hannover
17	!--------------------------------------------------------------------------------!
18	!
19	! Current revisions:
20	! -----------------
21	!
22	!
23	! Former revisions:
24	! -----------------
25	! $Id: poisfft.f90 1805 2016-04-05 16:32:34Z raasch $
26	!
27	! 1804 2016-04-05 16:30:18Z maronga
28	! Removed code for parameter file check (__check)
29	!
30	! 1682 2015-10-07 23:56:08Z knoop
31	! Code annotations made doxygen readable
32	!
33	! 1482 2014-10-18 12:34:45Z raasch
34	! use 2d-decomposition, if accelerator boards are used
35	!
36	! 1406 2014-05-16 13:47:01Z raasch
37	! bugfix for pgi 14.4: declare create moved after array declaration
38	!
39	! 1320 2014-03-20 08:40:49Z raasch
40	! ONLY-attribute added to USE-statements,
41	! kind-parameters added to all INTEGER and REAL declaration statements,
42	! kinds are defined in new module kinds,
43	! old module precision_kind is removed,
44	! revision history before 2012 removed,
45	! comment fields (!:) to be used for variable explanations added to
46	! all variable declaration statements
47	!
48	! 1318 2014-03-17 13:35:16Z raasch
49	! module interfaces removed
50	!
51	! 1306 2014-03-13 14:30:59Z raasch
52	! openmp sections removed from the overlap branch,
53	! second argument removed from parameter list
54	!
55	! 1216 2013-08-26 09:31:42Z raasch
56	! resorting of arrays moved to separate routines resort_for_...,
57	! one argument, used as temporary work array, removed from all transpose
58	! routines
59	! overlapping fft / transposition implemented
60	!
61	! 1212 2013-08-15 08:46:27Z raasch
62	! tridia routines moved to seperate module tridia_solver
63	!
64	! 1208 2013-08-13 06:41:49Z raasch
65	! acc-update clauses added for "ar" so that ffts other than cufft can also be
66	! used (although they are not ported and will give a poor performance)
67	!
68	! 1111 2013-03-08 23:54:10Z raasch
69	! further openACC porting of non-parallel (MPI) branch:
70	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
71	! no distinction between parallel/non-parallel in poisfft and tridia any more,
72	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
73	! (otherwise "invalid device function" is indicated during runtime),
74	! optimization of tridia routines: constant elements and coefficients of tri are
75	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
76	! to 2,
77	! poisfft_init is now called internally from poisfft, maketri is called from
78	! poisfft_init,
79	! ibc_p_b = 2 removed
80	!
81	! 1106 2013-03-04 05:31:38Z raasch
82	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
83	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
84	! fft_y_1d
85	!
86	! 1103 2013-02-20 02:15:53Z raasch
87	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
88	! sometimes cause segmentation faults with intel 12.1 compiler
89	!
90	! 1092 2013-02-02 11:24:22Z raasch
91	! unused variables removed
92	!
93	! 1036 2012-10-22 13:43:42Z raasch
94	! code put under GPL (PALM 3.9)
95	!
96	! 2012-09-21 07:03:55Z raasch
97	! FLOAT type conversion replaced by REAL
98	!
99	! 1003 2012-09-14 14:35:53Z raasch
100	! indices nxa, nya, etc. replaced by nx, ny, etc.
101	!
102	! 940 2012-07-09 14:31:00Z raasch
103	! special handling of tri-array as an argument in tridia_1dd routines switched
104	! off because it caused segmentation faults with intel 12.1 compiler
105	!
106	! 877 2012-04-03 11:21:44Z suehring
107	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
108	! pressure at the top of the model domain.
109	!
110	! 809 2012-01-30 13:32:58Z maronga
111	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
112	!
113	! 807 2012-01-25 11:53:51Z maronga
114	! New cpp directive "__check" implemented which is used by check_namelist_files
115	! (most of the code is unneeded by check_namelist_files).
116	!
117	! Revision 1.1 1997/07/24 11:24:14 raasch
118	! Initial revision
119	!
120	!
121	! Description:
122	! ------------
123	!> Solves the Poisson equation with a 2D spectral method
124	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
125	!>
126	!> Input:
127	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
128	!> starting from (1,nys,nxl)
129	!>
130	!> Output:
131	!> real ar contains the solution for perturbation pressure p
132	!------------------------------------------------------------------------------!
133	MODULE poisfft_mod
134
135
136	USE fft_xy, &
137	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
138
139	USE indices, &
140	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
141
142	USE transpose_indices, &
143	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
144	nzb_y, nzt_x, nzt_y
145
146	USE tridia_solver, &
147	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
148
149	IMPLICIT NONE
150
151	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
152
153	PRIVATE
154
155	PUBLIC poisfft, poisfft_init
156
157	INTERFACE poisfft
158	MODULE PROCEDURE poisfft
159	END INTERFACE poisfft
160
161	INTERFACE poisfft_init
162	MODULE PROCEDURE poisfft_init
163	END INTERFACE poisfft_init
164
165
166	CONTAINS
167
168	!------------------------------------------------------------------------------!
169	! Description:
170	! ------------
171	!> @todo Missing subroutine description.
172	!------------------------------------------------------------------------------!
173	SUBROUTINE poisfft_init
174
175	USE arrays_3d, &
176	ONLY: ddzu_pres, ddzw
177
178	USE kinds
179
180	IMPLICIT NONE
181
182	INTEGER(iwp) :: k !<
183
184
185	CALL fft_init
186
187	CALL tridia_init
188
189	poisfft_initialized = .TRUE.
190
191	END SUBROUTINE poisfft_init
192
193
194
195	!------------------------------------------------------------------------------!
196	! Description:
197	! ------------
198	!> Two-dimensional Fourier Transformation in x- and y-direction.
199	!------------------------------------------------------------------------------!
200	SUBROUTINE poisfft( ar )
201
202	USE control_parameters, &
203	ONLY: fft_method, transpose_compute_overlap
204
205	USE cpulog, &
206	ONLY: cpu_log, cpu_log_nowait, log_point_s
207
208	USE kinds
209
210	USE pegrid
211
212	IMPLICIT NONE
213
214	INTEGER(iwp) :: ii !<
215	INTEGER(iwp) :: iind !<
216	INTEGER(iwp) :: inew !<
217	INTEGER(iwp) :: jj !<
218	INTEGER(iwp) :: jind !<
219	INTEGER(iwp) :: jnew !<
220	INTEGER(iwp) :: ki !<
221	INTEGER(iwp) :: kk !<
222	INTEGER(iwp) :: knew !<
223	INTEGER(iwp) :: n !<
224	INTEGER(iwp) :: nblk !<
225	INTEGER(iwp) :: nnx_y !<
226	INTEGER(iwp) :: nny_z !<
227	INTEGER(iwp) :: nnz_t !<
228	INTEGER(iwp) :: nnz_x !<
229	INTEGER(iwp) :: nxl_y_bound !<
230	INTEGER(iwp) :: nxr_y_bound !<
231
232	INTEGER(iwp), DIMENSION(4) :: isave !<
233
234	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
235	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
236	!$acc declare create( ar_inv )
237
238	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
239	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
240	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
241	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
242	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
243
244
245	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
246
247	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
248
249	!
250	!-- Two-dimensional Fourier Transformation in x- and y-direction.
251	IF ( pdims(2) == 1 .AND. pdims(1) > 1 .AND. num_acc_per_node == 0 ) &
252	THEN
253
254	!
255	!-- 1d-domain-decomposition along x:
256	!-- FFT along y and transposition y --> x
257	CALL ffty_tr_yx( ar, ar )
258
259	!
260	!-- FFT along x, solving the tridiagonal system and backward FFT
261	CALL fftx_tri_fftx( ar )
262
263	!
264	!-- Transposition x --> y and backward FFT along y
265	CALL tr_xy_ffty( ar, ar )
266
267	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 .AND. num_acc_per_node == 0 ) &
268	THEN
269
270	!
271	!-- 1d-domain-decomposition along y:
272	!-- FFT along x and transposition x --> y
273	CALL fftx_tr_xy( ar, ar )
274
275	!
276	!-- FFT along y, solving the tridiagonal system and backward FFT
277	CALL ffty_tri_ffty( ar )
278
279	!
280	!-- Transposition y --> x and backward FFT along x
281	CALL tr_yx_fftx( ar, ar )
282
283	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
284
285	!
286	!-- 2d-domain-decomposition or no decomposition (1 PE run)
287	!-- Transposition z --> x
288	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
289	CALL resort_for_zx( ar, ar_inv )
290	CALL transpose_zx( ar_inv, ar )
291	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
292
293	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
294	IF ( fft_method /= 'system-specific' ) THEN
295	!$acc update host( ar )
296	ENDIF
297	CALL fft_x( ar, 'forward' )
298	IF ( fft_method /= 'system-specific' ) THEN
299	!$acc update device( ar )
300	ENDIF
301	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
302
303	!
304	!-- Transposition x --> y
305	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
306	CALL resort_for_xy( ar, ar_inv )
307	CALL transpose_xy( ar_inv, ar )
308	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
309
310	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
311	IF ( fft_method /= 'system-specific' ) THEN
312	!$acc update host( ar )
313	ENDIF
314	CALL fft_y( ar, 'forward', ar_tr = ar, &
315	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
316	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
317	IF ( fft_method /= 'system-specific' ) THEN
318	!$acc update device( ar )
319	ENDIF
320	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
321
322	!
323	!-- Transposition y --> z
324	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
325	CALL resort_for_yz( ar, ar_inv )
326	CALL transpose_yz( ar_inv, ar )
327	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
328
329	!
330	!-- Solve the tridiagonal equation system along z
331	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
332	CALL tridia_substi( ar )
333	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
334
335	!
336	!-- Inverse Fourier Transformation
337	!-- Transposition z --> y
338	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
339	CALL transpose_zy( ar, ar_inv )
340	CALL resort_for_zy( ar_inv, ar )
341	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
342
343	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
344	IF ( fft_method /= 'system-specific' ) THEN
345	!$acc update host( ar )
346	ENDIF
347	CALL fft_y( ar, 'backward', ar_tr = ar, &
348	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
349	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
350	IF ( fft_method /= 'system-specific' ) THEN
351	!$acc update device( ar )
352	ENDIF
353	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
354
355	!
356	!-- Transposition y --> x
357	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
358	CALL transpose_yx( ar, ar_inv )
359	CALL resort_for_yx( ar_inv, ar )
360	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
361
362	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
363	IF ( fft_method /= 'system-specific' ) THEN
364	!$acc update host( ar )
365	ENDIF
366	CALL fft_x( ar, 'backward' )
367	IF ( fft_method /= 'system-specific' ) THEN
368	!$acc update device( ar )
369	ENDIF
370	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
371
372	!
373	!-- Transposition x --> z
374	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
375	CALL transpose_xz( ar, ar_inv )
376	CALL resort_for_xz( ar_inv, ar )
377	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
378
379	ELSE
380
381	!
382	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
383	!-- overlapping transposition / fft
384	!-- cputime logging must not use barriers, which would prevent overlapping
385	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
386	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
387	!
388	!-- Transposition z --> x + subsequent fft along x
389	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
390	CALL resort_for_zx( ar, f_inv )
391	!
392	!-- Save original indices and gridpoint counter
393	isave(1) = nz
394	isave(2) = nzb_x
395	isave(3) = nzt_x
396	isave(4) = sendrecvcount_zx
397	!
398	!-- Set new indices for transformation
399	nblk = nz / pdims(1)
400	nz = pdims(1)
401	nnz_x = 1
402	nzb_x = 1 + myidx * nnz_x
403	nzt_x = ( myidx + 1 ) * nnz_x
404	sendrecvcount_zx = nnx * nny * nnz_x
405
406	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
407	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
408
409	DO kk = 1, nblk
410
411	IF ( kk == 1 ) THEN
412	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
413	ELSE
414	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
415	ENDIF
416
417	DO knew = 1, nz
418	ki = kk + nblk * ( knew - 1 )
419	f_in(:,:,knew) = f_inv(:,:,ki)
420	ENDDO
421
422	CALL transpose_zx( f_in, ar1(:,:,:))
423	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
424
425	IF ( kk == 1 ) THEN
426	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
427	ELSE
428	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
429	ENDIF
430
431	n = isave(2) + kk - 1
432	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
433	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
434
435	ENDDO
436	!
437	!-- Restore original indices/counters
438	nz = isave(1)
439	nzb_x = isave(2)
440	nzt_x = isave(3)
441	sendrecvcount_zx = isave(4)
442
443	DEALLOCATE( ar1, f_in, f_inv )
444
445	!
446	!-- Transposition x --> y + subsequent fft along y
447	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
448	CALL resort_for_xy( f_out_z, f_inv )
449	!
450	!-- Save original indices and gridpoint counter
451	isave(1) = nx
452	isave(2) = nxl_y
453	isave(3) = nxr_y
454	isave(4) = sendrecvcount_xy
455	!
456	!-- Set new indices for transformation
457	nblk = ( ( nx+1 ) / pdims(2) ) - 1
458	nx = pdims(2)
459	nnx_y = 1
460	nxl_y = myidy * nnx_y
461	nxr_y = ( myidy + 1 ) * nnx_y - 1
462	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
463
464	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
465	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
466
467	DO ii = 0, nblk
468
469	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
470
471	DO inew = 0, nx-1
472	iind = ii + ( nblk + 1 ) * inew
473	f_in(:,:,inew) = f_inv(:,:,iind)
474	ENDDO
475
476	CALL transpose_xy( f_in, ar1(:,:,:) )
477
478	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
479
480	IF ( ii == 1 ) THEN
481	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
482	ELSE
483	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
484	ENDIF
485
486	nxl_y_bound = isave(2)
487	nxr_y_bound = isave(3)
488	n = isave(2) + ii
489	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
490	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
491	nxl_y_l = n, nxr_y_l = n )
492
493	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
494
495	ENDDO
496	!
497	!-- Restore original indices/counters
498	nx = isave(1)
499	nxl_y = isave(2)
500	nxr_y = isave(3)
501	sendrecvcount_xy = isave(4)
502
503	DEALLOCATE( ar1, f_in, f_inv )
504
505	!
506	!-- Transposition y --> z + subsequent tridia + resort for z --> y
507	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
508	CALL resort_for_yz( f_out_y, f_inv )
509	!
510	!-- Save original indices and gridpoint counter
511	isave(1) = ny
512	isave(2) = nys_z
513	isave(3) = nyn_z
514	isave(4) = sendrecvcount_yz
515	!
516	!-- Set new indices for transformation
517	nblk = ( ( ny+1 ) / pdims(1) ) - 1
518	ny = pdims(1)
519	nny_z = 1
520	nys_z = myidx * nny_z
521	nyn_z = ( myidx + 1 ) * nny_z - 1
522	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
523
524	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
525	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
526
527	DO jj = 0, nblk
528	!
529	!-- Forward Fourier Transformation
530	!-- Transposition y --> z
531	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
532
533	DO jnew = 0, ny-1
534	jind = jj + ( nblk + 1 ) * jnew
535	f_in(:,:,jnew) = f_inv(:,:,jind)
536	ENDDO
537
538	CALL transpose_yz( f_in, ar1(:,:,:) )
539
540	IF ( jj == nblk ) THEN
541	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
542	ELSE
543	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
544	ENDIF
545
546	!
547	!-- Solve the tridiagonal equation system along z
548	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
549
550	n = isave(2) + jj
551	CALL tridia_substi_overlap( ar1(:,:,:), n )
552
553	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
554
555	!
556	!-- Inverse Fourier Transformation
557	!-- Transposition z --> y
558	!-- Only one thread should call MPI routines, therefore forward and
559	!-- backward tranpose are in the same section
560	IF ( jj == 0 ) THEN
561	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
562	ELSE
563	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
564	ENDIF
565
566	CALL transpose_zy( ar1(:,:,:), f_in )
567
568	DO jnew = 0, ny-1
569	jind = jj + ( nblk + 1 ) * jnew
570	f_inv(:,:,jind) = f_in(:,:,jnew)
571	ENDDO
572
573	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
574
575	ENDDO
576	!
577	!-- Restore original indices/counters
578	ny = isave(1)
579	nys_z = isave(2)
580	nyn_z = isave(3)
581	sendrecvcount_yz = isave(4)
582
583	CALL resort_for_zy( f_inv, f_out_y )
584
585	DEALLOCATE( ar1, f_in, f_inv )
586
587	!
588	!-- fft along y backward + subsequent transposition y --> x
589	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
590	!
591	!-- Save original indices and gridpoint counter
592	isave(1) = nx
593	isave(2) = nxl_y
594	isave(3) = nxr_y
595	isave(4) = sendrecvcount_xy
596	!
597	!-- Set new indices for transformation
598	nblk = (( nx+1 ) / pdims(2) ) - 1
599	nx = pdims(2)
600	nnx_y = 1
601	nxl_y = myidy * nnx_y
602	nxr_y = ( myidy + 1 ) * nnx_y - 1
603	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
604
605	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
606	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
607
608	DO ii = 0, nblk
609
610	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
611
612	n = isave(2) + ii
613	nxl_y_bound = isave(2)
614	nxr_y_bound = isave(3)
615
616	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
617	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
618	nxl_y_l = n, nxr_y_l = n )
619
620	IF ( ii == nblk ) THEN
621	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
622	ELSE
623	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
624	ENDIF
625
626	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
627
628	CALL transpose_yx( ar1(:,:,:), f_in )
629
630	DO inew = 0, nx-1
631	iind = ii + (nblk+1) * inew
632	f_inv(:,:,iind) = f_in(:,:,inew)
633	ENDDO
634
635	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
636
637	ENDDO
638	!
639	!-- Restore original indices/counters
640	nx = isave(1)
641	nxl_y = isave(2)
642	nxr_y = isave(3)
643	sendrecvcount_xy = isave(4)
644
645	CALL resort_for_yx( f_inv, f_out_z )
646
647	DEALLOCATE( ar1, f_in, f_inv )
648
649	!
650	!-- fft along x backward + subsequent final transposition x --> z
651	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
652	!
653	!-- Save original indices and gridpoint counter
654	isave(1) = nz
655	isave(2) = nzb_x
656	isave(3) = nzt_x
657	isave(4) = sendrecvcount_zx
658	!
659	!-- Set new indices for transformation
660	nblk = nz / pdims(1)
661	nz = pdims(1)
662	nnz_x = 1
663	nzb_x = 1 + myidx * nnz_x
664	nzt_x = ( myidx + 1 ) * nnz_x
665	sendrecvcount_zx = nnx * nny * nnz_x
666
667	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
668	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
669
670	DO kk = 1, nblk
671
672	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
673
674	n = isave(2) + kk - 1
675	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
676
677	IF ( kk == nblk ) THEN
678	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
679	ELSE
680	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
681	ENDIF
682
683	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
684
685	CALL transpose_xz( ar1(:,:,:), f_in )
686
687	DO knew = 1, nz
688	ki = kk + nblk * (knew-1)
689	f_inv(:,:,ki) = f_in(:,:,knew)
690	ENDDO
691
692	IF ( kk == nblk ) THEN
693	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
694	ELSE
695	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
696	ENDIF
697
698	ENDDO
699	!
700	!-- Restore original indices/counters
701	nz = isave(1)
702	nzb_x = isave(2)
703	nzt_x = isave(3)
704	sendrecvcount_zx = isave(4)
705
706	CALL resort_for_xz( f_inv, ar )
707
708	DEALLOCATE( ar1, f_in, f_inv )
709
710	ENDIF
711
712	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
713
714	END SUBROUTINE poisfft
715
716
717	!------------------------------------------------------------------------------!
718	! Description:
719	! ------------
720	!> Fourier-transformation along y with subsequent transposition y --> x for
721	!> a 1d-decomposition along x.
722	!>
723	!> @attention The performance of this routine is much faster on the NEC-SX6,
724	!> if the first index of work_ffty_vec is odd. Otherwise
725	!> memory bank conflicts may occur (especially if the index is a
726	!> multiple of 128). That's why work_ffty_vec is dimensioned as
727	!> 0:ny+1.
728	!> Of course, this will not work if users are using an odd number
729	!> of gridpoints along y.
730	!------------------------------------------------------------------------------!
731	SUBROUTINE ffty_tr_yx( f_in, f_out )
732
733
734	USE control_parameters, &
735	ONLY: host
736
737	USE cpulog, &
738	ONLY: cpu_log, log_point_s
739
740	USE kinds
741
742	USE pegrid
743
744	IMPLICIT NONE
745
746	INTEGER(iwp) :: i !<
747	INTEGER(iwp) :: iend !<
748	INTEGER(iwp) :: iouter !<
749	INTEGER(iwp) :: ir !<
750	INTEGER(iwp) :: j !<
751	INTEGER(iwp) :: k !<
752
753	INTEGER(iwp), PARAMETER :: stridex = 4 !<
754
755	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
756	#if defined( __nec )
757	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
758	#endif
759	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
760	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
761	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
762
763	!
764	!-- Carry out the FFT along y, where all data are present due to the
765	!-- 1d-decomposition along x. Resort the data in a way that x becomes
766	!-- the first index.
767	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
768
769	IF ( host(1:3) == 'nec' ) THEN
770	#if defined( __nec )
771	!
772	!-- Code optimized for vector processors
773	!$OMP PARALLEL PRIVATE ( i, j, k )
774	!$OMP DO
775	DO i = nxl, nxr
776
777	DO j = 0, ny
778	DO k = 1, nz
779	work_ffty_vec(j,k,i) = f_in(k,j,i)
780	ENDDO
781	ENDDO
782
783	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
784
785	ENDDO
786
787	!$OMP DO
788	DO k = 1, nz
789	DO j = 0, ny
790	DO i = nxl, nxr
791	work(i,k,j) = work_ffty_vec(j,k,i)
792	ENDDO
793	ENDDO
794	ENDDO
795	!$OMP END PARALLEL
796	#endif
797
798	ELSE
799
800	!
801	!-- Cache optimized code.
802	!-- The i-(x-)direction is split into a strided outer loop and an inner
803	!-- loop for better cache performance
804	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
805	!$OMP DO
806	DO iouter = nxl, nxr, stridex
807
808	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
809
810	DO k = 1, nz
811
812	DO i = iouter, iend
813
814	ir = i-iouter+1 ! counter within a stride
815	DO j = 0, ny
816	work_ffty(j,ir) = f_in(k,j,i)
817	ENDDO
818	!
819	!-- FFT along y
820	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
821
822	ENDDO
823
824	!
825	!-- Resort
826	DO j = 0, ny
827	DO i = iouter, iend
828	work(i,k,j) = work_ffty(j,i-iouter+1)
829	ENDDO
830	ENDDO
831
832	ENDDO
833
834	ENDDO
835	!$OMP END PARALLEL
836
837	ENDIF
838	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
839
840	!
841	!-- Transpose array
842	#if defined( __parallel )
843	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
844	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
845	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
846	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
847	comm1dx, ierr )
848	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
849	#endif
850
851	END SUBROUTINE ffty_tr_yx
852
853
854	!------------------------------------------------------------------------------!
855	! Description:
856	! ------------
857	!> Transposition x --> y with a subsequent backward Fourier transformation for
858	!> a 1d-decomposition along x
859	!------------------------------------------------------------------------------!
860	SUBROUTINE tr_xy_ffty( f_in, f_out )
861
862
863	USE control_parameters, &
864	ONLY: host
865
866	USE cpulog, &
867	ONLY: cpu_log, log_point_s
868
869	USE kinds
870
871	USE pegrid
872
873	IMPLICIT NONE
874
875	INTEGER(iwp) :: i !<
876	INTEGER(iwp) :: iend !<
877	INTEGER(iwp) :: iouter !<
878	INTEGER(iwp) :: ir !<
879	INTEGER(iwp) :: j !<
880	INTEGER(iwp) :: k !<
881
882	INTEGER(iwp), PARAMETER :: stridex = 4 !<
883
884	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
885	#if defined( __nec )
886	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
887	#endif
888	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
889	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
890	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
891
892	!
893	!-- Transpose array
894	#if defined( __parallel )
895	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
896	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
897	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
898	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
899	comm1dx, ierr )
900	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
901	#endif
902
903	!
904	!-- Resort the data in a way that y becomes the first index and carry out the
905	!-- backward fft along y.
906	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
907
908	IF ( host(1:3) == 'nec' ) THEN
909	#if defined( __nec )
910	!
911	!-- Code optimized for vector processors
912	!$OMP PARALLEL PRIVATE ( i, j, k )
913	!$OMP DO
914	DO k = 1, nz
915	DO j = 0, ny
916	DO i = nxl, nxr
917	work_ffty_vec(j,k,i) = work(i,k,j)
918	ENDDO
919	ENDDO
920	ENDDO
921
922	!$OMP DO
923	DO i = nxl, nxr
924
925	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
926
927	DO j = 0, ny
928	DO k = 1, nz
929	f_out(k,j,i) = work_ffty_vec(j,k,i)
930	ENDDO
931	ENDDO
932
933	ENDDO
934	!$OMP END PARALLEL
935	#endif
936
937	ELSE
938
939	!
940	!-- Cache optimized code.
941	!-- The i-(x-)direction is split into a strided outer loop and an inner
942	!-- loop for better cache performance
943	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
944	!$OMP DO
945	DO iouter = nxl, nxr, stridex
946
947	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
948
949	DO k = 1, nz
950	!
951	!-- Resort
952	DO j = 0, ny
953	DO i = iouter, iend
954	work_ffty(j,i-iouter+1) = work(i,k,j)
955	ENDDO
956	ENDDO
957
958	DO i = iouter, iend
959
960	!
961	!-- FFT along y
962	ir = i-iouter+1 ! counter within a stride
963	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
964
965	DO j = 0, ny
966	f_out(k,j,i) = work_ffty(j,ir)
967	ENDDO
968	ENDDO
969
970	ENDDO
971
972	ENDDO
973	!$OMP END PARALLEL
974
975	ENDIF
976
977	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
978
979	END SUBROUTINE tr_xy_ffty
980
981
982	!------------------------------------------------------------------------------!
983	! Description:
984	! ------------
985	!> FFT along x, solution of the tridiagonal system and backward FFT for
986	!> a 1d-decomposition along x
987	!>
988	!> @warning this subroutine may still not work for hybrid parallelization
989	!> with OpenMP (for possible necessary changes see the original
990	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
991	!------------------------------------------------------------------------------!
992	SUBROUTINE fftx_tri_fftx( ar )
993
994
995	USE control_parameters, &
996	ONLY: host
997
998	USE cpulog, &
999	ONLY: cpu_log, log_point_s
1000
1001	USE grid_variables, &
1002	ONLY: ddx2, ddy2
1003
1004	USE kinds
1005
1006	USE pegrid
1007
1008	IMPLICIT NONE
1009
1010	INTEGER(iwp) :: i !<
1011	INTEGER(iwp) :: j !<
1012	INTEGER(iwp) :: k !<
1013	INTEGER(iwp) :: m !<
1014	INTEGER(iwp) :: n !<
1015	INTEGER(iwp) :: omp_get_thread_num !<
1016	INTEGER(iwp) :: tn !<
1017
1018	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
1019	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
1020	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
1021	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1022
1023
1024	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
1025
1026	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1027
1028	tn = 0 ! Default thread number in case of one thread
1029	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1030	DO j = nys_x, nyn_x
1031
1032	!$ tn = omp_get_thread_num()
1033
1034	IF ( host(1:3) == 'nec' ) THEN
1035	!
1036	!-- Code optimized for vector processors
1037	DO k = 1, nz
1038
1039	m = 0
1040	DO n = 1, pdims(1)
1041	DO i = 1, nnx
1042	work_trix(m,k) = ar(i,k,j,n)
1043	m = m + 1
1044	ENDDO
1045	ENDDO
1046
1047	ENDDO
1048
1049	CALL fft_x_m( work_trix, 'forward' )
1050
1051	ELSE
1052	!
1053	!-- Cache optimized code
1054	DO k = 1, nz
1055
1056	m = 0
1057	DO n = 1, pdims(1)
1058	DO i = 1, nnx
1059	work_fftx(m) = ar(i,k,j,n)
1060	m = m + 1
1061	ENDDO
1062	ENDDO
1063
1064	CALL fft_x_1d( work_fftx, 'forward' )
1065
1066	DO i = 0, nx
1067	work_trix(i,k) = work_fftx(i)
1068	ENDDO
1069
1070	ENDDO
1071
1072	ENDIF
1073
1074	!
1075	!-- Solve the linear equation system
1076	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1077
1078	IF ( host(1:3) == 'nec' ) THEN
1079	!
1080	!-- Code optimized for vector processors
1081	CALL fft_x_m( work_trix, 'backward' )
1082
1083	DO k = 1, nz
1084
1085	m = 0
1086	DO n = 1, pdims(1)
1087	DO i = 1, nnx
1088	ar(i,k,j,n) = work_trix(m,k)
1089	m = m + 1
1090	ENDDO
1091	ENDDO
1092
1093	ENDDO
1094
1095	ELSE
1096	!
1097	!-- Cache optimized code
1098	DO k = 1, nz
1099
1100	DO i = 0, nx
1101	work_fftx(i) = work_trix(i,k)
1102	ENDDO
1103
1104	CALL fft_x_1d( work_fftx, 'backward' )
1105
1106	m = 0
1107	DO n = 1, pdims(1)
1108	DO i = 1, nnx
1109	ar(i,k,j,n) = work_fftx(m)
1110	m = m + 1
1111	ENDDO
1112	ENDDO
1113
1114	ENDDO
1115
1116	ENDIF
1117
1118	ENDDO
1119
1120	DEALLOCATE( tri )
1121
1122	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1123
1124	END SUBROUTINE fftx_tri_fftx
1125
1126
1127	!------------------------------------------------------------------------------!
1128	! Description:
1129	! ------------
1130	!> Fourier-transformation along x with subsequent transposition x --> y for
1131	!> a 1d-decomposition along y.
1132	!>
1133	!> @attention NEC-branch of this routine may significantly profit from
1134	!> further optimizations. So far, performance is much worse than
1135	!> for routine ffty_tr_yx (more than three times slower).
1136	!------------------------------------------------------------------------------!
1137	SUBROUTINE fftx_tr_xy( f_in, f_out )
1138
1139
1140	USE control_parameters, &
1141	ONLY: host
1142
1143	USE cpulog, &
1144	ONLY: cpu_log, log_point_s
1145
1146	USE kinds
1147
1148	USE pegrid
1149
1150	IMPLICIT NONE
1151
1152	INTEGER(iwp) :: i !<
1153	INTEGER(iwp) :: j !<
1154	INTEGER(iwp) :: k !<
1155
1156	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1157	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
1158	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
1159	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1160
1161	!
1162	!-- Carry out the FFT along x, where all data are present due to the
1163	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1164	!-- the first index.
1165	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1166
1167	IF ( host(1:3) == 'nec' ) THEN
1168	!
1169	!-- Code for vector processors
1170	!$OMP PARALLEL PRIVATE ( i, j, k )
1171	!$OMP DO
1172	DO i = 0, nx
1173
1174	DO j = nys, nyn
1175	DO k = 1, nz
1176	work_fftx(i,k,j) = f_in(k,j,i)
1177	ENDDO
1178	ENDDO
1179
1180	ENDDO
1181
1182	!$OMP DO
1183	DO j = nys, nyn
1184
1185	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1186
1187	DO k = 1, nz
1188	DO i = 0, nx
1189	work(j,k,i) = work_fftx(i,k,j)
1190	ENDDO
1191	ENDDO
1192
1193	ENDDO
1194	!$OMP END PARALLEL
1195
1196	ELSE
1197
1198	!
1199	!-- Cache optimized code (there might be still a potential for better
1200	!-- optimization).
1201	!$OMP PARALLEL PRIVATE (i,j,k)
1202	!$OMP DO
1203	DO i = 0, nx
1204
1205	DO j = nys, nyn
1206	DO k = 1, nz
1207	work_fftx(i,k,j) = f_in(k,j,i)
1208	ENDDO
1209	ENDDO
1210
1211	ENDDO
1212
1213	!$OMP DO
1214	DO j = nys, nyn
1215	DO k = 1, nz
1216
1217	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1218
1219	DO i = 0, nx
1220	work(j,k,i) = work_fftx(i,k,j)
1221	ENDDO
1222	ENDDO
1223
1224	ENDDO
1225	!$OMP END PARALLEL
1226
1227	ENDIF
1228	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1229
1230	!
1231	!-- Transpose array
1232	#if defined( __parallel )
1233	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1234	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1235	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1236	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1237	comm1dy, ierr )
1238	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1239	#endif
1240
1241	END SUBROUTINE fftx_tr_xy
1242
1243
1244	!------------------------------------------------------------------------------!
1245	! Description:
1246	! ------------
1247	!> Transposition y --> x with a subsequent backward Fourier transformation for
1248	!> a 1d-decomposition along x.
1249	!------------------------------------------------------------------------------!
1250	SUBROUTINE tr_yx_fftx( f_in, f_out )
1251
1252
1253	USE control_parameters, &
1254	ONLY: host
1255
1256	USE cpulog, &
1257	ONLY: cpu_log, log_point_s
1258
1259	USE kinds
1260
1261	USE pegrid
1262
1263	IMPLICIT NONE
1264
1265	INTEGER(iwp) :: i !<
1266	INTEGER(iwp) :: j !<
1267	INTEGER(iwp) :: k !<
1268
1269	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1270	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
1271	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
1272	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1273
1274	!
1275	!-- Transpose array
1276	#if defined( __parallel )
1277	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1278	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1279	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1280	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1281	comm1dy, ierr )
1282	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1283	#endif
1284
1285	!
1286	!-- Carry out the FFT along x, where all data are present due to the
1287	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1288	!-- the first index.
1289	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1290
1291	IF ( host(1:3) == 'nec' ) THEN
1292	!
1293	!-- Code optimized for vector processors
1294	!$OMP PARALLEL PRIVATE ( i, j, k )
1295	!$OMP DO
1296	DO j = nys, nyn
1297
1298	DO k = 1, nz
1299	DO i = 0, nx
1300	work_fftx(i,k,j) = work(j,k,i)
1301	ENDDO
1302	ENDDO
1303
1304	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1305
1306	ENDDO
1307
1308	!$OMP DO
1309	DO i = 0, nx
1310	DO j = nys, nyn
1311	DO k = 1, nz
1312	f_out(k,j,i) = work_fftx(i,k,j)
1313	ENDDO
1314	ENDDO
1315	ENDDO
1316	!$OMP END PARALLEL
1317
1318	ELSE
1319
1320	!
1321	!-- Cache optimized code (there might be still a potential for better
1322	!-- optimization).
1323	!$OMP PARALLEL PRIVATE (i,j,k)
1324	!$OMP DO
1325	DO j = nys, nyn
1326	DO k = 1, nz
1327
1328	DO i = 0, nx
1329	work_fftx(i,k,j) = work(j,k,i)
1330	ENDDO
1331
1332	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1333
1334	ENDDO
1335	ENDDO
1336
1337	!$OMP DO
1338	DO i = 0, nx
1339	DO j = nys, nyn
1340	DO k = 1, nz
1341	f_out(k,j,i) = work_fftx(i,k,j)
1342	ENDDO
1343	ENDDO
1344	ENDDO
1345	!$OMP END PARALLEL
1346
1347	ENDIF
1348	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1349
1350	END SUBROUTINE tr_yx_fftx
1351
1352
1353	!------------------------------------------------------------------------------!
1354	! Description:
1355	! ------------
1356	!> FFT along y, solution of the tridiagonal system and backward FFT for
1357	!> a 1d-decomposition along y.
1358	!>
1359	!> @warning this subroutine may still not work for hybrid parallelization
1360	!> with OpenMP (for possible necessary changes see the original
1361	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1362	!------------------------------------------------------------------------------!
1363	SUBROUTINE ffty_tri_ffty( ar )
1364
1365
1366	USE control_parameters, &
1367	ONLY: host
1368
1369	USE cpulog, &
1370	ONLY: cpu_log, log_point_s
1371
1372	USE grid_variables, &
1373	ONLY: ddx2, ddy2
1374
1375	USE kinds
1376
1377	USE pegrid
1378
1379	IMPLICIT NONE
1380
1381	INTEGER(iwp) :: i !<
1382	INTEGER(iwp) :: j !<
1383	INTEGER(iwp) :: k !<
1384	INTEGER(iwp) :: m !<
1385	INTEGER(iwp) :: n !<
1386	INTEGER(iwp) :: omp_get_thread_num !<
1387	INTEGER(iwp) :: tn !<
1388
1389	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
1390	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
1391	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
1392	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1393
1394
1395	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1396
1397	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1398
1399	tn = 0 ! Default thread number in case of one thread
1400	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1401	DO i = nxl_y, nxr_y
1402
1403	!$ tn = omp_get_thread_num()
1404
1405	IF ( host(1:3) == 'nec' ) THEN
1406	!
1407	!-- Code optimized for vector processors
1408	DO k = 1, nz
1409
1410	m = 0
1411	DO n = 1, pdims(2)
1412	DO j = 1, nny
1413	work_triy(m,k) = ar(j,k,i,n)
1414	m = m + 1
1415	ENDDO
1416	ENDDO
1417
1418	ENDDO
1419
1420	CALL fft_y_m( work_triy, ny, 'forward' )
1421
1422	ELSE
1423	!
1424	!-- Cache optimized code
1425	DO k = 1, nz
1426
1427	m = 0
1428	DO n = 1, pdims(2)
1429	DO j = 1, nny
1430	work_ffty(m) = ar(j,k,i,n)
1431	m = m + 1
1432	ENDDO
1433	ENDDO
1434
1435	CALL fft_y_1d( work_ffty, 'forward' )
1436
1437	DO j = 0, ny
1438	work_triy(j,k) = work_ffty(j)
1439	ENDDO
1440
1441	ENDDO
1442
1443	ENDIF
1444
1445	!
1446	!-- Solve the linear equation system
1447	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1448
1449	IF ( host(1:3) == 'nec' ) THEN
1450	!
1451	!-- Code optimized for vector processors
1452	CALL fft_y_m( work_triy, ny, 'backward' )
1453
1454	DO k = 1, nz
1455
1456	m = 0
1457	DO n = 1, pdims(2)
1458	DO j = 1, nny
1459	ar(j,k,i,n) = work_triy(m,k)
1460	m = m + 1
1461	ENDDO
1462	ENDDO
1463
1464	ENDDO
1465
1466	ELSE
1467	!
1468	!-- Cache optimized code
1469	DO k = 1, nz
1470
1471	DO j = 0, ny
1472	work_ffty(j) = work_triy(j,k)
1473	ENDDO
1474
1475	CALL fft_y_1d( work_ffty, 'backward' )
1476
1477	m = 0
1478	DO n = 1, pdims(2)
1479	DO j = 1, nny
1480	ar(j,k,i,n) = work_ffty(m)
1481	m = m + 1
1482	ENDDO
1483	ENDDO
1484
1485	ENDDO
1486
1487	ENDIF
1488
1489	ENDDO
1490
1491	DEALLOCATE( tri )
1492
1493	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1494
1495	END SUBROUTINE ffty_tri_ffty
1496
1497	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1836

Download in other formats: