Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 2241

Last change on this file since 2241 was 2119, checked in by raasch, 8 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 44.8 KB

Line
1	!> @file poisfft_mod.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of PALM.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2017 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft_mod.f90 2119 2017-01-17 16:51:50Z hellstea $
27	!
28	! 2118 2017-01-17 16:38:49Z raasch
29	! OpenACC directives and related code removed
30	!
31	! 2000 2016-08-20 18:09:15Z knoop
32	! Forced header and separation lines into 80 columns
33	!
34	! 1850 2016-04-08 13:29:27Z maronga
35	! Module renamed
36	!
37	!
38	! 1804 2016-04-05 16:30:18Z maronga
39	! Removed code for parameter file check (__check)
40	!
41	! 1682 2015-10-07 23:56:08Z knoop
42	! Code annotations made doxygen readable
43	!
44	! 1482 2014-10-18 12:34:45Z raasch
45	! use 2d-decomposition, if accelerator boards are used
46	!
47	! 1406 2014-05-16 13:47:01Z raasch
48	! bugfix for pgi 14.4: declare create moved after array declaration
49	!
50	! 1320 2014-03-20 08:40:49Z raasch
51	! ONLY-attribute added to USE-statements,
52	! kind-parameters added to all INTEGER and REAL declaration statements,
53	! kinds are defined in new module kinds,
54	! old module precision_kind is removed,
55	! revision history before 2012 removed,
56	! comment fields (!:) to be used for variable explanations added to
57	! all variable declaration statements
58	!
59	! 1318 2014-03-17 13:35:16Z raasch
60	! module interfaces removed
61	!
62	! 1306 2014-03-13 14:30:59Z raasch
63	! openmp sections removed from the overlap branch,
64	! second argument removed from parameter list
65	!
66	! 1216 2013-08-26 09:31:42Z raasch
67	! resorting of arrays moved to separate routines resort_for_...,
68	! one argument, used as temporary work array, removed from all transpose
69	! routines
70	! overlapping fft / transposition implemented
71	!
72	! 1212 2013-08-15 08:46:27Z raasch
73	! tridia routines moved to seperate module tridia_solver
74	!
75	! 1208 2013-08-13 06:41:49Z raasch
76	! acc-update clauses added for "ar" so that ffts other than cufft can also be
77	! used (although they are not ported and will give a poor performance)
78	!
79	! 1111 2013-03-08 23:54:10Z raasch
80	! further openACC porting of non-parallel (MPI) branch:
81	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
82	! no distinction between parallel/non-parallel in poisfft and tridia any more,
83	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
84	! (otherwise "invalid device function" is indicated during runtime),
85	! optimization of tridia routines: constant elements and coefficients of tri are
86	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
87	! to 2,
88	! poisfft_init is now called internally from poisfft, maketri is called from
89	! poisfft_init,
90	! ibc_p_b = 2 removed
91	!
92	! 1106 2013-03-04 05:31:38Z raasch
93	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
94	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
95	! fft_y_1d
96	!
97	! 1103 2013-02-20 02:15:53Z raasch
98	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
99	! sometimes cause segmentation faults with intel 12.1 compiler
100	!
101	! 1092 2013-02-02 11:24:22Z raasch
102	! unused variables removed
103	!
104	! 1036 2012-10-22 13:43:42Z raasch
105	! code put under GPL (PALM 3.9)
106	!
107	! 2012-09-21 07:03:55Z raasch
108	! FLOAT type conversion replaced by REAL
109	!
110	! 1003 2012-09-14 14:35:53Z raasch
111	! indices nxa, nya, etc. replaced by nx, ny, etc.
112	!
113	! 940 2012-07-09 14:31:00Z raasch
114	! special handling of tri-array as an argument in tridia_1dd routines switched
115	! off because it caused segmentation faults with intel 12.1 compiler
116	!
117	! 877 2012-04-03 11:21:44Z suehring
118	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
119	! pressure at the top of the model domain.
120	!
121	! 809 2012-01-30 13:32:58Z maronga
122	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
123	!
124	! 807 2012-01-25 11:53:51Z maronga
125	! New cpp directive "__check" implemented which is used by check_namelist_files
126	! (most of the code is unneeded by check_namelist_files).
127	!
128	! Revision 1.1 1997/07/24 11:24:14 raasch
129	! Initial revision
130	!
131	!
132	! Description:
133	! ------------
134	!> Solves the Poisson equation with a 2D spectral method
135	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
136	!>
137	!> Input:
138	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
139	!> starting from (1,nys,nxl)
140	!>
141	!> Output:
142	!> real ar contains the solution for perturbation pressure p
143	!------------------------------------------------------------------------------!
144	MODULE poisfft_mod
145
146
147	USE fft_xy, &
148	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
149
150	USE indices, &
151	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
152
153	USE transpose_indices, &
154	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
155	nzb_y, nzt_x, nzt_y
156
157	USE tridia_solver, &
158	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
159
160	IMPLICIT NONE
161
162	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
163
164	PRIVATE
165
166	PUBLIC poisfft, poisfft_init
167
168	INTERFACE poisfft
169	MODULE PROCEDURE poisfft
170	END INTERFACE poisfft
171
172	INTERFACE poisfft_init
173	MODULE PROCEDURE poisfft_init
174	END INTERFACE poisfft_init
175
176
177	CONTAINS
178
179	!------------------------------------------------------------------------------!
180	! Description:
181	! ------------
182	!> @todo Missing subroutine description.
183	!------------------------------------------------------------------------------!
184	SUBROUTINE poisfft_init
185
186	USE arrays_3d, &
187	ONLY: ddzu_pres, ddzw
188
189	USE kinds
190
191	IMPLICIT NONE
192
193	INTEGER(iwp) :: k !<
194
195
196	CALL fft_init
197
198	CALL tridia_init
199
200	poisfft_initialized = .TRUE.
201
202	END SUBROUTINE poisfft_init
203
204
205
206	!------------------------------------------------------------------------------!
207	! Description:
208	! ------------
209	!> Two-dimensional Fourier Transformation in x- and y-direction.
210	!------------------------------------------------------------------------------!
211	SUBROUTINE poisfft( ar )
212
213	USE control_parameters, &
214	ONLY: fft_method, transpose_compute_overlap
215
216	USE cpulog, &
217	ONLY: cpu_log, cpu_log_nowait, log_point_s
218
219	USE kinds
220
221	USE pegrid
222
223	IMPLICIT NONE
224
225	INTEGER(iwp) :: ii !<
226	INTEGER(iwp) :: iind !<
227	INTEGER(iwp) :: inew !<
228	INTEGER(iwp) :: jj !<
229	INTEGER(iwp) :: jind !<
230	INTEGER(iwp) :: jnew !<
231	INTEGER(iwp) :: ki !<
232	INTEGER(iwp) :: kk !<
233	INTEGER(iwp) :: knew !<
234	INTEGER(iwp) :: n !<
235	INTEGER(iwp) :: nblk !<
236	INTEGER(iwp) :: nnx_y !<
237	INTEGER(iwp) :: nny_z !<
238	INTEGER(iwp) :: nnz_t !<
239	INTEGER(iwp) :: nnz_x !<
240	INTEGER(iwp) :: nxl_y_bound !<
241	INTEGER(iwp) :: nxr_y_bound !<
242
243	INTEGER(iwp), DIMENSION(4) :: isave !<
244
245	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
246	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
247
248	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
249	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
250	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
251	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
252	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
253
254
255	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
256
257	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
258
259	!
260	!-- Two-dimensional Fourier Transformation in x- and y-direction.
261	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
262
263	!
264	!-- 1d-domain-decomposition along x:
265	!-- FFT along y and transposition y --> x
266	CALL ffty_tr_yx( ar, ar )
267
268	!
269	!-- FFT along x, solving the tridiagonal system and backward FFT
270	CALL fftx_tri_fftx( ar )
271
272	!
273	!-- Transposition x --> y and backward FFT along y
274	CALL tr_xy_ffty( ar, ar )
275
276	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
277
278	!
279	!-- 1d-domain-decomposition along y:
280	!-- FFT along x and transposition x --> y
281	CALL fftx_tr_xy( ar, ar )
282
283	!
284	!-- FFT along y, solving the tridiagonal system and backward FFT
285	CALL ffty_tri_ffty( ar )
286
287	!
288	!-- Transposition y --> x and backward FFT along x
289	CALL tr_yx_fftx( ar, ar )
290
291	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
292
293	!
294	!-- 2d-domain-decomposition or no decomposition (1 PE run)
295	!-- Transposition z --> x
296	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
297	CALL resort_for_zx( ar, ar_inv )
298	CALL transpose_zx( ar_inv, ar )
299	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
300
301	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
302	CALL fft_x( ar, 'forward' )
303	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
304
305	!
306	!-- Transposition x --> y
307	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
308	CALL resort_for_xy( ar, ar_inv )
309	CALL transpose_xy( ar_inv, ar )
310	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
311
312	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
313	CALL fft_y( ar, 'forward', ar_tr = ar, &
314	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
315	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
316	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
317
318	!
319	!-- Transposition y --> z
320	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
321	CALL resort_for_yz( ar, ar_inv )
322	CALL transpose_yz( ar_inv, ar )
323	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
324
325	!
326	!-- Solve the tridiagonal equation system along z
327	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
328	CALL tridia_substi( ar )
329	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
330
331	!
332	!-- Inverse Fourier Transformation
333	!-- Transposition z --> y
334	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
335	CALL transpose_zy( ar, ar_inv )
336	CALL resort_for_zy( ar_inv, ar )
337	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
338
339	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
340	CALL fft_y( ar, 'backward', ar_tr = ar, &
341	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
342	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
343	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
344
345	!
346	!-- Transposition y --> x
347	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
348	CALL transpose_yx( ar, ar_inv )
349	CALL resort_for_yx( ar_inv, ar )
350	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
351
352	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
353	CALL fft_x( ar, 'backward' )
354	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
355
356	!
357	!-- Transposition x --> z
358	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
359	CALL transpose_xz( ar, ar_inv )
360	CALL resort_for_xz( ar_inv, ar )
361	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
362
363	ELSE
364
365	!
366	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
367	!-- overlapping transposition / fft
368	!-- cputime logging must not use barriers, which would prevent overlapping
369	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
370	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
371	!
372	!-- Transposition z --> x + subsequent fft along x
373	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
374	CALL resort_for_zx( ar, f_inv )
375	!
376	!-- Save original indices and gridpoint counter
377	isave(1) = nz
378	isave(2) = nzb_x
379	isave(3) = nzt_x
380	isave(4) = sendrecvcount_zx
381	!
382	!-- Set new indices for transformation
383	nblk = nz / pdims(1)
384	nz = pdims(1)
385	nnz_x = 1
386	nzb_x = 1 + myidx * nnz_x
387	nzt_x = ( myidx + 1 ) * nnz_x
388	sendrecvcount_zx = nnx * nny * nnz_x
389
390	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
391	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
392
393	DO kk = 1, nblk
394
395	IF ( kk == 1 ) THEN
396	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
397	ELSE
398	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
399	ENDIF
400
401	DO knew = 1, nz
402	ki = kk + nblk * ( knew - 1 )
403	f_in(:,:,knew) = f_inv(:,:,ki)
404	ENDDO
405
406	CALL transpose_zx( f_in, ar1(:,:,:))
407	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
408
409	IF ( kk == 1 ) THEN
410	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
411	ELSE
412	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
413	ENDIF
414
415	n = isave(2) + kk - 1
416	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
417	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
418
419	ENDDO
420	!
421	!-- Restore original indices/counters
422	nz = isave(1)
423	nzb_x = isave(2)
424	nzt_x = isave(3)
425	sendrecvcount_zx = isave(4)
426
427	DEALLOCATE( ar1, f_in, f_inv )
428
429	!
430	!-- Transposition x --> y + subsequent fft along y
431	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
432	CALL resort_for_xy( f_out_z, f_inv )
433	!
434	!-- Save original indices and gridpoint counter
435	isave(1) = nx
436	isave(2) = nxl_y
437	isave(3) = nxr_y
438	isave(4) = sendrecvcount_xy
439	!
440	!-- Set new indices for transformation
441	nblk = ( ( nx+1 ) / pdims(2) ) - 1
442	nx = pdims(2)
443	nnx_y = 1
444	nxl_y = myidy * nnx_y
445	nxr_y = ( myidy + 1 ) * nnx_y - 1
446	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
447
448	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
449	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
450
451	DO ii = 0, nblk
452
453	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
454
455	DO inew = 0, nx-1
456	iind = ii + ( nblk + 1 ) * inew
457	f_in(:,:,inew) = f_inv(:,:,iind)
458	ENDDO
459
460	CALL transpose_xy( f_in, ar1(:,:,:) )
461
462	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
463
464	IF ( ii == 1 ) THEN
465	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
466	ELSE
467	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
468	ENDIF
469
470	nxl_y_bound = isave(2)
471	nxr_y_bound = isave(3)
472	n = isave(2) + ii
473	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
474	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
475	nxl_y_l = n, nxr_y_l = n )
476
477	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
478
479	ENDDO
480	!
481	!-- Restore original indices/counters
482	nx = isave(1)
483	nxl_y = isave(2)
484	nxr_y = isave(3)
485	sendrecvcount_xy = isave(4)
486
487	DEALLOCATE( ar1, f_in, f_inv )
488
489	!
490	!-- Transposition y --> z + subsequent tridia + resort for z --> y
491	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
492	CALL resort_for_yz( f_out_y, f_inv )
493	!
494	!-- Save original indices and gridpoint counter
495	isave(1) = ny
496	isave(2) = nys_z
497	isave(3) = nyn_z
498	isave(4) = sendrecvcount_yz
499	!
500	!-- Set new indices for transformation
501	nblk = ( ( ny+1 ) / pdims(1) ) - 1
502	ny = pdims(1)
503	nny_z = 1
504	nys_z = myidx * nny_z
505	nyn_z = ( myidx + 1 ) * nny_z - 1
506	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
507
508	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
509	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
510
511	DO jj = 0, nblk
512	!
513	!-- Forward Fourier Transformation
514	!-- Transposition y --> z
515	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
516
517	DO jnew = 0, ny-1
518	jind = jj + ( nblk + 1 ) * jnew
519	f_in(:,:,jnew) = f_inv(:,:,jind)
520	ENDDO
521
522	CALL transpose_yz( f_in, ar1(:,:,:) )
523
524	IF ( jj == nblk ) THEN
525	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
526	ELSE
527	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
528	ENDIF
529
530	!
531	!-- Solve the tridiagonal equation system along z
532	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
533
534	n = isave(2) + jj
535	CALL tridia_substi_overlap( ar1(:,:,:), n )
536
537	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
538
539	!
540	!-- Inverse Fourier Transformation
541	!-- Transposition z --> y
542	!-- Only one thread should call MPI routines, therefore forward and
543	!-- backward tranpose are in the same section
544	IF ( jj == 0 ) THEN
545	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
546	ELSE
547	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
548	ENDIF
549
550	CALL transpose_zy( ar1(:,:,:), f_in )
551
552	DO jnew = 0, ny-1
553	jind = jj + ( nblk + 1 ) * jnew
554	f_inv(:,:,jind) = f_in(:,:,jnew)
555	ENDDO
556
557	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
558
559	ENDDO
560	!
561	!-- Restore original indices/counters
562	ny = isave(1)
563	nys_z = isave(2)
564	nyn_z = isave(3)
565	sendrecvcount_yz = isave(4)
566
567	CALL resort_for_zy( f_inv, f_out_y )
568
569	DEALLOCATE( ar1, f_in, f_inv )
570
571	!
572	!-- fft along y backward + subsequent transposition y --> x
573	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
574	!
575	!-- Save original indices and gridpoint counter
576	isave(1) = nx
577	isave(2) = nxl_y
578	isave(3) = nxr_y
579	isave(4) = sendrecvcount_xy
580	!
581	!-- Set new indices for transformation
582	nblk = (( nx+1 ) / pdims(2) ) - 1
583	nx = pdims(2)
584	nnx_y = 1
585	nxl_y = myidy * nnx_y
586	nxr_y = ( myidy + 1 ) * nnx_y - 1
587	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
588
589	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
590	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
591
592	DO ii = 0, nblk
593
594	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
595
596	n = isave(2) + ii
597	nxl_y_bound = isave(2)
598	nxr_y_bound = isave(3)
599
600	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
601	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
602	nxl_y_l = n, nxr_y_l = n )
603
604	IF ( ii == nblk ) THEN
605	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
606	ELSE
607	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
608	ENDIF
609
610	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
611
612	CALL transpose_yx( ar1(:,:,:), f_in )
613
614	DO inew = 0, nx-1
615	iind = ii + (nblk+1) * inew
616	f_inv(:,:,iind) = f_in(:,:,inew)
617	ENDDO
618
619	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
620
621	ENDDO
622	!
623	!-- Restore original indices/counters
624	nx = isave(1)
625	nxl_y = isave(2)
626	nxr_y = isave(3)
627	sendrecvcount_xy = isave(4)
628
629	CALL resort_for_yx( f_inv, f_out_z )
630
631	DEALLOCATE( ar1, f_in, f_inv )
632
633	!
634	!-- fft along x backward + subsequent final transposition x --> z
635	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
636	!
637	!-- Save original indices and gridpoint counter
638	isave(1) = nz
639	isave(2) = nzb_x
640	isave(3) = nzt_x
641	isave(4) = sendrecvcount_zx
642	!
643	!-- Set new indices for transformation
644	nblk = nz / pdims(1)
645	nz = pdims(1)
646	nnz_x = 1
647	nzb_x = 1 + myidx * nnz_x
648	nzt_x = ( myidx + 1 ) * nnz_x
649	sendrecvcount_zx = nnx * nny * nnz_x
650
651	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
652	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
653
654	DO kk = 1, nblk
655
656	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
657
658	n = isave(2) + kk - 1
659	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
660
661	IF ( kk == nblk ) THEN
662	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
663	ELSE
664	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
665	ENDIF
666
667	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
668
669	CALL transpose_xz( ar1(:,:,:), f_in )
670
671	DO knew = 1, nz
672	ki = kk + nblk * (knew-1)
673	f_inv(:,:,ki) = f_in(:,:,knew)
674	ENDDO
675
676	IF ( kk == nblk ) THEN
677	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
678	ELSE
679	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
680	ENDIF
681
682	ENDDO
683	!
684	!-- Restore original indices/counters
685	nz = isave(1)
686	nzb_x = isave(2)
687	nzt_x = isave(3)
688	sendrecvcount_zx = isave(4)
689
690	CALL resort_for_xz( f_inv, ar )
691
692	DEALLOCATE( ar1, f_in, f_inv )
693
694	ENDIF
695
696	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
697
698	END SUBROUTINE poisfft
699
700
701	!------------------------------------------------------------------------------!
702	! Description:
703	! ------------
704	!> Fourier-transformation along y with subsequent transposition y --> x for
705	!> a 1d-decomposition along x.
706	!>
707	!> @attention The performance of this routine is much faster on the NEC-SX6,
708	!> if the first index of work_ffty_vec is odd. Otherwise
709	!> memory bank conflicts may occur (especially if the index is a
710	!> multiple of 128). That's why work_ffty_vec is dimensioned as
711	!> 0:ny+1.
712	!> Of course, this will not work if users are using an odd number
713	!> of gridpoints along y.
714	!------------------------------------------------------------------------------!
715	SUBROUTINE ffty_tr_yx( f_in, f_out )
716
717
718	USE control_parameters, &
719	ONLY: host
720
721	USE cpulog, &
722	ONLY: cpu_log, log_point_s
723
724	USE kinds
725
726	USE pegrid
727
728	IMPLICIT NONE
729
730	INTEGER(iwp) :: i !<
731	INTEGER(iwp) :: iend !<
732	INTEGER(iwp) :: iouter !<
733	INTEGER(iwp) :: ir !<
734	INTEGER(iwp) :: j !<
735	INTEGER(iwp) :: k !<
736
737	INTEGER(iwp), PARAMETER :: stridex = 4 !<
738
739	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
740	#if defined( __nec )
741	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
742	#endif
743	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
744	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
745	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
746
747	!
748	!-- Carry out the FFT along y, where all data are present due to the
749	!-- 1d-decomposition along x. Resort the data in a way that x becomes
750	!-- the first index.
751	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
752
753	IF ( host(1:3) == 'nec' ) THEN
754	#if defined( __nec )
755	!
756	!-- Code optimized for vector processors
757	!$OMP PARALLEL PRIVATE ( i, j, k )
758	!$OMP DO
759	DO i = nxl, nxr
760
761	DO j = 0, ny
762	DO k = 1, nz
763	work_ffty_vec(j,k,i) = f_in(k,j,i)
764	ENDDO
765	ENDDO
766
767	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
768
769	ENDDO
770
771	!$OMP DO
772	DO k = 1, nz
773	DO j = 0, ny
774	DO i = nxl, nxr
775	work(i,k,j) = work_ffty_vec(j,k,i)
776	ENDDO
777	ENDDO
778	ENDDO
779	!$OMP END PARALLEL
780	#endif
781
782	ELSE
783
784	!
785	!-- Cache optimized code.
786	!-- The i-(x-)direction is split into a strided outer loop and an inner
787	!-- loop for better cache performance
788	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
789	!$OMP DO
790	DO iouter = nxl, nxr, stridex
791
792	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
793
794	DO k = 1, nz
795
796	DO i = iouter, iend
797
798	ir = i-iouter+1 ! counter within a stride
799	DO j = 0, ny
800	work_ffty(j,ir) = f_in(k,j,i)
801	ENDDO
802	!
803	!-- FFT along y
804	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
805
806	ENDDO
807
808	!
809	!-- Resort
810	DO j = 0, ny
811	DO i = iouter, iend
812	work(i,k,j) = work_ffty(j,i-iouter+1)
813	ENDDO
814	ENDDO
815
816	ENDDO
817
818	ENDDO
819	!$OMP END PARALLEL
820
821	ENDIF
822	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
823
824	!
825	!-- Transpose array
826	#if defined( __parallel )
827	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
828	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
829	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
830	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
831	comm1dx, ierr )
832	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
833	#endif
834
835	END SUBROUTINE ffty_tr_yx
836
837
838	!------------------------------------------------------------------------------!
839	! Description:
840	! ------------
841	!> Transposition x --> y with a subsequent backward Fourier transformation for
842	!> a 1d-decomposition along x
843	!------------------------------------------------------------------------------!
844	SUBROUTINE tr_xy_ffty( f_in, f_out )
845
846
847	USE control_parameters, &
848	ONLY: host
849
850	USE cpulog, &
851	ONLY: cpu_log, log_point_s
852
853	USE kinds
854
855	USE pegrid
856
857	IMPLICIT NONE
858
859	INTEGER(iwp) :: i !<
860	INTEGER(iwp) :: iend !<
861	INTEGER(iwp) :: iouter !<
862	INTEGER(iwp) :: ir !<
863	INTEGER(iwp) :: j !<
864	INTEGER(iwp) :: k !<
865
866	INTEGER(iwp), PARAMETER :: stridex = 4 !<
867
868	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
869	#if defined( __nec )
870	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
871	#endif
872	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
873	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
874	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
875
876	!
877	!-- Transpose array
878	#if defined( __parallel )
879	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
880	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
881	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
882	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
883	comm1dx, ierr )
884	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
885	#endif
886
887	!
888	!-- Resort the data in a way that y becomes the first index and carry out the
889	!-- backward fft along y.
890	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
891
892	IF ( host(1:3) == 'nec' ) THEN
893	#if defined( __nec )
894	!
895	!-- Code optimized for vector processors
896	!$OMP PARALLEL PRIVATE ( i, j, k )
897	!$OMP DO
898	DO k = 1, nz
899	DO j = 0, ny
900	DO i = nxl, nxr
901	work_ffty_vec(j,k,i) = work(i,k,j)
902	ENDDO
903	ENDDO
904	ENDDO
905
906	!$OMP DO
907	DO i = nxl, nxr
908
909	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
910
911	DO j = 0, ny
912	DO k = 1, nz
913	f_out(k,j,i) = work_ffty_vec(j,k,i)
914	ENDDO
915	ENDDO
916
917	ENDDO
918	!$OMP END PARALLEL
919	#endif
920
921	ELSE
922
923	!
924	!-- Cache optimized code.
925	!-- The i-(x-)direction is split into a strided outer loop and an inner
926	!-- loop for better cache performance
927	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
928	!$OMP DO
929	DO iouter = nxl, nxr, stridex
930
931	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
932
933	DO k = 1, nz
934	!
935	!-- Resort
936	DO j = 0, ny
937	DO i = iouter, iend
938	work_ffty(j,i-iouter+1) = work(i,k,j)
939	ENDDO
940	ENDDO
941
942	DO i = iouter, iend
943
944	!
945	!-- FFT along y
946	ir = i-iouter+1 ! counter within a stride
947	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
948
949	DO j = 0, ny
950	f_out(k,j,i) = work_ffty(j,ir)
951	ENDDO
952	ENDDO
953
954	ENDDO
955
956	ENDDO
957	!$OMP END PARALLEL
958
959	ENDIF
960
961	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
962
963	END SUBROUTINE tr_xy_ffty
964
965
966	!------------------------------------------------------------------------------!
967	! Description:
968	! ------------
969	!> FFT along x, solution of the tridiagonal system and backward FFT for
970	!> a 1d-decomposition along x
971	!>
972	!> @warning this subroutine may still not work for hybrid parallelization
973	!> with OpenMP (for possible necessary changes see the original
974	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
975	!------------------------------------------------------------------------------!
976	SUBROUTINE fftx_tri_fftx( ar )
977
978
979	USE control_parameters, &
980	ONLY: host
981
982	USE cpulog, &
983	ONLY: cpu_log, log_point_s
984
985	USE grid_variables, &
986	ONLY: ddx2, ddy2
987
988	USE kinds
989
990	USE pegrid
991
992	IMPLICIT NONE
993
994	INTEGER(iwp) :: i !<
995	INTEGER(iwp) :: j !<
996	INTEGER(iwp) :: k !<
997	INTEGER(iwp) :: m !<
998	INTEGER(iwp) :: n !<
999	INTEGER(iwp) :: omp_get_thread_num !<
1000	INTEGER(iwp) :: tn !<
1001
1002	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
1003	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
1004	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
1005	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1006
1007
1008	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
1009
1010	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1011
1012	tn = 0 ! Default thread number in case of one thread
1013	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1014	DO j = nys_x, nyn_x
1015
1016	!$ tn = omp_get_thread_num()
1017
1018	IF ( host(1:3) == 'nec' ) THEN
1019	!
1020	!-- Code optimized for vector processors
1021	DO k = 1, nz
1022
1023	m = 0
1024	DO n = 1, pdims(1)
1025	DO i = 1, nnx
1026	work_trix(m,k) = ar(i,k,j,n)
1027	m = m + 1
1028	ENDDO
1029	ENDDO
1030
1031	ENDDO
1032
1033	CALL fft_x_m( work_trix, 'forward' )
1034
1035	ELSE
1036	!
1037	!-- Cache optimized code
1038	DO k = 1, nz
1039
1040	m = 0
1041	DO n = 1, pdims(1)
1042	DO i = 1, nnx
1043	work_fftx(m) = ar(i,k,j,n)
1044	m = m + 1
1045	ENDDO
1046	ENDDO
1047
1048	CALL fft_x_1d( work_fftx, 'forward' )
1049
1050	DO i = 0, nx
1051	work_trix(i,k) = work_fftx(i)
1052	ENDDO
1053
1054	ENDDO
1055
1056	ENDIF
1057
1058	!
1059	!-- Solve the linear equation system
1060	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1061
1062	IF ( host(1:3) == 'nec' ) THEN
1063	!
1064	!-- Code optimized for vector processors
1065	CALL fft_x_m( work_trix, 'backward' )
1066
1067	DO k = 1, nz
1068
1069	m = 0
1070	DO n = 1, pdims(1)
1071	DO i = 1, nnx
1072	ar(i,k,j,n) = work_trix(m,k)
1073	m = m + 1
1074	ENDDO
1075	ENDDO
1076
1077	ENDDO
1078
1079	ELSE
1080	!
1081	!-- Cache optimized code
1082	DO k = 1, nz
1083
1084	DO i = 0, nx
1085	work_fftx(i) = work_trix(i,k)
1086	ENDDO
1087
1088	CALL fft_x_1d( work_fftx, 'backward' )
1089
1090	m = 0
1091	DO n = 1, pdims(1)
1092	DO i = 1, nnx
1093	ar(i,k,j,n) = work_fftx(m)
1094	m = m + 1
1095	ENDDO
1096	ENDDO
1097
1098	ENDDO
1099
1100	ENDIF
1101
1102	ENDDO
1103
1104	DEALLOCATE( tri )
1105
1106	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1107
1108	END SUBROUTINE fftx_tri_fftx
1109
1110
1111	!------------------------------------------------------------------------------!
1112	! Description:
1113	! ------------
1114	!> Fourier-transformation along x with subsequent transposition x --> y for
1115	!> a 1d-decomposition along y.
1116	!>
1117	!> @attention NEC-branch of this routine may significantly profit from
1118	!> further optimizations. So far, performance is much worse than
1119	!> for routine ffty_tr_yx (more than three times slower).
1120	!------------------------------------------------------------------------------!
1121	SUBROUTINE fftx_tr_xy( f_in, f_out )
1122
1123
1124	USE control_parameters, &
1125	ONLY: host
1126
1127	USE cpulog, &
1128	ONLY: cpu_log, log_point_s
1129
1130	USE kinds
1131
1132	USE pegrid
1133
1134	IMPLICIT NONE
1135
1136	INTEGER(iwp) :: i !<
1137	INTEGER(iwp) :: j !<
1138	INTEGER(iwp) :: k !<
1139
1140	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1141	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
1142	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
1143	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1144
1145	!
1146	!-- Carry out the FFT along x, where all data are present due to the
1147	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1148	!-- the first index.
1149	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1150
1151	IF ( host(1:3) == 'nec' ) THEN
1152	!
1153	!-- Code for vector processors
1154	!$OMP PARALLEL PRIVATE ( i, j, k )
1155	!$OMP DO
1156	DO i = 0, nx
1157
1158	DO j = nys, nyn
1159	DO k = 1, nz
1160	work_fftx(i,k,j) = f_in(k,j,i)
1161	ENDDO
1162	ENDDO
1163
1164	ENDDO
1165
1166	!$OMP DO
1167	DO j = nys, nyn
1168
1169	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1170
1171	DO k = 1, nz
1172	DO i = 0, nx
1173	work(j,k,i) = work_fftx(i,k,j)
1174	ENDDO
1175	ENDDO
1176
1177	ENDDO
1178	!$OMP END PARALLEL
1179
1180	ELSE
1181
1182	!
1183	!-- Cache optimized code (there might be still a potential for better
1184	!-- optimization).
1185	!$OMP PARALLEL PRIVATE (i,j,k)
1186	!$OMP DO
1187	DO i = 0, nx
1188
1189	DO j = nys, nyn
1190	DO k = 1, nz
1191	work_fftx(i,k,j) = f_in(k,j,i)
1192	ENDDO
1193	ENDDO
1194
1195	ENDDO
1196
1197	!$OMP DO
1198	DO j = nys, nyn
1199	DO k = 1, nz
1200
1201	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1202
1203	DO i = 0, nx
1204	work(j,k,i) = work_fftx(i,k,j)
1205	ENDDO
1206	ENDDO
1207
1208	ENDDO
1209	!$OMP END PARALLEL
1210
1211	ENDIF
1212	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1213
1214	!
1215	!-- Transpose array
1216	#if defined( __parallel )
1217	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1218	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1219	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1220	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1221	comm1dy, ierr )
1222	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1223	#endif
1224
1225	END SUBROUTINE fftx_tr_xy
1226
1227
1228	!------------------------------------------------------------------------------!
1229	! Description:
1230	! ------------
1231	!> Transposition y --> x with a subsequent backward Fourier transformation for
1232	!> a 1d-decomposition along x.
1233	!------------------------------------------------------------------------------!
1234	SUBROUTINE tr_yx_fftx( f_in, f_out )
1235
1236
1237	USE control_parameters, &
1238	ONLY: host
1239
1240	USE cpulog, &
1241	ONLY: cpu_log, log_point_s
1242
1243	USE kinds
1244
1245	USE pegrid
1246
1247	IMPLICIT NONE
1248
1249	INTEGER(iwp) :: i !<
1250	INTEGER(iwp) :: j !<
1251	INTEGER(iwp) :: k !<
1252
1253	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1254	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
1255	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
1256	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1257
1258	!
1259	!-- Transpose array
1260	#if defined( __parallel )
1261	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1262	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1263	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1264	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1265	comm1dy, ierr )
1266	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1267	#endif
1268
1269	!
1270	!-- Carry out the FFT along x, where all data are present due to the
1271	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1272	!-- the first index.
1273	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1274
1275	IF ( host(1:3) == 'nec' ) THEN
1276	!
1277	!-- Code optimized for vector processors
1278	!$OMP PARALLEL PRIVATE ( i, j, k )
1279	!$OMP DO
1280	DO j = nys, nyn
1281
1282	DO k = 1, nz
1283	DO i = 0, nx
1284	work_fftx(i,k,j) = work(j,k,i)
1285	ENDDO
1286	ENDDO
1287
1288	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1289
1290	ENDDO
1291
1292	!$OMP DO
1293	DO i = 0, nx
1294	DO j = nys, nyn
1295	DO k = 1, nz
1296	f_out(k,j,i) = work_fftx(i,k,j)
1297	ENDDO
1298	ENDDO
1299	ENDDO
1300	!$OMP END PARALLEL
1301
1302	ELSE
1303
1304	!
1305	!-- Cache optimized code (there might be still a potential for better
1306	!-- optimization).
1307	!$OMP PARALLEL PRIVATE (i,j,k)
1308	!$OMP DO
1309	DO j = nys, nyn
1310	DO k = 1, nz
1311
1312	DO i = 0, nx
1313	work_fftx(i,k,j) = work(j,k,i)
1314	ENDDO
1315
1316	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1317
1318	ENDDO
1319	ENDDO
1320
1321	!$OMP DO
1322	DO i = 0, nx
1323	DO j = nys, nyn
1324	DO k = 1, nz
1325	f_out(k,j,i) = work_fftx(i,k,j)
1326	ENDDO
1327	ENDDO
1328	ENDDO
1329	!$OMP END PARALLEL
1330
1331	ENDIF
1332	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1333
1334	END SUBROUTINE tr_yx_fftx
1335
1336
1337	!------------------------------------------------------------------------------!
1338	! Description:
1339	! ------------
1340	!> FFT along y, solution of the tridiagonal system and backward FFT for
1341	!> a 1d-decomposition along y.
1342	!>
1343	!> @warning this subroutine may still not work for hybrid parallelization
1344	!> with OpenMP (for possible necessary changes see the original
1345	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1346	!------------------------------------------------------------------------------!
1347	SUBROUTINE ffty_tri_ffty( ar )
1348
1349
1350	USE control_parameters, &
1351	ONLY: host
1352
1353	USE cpulog, &
1354	ONLY: cpu_log, log_point_s
1355
1356	USE grid_variables, &
1357	ONLY: ddx2, ddy2
1358
1359	USE kinds
1360
1361	USE pegrid
1362
1363	IMPLICIT NONE
1364
1365	INTEGER(iwp) :: i !<
1366	INTEGER(iwp) :: j !<
1367	INTEGER(iwp) :: k !<
1368	INTEGER(iwp) :: m !<
1369	INTEGER(iwp) :: n !<
1370	INTEGER(iwp) :: omp_get_thread_num !<
1371	INTEGER(iwp) :: tn !<
1372
1373	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
1374	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
1375	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
1376	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1377
1378
1379	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1380
1381	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1382
1383	tn = 0 ! Default thread number in case of one thread
1384	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1385	DO i = nxl_y, nxr_y
1386
1387	!$ tn = omp_get_thread_num()
1388
1389	IF ( host(1:3) == 'nec' ) THEN
1390	!
1391	!-- Code optimized for vector processors
1392	DO k = 1, nz
1393
1394	m = 0
1395	DO n = 1, pdims(2)
1396	DO j = 1, nny
1397	work_triy(m,k) = ar(j,k,i,n)
1398	m = m + 1
1399	ENDDO
1400	ENDDO
1401
1402	ENDDO
1403
1404	CALL fft_y_m( work_triy, ny, 'forward' )
1405
1406	ELSE
1407	!
1408	!-- Cache optimized code
1409	DO k = 1, nz
1410
1411	m = 0
1412	DO n = 1, pdims(2)
1413	DO j = 1, nny
1414	work_ffty(m) = ar(j,k,i,n)
1415	m = m + 1
1416	ENDDO
1417	ENDDO
1418
1419	CALL fft_y_1d( work_ffty, 'forward' )
1420
1421	DO j = 0, ny
1422	work_triy(j,k) = work_ffty(j)
1423	ENDDO
1424
1425	ENDDO
1426
1427	ENDIF
1428
1429	!
1430	!-- Solve the linear equation system
1431	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1432
1433	IF ( host(1:3) == 'nec' ) THEN
1434	!
1435	!-- Code optimized for vector processors
1436	CALL fft_y_m( work_triy, ny, 'backward' )
1437
1438	DO k = 1, nz
1439
1440	m = 0
1441	DO n = 1, pdims(2)
1442	DO j = 1, nny
1443	ar(j,k,i,n) = work_triy(m,k)
1444	m = m + 1
1445	ENDDO
1446	ENDDO
1447
1448	ENDDO
1449
1450	ELSE
1451	!
1452	!-- Cache optimized code
1453	DO k = 1, nz
1454
1455	DO j = 0, ny
1456	work_ffty(j) = work_triy(j,k)
1457	ENDDO
1458
1459	CALL fft_y_1d( work_ffty, 'backward' )
1460
1461	m = 0
1462	DO n = 1, pdims(2)
1463	DO j = 1, nny
1464	ar(j,k,i,n) = work_ffty(m)
1465	m = m + 1
1466	ENDDO
1467	ENDDO
1468
1469	ENDDO
1470
1471	ENDIF
1472
1473	ENDDO
1474
1475	DEALLOCATE( tri )
1476
1477	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1478
1479	END SUBROUTINE ffty_tri_ffty
1480
1481	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |