Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 2118

Last change on this file since 2118 was 2118, checked in by raasch, 7 years ago
all OpenACC directives and related parts removed from the code
Property svn:keywords set to `Id`
File size: 44.8 KB

Line
1	!> @file poisfft_mod.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of PALM.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2017 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	! OpenACC directives and related code removed
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft_mod.f90 2118 2017-01-17 16:38:49Z raasch $
27	!
28	! 2000 2016-08-20 18:09:15Z knoop
29	! Forced header and separation lines into 80 columns
30	!
31	! 1850 2016-04-08 13:29:27Z maronga
32	! Module renamed
33	!
34	!
35	! 1804 2016-04-05 16:30:18Z maronga
36	! Removed code for parameter file check (__check)
37	!
38	! 1682 2015-10-07 23:56:08Z knoop
39	! Code annotations made doxygen readable
40	!
41	! 1482 2014-10-18 12:34:45Z raasch
42	! use 2d-decomposition, if accelerator boards are used
43	!
44	! 1406 2014-05-16 13:47:01Z raasch
45	! bugfix for pgi 14.4: declare create moved after array declaration
46	!
47	! 1320 2014-03-20 08:40:49Z raasch
48	! ONLY-attribute added to USE-statements,
49	! kind-parameters added to all INTEGER and REAL declaration statements,
50	! kinds are defined in new module kinds,
51	! old module precision_kind is removed,
52	! revision history before 2012 removed,
53	! comment fields (!:) to be used for variable explanations added to
54	! all variable declaration statements
55	!
56	! 1318 2014-03-17 13:35:16Z raasch
57	! module interfaces removed
58	!
59	! 1306 2014-03-13 14:30:59Z raasch
60	! openmp sections removed from the overlap branch,
61	! second argument removed from parameter list
62	!
63	! 1216 2013-08-26 09:31:42Z raasch
64	! resorting of arrays moved to separate routines resort_for_...,
65	! one argument, used as temporary work array, removed from all transpose
66	! routines
67	! overlapping fft / transposition implemented
68	!
69	! 1212 2013-08-15 08:46:27Z raasch
70	! tridia routines moved to seperate module tridia_solver
71	!
72	! 1208 2013-08-13 06:41:49Z raasch
73	! acc-update clauses added for "ar" so that ffts other than cufft can also be
74	! used (although they are not ported and will give a poor performance)
75	!
76	! 1111 2013-03-08 23:54:10Z raasch
77	! further openACC porting of non-parallel (MPI) branch:
78	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
79	! no distinction between parallel/non-parallel in poisfft and tridia any more,
80	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
81	! (otherwise "invalid device function" is indicated during runtime),
82	! optimization of tridia routines: constant elements and coefficients of tri are
83	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
84	! to 2,
85	! poisfft_init is now called internally from poisfft, maketri is called from
86	! poisfft_init,
87	! ibc_p_b = 2 removed
88	!
89	! 1106 2013-03-04 05:31:38Z raasch
90	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
91	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
92	! fft_y_1d
93	!
94	! 1103 2013-02-20 02:15:53Z raasch
95	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
96	! sometimes cause segmentation faults with intel 12.1 compiler
97	!
98	! 1092 2013-02-02 11:24:22Z raasch
99	! unused variables removed
100	!
101	! 1036 2012-10-22 13:43:42Z raasch
102	! code put under GPL (PALM 3.9)
103	!
104	! 2012-09-21 07:03:55Z raasch
105	! FLOAT type conversion replaced by REAL
106	!
107	! 1003 2012-09-14 14:35:53Z raasch
108	! indices nxa, nya, etc. replaced by nx, ny, etc.
109	!
110	! 940 2012-07-09 14:31:00Z raasch
111	! special handling of tri-array as an argument in tridia_1dd routines switched
112	! off because it caused segmentation faults with intel 12.1 compiler
113	!
114	! 877 2012-04-03 11:21:44Z suehring
115	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
116	! pressure at the top of the model domain.
117	!
118	! 809 2012-01-30 13:32:58Z maronga
119	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
120	!
121	! 807 2012-01-25 11:53:51Z maronga
122	! New cpp directive "__check" implemented which is used by check_namelist_files
123	! (most of the code is unneeded by check_namelist_files).
124	!
125	! Revision 1.1 1997/07/24 11:24:14 raasch
126	! Initial revision
127	!
128	!
129	! Description:
130	! ------------
131	!> Solves the Poisson equation with a 2D spectral method
132	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
133	!>
134	!> Input:
135	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
136	!> starting from (1,nys,nxl)
137	!>
138	!> Output:
139	!> real ar contains the solution for perturbation pressure p
140	!------------------------------------------------------------------------------!
141	MODULE poisfft_mod
142
143
144	USE fft_xy, &
145	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
146
147	USE indices, &
148	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
149
150	USE transpose_indices, &
151	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
152	nzb_y, nzt_x, nzt_y
153
154	USE tridia_solver, &
155	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
156
157	IMPLICIT NONE
158
159	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
160
161	PRIVATE
162
163	PUBLIC poisfft, poisfft_init
164
165	INTERFACE poisfft
166	MODULE PROCEDURE poisfft
167	END INTERFACE poisfft
168
169	INTERFACE poisfft_init
170	MODULE PROCEDURE poisfft_init
171	END INTERFACE poisfft_init
172
173
174	CONTAINS
175
176	!------------------------------------------------------------------------------!
177	! Description:
178	! ------------
179	!> @todo Missing subroutine description.
180	!------------------------------------------------------------------------------!
181	SUBROUTINE poisfft_init
182
183	USE arrays_3d, &
184	ONLY: ddzu_pres, ddzw
185
186	USE kinds
187
188	IMPLICIT NONE
189
190	INTEGER(iwp) :: k !<
191
192
193	CALL fft_init
194
195	CALL tridia_init
196
197	poisfft_initialized = .TRUE.
198
199	END SUBROUTINE poisfft_init
200
201
202
203	!------------------------------------------------------------------------------!
204	! Description:
205	! ------------
206	!> Two-dimensional Fourier Transformation in x- and y-direction.
207	!------------------------------------------------------------------------------!
208	SUBROUTINE poisfft( ar )
209
210	USE control_parameters, &
211	ONLY: fft_method, transpose_compute_overlap
212
213	USE cpulog, &
214	ONLY: cpu_log, cpu_log_nowait, log_point_s
215
216	USE kinds
217
218	USE pegrid
219
220	IMPLICIT NONE
221
222	INTEGER(iwp) :: ii !<
223	INTEGER(iwp) :: iind !<
224	INTEGER(iwp) :: inew !<
225	INTEGER(iwp) :: jj !<
226	INTEGER(iwp) :: jind !<
227	INTEGER(iwp) :: jnew !<
228	INTEGER(iwp) :: ki !<
229	INTEGER(iwp) :: kk !<
230	INTEGER(iwp) :: knew !<
231	INTEGER(iwp) :: n !<
232	INTEGER(iwp) :: nblk !<
233	INTEGER(iwp) :: nnx_y !<
234	INTEGER(iwp) :: nny_z !<
235	INTEGER(iwp) :: nnz_t !<
236	INTEGER(iwp) :: nnz_x !<
237	INTEGER(iwp) :: nxl_y_bound !<
238	INTEGER(iwp) :: nxr_y_bound !<
239
240	INTEGER(iwp), DIMENSION(4) :: isave !<
241
242	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
243	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
244
245	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
246	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
247	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
248	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
249	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
250
251
252	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
253
254	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
255
256	!
257	!-- Two-dimensional Fourier Transformation in x- and y-direction.
258	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
259
260	!
261	!-- 1d-domain-decomposition along x:
262	!-- FFT along y and transposition y --> x
263	CALL ffty_tr_yx( ar, ar )
264
265	!
266	!-- FFT along x, solving the tridiagonal system and backward FFT
267	CALL fftx_tri_fftx( ar )
268
269	!
270	!-- Transposition x --> y and backward FFT along y
271	CALL tr_xy_ffty( ar, ar )
272
273	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
274
275	!
276	!-- 1d-domain-decomposition along y:
277	!-- FFT along x and transposition x --> y
278	CALL fftx_tr_xy( ar, ar )
279
280	!
281	!-- FFT along y, solving the tridiagonal system and backward FFT
282	CALL ffty_tri_ffty( ar )
283
284	!
285	!-- Transposition y --> x and backward FFT along x
286	CALL tr_yx_fftx( ar, ar )
287
288	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
289
290	!
291	!-- 2d-domain-decomposition or no decomposition (1 PE run)
292	!-- Transposition z --> x
293	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
294	CALL resort_for_zx( ar, ar_inv )
295	CALL transpose_zx( ar_inv, ar )
296	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
297
298	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
299	CALL fft_x( ar, 'forward' )
300	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
301
302	!
303	!-- Transposition x --> y
304	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
305	CALL resort_for_xy( ar, ar_inv )
306	CALL transpose_xy( ar_inv, ar )
307	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
308
309	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
310	CALL fft_y( ar, 'forward', ar_tr = ar, &
311	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
312	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
313	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
314
315	!
316	!-- Transposition y --> z
317	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
318	CALL resort_for_yz( ar, ar_inv )
319	CALL transpose_yz( ar_inv, ar )
320	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
321
322	!
323	!-- Solve the tridiagonal equation system along z
324	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
325	CALL tridia_substi( ar )
326	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
327
328	!
329	!-- Inverse Fourier Transformation
330	!-- Transposition z --> y
331	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
332	CALL transpose_zy( ar, ar_inv )
333	CALL resort_for_zy( ar_inv, ar )
334	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
335
336	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
337	CALL fft_y( ar, 'backward', ar_tr = ar, &
338	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
339	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
340	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
341
342	!
343	!-- Transposition y --> x
344	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
345	CALL transpose_yx( ar, ar_inv )
346	CALL resort_for_yx( ar_inv, ar )
347	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
348
349	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
350	CALL fft_x( ar, 'backward' )
351	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
352
353	!
354	!-- Transposition x --> z
355	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
356	CALL transpose_xz( ar, ar_inv )
357	CALL resort_for_xz( ar_inv, ar )
358	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
359
360	ELSE
361
362	!
363	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
364	!-- overlapping transposition / fft
365	!-- cputime logging must not use barriers, which would prevent overlapping
366	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
367	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
368	!
369	!-- Transposition z --> x + subsequent fft along x
370	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
371	CALL resort_for_zx( ar, f_inv )
372	!
373	!-- Save original indices and gridpoint counter
374	isave(1) = nz
375	isave(2) = nzb_x
376	isave(3) = nzt_x
377	isave(4) = sendrecvcount_zx
378	!
379	!-- Set new indices for transformation
380	nblk = nz / pdims(1)
381	nz = pdims(1)
382	nnz_x = 1
383	nzb_x = 1 + myidx * nnz_x
384	nzt_x = ( myidx + 1 ) * nnz_x
385	sendrecvcount_zx = nnx * nny * nnz_x
386
387	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
388	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
389
390	DO kk = 1, nblk
391
392	IF ( kk == 1 ) THEN
393	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
394	ELSE
395	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
396	ENDIF
397
398	DO knew = 1, nz
399	ki = kk + nblk * ( knew - 1 )
400	f_in(:,:,knew) = f_inv(:,:,ki)
401	ENDDO
402
403	CALL transpose_zx( f_in, ar1(:,:,:))
404	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
405
406	IF ( kk == 1 ) THEN
407	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
408	ELSE
409	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
410	ENDIF
411
412	n = isave(2) + kk - 1
413	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
414	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
415
416	ENDDO
417	!
418	!-- Restore original indices/counters
419	nz = isave(1)
420	nzb_x = isave(2)
421	nzt_x = isave(3)
422	sendrecvcount_zx = isave(4)
423
424	DEALLOCATE( ar1, f_in, f_inv )
425
426	!
427	!-- Transposition x --> y + subsequent fft along y
428	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
429	CALL resort_for_xy( f_out_z, f_inv )
430	!
431	!-- Save original indices and gridpoint counter
432	isave(1) = nx
433	isave(2) = nxl_y
434	isave(3) = nxr_y
435	isave(4) = sendrecvcount_xy
436	!
437	!-- Set new indices for transformation
438	nblk = ( ( nx+1 ) / pdims(2) ) - 1
439	nx = pdims(2)
440	nnx_y = 1
441	nxl_y = myidy * nnx_y
442	nxr_y = ( myidy + 1 ) * nnx_y - 1
443	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
444
445	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
446	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
447
448	DO ii = 0, nblk
449
450	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
451
452	DO inew = 0, nx-1
453	iind = ii + ( nblk + 1 ) * inew
454	f_in(:,:,inew) = f_inv(:,:,iind)
455	ENDDO
456
457	CALL transpose_xy( f_in, ar1(:,:,:) )
458
459	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
460
461	IF ( ii == 1 ) THEN
462	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
463	ELSE
464	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
465	ENDIF
466
467	nxl_y_bound = isave(2)
468	nxr_y_bound = isave(3)
469	n = isave(2) + ii
470	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
471	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
472	nxl_y_l = n, nxr_y_l = n )
473
474	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
475
476	ENDDO
477	!
478	!-- Restore original indices/counters
479	nx = isave(1)
480	nxl_y = isave(2)
481	nxr_y = isave(3)
482	sendrecvcount_xy = isave(4)
483
484	DEALLOCATE( ar1, f_in, f_inv )
485
486	!
487	!-- Transposition y --> z + subsequent tridia + resort for z --> y
488	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
489	CALL resort_for_yz( f_out_y, f_inv )
490	!
491	!-- Save original indices and gridpoint counter
492	isave(1) = ny
493	isave(2) = nys_z
494	isave(3) = nyn_z
495	isave(4) = sendrecvcount_yz
496	!
497	!-- Set new indices for transformation
498	nblk = ( ( ny+1 ) / pdims(1) ) - 1
499	ny = pdims(1)
500	nny_z = 1
501	nys_z = myidx * nny_z
502	nyn_z = ( myidx + 1 ) * nny_z - 1
503	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
504
505	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
506	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
507
508	DO jj = 0, nblk
509	!
510	!-- Forward Fourier Transformation
511	!-- Transposition y --> z
512	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
513
514	DO jnew = 0, ny-1
515	jind = jj + ( nblk + 1 ) * jnew
516	f_in(:,:,jnew) = f_inv(:,:,jind)
517	ENDDO
518
519	CALL transpose_yz( f_in, ar1(:,:,:) )
520
521	IF ( jj == nblk ) THEN
522	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
523	ELSE
524	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
525	ENDIF
526
527	!
528	!-- Solve the tridiagonal equation system along z
529	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
530
531	n = isave(2) + jj
532	CALL tridia_substi_overlap( ar1(:,:,:), n )
533
534	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
535
536	!
537	!-- Inverse Fourier Transformation
538	!-- Transposition z --> y
539	!-- Only one thread should call MPI routines, therefore forward and
540	!-- backward tranpose are in the same section
541	IF ( jj == 0 ) THEN
542	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
543	ELSE
544	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
545	ENDIF
546
547	CALL transpose_zy( ar1(:,:,:), f_in )
548
549	DO jnew = 0, ny-1
550	jind = jj + ( nblk + 1 ) * jnew
551	f_inv(:,:,jind) = f_in(:,:,jnew)
552	ENDDO
553
554	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
555
556	ENDDO
557	!
558	!-- Restore original indices/counters
559	ny = isave(1)
560	nys_z = isave(2)
561	nyn_z = isave(3)
562	sendrecvcount_yz = isave(4)
563
564	CALL resort_for_zy( f_inv, f_out_y )
565
566	DEALLOCATE( ar1, f_in, f_inv )
567
568	!
569	!-- fft along y backward + subsequent transposition y --> x
570	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
571	!
572	!-- Save original indices and gridpoint counter
573	isave(1) = nx
574	isave(2) = nxl_y
575	isave(3) = nxr_y
576	isave(4) = sendrecvcount_xy
577	!
578	!-- Set new indices for transformation
579	nblk = (( nx+1 ) / pdims(2) ) - 1
580	nx = pdims(2)
581	nnx_y = 1
582	nxl_y = myidy * nnx_y
583	nxr_y = ( myidy + 1 ) * nnx_y - 1
584	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
585
586	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
587	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
588
589	DO ii = 0, nblk
590
591	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
592
593	n = isave(2) + ii
594	nxl_y_bound = isave(2)
595	nxr_y_bound = isave(3)
596
597	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
598	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
599	nxl_y_l = n, nxr_y_l = n )
600
601	IF ( ii == nblk ) THEN
602	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
603	ELSE
604	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
605	ENDIF
606
607	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
608
609	CALL transpose_yx( ar1(:,:,:), f_in )
610
611	DO inew = 0, nx-1
612	iind = ii + (nblk+1) * inew
613	f_inv(:,:,iind) = f_in(:,:,inew)
614	ENDDO
615
616	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
617
618	ENDDO
619	!
620	!-- Restore original indices/counters
621	nx = isave(1)
622	nxl_y = isave(2)
623	nxr_y = isave(3)
624	sendrecvcount_xy = isave(4)
625
626	CALL resort_for_yx( f_inv, f_out_z )
627
628	DEALLOCATE( ar1, f_in, f_inv )
629
630	!
631	!-- fft along x backward + subsequent final transposition x --> z
632	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
633	!
634	!-- Save original indices and gridpoint counter
635	isave(1) = nz
636	isave(2) = nzb_x
637	isave(3) = nzt_x
638	isave(4) = sendrecvcount_zx
639	!
640	!-- Set new indices for transformation
641	nblk = nz / pdims(1)
642	nz = pdims(1)
643	nnz_x = 1
644	nzb_x = 1 + myidx * nnz_x
645	nzt_x = ( myidx + 1 ) * nnz_x
646	sendrecvcount_zx = nnx * nny * nnz_x
647
648	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
649	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
650
651	DO kk = 1, nblk
652
653	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
654
655	n = isave(2) + kk - 1
656	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
657
658	IF ( kk == nblk ) THEN
659	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
660	ELSE
661	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
662	ENDIF
663
664	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
665
666	CALL transpose_xz( ar1(:,:,:), f_in )
667
668	DO knew = 1, nz
669	ki = kk + nblk * (knew-1)
670	f_inv(:,:,ki) = f_in(:,:,knew)
671	ENDDO
672
673	IF ( kk == nblk ) THEN
674	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
675	ELSE
676	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
677	ENDIF
678
679	ENDDO
680	!
681	!-- Restore original indices/counters
682	nz = isave(1)
683	nzb_x = isave(2)
684	nzt_x = isave(3)
685	sendrecvcount_zx = isave(4)
686
687	CALL resort_for_xz( f_inv, ar )
688
689	DEALLOCATE( ar1, f_in, f_inv )
690
691	ENDIF
692
693	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
694
695	END SUBROUTINE poisfft
696
697
698	!------------------------------------------------------------------------------!
699	! Description:
700	! ------------
701	!> Fourier-transformation along y with subsequent transposition y --> x for
702	!> a 1d-decomposition along x.
703	!>
704	!> @attention The performance of this routine is much faster on the NEC-SX6,
705	!> if the first index of work_ffty_vec is odd. Otherwise
706	!> memory bank conflicts may occur (especially if the index is a
707	!> multiple of 128). That's why work_ffty_vec is dimensioned as
708	!> 0:ny+1.
709	!> Of course, this will not work if users are using an odd number
710	!> of gridpoints along y.
711	!------------------------------------------------------------------------------!
712	SUBROUTINE ffty_tr_yx( f_in, f_out )
713
714
715	USE control_parameters, &
716	ONLY: host
717
718	USE cpulog, &
719	ONLY: cpu_log, log_point_s
720
721	USE kinds
722
723	USE pegrid
724
725	IMPLICIT NONE
726
727	INTEGER(iwp) :: i !<
728	INTEGER(iwp) :: iend !<
729	INTEGER(iwp) :: iouter !<
730	INTEGER(iwp) :: ir !<
731	INTEGER(iwp) :: j !<
732	INTEGER(iwp) :: k !<
733
734	INTEGER(iwp), PARAMETER :: stridex = 4 !<
735
736	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
737	#if defined( __nec )
738	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
739	#endif
740	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
741	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
742	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
743
744	!
745	!-- Carry out the FFT along y, where all data are present due to the
746	!-- 1d-decomposition along x. Resort the data in a way that x becomes
747	!-- the first index.
748	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
749
750	IF ( host(1:3) == 'nec' ) THEN
751	#if defined( __nec )
752	!
753	!-- Code optimized for vector processors
754	!$OMP PARALLEL PRIVATE ( i, j, k )
755	!$OMP DO
756	DO i = nxl, nxr
757
758	DO j = 0, ny
759	DO k = 1, nz
760	work_ffty_vec(j,k,i) = f_in(k,j,i)
761	ENDDO
762	ENDDO
763
764	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
765
766	ENDDO
767
768	!$OMP DO
769	DO k = 1, nz
770	DO j = 0, ny
771	DO i = nxl, nxr
772	work(i,k,j) = work_ffty_vec(j,k,i)
773	ENDDO
774	ENDDO
775	ENDDO
776	!$OMP END PARALLEL
777	#endif
778
779	ELSE
780
781	!
782	!-- Cache optimized code.
783	!-- The i-(x-)direction is split into a strided outer loop and an inner
784	!-- loop for better cache performance
785	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
786	!$OMP DO
787	DO iouter = nxl, nxr, stridex
788
789	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
790
791	DO k = 1, nz
792
793	DO i = iouter, iend
794
795	ir = i-iouter+1 ! counter within a stride
796	DO j = 0, ny
797	work_ffty(j,ir) = f_in(k,j,i)
798	ENDDO
799	!
800	!-- FFT along y
801	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
802
803	ENDDO
804
805	!
806	!-- Resort
807	DO j = 0, ny
808	DO i = iouter, iend
809	work(i,k,j) = work_ffty(j,i-iouter+1)
810	ENDDO
811	ENDDO
812
813	ENDDO
814
815	ENDDO
816	!$OMP END PARALLEL
817
818	ENDIF
819	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
820
821	!
822	!-- Transpose array
823	#if defined( __parallel )
824	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
825	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
826	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
827	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
828	comm1dx, ierr )
829	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
830	#endif
831
832	END SUBROUTINE ffty_tr_yx
833
834
835	!------------------------------------------------------------------------------!
836	! Description:
837	! ------------
838	!> Transposition x --> y with a subsequent backward Fourier transformation for
839	!> a 1d-decomposition along x
840	!------------------------------------------------------------------------------!
841	SUBROUTINE tr_xy_ffty( f_in, f_out )
842
843
844	USE control_parameters, &
845	ONLY: host
846
847	USE cpulog, &
848	ONLY: cpu_log, log_point_s
849
850	USE kinds
851
852	USE pegrid
853
854	IMPLICIT NONE
855
856	INTEGER(iwp) :: i !<
857	INTEGER(iwp) :: iend !<
858	INTEGER(iwp) :: iouter !<
859	INTEGER(iwp) :: ir !<
860	INTEGER(iwp) :: j !<
861	INTEGER(iwp) :: k !<
862
863	INTEGER(iwp), PARAMETER :: stridex = 4 !<
864
865	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
866	#if defined( __nec )
867	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
868	#endif
869	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
870	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
871	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
872
873	!
874	!-- Transpose array
875	#if defined( __parallel )
876	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
877	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
878	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
879	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
880	comm1dx, ierr )
881	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
882	#endif
883
884	!
885	!-- Resort the data in a way that y becomes the first index and carry out the
886	!-- backward fft along y.
887	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
888
889	IF ( host(1:3) == 'nec' ) THEN
890	#if defined( __nec )
891	!
892	!-- Code optimized for vector processors
893	!$OMP PARALLEL PRIVATE ( i, j, k )
894	!$OMP DO
895	DO k = 1, nz
896	DO j = 0, ny
897	DO i = nxl, nxr
898	work_ffty_vec(j,k,i) = work(i,k,j)
899	ENDDO
900	ENDDO
901	ENDDO
902
903	!$OMP DO
904	DO i = nxl, nxr
905
906	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
907
908	DO j = 0, ny
909	DO k = 1, nz
910	f_out(k,j,i) = work_ffty_vec(j,k,i)
911	ENDDO
912	ENDDO
913
914	ENDDO
915	!$OMP END PARALLEL
916	#endif
917
918	ELSE
919
920	!
921	!-- Cache optimized code.
922	!-- The i-(x-)direction is split into a strided outer loop and an inner
923	!-- loop for better cache performance
924	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
925	!$OMP DO
926	DO iouter = nxl, nxr, stridex
927
928	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
929
930	DO k = 1, nz
931	!
932	!-- Resort
933	DO j = 0, ny
934	DO i = iouter, iend
935	work_ffty(j,i-iouter+1) = work(i,k,j)
936	ENDDO
937	ENDDO
938
939	DO i = iouter, iend
940
941	!
942	!-- FFT along y
943	ir = i-iouter+1 ! counter within a stride
944	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
945
946	DO j = 0, ny
947	f_out(k,j,i) = work_ffty(j,ir)
948	ENDDO
949	ENDDO
950
951	ENDDO
952
953	ENDDO
954	!$OMP END PARALLEL
955
956	ENDIF
957
958	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
959
960	END SUBROUTINE tr_xy_ffty
961
962
963	!------------------------------------------------------------------------------!
964	! Description:
965	! ------------
966	!> FFT along x, solution of the tridiagonal system and backward FFT for
967	!> a 1d-decomposition along x
968	!>
969	!> @warning this subroutine may still not work for hybrid parallelization
970	!> with OpenMP (for possible necessary changes see the original
971	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
972	!------------------------------------------------------------------------------!
973	SUBROUTINE fftx_tri_fftx( ar )
974
975
976	USE control_parameters, &
977	ONLY: host
978
979	USE cpulog, &
980	ONLY: cpu_log, log_point_s
981
982	USE grid_variables, &
983	ONLY: ddx2, ddy2
984
985	USE kinds
986
987	USE pegrid
988
989	IMPLICIT NONE
990
991	INTEGER(iwp) :: i !<
992	INTEGER(iwp) :: j !<
993	INTEGER(iwp) :: k !<
994	INTEGER(iwp) :: m !<
995	INTEGER(iwp) :: n !<
996	INTEGER(iwp) :: omp_get_thread_num !<
997	INTEGER(iwp) :: tn !<
998
999	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
1000	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
1001	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
1002	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1003
1004
1005	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
1006
1007	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1008
1009	tn = 0 ! Default thread number in case of one thread
1010	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1011	DO j = nys_x, nyn_x
1012
1013	!$ tn = omp_get_thread_num()
1014
1015	IF ( host(1:3) == 'nec' ) THEN
1016	!
1017	!-- Code optimized for vector processors
1018	DO k = 1, nz
1019
1020	m = 0
1021	DO n = 1, pdims(1)
1022	DO i = 1, nnx
1023	work_trix(m,k) = ar(i,k,j,n)
1024	m = m + 1
1025	ENDDO
1026	ENDDO
1027
1028	ENDDO
1029
1030	CALL fft_x_m( work_trix, 'forward' )
1031
1032	ELSE
1033	!
1034	!-- Cache optimized code
1035	DO k = 1, nz
1036
1037	m = 0
1038	DO n = 1, pdims(1)
1039	DO i = 1, nnx
1040	work_fftx(m) = ar(i,k,j,n)
1041	m = m + 1
1042	ENDDO
1043	ENDDO
1044
1045	CALL fft_x_1d( work_fftx, 'forward' )
1046
1047	DO i = 0, nx
1048	work_trix(i,k) = work_fftx(i)
1049	ENDDO
1050
1051	ENDDO
1052
1053	ENDIF
1054
1055	!
1056	!-- Solve the linear equation system
1057	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1058
1059	IF ( host(1:3) == 'nec' ) THEN
1060	!
1061	!-- Code optimized for vector processors
1062	CALL fft_x_m( work_trix, 'backward' )
1063
1064	DO k = 1, nz
1065
1066	m = 0
1067	DO n = 1, pdims(1)
1068	DO i = 1, nnx
1069	ar(i,k,j,n) = work_trix(m,k)
1070	m = m + 1
1071	ENDDO
1072	ENDDO
1073
1074	ENDDO
1075
1076	ELSE
1077	!
1078	!-- Cache optimized code
1079	DO k = 1, nz
1080
1081	DO i = 0, nx
1082	work_fftx(i) = work_trix(i,k)
1083	ENDDO
1084
1085	CALL fft_x_1d( work_fftx, 'backward' )
1086
1087	m = 0
1088	DO n = 1, pdims(1)
1089	DO i = 1, nnx
1090	ar(i,k,j,n) = work_fftx(m)
1091	m = m + 1
1092	ENDDO
1093	ENDDO
1094
1095	ENDDO
1096
1097	ENDIF
1098
1099	ENDDO
1100
1101	DEALLOCATE( tri )
1102
1103	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1104
1105	END SUBROUTINE fftx_tri_fftx
1106
1107
1108	!------------------------------------------------------------------------------!
1109	! Description:
1110	! ------------
1111	!> Fourier-transformation along x with subsequent transposition x --> y for
1112	!> a 1d-decomposition along y.
1113	!>
1114	!> @attention NEC-branch of this routine may significantly profit from
1115	!> further optimizations. So far, performance is much worse than
1116	!> for routine ffty_tr_yx (more than three times slower).
1117	!------------------------------------------------------------------------------!
1118	SUBROUTINE fftx_tr_xy( f_in, f_out )
1119
1120
1121	USE control_parameters, &
1122	ONLY: host
1123
1124	USE cpulog, &
1125	ONLY: cpu_log, log_point_s
1126
1127	USE kinds
1128
1129	USE pegrid
1130
1131	IMPLICIT NONE
1132
1133	INTEGER(iwp) :: i !<
1134	INTEGER(iwp) :: j !<
1135	INTEGER(iwp) :: k !<
1136
1137	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1138	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
1139	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
1140	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1141
1142	!
1143	!-- Carry out the FFT along x, where all data are present due to the
1144	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1145	!-- the first index.
1146	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1147
1148	IF ( host(1:3) == 'nec' ) THEN
1149	!
1150	!-- Code for vector processors
1151	!$OMP PARALLEL PRIVATE ( i, j, k )
1152	!$OMP DO
1153	DO i = 0, nx
1154
1155	DO j = nys, nyn
1156	DO k = 1, nz
1157	work_fftx(i,k,j) = f_in(k,j,i)
1158	ENDDO
1159	ENDDO
1160
1161	ENDDO
1162
1163	!$OMP DO
1164	DO j = nys, nyn
1165
1166	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1167
1168	DO k = 1, nz
1169	DO i = 0, nx
1170	work(j,k,i) = work_fftx(i,k,j)
1171	ENDDO
1172	ENDDO
1173
1174	ENDDO
1175	!$OMP END PARALLEL
1176
1177	ELSE
1178
1179	!
1180	!-- Cache optimized code (there might be still a potential for better
1181	!-- optimization).
1182	!$OMP PARALLEL PRIVATE (i,j,k)
1183	!$OMP DO
1184	DO i = 0, nx
1185
1186	DO j = nys, nyn
1187	DO k = 1, nz
1188	work_fftx(i,k,j) = f_in(k,j,i)
1189	ENDDO
1190	ENDDO
1191
1192	ENDDO
1193
1194	!$OMP DO
1195	DO j = nys, nyn
1196	DO k = 1, nz
1197
1198	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1199
1200	DO i = 0, nx
1201	work(j,k,i) = work_fftx(i,k,j)
1202	ENDDO
1203	ENDDO
1204
1205	ENDDO
1206	!$OMP END PARALLEL
1207
1208	ENDIF
1209	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1210
1211	!
1212	!-- Transpose array
1213	#if defined( __parallel )
1214	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1215	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1216	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1217	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1218	comm1dy, ierr )
1219	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1220	#endif
1221
1222	END SUBROUTINE fftx_tr_xy
1223
1224
1225	!------------------------------------------------------------------------------!
1226	! Description:
1227	! ------------
1228	!> Transposition y --> x with a subsequent backward Fourier transformation for
1229	!> a 1d-decomposition along x.
1230	!------------------------------------------------------------------------------!
1231	SUBROUTINE tr_yx_fftx( f_in, f_out )
1232
1233
1234	USE control_parameters, &
1235	ONLY: host
1236
1237	USE cpulog, &
1238	ONLY: cpu_log, log_point_s
1239
1240	USE kinds
1241
1242	USE pegrid
1243
1244	IMPLICIT NONE
1245
1246	INTEGER(iwp) :: i !<
1247	INTEGER(iwp) :: j !<
1248	INTEGER(iwp) :: k !<
1249
1250	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1251	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
1252	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
1253	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1254
1255	!
1256	!-- Transpose array
1257	#if defined( __parallel )
1258	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1259	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1260	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1261	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1262	comm1dy, ierr )
1263	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1264	#endif
1265
1266	!
1267	!-- Carry out the FFT along x, where all data are present due to the
1268	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1269	!-- the first index.
1270	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1271
1272	IF ( host(1:3) == 'nec' ) THEN
1273	!
1274	!-- Code optimized for vector processors
1275	!$OMP PARALLEL PRIVATE ( i, j, k )
1276	!$OMP DO
1277	DO j = nys, nyn
1278
1279	DO k = 1, nz
1280	DO i = 0, nx
1281	work_fftx(i,k,j) = work(j,k,i)
1282	ENDDO
1283	ENDDO
1284
1285	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1286
1287	ENDDO
1288
1289	!$OMP DO
1290	DO i = 0, nx
1291	DO j = nys, nyn
1292	DO k = 1, nz
1293	f_out(k,j,i) = work_fftx(i,k,j)
1294	ENDDO
1295	ENDDO
1296	ENDDO
1297	!$OMP END PARALLEL
1298
1299	ELSE
1300
1301	!
1302	!-- Cache optimized code (there might be still a potential for better
1303	!-- optimization).
1304	!$OMP PARALLEL PRIVATE (i,j,k)
1305	!$OMP DO
1306	DO j = nys, nyn
1307	DO k = 1, nz
1308
1309	DO i = 0, nx
1310	work_fftx(i,k,j) = work(j,k,i)
1311	ENDDO
1312
1313	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1314
1315	ENDDO
1316	ENDDO
1317
1318	!$OMP DO
1319	DO i = 0, nx
1320	DO j = nys, nyn
1321	DO k = 1, nz
1322	f_out(k,j,i) = work_fftx(i,k,j)
1323	ENDDO
1324	ENDDO
1325	ENDDO
1326	!$OMP END PARALLEL
1327
1328	ENDIF
1329	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1330
1331	END SUBROUTINE tr_yx_fftx
1332
1333
1334	!------------------------------------------------------------------------------!
1335	! Description:
1336	! ------------
1337	!> FFT along y, solution of the tridiagonal system and backward FFT for
1338	!> a 1d-decomposition along y.
1339	!>
1340	!> @warning this subroutine may still not work for hybrid parallelization
1341	!> with OpenMP (for possible necessary changes see the original
1342	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1343	!------------------------------------------------------------------------------!
1344	SUBROUTINE ffty_tri_ffty( ar )
1345
1346
1347	USE control_parameters, &
1348	ONLY: host
1349
1350	USE cpulog, &
1351	ONLY: cpu_log, log_point_s
1352
1353	USE grid_variables, &
1354	ONLY: ddx2, ddy2
1355
1356	USE kinds
1357
1358	USE pegrid
1359
1360	IMPLICIT NONE
1361
1362	INTEGER(iwp) :: i !<
1363	INTEGER(iwp) :: j !<
1364	INTEGER(iwp) :: k !<
1365	INTEGER(iwp) :: m !<
1366	INTEGER(iwp) :: n !<
1367	INTEGER(iwp) :: omp_get_thread_num !<
1368	INTEGER(iwp) :: tn !<
1369
1370	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
1371	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
1372	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
1373	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1374
1375
1376	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1377
1378	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1379
1380	tn = 0 ! Default thread number in case of one thread
1381	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1382	DO i = nxl_y, nxr_y
1383
1384	!$ tn = omp_get_thread_num()
1385
1386	IF ( host(1:3) == 'nec' ) THEN
1387	!
1388	!-- Code optimized for vector processors
1389	DO k = 1, nz
1390
1391	m = 0
1392	DO n = 1, pdims(2)
1393	DO j = 1, nny
1394	work_triy(m,k) = ar(j,k,i,n)
1395	m = m + 1
1396	ENDDO
1397	ENDDO
1398
1399	ENDDO
1400
1401	CALL fft_y_m( work_triy, ny, 'forward' )
1402
1403	ELSE
1404	!
1405	!-- Cache optimized code
1406	DO k = 1, nz
1407
1408	m = 0
1409	DO n = 1, pdims(2)
1410	DO j = 1, nny
1411	work_ffty(m) = ar(j,k,i,n)
1412	m = m + 1
1413	ENDDO
1414	ENDDO
1415
1416	CALL fft_y_1d( work_ffty, 'forward' )
1417
1418	DO j = 0, ny
1419	work_triy(j,k) = work_ffty(j)
1420	ENDDO
1421
1422	ENDDO
1423
1424	ENDIF
1425
1426	!
1427	!-- Solve the linear equation system
1428	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1429
1430	IF ( host(1:3) == 'nec' ) THEN
1431	!
1432	!-- Code optimized for vector processors
1433	CALL fft_y_m( work_triy, ny, 'backward' )
1434
1435	DO k = 1, nz
1436
1437	m = 0
1438	DO n = 1, pdims(2)
1439	DO j = 1, nny
1440	ar(j,k,i,n) = work_triy(m,k)
1441	m = m + 1
1442	ENDDO
1443	ENDDO
1444
1445	ENDDO
1446
1447	ELSE
1448	!
1449	!-- Cache optimized code
1450	DO k = 1, nz
1451
1452	DO j = 0, ny
1453	work_ffty(j) = work_triy(j,k)
1454	ENDDO
1455
1456	CALL fft_y_1d( work_ffty, 'backward' )
1457
1458	m = 0
1459	DO n = 1, pdims(2)
1460	DO j = 1, nny
1461	ar(j,k,i,n) = work_ffty(m)
1462	m = m + 1
1463	ENDDO
1464	ENDDO
1465
1466	ENDDO
1467
1468	ENDIF
1469
1470	ENDDO
1471
1472	DEALLOCATE( tri )
1473
1474	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1475
1476	END SUBROUTINE ffty_tri_ffty
1477
1478	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |