Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 3634

Last change on this file since 3634 was 3634, checked in by knoop, 5 years ago
OpenACC port for SPEC
Property svn:keywords set to `Id`
File size: 45.6 KB

Line
1	!> @file poisfft_mod.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2018 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft_mod.f90 3634 2018-12-18 12:31:28Z knoop $
27	! OpenACC port for SPEC
28	!
29	! 3241 2018-09-12 15:02:00Z raasch
30	! unused variables removed,
31	! declarations of omp_get_thread_num now as omp-directive
32	!
33	! 2718 2018-01-02 08:49:38Z maronga
34	! Corrected "Former revisions" section
35	!
36	! 2696 2017-12-14 17:12:51Z kanani
37	! Change in file header (GPL part)
38	!
39	! 2300 2017-06-29 13:31:14Z raasch
40	! settings depending on host variable removed or replaced by loop_optimization
41	!
42	! 2119 2017-01-17 16:51:50Z raasch
43	!
44	! 2118 2017-01-17 16:38:49Z raasch
45	! OpenACC directives and related code removed
46	!
47	! 2000 2016-08-20 18:09:15Z knoop
48	! Forced header and separation lines into 80 columns
49	!
50	! 1850 2016-04-08 13:29:27Z maronga
51	! Module renamed
52	!
53	!
54	! 1804 2016-04-05 16:30:18Z maronga
55	! Removed code for parameter file check (__check)
56	!
57	! 1682 2015-10-07 23:56:08Z knoop
58	! Code annotations made doxygen readable
59	!
60	! 1482 2014-10-18 12:34:45Z raasch
61	! use 2d-decomposition, if accelerator boards are used
62	!
63	! 1406 2014-05-16 13:47:01Z raasch
64	! bugfix for pgi 14.4: declare create moved after array declaration
65	!
66	! 1320 2014-03-20 08:40:49Z raasch
67	! ONLY-attribute added to USE-statements,
68	! kind-parameters added to all INTEGER and REAL declaration statements,
69	! kinds are defined in new module kinds,
70	! old module precision_kind is removed,
71	! revision history before 2012 removed,
72	! comment fields (!:) to be used for variable explanations added to
73	! all variable declaration statements
74	!
75	! 1318 2014-03-17 13:35:16Z raasch
76	! module interfaces removed
77	!
78	! 1306 2014-03-13 14:30:59Z raasch
79	! openmp sections removed from the overlap branch,
80	! second argument removed from parameter list
81	!
82	! 1216 2013-08-26 09:31:42Z raasch
83	! resorting of arrays moved to separate routines resort_for_...,
84	! one argument, used as temporary work array, removed from all transpose
85	! routines
86	! overlapping fft / transposition implemented
87	!
88	! 1212 2013-08-15 08:46:27Z raasch
89	! tridia routines moved to seperate module tridia_solver
90	!
91	! 1208 2013-08-13 06:41:49Z raasch
92	! acc-update clauses added for "ar" so that ffts other than cufft can also be
93	! used (although they are not ported and will give a poor performance)
94	!
95	! 1111 2013-03-08 23:54:10Z raasch
96	! further openACC porting of non-parallel (MPI) branch:
97	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
98	! no distinction between parallel/non-parallel in poisfft and tridia any more,
99	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
100	! (otherwise "invalid device function" is indicated during runtime),
101	! optimization of tridia routines: constant elements and coefficients of tri are
102	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
103	! to 2,
104	! poisfft_init is now called internally from poisfft, maketri is called from
105	! poisfft_init,
106	! ibc_p_b = 2 removed
107	!
108	! 1106 2013-03-04 05:31:38Z raasch
109	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
110	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
111	! fft_y_1d
112	!
113	! 1103 2013-02-20 02:15:53Z raasch
114	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
115	! sometimes cause segmentation faults with intel 12.1 compiler
116	!
117	! 1092 2013-02-02 11:24:22Z raasch
118	! unused variables removed
119	!
120	! 1036 2012-10-22 13:43:42Z raasch
121	! code put under GPL (PALM 3.9)
122	!
123	! 2012-09-21 07:03:55Z raasch
124	! FLOAT type conversion replaced by REAL
125	!
126	! 1003 2012-09-14 14:35:53Z raasch
127	! indices nxa, nya, etc. replaced by nx, ny, etc.
128	!
129	! 940 2012-07-09 14:31:00Z raasch
130	! special handling of tri-array as an argument in tridia_1dd routines switched
131	! off because it caused segmentation faults with intel 12.1 compiler
132	!
133	! 877 2012-04-03 11:21:44Z suehring
134	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
135	! pressure at the top of the model domain.
136	!
137	! 809 2012-01-30 13:32:58Z maronga
138	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
139	!
140	! 807 2012-01-25 11:53:51Z maronga
141	! New cpp directive "__check" implemented which is used by check_namelist_files
142	! (most of the code is unneeded by check_namelist_files).
143	!
144	! Revision 1.1 1997/07/24 11:24:14 raasch
145	! Initial revision
146	!
147	!
148	! Description:
149	! ------------
150	!> Solves the Poisson equation with a 2D spectral method
151	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
152	!>
153	!> Input:
154	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
155	!> starting from (1,nys,nxl)
156	!>
157	!> Output:
158	!> real ar contains the solution for perturbation pressure p
159	!------------------------------------------------------------------------------!
160	MODULE poisfft_mod
161
162
163	USE fft_xy, &
164	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
165
166	USE indices, &
167	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
168
169	USE transpose_indices, &
170	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
171	nzb_y, nzt_x, nzt_y
172
173	USE tridia_solver, &
174	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
175
176	IMPLICIT NONE
177
178	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
179
180	PRIVATE
181
182	PUBLIC poisfft, poisfft_init
183
184	INTERFACE poisfft
185	MODULE PROCEDURE poisfft
186	END INTERFACE poisfft
187
188	INTERFACE poisfft_init
189	MODULE PROCEDURE poisfft_init
190	END INTERFACE poisfft_init
191
192
193	CONTAINS
194
195	!------------------------------------------------------------------------------!
196	! Description:
197	! ------------
198	!> Setup coefficients for FFT and the tridiagonal solver
199	!------------------------------------------------------------------------------!
200	SUBROUTINE poisfft_init
201
202	IMPLICIT NONE
203
204
205	CALL fft_init
206
207	CALL tridia_init
208
209	poisfft_initialized = .TRUE.
210
211	END SUBROUTINE poisfft_init
212
213
214
215	!------------------------------------------------------------------------------!
216	! Description:
217	! ------------
218	!> Two-dimensional Fourier Transformation in x- and y-direction.
219	!------------------------------------------------------------------------------!
220	SUBROUTINE poisfft( ar )
221
222	USE control_parameters, &
223	ONLY: transpose_compute_overlap
224
225	USE cpulog, &
226	ONLY: cpu_log, cpu_log_nowait, log_point_s
227
228	USE kinds
229
230	USE pegrid
231
232	IMPLICIT NONE
233
234	INTEGER(iwp) :: ii !<
235	INTEGER(iwp) :: iind !<
236	INTEGER(iwp) :: inew !<
237	INTEGER(iwp) :: jj !<
238	INTEGER(iwp) :: jind !<
239	INTEGER(iwp) :: jnew !<
240	INTEGER(iwp) :: ki !<
241	INTEGER(iwp) :: kk !<
242	INTEGER(iwp) :: knew !<
243	INTEGER(iwp) :: n !<
244	INTEGER(iwp) :: nblk !<
245	INTEGER(iwp) :: nnx_y !<
246	INTEGER(iwp) :: nny_z !<
247	INTEGER(iwp) :: nnz_x !<
248	INTEGER(iwp) :: nxl_y_bound !<
249	INTEGER(iwp) :: nxr_y_bound !<
250
251	INTEGER(iwp), DIMENSION(4) :: isave !<
252
253	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
254	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
255	!$ACC DECLARE CREATE(ar_inv)
256
257	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
258	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
259	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
260	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
261	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
262
263
264	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
265
266	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
267
268	#ifndef _OPENACC
269	!
270	!-- Two-dimensional Fourier Transformation in x- and y-direction.
271	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
272
273	!
274	!-- 1d-domain-decomposition along x:
275	!-- FFT along y and transposition y --> x
276	CALL ffty_tr_yx( ar, ar )
277
278	!
279	!-- FFT along x, solving the tridiagonal system and backward FFT
280	CALL fftx_tri_fftx( ar )
281
282	!
283	!-- Transposition x --> y and backward FFT along y
284	CALL tr_xy_ffty( ar, ar )
285
286	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
287
288	!
289	!-- 1d-domain-decomposition along y:
290	!-- FFT along x and transposition x --> y
291	CALL fftx_tr_xy( ar, ar )
292
293	!
294	!-- FFT along y, solving the tridiagonal system and backward FFT
295	CALL ffty_tri_ffty( ar )
296
297	!
298	!-- Transposition y --> x and backward FFT along x
299	CALL tr_yx_fftx( ar, ar )
300
301	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
302	#endif
303
304	!
305	!-- 2d-domain-decomposition or no decomposition (1 PE run)
306	!-- Transposition z --> x
307	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
308	CALL resort_for_zx( ar, ar_inv )
309	CALL transpose_zx( ar_inv, ar )
310	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
311
312	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
313	CALL fft_x( ar, 'forward' )
314	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
315
316	!
317	!-- Transposition x --> y
318	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
319	CALL resort_for_xy( ar, ar_inv )
320	CALL transpose_xy( ar_inv, ar )
321	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
322
323	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
324	CALL fft_y( ar, 'forward', ar_tr = ar, &
325	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
326	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
327	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
328
329	!
330	!-- Transposition y --> z
331	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
332	CALL resort_for_yz( ar, ar_inv )
333	CALL transpose_yz( ar_inv, ar )
334	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
335
336	!
337	!-- Solve the tridiagonal equation system along z
338	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
339	CALL tridia_substi( ar )
340	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
341
342	!
343	!-- Inverse Fourier Transformation
344	!-- Transposition z --> y
345	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
346	CALL transpose_zy( ar, ar_inv )
347	CALL resort_for_zy( ar_inv, ar )
348	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
349
350	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
351	CALL fft_y( ar, 'backward', ar_tr = ar, &
352	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
353	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
354	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
355
356	!
357	!-- Transposition y --> x
358	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
359	CALL transpose_yx( ar, ar_inv )
360	CALL resort_for_yx( ar_inv, ar )
361	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
362
363	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
364	CALL fft_x( ar, 'backward' )
365	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
366
367	!
368	!-- Transposition x --> z
369	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
370	CALL transpose_xz( ar, ar_inv )
371	CALL resort_for_xz( ar_inv, ar )
372	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
373
374	#ifndef _OPENACC
375	ELSE
376
377	!
378	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
379	!-- overlapping transposition / fft
380	!-- cputime logging must not use barriers, which would prevent overlapping
381	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
382	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
383	!
384	!-- Transposition z --> x + subsequent fft along x
385	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
386	CALL resort_for_zx( ar, f_inv )
387	!
388	!-- Save original indices and gridpoint counter
389	isave(1) = nz
390	isave(2) = nzb_x
391	isave(3) = nzt_x
392	isave(4) = sendrecvcount_zx
393	!
394	!-- Set new indices for transformation
395	nblk = nz / pdims(1)
396	nz = pdims(1)
397	nnz_x = 1
398	nzb_x = 1 + myidx * nnz_x
399	nzt_x = ( myidx + 1 ) * nnz_x
400	sendrecvcount_zx = nnx * nny * nnz_x
401
402	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
403	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
404
405	DO kk = 1, nblk
406
407	IF ( kk == 1 ) THEN
408	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
409	ELSE
410	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
411	ENDIF
412
413	DO knew = 1, nz
414	ki = kk + nblk * ( knew - 1 )
415	f_in(:,:,knew) = f_inv(:,:,ki)
416	ENDDO
417
418	CALL transpose_zx( f_in, ar1(:,:,:))
419	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
420
421	IF ( kk == 1 ) THEN
422	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
423	ELSE
424	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
425	ENDIF
426
427	n = isave(2) + kk - 1
428	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
429	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
430
431	ENDDO
432	!
433	!-- Restore original indices/counters
434	nz = isave(1)
435	nzb_x = isave(2)
436	nzt_x = isave(3)
437	sendrecvcount_zx = isave(4)
438
439	DEALLOCATE( ar1, f_in, f_inv )
440
441	!
442	!-- Transposition x --> y + subsequent fft along y
443	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
444	CALL resort_for_xy( f_out_z, f_inv )
445	!
446	!-- Save original indices and gridpoint counter
447	isave(1) = nx
448	isave(2) = nxl_y
449	isave(3) = nxr_y
450	isave(4) = sendrecvcount_xy
451	!
452	!-- Set new indices for transformation
453	nblk = ( ( nx+1 ) / pdims(2) ) - 1
454	nx = pdims(2)
455	nnx_y = 1
456	nxl_y = myidy * nnx_y
457	nxr_y = ( myidy + 1 ) * nnx_y - 1
458	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
459
460	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
461	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
462
463	DO ii = 0, nblk
464
465	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
466
467	DO inew = 0, nx-1
468	iind = ii + ( nblk + 1 ) * inew
469	f_in(:,:,inew) = f_inv(:,:,iind)
470	ENDDO
471
472	CALL transpose_xy( f_in, ar1(:,:,:) )
473
474	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
475
476	IF ( ii == 1 ) THEN
477	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
478	ELSE
479	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
480	ENDIF
481
482	nxl_y_bound = isave(2)
483	nxr_y_bound = isave(3)
484	n = isave(2) + ii
485	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
486	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
487	nxl_y_l = n, nxr_y_l = n )
488
489	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
490
491	ENDDO
492	!
493	!-- Restore original indices/counters
494	nx = isave(1)
495	nxl_y = isave(2)
496	nxr_y = isave(3)
497	sendrecvcount_xy = isave(4)
498
499	DEALLOCATE( ar1, f_in, f_inv )
500
501	!
502	!-- Transposition y --> z + subsequent tridia + resort for z --> y
503	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
504	CALL resort_for_yz( f_out_y, f_inv )
505	!
506	!-- Save original indices and gridpoint counter
507	isave(1) = ny
508	isave(2) = nys_z
509	isave(3) = nyn_z
510	isave(4) = sendrecvcount_yz
511	!
512	!-- Set new indices for transformation
513	nblk = ( ( ny+1 ) / pdims(1) ) - 1
514	ny = pdims(1)
515	nny_z = 1
516	nys_z = myidx * nny_z
517	nyn_z = ( myidx + 1 ) * nny_z - 1
518	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
519
520	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
521	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
522
523	DO jj = 0, nblk
524	!
525	!-- Forward Fourier Transformation
526	!-- Transposition y --> z
527	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
528
529	DO jnew = 0, ny-1
530	jind = jj + ( nblk + 1 ) * jnew
531	f_in(:,:,jnew) = f_inv(:,:,jind)
532	ENDDO
533
534	CALL transpose_yz( f_in, ar1(:,:,:) )
535
536	IF ( jj == nblk ) THEN
537	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
538	ELSE
539	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
540	ENDIF
541
542	!
543	!-- Solve the tridiagonal equation system along z
544	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
545
546	n = isave(2) + jj
547	CALL tridia_substi_overlap( ar1(:,:,:), n )
548
549	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
550
551	!
552	!-- Inverse Fourier Transformation
553	!-- Transposition z --> y
554	!-- Only one thread should call MPI routines, therefore forward and
555	!-- backward tranpose are in the same section
556	IF ( jj == 0 ) THEN
557	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
558	ELSE
559	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
560	ENDIF
561
562	CALL transpose_zy( ar1(:,:,:), f_in )
563
564	DO jnew = 0, ny-1
565	jind = jj + ( nblk + 1 ) * jnew
566	f_inv(:,:,jind) = f_in(:,:,jnew)
567	ENDDO
568
569	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
570
571	ENDDO
572	!
573	!-- Restore original indices/counters
574	ny = isave(1)
575	nys_z = isave(2)
576	nyn_z = isave(3)
577	sendrecvcount_yz = isave(4)
578
579	CALL resort_for_zy( f_inv, f_out_y )
580
581	DEALLOCATE( ar1, f_in, f_inv )
582
583	!
584	!-- fft along y backward + subsequent transposition y --> x
585	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
586	!
587	!-- Save original indices and gridpoint counter
588	isave(1) = nx
589	isave(2) = nxl_y
590	isave(3) = nxr_y
591	isave(4) = sendrecvcount_xy
592	!
593	!-- Set new indices for transformation
594	nblk = (( nx+1 ) / pdims(2) ) - 1
595	nx = pdims(2)
596	nnx_y = 1
597	nxl_y = myidy * nnx_y
598	nxr_y = ( myidy + 1 ) * nnx_y - 1
599	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
600
601	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
602	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
603
604	DO ii = 0, nblk
605
606	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
607
608	n = isave(2) + ii
609	nxl_y_bound = isave(2)
610	nxr_y_bound = isave(3)
611
612	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
613	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
614	nxl_y_l = n, nxr_y_l = n )
615
616	IF ( ii == nblk ) THEN
617	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
618	ELSE
619	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
620	ENDIF
621
622	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
623
624	CALL transpose_yx( ar1(:,:,:), f_in )
625
626	DO inew = 0, nx-1
627	iind = ii + (nblk+1) * inew
628	f_inv(:,:,iind) = f_in(:,:,inew)
629	ENDDO
630
631	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
632
633	ENDDO
634	!
635	!-- Restore original indices/counters
636	nx = isave(1)
637	nxl_y = isave(2)
638	nxr_y = isave(3)
639	sendrecvcount_xy = isave(4)
640
641	CALL resort_for_yx( f_inv, f_out_z )
642
643	DEALLOCATE( ar1, f_in, f_inv )
644
645	!
646	!-- fft along x backward + subsequent final transposition x --> z
647	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
648	!
649	!-- Save original indices and gridpoint counter
650	isave(1) = nz
651	isave(2) = nzb_x
652	isave(3) = nzt_x
653	isave(4) = sendrecvcount_zx
654	!
655	!-- Set new indices for transformation
656	nblk = nz / pdims(1)
657	nz = pdims(1)
658	nnz_x = 1
659	nzb_x = 1 + myidx * nnz_x
660	nzt_x = ( myidx + 1 ) * nnz_x
661	sendrecvcount_zx = nnx * nny * nnz_x
662
663	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
664	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
665
666	DO kk = 1, nblk
667
668	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
669
670	n = isave(2) + kk - 1
671	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
672
673	IF ( kk == nblk ) THEN
674	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
675	ELSE
676	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
677	ENDIF
678
679	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
680
681	CALL transpose_xz( ar1(:,:,:), f_in )
682
683	DO knew = 1, nz
684	ki = kk + nblk * (knew-1)
685	f_inv(:,:,ki) = f_in(:,:,knew)
686	ENDDO
687
688	IF ( kk == nblk ) THEN
689	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
690	ELSE
691	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
692	ENDIF
693
694	ENDDO
695	!
696	!-- Restore original indices/counters
697	nz = isave(1)
698	nzb_x = isave(2)
699	nzt_x = isave(3)
700	sendrecvcount_zx = isave(4)
701
702	CALL resort_for_xz( f_inv, ar )
703
704	DEALLOCATE( ar1, f_in, f_inv )
705
706	ENDIF
707	#endif
708
709	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
710
711	END SUBROUTINE poisfft
712
713
714	!------------------------------------------------------------------------------!
715	! Description:
716	! ------------
717	!> Fourier-transformation along y with subsequent transposition y --> x for
718	!> a 1d-decomposition along x.
719	!>
720	!> @attention The performance of this routine is much faster on the NEC-SX6,
721	!> if the first index of work_ffty_vec is odd. Otherwise
722	!> memory bank conflicts may occur (especially if the index is a
723	!> multiple of 128). That's why work_ffty_vec is dimensioned as
724	!> 0:ny+1.
725	!> Of course, this will not work if users are using an odd number
726	!> of gridpoints along y.
727	!------------------------------------------------------------------------------!
728	SUBROUTINE ffty_tr_yx( f_in, f_out )
729
730	USE control_parameters, &
731	ONLY: loop_optimization
732
733	USE cpulog, &
734	ONLY: cpu_log, log_point_s
735
736	USE kinds
737
738	USE pegrid
739
740	IMPLICIT NONE
741
742	INTEGER(iwp) :: i !<
743	INTEGER(iwp) :: iend !<
744	INTEGER(iwp) :: iouter !<
745	INTEGER(iwp) :: ir !<
746	INTEGER(iwp) :: j !<
747	INTEGER(iwp) :: k !<
748
749	INTEGER(iwp), PARAMETER :: stridex = 4 !<
750
751	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
752	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
753	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
754
755	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
756	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
757
758	!
759	!-- Carry out the FFT along y, where all data are present due to the
760	!-- 1d-decomposition along x. Resort the data in a way that x becomes
761	!-- the first index.
762	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
763
764	IF ( loop_optimization == 'vector' ) THEN
765
766	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
767	!
768	!-- Code optimized for vector processors
769	!$OMP PARALLEL PRIVATE ( i, j, k )
770	!$OMP DO
771	DO i = nxl, nxr
772
773	DO j = 0, ny
774	DO k = 1, nz
775	work_ffty_vec(j,k,i) = f_in(k,j,i)
776	ENDDO
777	ENDDO
778
779	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
780
781	ENDDO
782
783	!$OMP DO
784	DO k = 1, nz
785	DO j = 0, ny
786	DO i = nxl, nxr
787	work(i,k,j) = work_ffty_vec(j,k,i)
788	ENDDO
789	ENDDO
790	ENDDO
791	!$OMP END PARALLEL
792
793	DEALLOCATE( work_ffty_vec )
794
795	ELSE
796	!
797	!-- Cache optimized code.
798	ALLOCATE( work_ffty(0:ny,stridex) )
799	!
800	!-- The i-(x-)direction is split into a strided outer loop and an inner
801	!-- loop for better cache performance
802	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
803	!$OMP DO
804	DO iouter = nxl, nxr, stridex
805
806	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
807
808	DO k = 1, nz
809
810	DO i = iouter, iend
811
812	ir = i-iouter+1 ! counter within a stride
813	DO j = 0, ny
814	work_ffty(j,ir) = f_in(k,j,i)
815	ENDDO
816	!
817	!-- FFT along y
818	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
819
820	ENDDO
821
822	!
823	!-- Resort
824	DO j = 0, ny
825	DO i = iouter, iend
826	work(i,k,j) = work_ffty(j,i-iouter+1)
827	ENDDO
828	ENDDO
829
830	ENDDO
831
832	ENDDO
833	!$OMP END PARALLEL
834
835	DEALLOCATE( work_ffty )
836
837	ENDIF
838
839	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
840
841	!
842	!-- Transpose array
843	#if defined( __parallel )
844	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
845	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
846	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
847	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
848	comm1dx, ierr )
849	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
850	#endif
851
852	END SUBROUTINE ffty_tr_yx
853
854
855	!------------------------------------------------------------------------------!
856	! Description:
857	! ------------
858	!> Transposition x --> y with a subsequent backward Fourier transformation for
859	!> a 1d-decomposition along x
860	!------------------------------------------------------------------------------!
861	SUBROUTINE tr_xy_ffty( f_in, f_out )
862
863	USE control_parameters, &
864	ONLY: loop_optimization
865
866	USE cpulog, &
867	ONLY: cpu_log, log_point_s
868
869	USE kinds
870
871	USE pegrid
872
873	IMPLICIT NONE
874
875	INTEGER(iwp) :: i !<
876	INTEGER(iwp) :: iend !<
877	INTEGER(iwp) :: iouter !<
878	INTEGER(iwp) :: ir !<
879	INTEGER(iwp) :: j !<
880	INTEGER(iwp) :: k !<
881
882	INTEGER(iwp), PARAMETER :: stridex = 4 !<
883
884	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
885	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
886	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
887
888	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
889	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
890
891	!
892	!-- Transpose array
893	#if defined( __parallel )
894	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
895	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
896	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
897	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
898	comm1dx, ierr )
899	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
900	#endif
901
902	!
903	!-- Resort the data in a way that y becomes the first index and carry out the
904	!-- backward fft along y.
905	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
906
907	IF ( loop_optimization == 'vector' ) THEN
908
909	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
910	!
911	!-- Code optimized for vector processors
912	!$OMP PARALLEL PRIVATE ( i, j, k )
913	!$OMP DO
914	DO k = 1, nz
915	DO j = 0, ny
916	DO i = nxl, nxr
917	work_ffty_vec(j,k,i) = work(i,k,j)
918	ENDDO
919	ENDDO
920	ENDDO
921
922	!$OMP DO
923	DO i = nxl, nxr
924
925	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
926
927	DO j = 0, ny
928	DO k = 1, nz
929	f_out(k,j,i) = work_ffty_vec(j,k,i)
930	ENDDO
931	ENDDO
932
933	ENDDO
934	!$OMP END PARALLEL
935
936	DEALLOCATE( work_ffty_vec )
937
938	ELSE
939	!
940	!-- Cache optimized code.
941	ALLOCATE( work_ffty(0:ny,stridex) )
942	!
943	!-- The i-(x-)direction is split into a strided outer loop and an inner
944	!-- loop for better cache performance
945	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
946	!$OMP DO
947	DO iouter = nxl, nxr, stridex
948
949	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
950
951	DO k = 1, nz
952	!
953	!-- Resort
954	DO j = 0, ny
955	DO i = iouter, iend
956	work_ffty(j,i-iouter+1) = work(i,k,j)
957	ENDDO
958	ENDDO
959
960	DO i = iouter, iend
961
962	!
963	!-- FFT along y
964	ir = i-iouter+1 ! counter within a stride
965	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
966
967	DO j = 0, ny
968	f_out(k,j,i) = work_ffty(j,ir)
969	ENDDO
970	ENDDO
971
972	ENDDO
973
974	ENDDO
975	!$OMP END PARALLEL
976
977	DEALLOCATE( work_ffty )
978
979	ENDIF
980
981	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
982
983	END SUBROUTINE tr_xy_ffty
984
985
986	!------------------------------------------------------------------------------!
987	! Description:
988	! ------------
989	!> FFT along x, solution of the tridiagonal system and backward FFT for
990	!> a 1d-decomposition along x
991	!>
992	!> @warning this subroutine may still not work for hybrid parallelization
993	!> with OpenMP (for possible necessary changes see the original
994	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
995	!------------------------------------------------------------------------------!
996	SUBROUTINE fftx_tri_fftx( ar )
997
998	USE control_parameters, &
999	ONLY: loop_optimization
1000
1001	USE cpulog, &
1002	ONLY: cpu_log, log_point_s
1003
1004	USE grid_variables, &
1005	ONLY: ddx2, ddy2
1006
1007	USE kinds
1008
1009	USE pegrid
1010
1011	IMPLICIT NONE
1012
1013	INTEGER(iwp) :: i !<
1014	INTEGER(iwp) :: j !<
1015	INTEGER(iwp) :: k !<
1016	INTEGER(iwp) :: m !<
1017	INTEGER(iwp) :: n !<
1018	!$ INTEGER(iwp) :: omp_get_thread_num !<
1019	INTEGER(iwp) :: tn !<
1020
1021	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
1022	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
1023	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
1024	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1025
1026
1027	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
1028
1029	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1030
1031	tn = 0 ! Default thread number in case of one thread
1032	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1033	DO j = nys_x, nyn_x
1034
1035	!$ tn = omp_get_thread_num()
1036
1037	IF ( loop_optimization == 'vector' ) THEN
1038	!
1039	!-- Code optimized for vector processors
1040	DO k = 1, nz
1041
1042	m = 0
1043	DO n = 1, pdims(1)
1044	DO i = 1, nnx
1045	work_trix(m,k) = ar(i,k,j,n)
1046	m = m + 1
1047	ENDDO
1048	ENDDO
1049
1050	ENDDO
1051
1052	CALL fft_x_m( work_trix, 'forward' )
1053
1054	ELSE
1055	!
1056	!-- Cache optimized code
1057	DO k = 1, nz
1058
1059	m = 0
1060	DO n = 1, pdims(1)
1061	DO i = 1, nnx
1062	work_fftx(m) = ar(i,k,j,n)
1063	m = m + 1
1064	ENDDO
1065	ENDDO
1066
1067	CALL fft_x_1d( work_fftx, 'forward' )
1068
1069	DO i = 0, nx
1070	work_trix(i,k) = work_fftx(i)
1071	ENDDO
1072
1073	ENDDO
1074
1075	ENDIF
1076
1077	!
1078	!-- Solve the linear equation system
1079	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1080
1081	IF ( loop_optimization == 'vector' ) THEN
1082	!
1083	!-- Code optimized for vector processors
1084	CALL fft_x_m( work_trix, 'backward' )
1085
1086	DO k = 1, nz
1087
1088	m = 0
1089	DO n = 1, pdims(1)
1090	DO i = 1, nnx
1091	ar(i,k,j,n) = work_trix(m,k)
1092	m = m + 1
1093	ENDDO
1094	ENDDO
1095
1096	ENDDO
1097
1098	ELSE
1099	!
1100	!-- Cache optimized code
1101	DO k = 1, nz
1102
1103	DO i = 0, nx
1104	work_fftx(i) = work_trix(i,k)
1105	ENDDO
1106
1107	CALL fft_x_1d( work_fftx, 'backward' )
1108
1109	m = 0
1110	DO n = 1, pdims(1)
1111	DO i = 1, nnx
1112	ar(i,k,j,n) = work_fftx(m)
1113	m = m + 1
1114	ENDDO
1115	ENDDO
1116
1117	ENDDO
1118
1119	ENDIF
1120
1121	ENDDO
1122
1123	DEALLOCATE( tri )
1124
1125	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1126
1127	END SUBROUTINE fftx_tri_fftx
1128
1129
1130	!------------------------------------------------------------------------------!
1131	! Description:
1132	! ------------
1133	!> Fourier-transformation along x with subsequent transposition x --> y for
1134	!> a 1d-decomposition along y.
1135	!>
1136	!> @attention NEC-branch of this routine may significantly profit from
1137	!> further optimizations. So far, performance is much worse than
1138	!> for routine ffty_tr_yx (more than three times slower).
1139	!------------------------------------------------------------------------------!
1140	SUBROUTINE fftx_tr_xy( f_in, f_out )
1141
1142
1143	USE control_parameters, &
1144	ONLY: loop_optimization
1145
1146	USE cpulog, &
1147	ONLY: cpu_log, log_point_s
1148
1149	USE kinds
1150
1151	USE pegrid
1152
1153	IMPLICIT NONE
1154
1155	INTEGER(iwp) :: i !<
1156	INTEGER(iwp) :: j !<
1157	INTEGER(iwp) :: k !<
1158
1159	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1160	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
1161	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
1162	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1163
1164	!
1165	!-- Carry out the FFT along x, where all data are present due to the
1166	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1167	!-- the first index.
1168	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1169
1170	IF ( loop_optimization == 'vector' ) THEN
1171	!
1172	!-- Code for vector processors
1173	!$OMP PARALLEL PRIVATE ( i, j, k )
1174	!$OMP DO
1175	DO i = 0, nx
1176
1177	DO j = nys, nyn
1178	DO k = 1, nz
1179	work_fftx(i,k,j) = f_in(k,j,i)
1180	ENDDO
1181	ENDDO
1182
1183	ENDDO
1184
1185	!$OMP DO
1186	DO j = nys, nyn
1187
1188	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1189
1190	DO k = 1, nz
1191	DO i = 0, nx
1192	work(j,k,i) = work_fftx(i,k,j)
1193	ENDDO
1194	ENDDO
1195
1196	ENDDO
1197	!$OMP END PARALLEL
1198
1199	ELSE
1200
1201	!
1202	!-- Cache optimized code (there might be still a potential for better
1203	!-- optimization).
1204	!$OMP PARALLEL PRIVATE (i,j,k)
1205	!$OMP DO
1206	DO i = 0, nx
1207
1208	DO j = nys, nyn
1209	DO k = 1, nz
1210	work_fftx(i,k,j) = f_in(k,j,i)
1211	ENDDO
1212	ENDDO
1213
1214	ENDDO
1215
1216	!$OMP DO
1217	DO j = nys, nyn
1218	DO k = 1, nz
1219
1220	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1221
1222	DO i = 0, nx
1223	work(j,k,i) = work_fftx(i,k,j)
1224	ENDDO
1225	ENDDO
1226
1227	ENDDO
1228	!$OMP END PARALLEL
1229
1230	ENDIF
1231	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1232
1233	!
1234	!-- Transpose array
1235	#if defined( __parallel )
1236	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1237	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1238	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1239	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1240	comm1dy, ierr )
1241	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1242	#endif
1243
1244	END SUBROUTINE fftx_tr_xy
1245
1246
1247	!------------------------------------------------------------------------------!
1248	! Description:
1249	! ------------
1250	!> Transposition y --> x with a subsequent backward Fourier transformation for
1251	!> a 1d-decomposition along x.
1252	!------------------------------------------------------------------------------!
1253	SUBROUTINE tr_yx_fftx( f_in, f_out )
1254
1255
1256	USE control_parameters, &
1257	ONLY: loop_optimization
1258
1259	USE cpulog, &
1260	ONLY: cpu_log, log_point_s
1261
1262	USE kinds
1263
1264	USE pegrid
1265
1266	IMPLICIT NONE
1267
1268	INTEGER(iwp) :: i !<
1269	INTEGER(iwp) :: j !<
1270	INTEGER(iwp) :: k !<
1271
1272	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1273	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
1274	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
1275	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1276
1277	!
1278	!-- Transpose array
1279	#if defined( __parallel )
1280	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1281	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1282	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1283	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1284	comm1dy, ierr )
1285	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1286	#endif
1287
1288	!
1289	!-- Carry out the FFT along x, where all data are present due to the
1290	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1291	!-- the first index.
1292	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1293
1294	IF ( loop_optimization == 'vector' ) THEN
1295	!
1296	!-- Code optimized for vector processors
1297	!$OMP PARALLEL PRIVATE ( i, j, k )
1298	!$OMP DO
1299	DO j = nys, nyn
1300
1301	DO k = 1, nz
1302	DO i = 0, nx
1303	work_fftx(i,k,j) = work(j,k,i)
1304	ENDDO
1305	ENDDO
1306
1307	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1308
1309	ENDDO
1310
1311	!$OMP DO
1312	DO i = 0, nx
1313	DO j = nys, nyn
1314	DO k = 1, nz
1315	f_out(k,j,i) = work_fftx(i,k,j)
1316	ENDDO
1317	ENDDO
1318	ENDDO
1319	!$OMP END PARALLEL
1320
1321	ELSE
1322
1323	!
1324	!-- Cache optimized code (there might be still a potential for better
1325	!-- optimization).
1326	!$OMP PARALLEL PRIVATE (i,j,k)
1327	!$OMP DO
1328	DO j = nys, nyn
1329	DO k = 1, nz
1330
1331	DO i = 0, nx
1332	work_fftx(i,k,j) = work(j,k,i)
1333	ENDDO
1334
1335	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1336
1337	ENDDO
1338	ENDDO
1339
1340	!$OMP DO
1341	DO i = 0, nx
1342	DO j = nys, nyn
1343	DO k = 1, nz
1344	f_out(k,j,i) = work_fftx(i,k,j)
1345	ENDDO
1346	ENDDO
1347	ENDDO
1348	!$OMP END PARALLEL
1349
1350	ENDIF
1351	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1352
1353	END SUBROUTINE tr_yx_fftx
1354
1355
1356	!------------------------------------------------------------------------------!
1357	! Description:
1358	! ------------
1359	!> FFT along y, solution of the tridiagonal system and backward FFT for
1360	!> a 1d-decomposition along y.
1361	!>
1362	!> @warning this subroutine may still not work for hybrid parallelization
1363	!> with OpenMP (for possible necessary changes see the original
1364	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1365	!------------------------------------------------------------------------------!
1366	SUBROUTINE ffty_tri_ffty( ar )
1367
1368
1369	USE control_parameters, &
1370	ONLY: loop_optimization
1371
1372	USE cpulog, &
1373	ONLY: cpu_log, log_point_s
1374
1375	USE grid_variables, &
1376	ONLY: ddx2, ddy2
1377
1378	USE kinds
1379
1380	USE pegrid
1381
1382	IMPLICIT NONE
1383
1384	INTEGER(iwp) :: i !<
1385	INTEGER(iwp) :: j !<
1386	INTEGER(iwp) :: k !<
1387	INTEGER(iwp) :: m !<
1388	INTEGER(iwp) :: n !<
1389	!$ INTEGER(iwp) :: omp_get_thread_num !<
1390	INTEGER(iwp) :: tn !<
1391
1392	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
1393	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
1394	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
1395	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1396
1397
1398	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1399
1400	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1401
1402	tn = 0 ! Default thread number in case of one thread
1403	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1404	DO i = nxl_y, nxr_y
1405
1406	!$ tn = omp_get_thread_num()
1407
1408	IF ( loop_optimization == 'vector' ) THEN
1409	!
1410	!-- Code optimized for vector processors
1411	DO k = 1, nz
1412
1413	m = 0
1414	DO n = 1, pdims(2)
1415	DO j = 1, nny
1416	work_triy(m,k) = ar(j,k,i,n)
1417	m = m + 1
1418	ENDDO
1419	ENDDO
1420
1421	ENDDO
1422
1423	CALL fft_y_m( work_triy, ny, 'forward' )
1424
1425	ELSE
1426	!
1427	!-- Cache optimized code
1428	DO k = 1, nz
1429
1430	m = 0
1431	DO n = 1, pdims(2)
1432	DO j = 1, nny
1433	work_ffty(m) = ar(j,k,i,n)
1434	m = m + 1
1435	ENDDO
1436	ENDDO
1437
1438	CALL fft_y_1d( work_ffty, 'forward' )
1439
1440	DO j = 0, ny
1441	work_triy(j,k) = work_ffty(j)
1442	ENDDO
1443
1444	ENDDO
1445
1446	ENDIF
1447
1448	!
1449	!-- Solve the linear equation system
1450	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1451
1452	IF ( loop_optimization == 'vector' ) THEN
1453	!
1454	!-- Code optimized for vector processors
1455	CALL fft_y_m( work_triy, ny, 'backward' )
1456
1457	DO k = 1, nz
1458
1459	m = 0
1460	DO n = 1, pdims(2)
1461	DO j = 1, nny
1462	ar(j,k,i,n) = work_triy(m,k)
1463	m = m + 1
1464	ENDDO
1465	ENDDO
1466
1467	ENDDO
1468
1469	ELSE
1470	!
1471	!-- Cache optimized code
1472	DO k = 1, nz
1473
1474	DO j = 0, ny
1475	work_ffty(j) = work_triy(j,k)
1476	ENDDO
1477
1478	CALL fft_y_1d( work_ffty, 'backward' )
1479
1480	m = 0
1481	DO n = 1, pdims(2)
1482	DO j = 1, nny
1483	ar(j,k,i,n) = work_ffty(m)
1484	m = m + 1
1485	ENDDO
1486	ENDDO
1487
1488	ENDDO
1489
1490	ENDIF
1491
1492	ENDDO
1493
1494	DEALLOCATE( tri )
1495
1496	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1497
1498	END SUBROUTINE ffty_tri_ffty
1499
1500	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |