Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1777

Last change on this file since 1777 was 1683, checked in by knoop, 9 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 45.6 KB

Line
1	!> @file poisfft.f90
2	!--------------------------------------------------------------------------------!
3	! This file is part of PALM.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the terms
6	! of the GNU General Public License as published by the Free Software Foundation,
7	! either version 3 of the License, or (at your option) any later version.
8	!
9	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
10	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
11	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
12	!
13	! You should have received a copy of the GNU General Public License along with
14	! PALM. If not, see <http://www.gnu.org/licenses/>.
15	!
16	! Copyright 1997-2014 Leibniz Universitaet Hannover
17	!--------------------------------------------------------------------------------!
18	!
19	! Current revisions:
20	! -----------------
21	!
22	!
23	! Former revisions:
24	! -----------------
25	! $Id: poisfft.f90 1683 2015-10-07 23:57:51Z hoffmann $
26	!
27	! 1682 2015-10-07 23:56:08Z knoop
28	! Code annotations made doxygen readable
29	!
30	! 1482 2014-10-18 12:34:45Z raasch
31	! use 2d-decomposition, if accelerator boards are used
32	!
33	! 1406 2014-05-16 13:47:01Z raasch
34	! bugfix for pgi 14.4: declare create moved after array declaration
35	!
36	! 1320 2014-03-20 08:40:49Z raasch
37	! ONLY-attribute added to USE-statements,
38	! kind-parameters added to all INTEGER and REAL declaration statements,
39	! kinds are defined in new module kinds,
40	! old module precision_kind is removed,
41	! revision history before 2012 removed,
42	! comment fields (!:) to be used for variable explanations added to
43	! all variable declaration statements
44	!
45	! 1318 2014-03-17 13:35:16Z raasch
46	! module interfaces removed
47	!
48	! 1306 2014-03-13 14:30:59Z raasch
49	! openmp sections removed from the overlap branch,
50	! second argument removed from parameter list
51	!
52	! 1216 2013-08-26 09:31:42Z raasch
53	! resorting of arrays moved to separate routines resort_for_...,
54	! one argument, used as temporary work array, removed from all transpose
55	! routines
56	! overlapping fft / transposition implemented
57	!
58	! 1212 2013-08-15 08:46:27Z raasch
59	! tridia routines moved to seperate module tridia_solver
60	!
61	! 1208 2013-08-13 06:41:49Z raasch
62	! acc-update clauses added for "ar" so that ffts other than cufft can also be
63	! used (although they are not ported and will give a poor performance)
64	!
65	! 1111 2013-03-08 23:54:10Z raasch
66	! further openACC porting of non-parallel (MPI) branch:
67	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
68	! no distinction between parallel/non-parallel in poisfft and tridia any more,
69	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
70	! (otherwise "invalid device function" is indicated during runtime),
71	! optimization of tridia routines: constant elements and coefficients of tri are
72	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
73	! to 2,
74	! poisfft_init is now called internally from poisfft, maketri is called from
75	! poisfft_init,
76	! ibc_p_b = 2 removed
77	!
78	! 1106 2013-03-04 05:31:38Z raasch
79	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
80	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
81	! fft_y_1d
82	!
83	! 1103 2013-02-20 02:15:53Z raasch
84	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
85	! sometimes cause segmentation faults with intel 12.1 compiler
86	!
87	! 1092 2013-02-02 11:24:22Z raasch
88	! unused variables removed
89	!
90	! 1036 2012-10-22 13:43:42Z raasch
91	! code put under GPL (PALM 3.9)
92	!
93	! 2012-09-21 07:03:55Z raasch
94	! FLOAT type conversion replaced by REAL
95	!
96	! 1003 2012-09-14 14:35:53Z raasch
97	! indices nxa, nya, etc. replaced by nx, ny, etc.
98	!
99	! 940 2012-07-09 14:31:00Z raasch
100	! special handling of tri-array as an argument in tridia_1dd routines switched
101	! off because it caused segmentation faults with intel 12.1 compiler
102	!
103	! 877 2012-04-03 11:21:44Z suehring
104	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
105	! pressure at the top of the model domain.
106	!
107	! 809 2012-01-30 13:32:58Z maronga
108	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
109	!
110	! 807 2012-01-25 11:53:51Z maronga
111	! New cpp directive "__check" implemented which is used by check_namelist_files
112	! (most of the code is unneeded by check_namelist_files).
113	!
114	! Revision 1.1 1997/07/24 11:24:14 raasch
115	! Initial revision
116	!
117	!
118	! Description:
119	! ------------
120	!> Solves the Poisson equation with a 2D spectral method
121	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
122	!>
123	!> Input:
124	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
125	!> starting from (1,nys,nxl)
126	!>
127	!> Output:
128	!> real ar contains the solution for perturbation pressure p
129	!------------------------------------------------------------------------------!
130	MODULE poisfft_mod
131
132
133	USE fft_xy, &
134	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
135
136	USE indices, &
137	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
138
139	USE transpose_indices, &
140	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
141	nzb_y, nzt_x, nzt_y
142
143	USE tridia_solver, &
144	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
145
146	IMPLICIT NONE
147
148	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
149
150	PRIVATE
151
152	#if ! defined ( __check )
153	PUBLIC poisfft, poisfft_init
154
155	INTERFACE poisfft
156	MODULE PROCEDURE poisfft
157	END INTERFACE poisfft
158
159	INTERFACE poisfft_init
160	MODULE PROCEDURE poisfft_init
161	END INTERFACE poisfft_init
162	#else
163	PUBLIC poisfft_init
164
165	INTERFACE poisfft_init
166	MODULE PROCEDURE poisfft_init
167	END INTERFACE poisfft_init
168	#endif
169
170	CONTAINS
171
172	!------------------------------------------------------------------------------!
173	! Description:
174	! ------------
175	!> @todo Missing subroutine description.
176	!------------------------------------------------------------------------------!
177	SUBROUTINE poisfft_init
178
179	USE arrays_3d, &
180	ONLY: ddzu_pres, ddzw
181
182	USE kinds
183
184	IMPLICIT NONE
185
186	INTEGER(iwp) :: k !<
187
188
189	CALL fft_init
190
191	CALL tridia_init
192
193	poisfft_initialized = .TRUE.
194
195	END SUBROUTINE poisfft_init
196
197
198	#if ! defined ( __check )
199	!------------------------------------------------------------------------------!
200	! Description:
201	! ------------
202	!> Two-dimensional Fourier Transformation in x- and y-direction.
203	!------------------------------------------------------------------------------!
204	SUBROUTINE poisfft( ar )
205
206	USE control_parameters, &
207	ONLY: fft_method, transpose_compute_overlap
208
209	USE cpulog, &
210	ONLY: cpu_log, cpu_log_nowait, log_point_s
211
212	USE kinds
213
214	USE pegrid
215
216	IMPLICIT NONE
217
218	INTEGER(iwp) :: ii !<
219	INTEGER(iwp) :: iind !<
220	INTEGER(iwp) :: inew !<
221	INTEGER(iwp) :: jj !<
222	INTEGER(iwp) :: jind !<
223	INTEGER(iwp) :: jnew !<
224	INTEGER(iwp) :: ki !<
225	INTEGER(iwp) :: kk !<
226	INTEGER(iwp) :: knew !<
227	INTEGER(iwp) :: n !<
228	INTEGER(iwp) :: nblk !<
229	INTEGER(iwp) :: nnx_y !<
230	INTEGER(iwp) :: nny_z !<
231	INTEGER(iwp) :: nnz_t !<
232	INTEGER(iwp) :: nnz_x !<
233	INTEGER(iwp) :: nxl_y_bound !<
234	INTEGER(iwp) :: nxr_y_bound !<
235
236	INTEGER(iwp), DIMENSION(4) :: isave !<
237
238	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
239	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
240	!$acc declare create( ar_inv )
241
242	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
243	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
244	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
245	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
246	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
247
248
249	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
250
251	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
252
253	!
254	!-- Two-dimensional Fourier Transformation in x- and y-direction.
255	IF ( pdims(2) == 1 .AND. pdims(1) > 1 .AND. num_acc_per_node == 0 ) &
256	THEN
257
258	!
259	!-- 1d-domain-decomposition along x:
260	!-- FFT along y and transposition y --> x
261	CALL ffty_tr_yx( ar, ar )
262
263	!
264	!-- FFT along x, solving the tridiagonal system and backward FFT
265	CALL fftx_tri_fftx( ar )
266
267	!
268	!-- Transposition x --> y and backward FFT along y
269	CALL tr_xy_ffty( ar, ar )
270
271	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 .AND. num_acc_per_node == 0 ) &
272	THEN
273
274	!
275	!-- 1d-domain-decomposition along y:
276	!-- FFT along x and transposition x --> y
277	CALL fftx_tr_xy( ar, ar )
278
279	!
280	!-- FFT along y, solving the tridiagonal system and backward FFT
281	CALL ffty_tri_ffty( ar )
282
283	!
284	!-- Transposition y --> x and backward FFT along x
285	CALL tr_yx_fftx( ar, ar )
286
287	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
288
289	!
290	!-- 2d-domain-decomposition or no decomposition (1 PE run)
291	!-- Transposition z --> x
292	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
293	CALL resort_for_zx( ar, ar_inv )
294	CALL transpose_zx( ar_inv, ar )
295	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
296
297	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
298	IF ( fft_method /= 'system-specific' ) THEN
299	!$acc update host( ar )
300	ENDIF
301	CALL fft_x( ar, 'forward' )
302	IF ( fft_method /= 'system-specific' ) THEN
303	!$acc update device( ar )
304	ENDIF
305	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
306
307	!
308	!-- Transposition x --> y
309	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
310	CALL resort_for_xy( ar, ar_inv )
311	CALL transpose_xy( ar_inv, ar )
312	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
313
314	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
315	IF ( fft_method /= 'system-specific' ) THEN
316	!$acc update host( ar )
317	ENDIF
318	CALL fft_y( ar, 'forward', ar_tr = ar, &
319	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
320	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
321	IF ( fft_method /= 'system-specific' ) THEN
322	!$acc update device( ar )
323	ENDIF
324	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
325
326	!
327	!-- Transposition y --> z
328	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
329	CALL resort_for_yz( ar, ar_inv )
330	CALL transpose_yz( ar_inv, ar )
331	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
332
333	!
334	!-- Solve the tridiagonal equation system along z
335	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
336	CALL tridia_substi( ar )
337	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
338
339	!
340	!-- Inverse Fourier Transformation
341	!-- Transposition z --> y
342	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
343	CALL transpose_zy( ar, ar_inv )
344	CALL resort_for_zy( ar_inv, ar )
345	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
346
347	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
348	IF ( fft_method /= 'system-specific' ) THEN
349	!$acc update host( ar )
350	ENDIF
351	CALL fft_y( ar, 'backward', ar_tr = ar, &
352	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
353	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
354	IF ( fft_method /= 'system-specific' ) THEN
355	!$acc update device( ar )
356	ENDIF
357	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
358
359	!
360	!-- Transposition y --> x
361	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
362	CALL transpose_yx( ar, ar_inv )
363	CALL resort_for_yx( ar_inv, ar )
364	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
365
366	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
367	IF ( fft_method /= 'system-specific' ) THEN
368	!$acc update host( ar )
369	ENDIF
370	CALL fft_x( ar, 'backward' )
371	IF ( fft_method /= 'system-specific' ) THEN
372	!$acc update device( ar )
373	ENDIF
374	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
375
376	!
377	!-- Transposition x --> z
378	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
379	CALL transpose_xz( ar, ar_inv )
380	CALL resort_for_xz( ar_inv, ar )
381	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
382
383	ELSE
384
385	!
386	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
387	!-- overlapping transposition / fft
388	!-- cputime logging must not use barriers, which would prevent overlapping
389	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
390	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
391	!
392	!-- Transposition z --> x + subsequent fft along x
393	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
394	CALL resort_for_zx( ar, f_inv )
395	!
396	!-- Save original indices and gridpoint counter
397	isave(1) = nz
398	isave(2) = nzb_x
399	isave(3) = nzt_x
400	isave(4) = sendrecvcount_zx
401	!
402	!-- Set new indices for transformation
403	nblk = nz / pdims(1)
404	nz = pdims(1)
405	nnz_x = 1
406	nzb_x = 1 + myidx * nnz_x
407	nzt_x = ( myidx + 1 ) * nnz_x
408	sendrecvcount_zx = nnx * nny * nnz_x
409
410	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
411	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
412
413	DO kk = 1, nblk
414
415	IF ( kk == 1 ) THEN
416	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
417	ELSE
418	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
419	ENDIF
420
421	DO knew = 1, nz
422	ki = kk + nblk * ( knew - 1 )
423	f_in(:,:,knew) = f_inv(:,:,ki)
424	ENDDO
425
426	CALL transpose_zx( f_in, ar1(:,:,:))
427	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
428
429	IF ( kk == 1 ) THEN
430	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
431	ELSE
432	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
433	ENDIF
434
435	n = isave(2) + kk - 1
436	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
437	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
438
439	ENDDO
440	!
441	!-- Restore original indices/counters
442	nz = isave(1)
443	nzb_x = isave(2)
444	nzt_x = isave(3)
445	sendrecvcount_zx = isave(4)
446
447	DEALLOCATE( ar1, f_in, f_inv )
448
449	!
450	!-- Transposition x --> y + subsequent fft along y
451	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
452	CALL resort_for_xy( f_out_z, f_inv )
453	!
454	!-- Save original indices and gridpoint counter
455	isave(1) = nx
456	isave(2) = nxl_y
457	isave(3) = nxr_y
458	isave(4) = sendrecvcount_xy
459	!
460	!-- Set new indices for transformation
461	nblk = ( ( nx+1 ) / pdims(2) ) - 1
462	nx = pdims(2)
463	nnx_y = 1
464	nxl_y = myidy * nnx_y
465	nxr_y = ( myidy + 1 ) * nnx_y - 1
466	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
467
468	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
469	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
470
471	DO ii = 0, nblk
472
473	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
474
475	DO inew = 0, nx-1
476	iind = ii + ( nblk + 1 ) * inew
477	f_in(:,:,inew) = f_inv(:,:,iind)
478	ENDDO
479
480	CALL transpose_xy( f_in, ar1(:,:,:) )
481
482	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
483
484	IF ( ii == 1 ) THEN
485	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
486	ELSE
487	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
488	ENDIF
489
490	nxl_y_bound = isave(2)
491	nxr_y_bound = isave(3)
492	n = isave(2) + ii
493	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
494	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
495	nxl_y_l = n, nxr_y_l = n )
496
497	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
498
499	ENDDO
500	!
501	!-- Restore original indices/counters
502	nx = isave(1)
503	nxl_y = isave(2)
504	nxr_y = isave(3)
505	sendrecvcount_xy = isave(4)
506
507	DEALLOCATE( ar1, f_in, f_inv )
508
509	!
510	!-- Transposition y --> z + subsequent tridia + resort for z --> y
511	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
512	CALL resort_for_yz( f_out_y, f_inv )
513	!
514	!-- Save original indices and gridpoint counter
515	isave(1) = ny
516	isave(2) = nys_z
517	isave(3) = nyn_z
518	isave(4) = sendrecvcount_yz
519	!
520	!-- Set new indices for transformation
521	nblk = ( ( ny+1 ) / pdims(1) ) - 1
522	ny = pdims(1)
523	nny_z = 1
524	nys_z = myidx * nny_z
525	nyn_z = ( myidx + 1 ) * nny_z - 1
526	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
527
528	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
529	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
530
531	DO jj = 0, nblk
532	!
533	!-- Forward Fourier Transformation
534	!-- Transposition y --> z
535	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
536
537	DO jnew = 0, ny-1
538	jind = jj + ( nblk + 1 ) * jnew
539	f_in(:,:,jnew) = f_inv(:,:,jind)
540	ENDDO
541
542	CALL transpose_yz( f_in, ar1(:,:,:) )
543
544	IF ( jj == nblk ) THEN
545	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
546	ELSE
547	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
548	ENDIF
549
550	!
551	!-- Solve the tridiagonal equation system along z
552	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
553
554	n = isave(2) + jj
555	CALL tridia_substi_overlap( ar1(:,:,:), n )
556
557	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
558
559	!
560	!-- Inverse Fourier Transformation
561	!-- Transposition z --> y
562	!-- Only one thread should call MPI routines, therefore forward and
563	!-- backward tranpose are in the same section
564	IF ( jj == 0 ) THEN
565	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
566	ELSE
567	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
568	ENDIF
569
570	CALL transpose_zy( ar1(:,:,:), f_in )
571
572	DO jnew = 0, ny-1
573	jind = jj + ( nblk + 1 ) * jnew
574	f_inv(:,:,jind) = f_in(:,:,jnew)
575	ENDDO
576
577	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
578
579	ENDDO
580	!
581	!-- Restore original indices/counters
582	ny = isave(1)
583	nys_z = isave(2)
584	nyn_z = isave(3)
585	sendrecvcount_yz = isave(4)
586
587	CALL resort_for_zy( f_inv, f_out_y )
588
589	DEALLOCATE( ar1, f_in, f_inv )
590
591	!
592	!-- fft along y backward + subsequent transposition y --> x
593	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
594	!
595	!-- Save original indices and gridpoint counter
596	isave(1) = nx
597	isave(2) = nxl_y
598	isave(3) = nxr_y
599	isave(4) = sendrecvcount_xy
600	!
601	!-- Set new indices for transformation
602	nblk = (( nx+1 ) / pdims(2) ) - 1
603	nx = pdims(2)
604	nnx_y = 1
605	nxl_y = myidy * nnx_y
606	nxr_y = ( myidy + 1 ) * nnx_y - 1
607	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
608
609	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
610	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
611
612	DO ii = 0, nblk
613
614	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
615
616	n = isave(2) + ii
617	nxl_y_bound = isave(2)
618	nxr_y_bound = isave(3)
619
620	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
621	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
622	nxl_y_l = n, nxr_y_l = n )
623
624	IF ( ii == nblk ) THEN
625	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
626	ELSE
627	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
628	ENDIF
629
630	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
631
632	CALL transpose_yx( ar1(:,:,:), f_in )
633
634	DO inew = 0, nx-1
635	iind = ii + (nblk+1) * inew
636	f_inv(:,:,iind) = f_in(:,:,inew)
637	ENDDO
638
639	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
640
641	ENDDO
642	!
643	!-- Restore original indices/counters
644	nx = isave(1)
645	nxl_y = isave(2)
646	nxr_y = isave(3)
647	sendrecvcount_xy = isave(4)
648
649	CALL resort_for_yx( f_inv, f_out_z )
650
651	DEALLOCATE( ar1, f_in, f_inv )
652
653	!
654	!-- fft along x backward + subsequent final transposition x --> z
655	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
656	!
657	!-- Save original indices and gridpoint counter
658	isave(1) = nz
659	isave(2) = nzb_x
660	isave(3) = nzt_x
661	isave(4) = sendrecvcount_zx
662	!
663	!-- Set new indices for transformation
664	nblk = nz / pdims(1)
665	nz = pdims(1)
666	nnz_x = 1
667	nzb_x = 1 + myidx * nnz_x
668	nzt_x = ( myidx + 1 ) * nnz_x
669	sendrecvcount_zx = nnx * nny * nnz_x
670
671	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
672	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
673
674	DO kk = 1, nblk
675
676	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
677
678	n = isave(2) + kk - 1
679	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
680
681	IF ( kk == nblk ) THEN
682	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
683	ELSE
684	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
685	ENDIF
686
687	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
688
689	CALL transpose_xz( ar1(:,:,:), f_in )
690
691	DO knew = 1, nz
692	ki = kk + nblk * (knew-1)
693	f_inv(:,:,ki) = f_in(:,:,knew)
694	ENDDO
695
696	IF ( kk == nblk ) THEN
697	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
698	ELSE
699	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
700	ENDIF
701
702	ENDDO
703	!
704	!-- Restore original indices/counters
705	nz = isave(1)
706	nzb_x = isave(2)
707	nzt_x = isave(3)
708	sendrecvcount_zx = isave(4)
709
710	CALL resort_for_xz( f_inv, ar )
711
712	DEALLOCATE( ar1, f_in, f_inv )
713
714	ENDIF
715
716	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
717
718	END SUBROUTINE poisfft
719
720
721	!------------------------------------------------------------------------------!
722	! Description:
723	! ------------
724	!> Fourier-transformation along y with subsequent transposition y --> x for
725	!> a 1d-decomposition along x.
726	!>
727	!> @attention The performance of this routine is much faster on the NEC-SX6,
728	!> if the first index of work_ffty_vec is odd. Otherwise
729	!> memory bank conflicts may occur (especially if the index is a
730	!> multiple of 128). That's why work_ffty_vec is dimensioned as
731	!> 0:ny+1.
732	!> Of course, this will not work if users are using an odd number
733	!> of gridpoints along y.
734	!------------------------------------------------------------------------------!
735	SUBROUTINE ffty_tr_yx( f_in, f_out )
736
737
738	USE control_parameters, &
739	ONLY: host
740
741	USE cpulog, &
742	ONLY: cpu_log, log_point_s
743
744	USE kinds
745
746	USE pegrid
747
748	IMPLICIT NONE
749
750	INTEGER(iwp) :: i !<
751	INTEGER(iwp) :: iend !<
752	INTEGER(iwp) :: iouter !<
753	INTEGER(iwp) :: ir !<
754	INTEGER(iwp) :: j !<
755	INTEGER(iwp) :: k !<
756
757	INTEGER(iwp), PARAMETER :: stridex = 4 !<
758
759	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
760	#if defined( __nec )
761	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
762	#endif
763	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
764	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
765	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
766
767	!
768	!-- Carry out the FFT along y, where all data are present due to the
769	!-- 1d-decomposition along x. Resort the data in a way that x becomes
770	!-- the first index.
771	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
772
773	IF ( host(1:3) == 'nec' ) THEN
774	#if defined( __nec )
775	!
776	!-- Code optimized for vector processors
777	!$OMP PARALLEL PRIVATE ( i, j, k )
778	!$OMP DO
779	DO i = nxl, nxr
780
781	DO j = 0, ny
782	DO k = 1, nz
783	work_ffty_vec(j,k,i) = f_in(k,j,i)
784	ENDDO
785	ENDDO
786
787	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
788
789	ENDDO
790
791	!$OMP DO
792	DO k = 1, nz
793	DO j = 0, ny
794	DO i = nxl, nxr
795	work(i,k,j) = work_ffty_vec(j,k,i)
796	ENDDO
797	ENDDO
798	ENDDO
799	!$OMP END PARALLEL
800	#endif
801
802	ELSE
803
804	!
805	!-- Cache optimized code.
806	!-- The i-(x-)direction is split into a strided outer loop and an inner
807	!-- loop for better cache performance
808	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
809	!$OMP DO
810	DO iouter = nxl, nxr, stridex
811
812	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
813
814	DO k = 1, nz
815
816	DO i = iouter, iend
817
818	ir = i-iouter+1 ! counter within a stride
819	DO j = 0, ny
820	work_ffty(j,ir) = f_in(k,j,i)
821	ENDDO
822	!
823	!-- FFT along y
824	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
825
826	ENDDO
827
828	!
829	!-- Resort
830	DO j = 0, ny
831	DO i = iouter, iend
832	work(i,k,j) = work_ffty(j,i-iouter+1)
833	ENDDO
834	ENDDO
835
836	ENDDO
837
838	ENDDO
839	!$OMP END PARALLEL
840
841	ENDIF
842	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
843
844	!
845	!-- Transpose array
846	#if defined( __parallel )
847	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
848	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
849	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
850	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
851	comm1dx, ierr )
852	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
853	#endif
854
855	END SUBROUTINE ffty_tr_yx
856
857
858	!------------------------------------------------------------------------------!
859	! Description:
860	! ------------
861	!> Transposition x --> y with a subsequent backward Fourier transformation for
862	!> a 1d-decomposition along x
863	!------------------------------------------------------------------------------!
864	SUBROUTINE tr_xy_ffty( f_in, f_out )
865
866
867	USE control_parameters, &
868	ONLY: host
869
870	USE cpulog, &
871	ONLY: cpu_log, log_point_s
872
873	USE kinds
874
875	USE pegrid
876
877	IMPLICIT NONE
878
879	INTEGER(iwp) :: i !<
880	INTEGER(iwp) :: iend !<
881	INTEGER(iwp) :: iouter !<
882	INTEGER(iwp) :: ir !<
883	INTEGER(iwp) :: j !<
884	INTEGER(iwp) :: k !<
885
886	INTEGER(iwp), PARAMETER :: stridex = 4 !<
887
888	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !<
889	#if defined( __nec )
890	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !<
891	#endif
892	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
893	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
894	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
895
896	!
897	!-- Transpose array
898	#if defined( __parallel )
899	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
900	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
901	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
902	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
903	comm1dx, ierr )
904	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
905	#endif
906
907	!
908	!-- Resort the data in a way that y becomes the first index and carry out the
909	!-- backward fft along y.
910	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
911
912	IF ( host(1:3) == 'nec' ) THEN
913	#if defined( __nec )
914	!
915	!-- Code optimized for vector processors
916	!$OMP PARALLEL PRIVATE ( i, j, k )
917	!$OMP DO
918	DO k = 1, nz
919	DO j = 0, ny
920	DO i = nxl, nxr
921	work_ffty_vec(j,k,i) = work(i,k,j)
922	ENDDO
923	ENDDO
924	ENDDO
925
926	!$OMP DO
927	DO i = nxl, nxr
928
929	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
930
931	DO j = 0, ny
932	DO k = 1, nz
933	f_out(k,j,i) = work_ffty_vec(j,k,i)
934	ENDDO
935	ENDDO
936
937	ENDDO
938	!$OMP END PARALLEL
939	#endif
940
941	ELSE
942
943	!
944	!-- Cache optimized code.
945	!-- The i-(x-)direction is split into a strided outer loop and an inner
946	!-- loop for better cache performance
947	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
948	!$OMP DO
949	DO iouter = nxl, nxr, stridex
950
951	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
952
953	DO k = 1, nz
954	!
955	!-- Resort
956	DO j = 0, ny
957	DO i = iouter, iend
958	work_ffty(j,i-iouter+1) = work(i,k,j)
959	ENDDO
960	ENDDO
961
962	DO i = iouter, iend
963
964	!
965	!-- FFT along y
966	ir = i-iouter+1 ! counter within a stride
967	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
968
969	DO j = 0, ny
970	f_out(k,j,i) = work_ffty(j,ir)
971	ENDDO
972	ENDDO
973
974	ENDDO
975
976	ENDDO
977	!$OMP END PARALLEL
978
979	ENDIF
980
981	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
982
983	END SUBROUTINE tr_xy_ffty
984
985
986	!------------------------------------------------------------------------------!
987	! Description:
988	! ------------
989	!> FFT along x, solution of the tridiagonal system and backward FFT for
990	!> a 1d-decomposition along x
991	!>
992	!> @warning this subroutine may still not work for hybrid parallelization
993	!> with OpenMP (for possible necessary changes see the original
994	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
995	!------------------------------------------------------------------------------!
996	SUBROUTINE fftx_tri_fftx( ar )
997
998
999	USE control_parameters, &
1000	ONLY: host
1001
1002	USE cpulog, &
1003	ONLY: cpu_log, log_point_s
1004
1005	USE grid_variables, &
1006	ONLY: ddx2, ddy2
1007
1008	USE kinds
1009
1010	USE pegrid
1011
1012	IMPLICIT NONE
1013
1014	INTEGER(iwp) :: i !<
1015	INTEGER(iwp) :: j !<
1016	INTEGER(iwp) :: k !<
1017	INTEGER(iwp) :: m !<
1018	INTEGER(iwp) :: n !<
1019	INTEGER(iwp) :: omp_get_thread_num !<
1020	INTEGER(iwp) :: tn !<
1021
1022	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
1023	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
1024	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
1025	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1026
1027
1028	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
1029
1030	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1031
1032	tn = 0 ! Default thread number in case of one thread
1033	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1034	DO j = nys_x, nyn_x
1035
1036	!$ tn = omp_get_thread_num()
1037
1038	IF ( host(1:3) == 'nec' ) THEN
1039	!
1040	!-- Code optimized for vector processors
1041	DO k = 1, nz
1042
1043	m = 0
1044	DO n = 1, pdims(1)
1045	DO i = 1, nnx
1046	work_trix(m,k) = ar(i,k,j,n)
1047	m = m + 1
1048	ENDDO
1049	ENDDO
1050
1051	ENDDO
1052
1053	CALL fft_x_m( work_trix, 'forward' )
1054
1055	ELSE
1056	!
1057	!-- Cache optimized code
1058	DO k = 1, nz
1059
1060	m = 0
1061	DO n = 1, pdims(1)
1062	DO i = 1, nnx
1063	work_fftx(m) = ar(i,k,j,n)
1064	m = m + 1
1065	ENDDO
1066	ENDDO
1067
1068	CALL fft_x_1d( work_fftx, 'forward' )
1069
1070	DO i = 0, nx
1071	work_trix(i,k) = work_fftx(i)
1072	ENDDO
1073
1074	ENDDO
1075
1076	ENDIF
1077
1078	!
1079	!-- Solve the linear equation system
1080	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1081
1082	IF ( host(1:3) == 'nec' ) THEN
1083	!
1084	!-- Code optimized for vector processors
1085	CALL fft_x_m( work_trix, 'backward' )
1086
1087	DO k = 1, nz
1088
1089	m = 0
1090	DO n = 1, pdims(1)
1091	DO i = 1, nnx
1092	ar(i,k,j,n) = work_trix(m,k)
1093	m = m + 1
1094	ENDDO
1095	ENDDO
1096
1097	ENDDO
1098
1099	ELSE
1100	!
1101	!-- Cache optimized code
1102	DO k = 1, nz
1103
1104	DO i = 0, nx
1105	work_fftx(i) = work_trix(i,k)
1106	ENDDO
1107
1108	CALL fft_x_1d( work_fftx, 'backward' )
1109
1110	m = 0
1111	DO n = 1, pdims(1)
1112	DO i = 1, nnx
1113	ar(i,k,j,n) = work_fftx(m)
1114	m = m + 1
1115	ENDDO
1116	ENDDO
1117
1118	ENDDO
1119
1120	ENDIF
1121
1122	ENDDO
1123
1124	DEALLOCATE( tri )
1125
1126	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1127
1128	END SUBROUTINE fftx_tri_fftx
1129
1130
1131	!------------------------------------------------------------------------------!
1132	! Description:
1133	! ------------
1134	!> Fourier-transformation along x with subsequent transposition x --> y for
1135	!> a 1d-decomposition along y.
1136	!>
1137	!> @attention NEC-branch of this routine may significantly profit from
1138	!> further optimizations. So far, performance is much worse than
1139	!> for routine ffty_tr_yx (more than three times slower).
1140	!------------------------------------------------------------------------------!
1141	SUBROUTINE fftx_tr_xy( f_in, f_out )
1142
1143
1144	USE control_parameters, &
1145	ONLY: host
1146
1147	USE cpulog, &
1148	ONLY: cpu_log, log_point_s
1149
1150	USE kinds
1151
1152	USE pegrid
1153
1154	IMPLICIT NONE
1155
1156	INTEGER(iwp) :: i !<
1157	INTEGER(iwp) :: j !<
1158	INTEGER(iwp) :: k !<
1159
1160	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1161	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
1162	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
1163	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1164
1165	!
1166	!-- Carry out the FFT along x, where all data are present due to the
1167	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1168	!-- the first index.
1169	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1170
1171	IF ( host(1:3) == 'nec' ) THEN
1172	!
1173	!-- Code for vector processors
1174	!$OMP PARALLEL PRIVATE ( i, j, k )
1175	!$OMP DO
1176	DO i = 0, nx
1177
1178	DO j = nys, nyn
1179	DO k = 1, nz
1180	work_fftx(i,k,j) = f_in(k,j,i)
1181	ENDDO
1182	ENDDO
1183
1184	ENDDO
1185
1186	!$OMP DO
1187	DO j = nys, nyn
1188
1189	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1190
1191	DO k = 1, nz
1192	DO i = 0, nx
1193	work(j,k,i) = work_fftx(i,k,j)
1194	ENDDO
1195	ENDDO
1196
1197	ENDDO
1198	!$OMP END PARALLEL
1199
1200	ELSE
1201
1202	!
1203	!-- Cache optimized code (there might be still a potential for better
1204	!-- optimization).
1205	!$OMP PARALLEL PRIVATE (i,j,k)
1206	!$OMP DO
1207	DO i = 0, nx
1208
1209	DO j = nys, nyn
1210	DO k = 1, nz
1211	work_fftx(i,k,j) = f_in(k,j,i)
1212	ENDDO
1213	ENDDO
1214
1215	ENDDO
1216
1217	!$OMP DO
1218	DO j = nys, nyn
1219	DO k = 1, nz
1220
1221	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1222
1223	DO i = 0, nx
1224	work(j,k,i) = work_fftx(i,k,j)
1225	ENDDO
1226	ENDDO
1227
1228	ENDDO
1229	!$OMP END PARALLEL
1230
1231	ENDIF
1232	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1233
1234	!
1235	!-- Transpose array
1236	#if defined( __parallel )
1237	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1238	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1239	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1240	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1241	comm1dy, ierr )
1242	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1243	#endif
1244
1245	END SUBROUTINE fftx_tr_xy
1246
1247
1248	!------------------------------------------------------------------------------!
1249	! Description:
1250	! ------------
1251	!> Transposition y --> x with a subsequent backward Fourier transformation for
1252	!> a 1d-decomposition along x.
1253	!------------------------------------------------------------------------------!
1254	SUBROUTINE tr_yx_fftx( f_in, f_out )
1255
1256
1257	USE control_parameters, &
1258	ONLY: host
1259
1260	USE cpulog, &
1261	ONLY: cpu_log, log_point_s
1262
1263	USE kinds
1264
1265	USE pegrid
1266
1267	IMPLICIT NONE
1268
1269	INTEGER(iwp) :: i !<
1270	INTEGER(iwp) :: j !<
1271	INTEGER(iwp) :: k !<
1272
1273	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1274	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
1275	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
1276	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1277
1278	!
1279	!-- Transpose array
1280	#if defined( __parallel )
1281	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1282	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1283	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1284	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1285	comm1dy, ierr )
1286	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1287	#endif
1288
1289	!
1290	!-- Carry out the FFT along x, where all data are present due to the
1291	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1292	!-- the first index.
1293	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1294
1295	IF ( host(1:3) == 'nec' ) THEN
1296	!
1297	!-- Code optimized for vector processors
1298	!$OMP PARALLEL PRIVATE ( i, j, k )
1299	!$OMP DO
1300	DO j = nys, nyn
1301
1302	DO k = 1, nz
1303	DO i = 0, nx
1304	work_fftx(i,k,j) = work(j,k,i)
1305	ENDDO
1306	ENDDO
1307
1308	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1309
1310	ENDDO
1311
1312	!$OMP DO
1313	DO i = 0, nx
1314	DO j = nys, nyn
1315	DO k = 1, nz
1316	f_out(k,j,i) = work_fftx(i,k,j)
1317	ENDDO
1318	ENDDO
1319	ENDDO
1320	!$OMP END PARALLEL
1321
1322	ELSE
1323
1324	!
1325	!-- Cache optimized code (there might be still a potential for better
1326	!-- optimization).
1327	!$OMP PARALLEL PRIVATE (i,j,k)
1328	!$OMP DO
1329	DO j = nys, nyn
1330	DO k = 1, nz
1331
1332	DO i = 0, nx
1333	work_fftx(i,k,j) = work(j,k,i)
1334	ENDDO
1335
1336	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1337
1338	ENDDO
1339	ENDDO
1340
1341	!$OMP DO
1342	DO i = 0, nx
1343	DO j = nys, nyn
1344	DO k = 1, nz
1345	f_out(k,j,i) = work_fftx(i,k,j)
1346	ENDDO
1347	ENDDO
1348	ENDDO
1349	!$OMP END PARALLEL
1350
1351	ENDIF
1352	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1353
1354	END SUBROUTINE tr_yx_fftx
1355
1356
1357	!------------------------------------------------------------------------------!
1358	! Description:
1359	! ------------
1360	!> FFT along y, solution of the tridiagonal system and backward FFT for
1361	!> a 1d-decomposition along y.
1362	!>
1363	!> @warning this subroutine may still not work for hybrid parallelization
1364	!> with OpenMP (for possible necessary changes see the original
1365	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1366	!------------------------------------------------------------------------------!
1367	SUBROUTINE ffty_tri_ffty( ar )
1368
1369
1370	USE control_parameters, &
1371	ONLY: host
1372
1373	USE cpulog, &
1374	ONLY: cpu_log, log_point_s
1375
1376	USE grid_variables, &
1377	ONLY: ddx2, ddy2
1378
1379	USE kinds
1380
1381	USE pegrid
1382
1383	IMPLICIT NONE
1384
1385	INTEGER(iwp) :: i !<
1386	INTEGER(iwp) :: j !<
1387	INTEGER(iwp) :: k !<
1388	INTEGER(iwp) :: m !<
1389	INTEGER(iwp) :: n !<
1390	INTEGER(iwp) :: omp_get_thread_num !<
1391	INTEGER(iwp) :: tn !<
1392
1393	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
1394	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
1395	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
1396	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1397
1398
1399	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1400
1401	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1402
1403	tn = 0 ! Default thread number in case of one thread
1404	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1405	DO i = nxl_y, nxr_y
1406
1407	!$ tn = omp_get_thread_num()
1408
1409	IF ( host(1:3) == 'nec' ) THEN
1410	!
1411	!-- Code optimized for vector processors
1412	DO k = 1, nz
1413
1414	m = 0
1415	DO n = 1, pdims(2)
1416	DO j = 1, nny
1417	work_triy(m,k) = ar(j,k,i,n)
1418	m = m + 1
1419	ENDDO
1420	ENDDO
1421
1422	ENDDO
1423
1424	CALL fft_y_m( work_triy, ny, 'forward' )
1425
1426	ELSE
1427	!
1428	!-- Cache optimized code
1429	DO k = 1, nz
1430
1431	m = 0
1432	DO n = 1, pdims(2)
1433	DO j = 1, nny
1434	work_ffty(m) = ar(j,k,i,n)
1435	m = m + 1
1436	ENDDO
1437	ENDDO
1438
1439	CALL fft_y_1d( work_ffty, 'forward' )
1440
1441	DO j = 0, ny
1442	work_triy(j,k) = work_ffty(j)
1443	ENDDO
1444
1445	ENDDO
1446
1447	ENDIF
1448
1449	!
1450	!-- Solve the linear equation system
1451	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1452
1453	IF ( host(1:3) == 'nec' ) THEN
1454	!
1455	!-- Code optimized for vector processors
1456	CALL fft_y_m( work_triy, ny, 'backward' )
1457
1458	DO k = 1, nz
1459
1460	m = 0
1461	DO n = 1, pdims(2)
1462	DO j = 1, nny
1463	ar(j,k,i,n) = work_triy(m,k)
1464	m = m + 1
1465	ENDDO
1466	ENDDO
1467
1468	ENDDO
1469
1470	ELSE
1471	!
1472	!-- Cache optimized code
1473	DO k = 1, nz
1474
1475	DO j = 0, ny
1476	work_ffty(j) = work_triy(j,k)
1477	ENDDO
1478
1479	CALL fft_y_1d( work_ffty, 'backward' )
1480
1481	m = 0
1482	DO n = 1, pdims(2)
1483	DO j = 1, nny
1484	ar(j,k,i,n) = work_ffty(m)
1485	m = m + 1
1486	ENDDO
1487	ENDDO
1488
1489	ENDDO
1490
1491	ENDIF
1492
1493	ENDDO
1494
1495	DEALLOCATE( tri )
1496
1497	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1498
1499	END SUBROUTINE ffty_tri_ffty
1500
1501	#endif
1502
1503	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |