Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 4180

Last change on this file since 4180 was 4180, checked in by scharf, 5 years ago
removed comments in 'Former revisions' section that are older than 01.01.2019
Property svn:keywords set to `Id`
File size: 41.8 KB

Line
1	!> @file poisfft_mod.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2019 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft_mod.f90 4180 2019-08-21 14:37:54Z scharf $
27	! OpenACC port for SPEC
28	!
29	!
30	! Description:
31	! ------------
32	!> Solves the Poisson equation with a 2D spectral method
33	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
34	!>
35	!> Input:
36	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
37	!> starting from (1,nys,nxl)
38	!>
39	!> Output:
40	!> real ar contains the solution for perturbation pressure p
41	!------------------------------------------------------------------------------!
42	MODULE poisfft_mod
43
44
45	USE fft_xy, &
46	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
47
48	USE indices, &
49	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
50
51	USE transpose_indices, &
52	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
53	nzb_y, nzt_x, nzt_y
54
55	USE tridia_solver, &
56	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
57
58	IMPLICIT NONE
59
60	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
61
62	PRIVATE
63
64	PUBLIC poisfft, poisfft_init
65
66	INTERFACE poisfft
67	MODULE PROCEDURE poisfft
68	END INTERFACE poisfft
69
70	INTERFACE poisfft_init
71	MODULE PROCEDURE poisfft_init
72	END INTERFACE poisfft_init
73
74
75	CONTAINS
76
77	!------------------------------------------------------------------------------!
78	! Description:
79	! ------------
80	!> Setup coefficients for FFT and the tridiagonal solver
81	!------------------------------------------------------------------------------!
82	SUBROUTINE poisfft_init
83
84	IMPLICIT NONE
85
86
87	CALL fft_init
88
89	CALL tridia_init
90
91	poisfft_initialized = .TRUE.
92
93	END SUBROUTINE poisfft_init
94
95
96
97	!------------------------------------------------------------------------------!
98	! Description:
99	! ------------
100	!> Two-dimensional Fourier Transformation in x- and y-direction.
101	!------------------------------------------------------------------------------!
102	SUBROUTINE poisfft( ar )
103
104	USE control_parameters, &
105	ONLY: transpose_compute_overlap
106
107	USE cpulog, &
108	ONLY: cpu_log, cpu_log_nowait, log_point_s
109
110	USE kinds
111
112	USE pegrid
113
114	IMPLICIT NONE
115
116	INTEGER(iwp) :: ii !<
117	INTEGER(iwp) :: iind !<
118	INTEGER(iwp) :: inew !<
119	INTEGER(iwp) :: jj !<
120	INTEGER(iwp) :: jind !<
121	INTEGER(iwp) :: jnew !<
122	INTEGER(iwp) :: ki !<
123	INTEGER(iwp) :: kk !<
124	INTEGER(iwp) :: knew !<
125	INTEGER(iwp) :: n !<
126	INTEGER(iwp) :: nblk !<
127	INTEGER(iwp) :: nnx_y !<
128	INTEGER(iwp) :: nny_z !<
129	INTEGER(iwp) :: nnz_x !<
130	INTEGER(iwp) :: nxl_y_bound !<
131	INTEGER(iwp) :: nxr_y_bound !<
132
133	INTEGER(iwp), DIMENSION(4) :: isave !<
134
135	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
136	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
137
138	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
139	#if __acc_fft_device
140	!$ACC DECLARE CREATE(ar_inv)
141	#endif
142
143	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
144	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
145	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
146	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
147	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
148
149
150	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
151
152	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
153
154	#if !__acc_fft_device
155	!$ACC UPDATE HOST(ar)
156	#endif
157
158	#ifndef _OPENACC
159	!
160	!-- Two-dimensional Fourier Transformation in x- and y-direction.
161	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
162
163	!
164	!-- 1d-domain-decomposition along x:
165	!-- FFT along y and transposition y --> x
166	CALL ffty_tr_yx( ar, ar )
167
168	!
169	!-- FFT along x, solving the tridiagonal system and backward FFT
170	CALL fftx_tri_fftx( ar )
171
172	!
173	!-- Transposition x --> y and backward FFT along y
174	CALL tr_xy_ffty( ar, ar )
175
176	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
177
178	!
179	!-- 1d-domain-decomposition along y:
180	!-- FFT along x and transposition x --> y
181	CALL fftx_tr_xy( ar, ar )
182
183	!
184	!-- FFT along y, solving the tridiagonal system and backward FFT
185	CALL ffty_tri_ffty( ar )
186
187	!
188	!-- Transposition y --> x and backward FFT along x
189	CALL tr_yx_fftx( ar, ar )
190
191	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
192	#endif
193
194	!
195	!-- 2d-domain-decomposition or no decomposition (1 PE run)
196	!-- Transposition z --> x
197	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
198	CALL resort_for_zx( ar, ar_inv )
199	CALL transpose_zx( ar_inv, ar )
200	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
201
202	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
203	CALL fft_x( ar, 'forward' )
204	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
205
206	!
207	!-- Transposition x --> y
208	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
209	CALL resort_for_xy( ar, ar_inv )
210	CALL transpose_xy( ar_inv, ar )
211	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
212
213	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
214	CALL fft_y( ar, 'forward', ar_tr = ar, &
215	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
216	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
217	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
218
219	!
220	!-- Transposition y --> z
221	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
222	CALL resort_for_yz( ar, ar_inv )
223	CALL transpose_yz( ar_inv, ar )
224	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
225
226	!
227	!-- Solve the tridiagonal equation system along z
228	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
229	CALL tridia_substi( ar )
230	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
231
232	!
233	!-- Inverse Fourier Transformation
234	!-- Transposition z --> y
235	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
236	CALL transpose_zy( ar, ar_inv )
237	CALL resort_for_zy( ar_inv, ar )
238	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
239
240	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
241	CALL fft_y( ar, 'backward', ar_tr = ar, &
242	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
243	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
244	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
245
246	!
247	!-- Transposition y --> x
248	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
249	CALL transpose_yx( ar, ar_inv )
250	CALL resort_for_yx( ar_inv, ar )
251	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
252
253	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
254	CALL fft_x( ar, 'backward' )
255	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
256
257	!
258	!-- Transposition x --> z
259	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
260	CALL transpose_xz( ar, ar_inv )
261	CALL resort_for_xz( ar_inv, ar )
262	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
263
264	#ifndef _OPENACC
265	ELSE
266
267	!
268	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
269	!-- overlapping transposition / fft
270	!-- cputime logging must not use barriers, which would prevent overlapping
271	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
272	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
273	!
274	!-- Transposition z --> x + subsequent fft along x
275	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
276	CALL resort_for_zx( ar, f_inv )
277	!
278	!-- Save original indices and gridpoint counter
279	isave(1) = nz
280	isave(2) = nzb_x
281	isave(3) = nzt_x
282	isave(4) = sendrecvcount_zx
283	!
284	!-- Set new indices for transformation
285	nblk = nz / pdims(1)
286	nz = pdims(1)
287	nnz_x = 1
288	nzb_x = 1 + myidx * nnz_x
289	nzt_x = ( myidx + 1 ) * nnz_x
290	sendrecvcount_zx = nnx * nny * nnz_x
291
292	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
293	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
294
295	DO kk = 1, nblk
296
297	IF ( kk == 1 ) THEN
298	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
299	ELSE
300	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
301	ENDIF
302
303	DO knew = 1, nz
304	ki = kk + nblk * ( knew - 1 )
305	f_in(:,:,knew) = f_inv(:,:,ki)
306	ENDDO
307
308	CALL transpose_zx( f_in, ar1(:,:,:))
309	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
310
311	IF ( kk == 1 ) THEN
312	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
313	ELSE
314	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
315	ENDIF
316
317	n = isave(2) + kk - 1
318	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
319	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
320
321	ENDDO
322	!
323	!-- Restore original indices/counters
324	nz = isave(1)
325	nzb_x = isave(2)
326	nzt_x = isave(3)
327	sendrecvcount_zx = isave(4)
328
329	DEALLOCATE( ar1, f_in, f_inv )
330
331	!
332	!-- Transposition x --> y + subsequent fft along y
333	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
334	CALL resort_for_xy( f_out_z, f_inv )
335	!
336	!-- Save original indices and gridpoint counter
337	isave(1) = nx
338	isave(2) = nxl_y
339	isave(3) = nxr_y
340	isave(4) = sendrecvcount_xy
341	!
342	!-- Set new indices for transformation
343	nblk = ( ( nx+1 ) / pdims(2) ) - 1
344	nx = pdims(2)
345	nnx_y = 1
346	nxl_y = myidy * nnx_y
347	nxr_y = ( myidy + 1 ) * nnx_y - 1
348	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
349
350	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
351	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
352
353	DO ii = 0, nblk
354
355	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
356
357	DO inew = 0, nx-1
358	iind = ii + ( nblk + 1 ) * inew
359	f_in(:,:,inew) = f_inv(:,:,iind)
360	ENDDO
361
362	CALL transpose_xy( f_in, ar1(:,:,:) )
363
364	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
365
366	IF ( ii == 1 ) THEN
367	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
368	ELSE
369	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
370	ENDIF
371
372	nxl_y_bound = isave(2)
373	nxr_y_bound = isave(3)
374	n = isave(2) + ii
375	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
376	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
377	nxl_y_l = n, nxr_y_l = n )
378
379	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
380
381	ENDDO
382	!
383	!-- Restore original indices/counters
384	nx = isave(1)
385	nxl_y = isave(2)
386	nxr_y = isave(3)
387	sendrecvcount_xy = isave(4)
388
389	DEALLOCATE( ar1, f_in, f_inv )
390
391	!
392	!-- Transposition y --> z + subsequent tridia + resort for z --> y
393	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
394	CALL resort_for_yz( f_out_y, f_inv )
395	!
396	!-- Save original indices and gridpoint counter
397	isave(1) = ny
398	isave(2) = nys_z
399	isave(3) = nyn_z
400	isave(4) = sendrecvcount_yz
401	!
402	!-- Set new indices for transformation
403	nblk = ( ( ny+1 ) / pdims(1) ) - 1
404	ny = pdims(1)
405	nny_z = 1
406	nys_z = myidx * nny_z
407	nyn_z = ( myidx + 1 ) * nny_z - 1
408	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
409
410	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
411	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
412
413	DO jj = 0, nblk
414	!
415	!-- Forward Fourier Transformation
416	!-- Transposition y --> z
417	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
418
419	DO jnew = 0, ny-1
420	jind = jj + ( nblk + 1 ) * jnew
421	f_in(:,:,jnew) = f_inv(:,:,jind)
422	ENDDO
423
424	CALL transpose_yz( f_in, ar1(:,:,:) )
425
426	IF ( jj == nblk ) THEN
427	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
428	ELSE
429	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
430	ENDIF
431
432	!
433	!-- Solve the tridiagonal equation system along z
434	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
435
436	n = isave(2) + jj
437	CALL tridia_substi_overlap( ar1(:,:,:), n )
438
439	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
440
441	!
442	!-- Inverse Fourier Transformation
443	!-- Transposition z --> y
444	!-- Only one thread should call MPI routines, therefore forward and
445	!-- backward tranpose are in the same section
446	IF ( jj == 0 ) THEN
447	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
448	ELSE
449	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
450	ENDIF
451
452	CALL transpose_zy( ar1(:,:,:), f_in )
453
454	DO jnew = 0, ny-1
455	jind = jj + ( nblk + 1 ) * jnew
456	f_inv(:,:,jind) = f_in(:,:,jnew)
457	ENDDO
458
459	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
460
461	ENDDO
462	!
463	!-- Restore original indices/counters
464	ny = isave(1)
465	nys_z = isave(2)
466	nyn_z = isave(3)
467	sendrecvcount_yz = isave(4)
468
469	CALL resort_for_zy( f_inv, f_out_y )
470
471	DEALLOCATE( ar1, f_in, f_inv )
472
473	!
474	!-- fft along y backward + subsequent transposition y --> x
475	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
476	!
477	!-- Save original indices and gridpoint counter
478	isave(1) = nx
479	isave(2) = nxl_y
480	isave(3) = nxr_y
481	isave(4) = sendrecvcount_xy
482	!
483	!-- Set new indices for transformation
484	nblk = (( nx+1 ) / pdims(2) ) - 1
485	nx = pdims(2)
486	nnx_y = 1
487	nxl_y = myidy * nnx_y
488	nxr_y = ( myidy + 1 ) * nnx_y - 1
489	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
490
491	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
492	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
493
494	DO ii = 0, nblk
495
496	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
497
498	n = isave(2) + ii
499	nxl_y_bound = isave(2)
500	nxr_y_bound = isave(3)
501
502	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
503	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
504	nxl_y_l = n, nxr_y_l = n )
505
506	IF ( ii == nblk ) THEN
507	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
508	ELSE
509	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
510	ENDIF
511
512	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
513
514	CALL transpose_yx( ar1(:,:,:), f_in )
515
516	DO inew = 0, nx-1
517	iind = ii + (nblk+1) * inew
518	f_inv(:,:,iind) = f_in(:,:,inew)
519	ENDDO
520
521	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
522
523	ENDDO
524	!
525	!-- Restore original indices/counters
526	nx = isave(1)
527	nxl_y = isave(2)
528	nxr_y = isave(3)
529	sendrecvcount_xy = isave(4)
530
531	CALL resort_for_yx( f_inv, f_out_z )
532
533	DEALLOCATE( ar1, f_in, f_inv )
534
535	!
536	!-- fft along x backward + subsequent final transposition x --> z
537	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
538	!
539	!-- Save original indices and gridpoint counter
540	isave(1) = nz
541	isave(2) = nzb_x
542	isave(3) = nzt_x
543	isave(4) = sendrecvcount_zx
544	!
545	!-- Set new indices for transformation
546	nblk = nz / pdims(1)
547	nz = pdims(1)
548	nnz_x = 1
549	nzb_x = 1 + myidx * nnz_x
550	nzt_x = ( myidx + 1 ) * nnz_x
551	sendrecvcount_zx = nnx * nny * nnz_x
552
553	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
554	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
555
556	DO kk = 1, nblk
557
558	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
559
560	n = isave(2) + kk - 1
561	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
562
563	IF ( kk == nblk ) THEN
564	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
565	ELSE
566	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
567	ENDIF
568
569	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
570
571	CALL transpose_xz( ar1(:,:,:), f_in )
572
573	DO knew = 1, nz
574	ki = kk + nblk * (knew-1)
575	f_inv(:,:,ki) = f_in(:,:,knew)
576	ENDDO
577
578	IF ( kk == nblk ) THEN
579	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
580	ELSE
581	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
582	ENDIF
583
584	ENDDO
585	!
586	!-- Restore original indices/counters
587	nz = isave(1)
588	nzb_x = isave(2)
589	nzt_x = isave(3)
590	sendrecvcount_zx = isave(4)
591
592	CALL resort_for_xz( f_inv, ar )
593
594	DEALLOCATE( ar1, f_in, f_inv )
595
596	ENDIF
597	#endif
598
599	#if !__acc_fft_device
600	!$ACC UPDATE DEVICE(ar)
601	#endif
602
603	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
604
605	END SUBROUTINE poisfft
606
607
608	!------------------------------------------------------------------------------!
609	! Description:
610	! ------------
611	!> Fourier-transformation along y with subsequent transposition y --> x for
612	!> a 1d-decomposition along x.
613	!>
614	!> @attention The performance of this routine is much faster on the NEC-SX6,
615	!> if the first index of work_ffty_vec is odd. Otherwise
616	!> memory bank conflicts may occur (especially if the index is a
617	!> multiple of 128). That's why work_ffty_vec is dimensioned as
618	!> 0:ny+1.
619	!> Of course, this will not work if users are using an odd number
620	!> of gridpoints along y.
621	!------------------------------------------------------------------------------!
622	SUBROUTINE ffty_tr_yx( f_in, f_out )
623
624	USE control_parameters, &
625	ONLY: loop_optimization
626
627	USE cpulog, &
628	ONLY: cpu_log, log_point_s
629
630	USE kinds
631
632	USE pegrid
633
634	IMPLICIT NONE
635
636	INTEGER(iwp) :: i !<
637	INTEGER(iwp) :: iend !<
638	INTEGER(iwp) :: iouter !<
639	INTEGER(iwp) :: ir !<
640	INTEGER(iwp) :: j !<
641	INTEGER(iwp) :: k !<
642
643	INTEGER(iwp), PARAMETER :: stridex = 4 !<
644
645	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
646	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
647	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
648
649	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
650	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
651
652	!
653	!-- Carry out the FFT along y, where all data are present due to the
654	!-- 1d-decomposition along x. Resort the data in a way that x becomes
655	!-- the first index.
656	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
657
658	IF ( loop_optimization == 'vector' ) THEN
659
660	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
661	!
662	!-- Code optimized for vector processors
663	!$OMP PARALLEL PRIVATE ( i, j, k )
664	!$OMP DO
665	DO i = nxl, nxr
666
667	DO j = 0, ny
668	DO k = 1, nz
669	work_ffty_vec(j,k,i) = f_in(k,j,i)
670	ENDDO
671	ENDDO
672
673	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
674
675	ENDDO
676
677	!$OMP DO
678	DO k = 1, nz
679	DO j = 0, ny
680	DO i = nxl, nxr
681	work(i,k,j) = work_ffty_vec(j,k,i)
682	ENDDO
683	ENDDO
684	ENDDO
685	!$OMP END PARALLEL
686
687	DEALLOCATE( work_ffty_vec )
688
689	ELSE
690	!
691	!-- Cache optimized code.
692	ALLOCATE( work_ffty(0:ny,stridex) )
693	!
694	!-- The i-(x-)direction is split into a strided outer loop and an inner
695	!-- loop for better cache performance
696	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
697	!$OMP DO
698	DO iouter = nxl, nxr, stridex
699
700	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
701
702	DO k = 1, nz
703
704	DO i = iouter, iend
705
706	ir = i-iouter+1 ! counter within a stride
707	DO j = 0, ny
708	work_ffty(j,ir) = f_in(k,j,i)
709	ENDDO
710	!
711	!-- FFT along y
712	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
713
714	ENDDO
715
716	!
717	!-- Resort
718	DO j = 0, ny
719	DO i = iouter, iend
720	work(i,k,j) = work_ffty(j,i-iouter+1)
721	ENDDO
722	ENDDO
723
724	ENDDO
725
726	ENDDO
727	!$OMP END PARALLEL
728
729	DEALLOCATE( work_ffty )
730
731	ENDIF
732
733	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
734
735	!
736	!-- Transpose array
737	#if defined( __parallel )
738	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
739	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
740	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
741	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
742	comm1dx, ierr )
743	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
744	#endif
745
746	END SUBROUTINE ffty_tr_yx
747
748
749	!------------------------------------------------------------------------------!
750	! Description:
751	! ------------
752	!> Transposition x --> y with a subsequent backward Fourier transformation for
753	!> a 1d-decomposition along x
754	!------------------------------------------------------------------------------!
755	SUBROUTINE tr_xy_ffty( f_in, f_out )
756
757	USE control_parameters, &
758	ONLY: loop_optimization
759
760	USE cpulog, &
761	ONLY: cpu_log, log_point_s
762
763	USE kinds
764
765	USE pegrid
766
767	IMPLICIT NONE
768
769	INTEGER(iwp) :: i !<
770	INTEGER(iwp) :: iend !<
771	INTEGER(iwp) :: iouter !<
772	INTEGER(iwp) :: ir !<
773	INTEGER(iwp) :: j !<
774	INTEGER(iwp) :: k !<
775
776	INTEGER(iwp), PARAMETER :: stridex = 4 !<
777
778	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
779	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
780	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
781
782	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
783	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
784
785	!
786	!-- Transpose array
787	#if defined( __parallel )
788	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
789	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
790	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
791	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
792	comm1dx, ierr )
793	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
794	#endif
795
796	!
797	!-- Resort the data in a way that y becomes the first index and carry out the
798	!-- backward fft along y.
799	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
800
801	IF ( loop_optimization == 'vector' ) THEN
802
803	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
804	!
805	!-- Code optimized for vector processors
806	!$OMP PARALLEL PRIVATE ( i, j, k )
807	!$OMP DO
808	DO k = 1, nz
809	DO j = 0, ny
810	DO i = nxl, nxr
811	work_ffty_vec(j,k,i) = work(i,k,j)
812	ENDDO
813	ENDDO
814	ENDDO
815
816	!$OMP DO
817	DO i = nxl, nxr
818
819	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
820
821	DO j = 0, ny
822	DO k = 1, nz
823	f_out(k,j,i) = work_ffty_vec(j,k,i)
824	ENDDO
825	ENDDO
826
827	ENDDO
828	!$OMP END PARALLEL
829
830	DEALLOCATE( work_ffty_vec )
831
832	ELSE
833	!
834	!-- Cache optimized code.
835	ALLOCATE( work_ffty(0:ny,stridex) )
836	!
837	!-- The i-(x-)direction is split into a strided outer loop and an inner
838	!-- loop for better cache performance
839	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
840	!$OMP DO
841	DO iouter = nxl, nxr, stridex
842
843	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
844
845	DO k = 1, nz
846	!
847	!-- Resort
848	DO j = 0, ny
849	DO i = iouter, iend
850	work_ffty(j,i-iouter+1) = work(i,k,j)
851	ENDDO
852	ENDDO
853
854	DO i = iouter, iend
855
856	!
857	!-- FFT along y
858	ir = i-iouter+1 ! counter within a stride
859	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
860
861	DO j = 0, ny
862	f_out(k,j,i) = work_ffty(j,ir)
863	ENDDO
864	ENDDO
865
866	ENDDO
867
868	ENDDO
869	!$OMP END PARALLEL
870
871	DEALLOCATE( work_ffty )
872
873	ENDIF
874
875	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
876
877	END SUBROUTINE tr_xy_ffty
878
879
880	!------------------------------------------------------------------------------!
881	! Description:
882	! ------------
883	!> FFT along x, solution of the tridiagonal system and backward FFT for
884	!> a 1d-decomposition along x
885	!>
886	!> @warning this subroutine may still not work for hybrid parallelization
887	!> with OpenMP (for possible necessary changes see the original
888	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
889	!------------------------------------------------------------------------------!
890	SUBROUTINE fftx_tri_fftx( ar )
891
892	USE control_parameters, &
893	ONLY: loop_optimization
894
895	USE cpulog, &
896	ONLY: cpu_log, log_point_s
897
898	USE grid_variables, &
899	ONLY: ddx2, ddy2
900
901	USE kinds
902
903	USE pegrid
904
905	IMPLICIT NONE
906
907	INTEGER(iwp) :: i !<
908	INTEGER(iwp) :: j !<
909	INTEGER(iwp) :: k !<
910	INTEGER(iwp) :: m !<
911	INTEGER(iwp) :: n !<
912	!$ INTEGER(iwp) :: omp_get_thread_num !<
913	INTEGER(iwp) :: tn !<
914
915	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
916	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
917	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
918	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
919
920
921	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
922
923	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
924
925	tn = 0 ! Default thread number in case of one thread
926	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
927	DO j = nys_x, nyn_x
928
929	!$ tn = omp_get_thread_num()
930
931	IF ( loop_optimization == 'vector' ) THEN
932	!
933	!-- Code optimized for vector processors
934	DO k = 1, nz
935
936	m = 0
937	DO n = 1, pdims(1)
938	DO i = 1, nnx
939	work_trix(m,k) = ar(i,k,j,n)
940	m = m + 1
941	ENDDO
942	ENDDO
943
944	ENDDO
945
946	CALL fft_x_m( work_trix, 'forward' )
947
948	ELSE
949	!
950	!-- Cache optimized code
951	DO k = 1, nz
952
953	m = 0
954	DO n = 1, pdims(1)
955	DO i = 1, nnx
956	work_fftx(m) = ar(i,k,j,n)
957	m = m + 1
958	ENDDO
959	ENDDO
960
961	CALL fft_x_1d( work_fftx, 'forward' )
962
963	DO i = 0, nx
964	work_trix(i,k) = work_fftx(i)
965	ENDDO
966
967	ENDDO
968
969	ENDIF
970
971	!
972	!-- Solve the linear equation system
973	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
974
975	IF ( loop_optimization == 'vector' ) THEN
976	!
977	!-- Code optimized for vector processors
978	CALL fft_x_m( work_trix, 'backward' )
979
980	DO k = 1, nz
981
982	m = 0
983	DO n = 1, pdims(1)
984	DO i = 1, nnx
985	ar(i,k,j,n) = work_trix(m,k)
986	m = m + 1
987	ENDDO
988	ENDDO
989
990	ENDDO
991
992	ELSE
993	!
994	!-- Cache optimized code
995	DO k = 1, nz
996
997	DO i = 0, nx
998	work_fftx(i) = work_trix(i,k)
999	ENDDO
1000
1001	CALL fft_x_1d( work_fftx, 'backward' )
1002
1003	m = 0
1004	DO n = 1, pdims(1)
1005	DO i = 1, nnx
1006	ar(i,k,j,n) = work_fftx(m)
1007	m = m + 1
1008	ENDDO
1009	ENDDO
1010
1011	ENDDO
1012
1013	ENDIF
1014
1015	ENDDO
1016
1017	DEALLOCATE( tri )
1018
1019	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1020
1021	END SUBROUTINE fftx_tri_fftx
1022
1023
1024	!------------------------------------------------------------------------------!
1025	! Description:
1026	! ------------
1027	!> Fourier-transformation along x with subsequent transposition x --> y for
1028	!> a 1d-decomposition along y.
1029	!>
1030	!> @attention NEC-branch of this routine may significantly profit from
1031	!> further optimizations. So far, performance is much worse than
1032	!> for routine ffty_tr_yx (more than three times slower).
1033	!------------------------------------------------------------------------------!
1034	SUBROUTINE fftx_tr_xy( f_in, f_out )
1035
1036
1037	USE control_parameters, &
1038	ONLY: loop_optimization
1039
1040	USE cpulog, &
1041	ONLY: cpu_log, log_point_s
1042
1043	USE kinds
1044
1045	USE pegrid
1046
1047	IMPLICIT NONE
1048
1049	INTEGER(iwp) :: i !<
1050	INTEGER(iwp) :: j !<
1051	INTEGER(iwp) :: k !<
1052
1053	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1054	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
1055	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
1056	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1057
1058	!
1059	!-- Carry out the FFT along x, where all data are present due to the
1060	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1061	!-- the first index.
1062	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1063
1064	IF ( loop_optimization == 'vector' ) THEN
1065	!
1066	!-- Code for vector processors
1067	!$OMP PARALLEL PRIVATE ( i, j, k )
1068	!$OMP DO
1069	DO i = 0, nx
1070
1071	DO j = nys, nyn
1072	DO k = 1, nz
1073	work_fftx(i,k,j) = f_in(k,j,i)
1074	ENDDO
1075	ENDDO
1076
1077	ENDDO
1078
1079	!$OMP DO
1080	DO j = nys, nyn
1081
1082	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1083
1084	DO k = 1, nz
1085	DO i = 0, nx
1086	work(j,k,i) = work_fftx(i,k,j)
1087	ENDDO
1088	ENDDO
1089
1090	ENDDO
1091	!$OMP END PARALLEL
1092
1093	ELSE
1094
1095	!
1096	!-- Cache optimized code (there might be still a potential for better
1097	!-- optimization).
1098	!$OMP PARALLEL PRIVATE (i,j,k)
1099	!$OMP DO
1100	DO i = 0, nx
1101
1102	DO j = nys, nyn
1103	DO k = 1, nz
1104	work_fftx(i,k,j) = f_in(k,j,i)
1105	ENDDO
1106	ENDDO
1107
1108	ENDDO
1109
1110	!$OMP DO
1111	DO j = nys, nyn
1112	DO k = 1, nz
1113
1114	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1115
1116	DO i = 0, nx
1117	work(j,k,i) = work_fftx(i,k,j)
1118	ENDDO
1119	ENDDO
1120
1121	ENDDO
1122	!$OMP END PARALLEL
1123
1124	ENDIF
1125	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1126
1127	!
1128	!-- Transpose array
1129	#if defined( __parallel )
1130	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1131	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1132	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1133	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1134	comm1dy, ierr )
1135	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1136	#endif
1137
1138	END SUBROUTINE fftx_tr_xy
1139
1140
1141	!------------------------------------------------------------------------------!
1142	! Description:
1143	! ------------
1144	!> Transposition y --> x with a subsequent backward Fourier transformation for
1145	!> a 1d-decomposition along x.
1146	!------------------------------------------------------------------------------!
1147	SUBROUTINE tr_yx_fftx( f_in, f_out )
1148
1149
1150	USE control_parameters, &
1151	ONLY: loop_optimization
1152
1153	USE cpulog, &
1154	ONLY: cpu_log, log_point_s
1155
1156	USE kinds
1157
1158	USE pegrid
1159
1160	IMPLICIT NONE
1161
1162	INTEGER(iwp) :: i !<
1163	INTEGER(iwp) :: j !<
1164	INTEGER(iwp) :: k !<
1165
1166	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1167	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
1168	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
1169	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1170
1171	!
1172	!-- Transpose array
1173	#if defined( __parallel )
1174	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1175	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1176	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1177	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1178	comm1dy, ierr )
1179	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1180	#endif
1181
1182	!
1183	!-- Carry out the FFT along x, where all data are present due to the
1184	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1185	!-- the first index.
1186	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1187
1188	IF ( loop_optimization == 'vector' ) THEN
1189	!
1190	!-- Code optimized for vector processors
1191	!$OMP PARALLEL PRIVATE ( i, j, k )
1192	!$OMP DO
1193	DO j = nys, nyn
1194
1195	DO k = 1, nz
1196	DO i = 0, nx
1197	work_fftx(i,k,j) = work(j,k,i)
1198	ENDDO
1199	ENDDO
1200
1201	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1202
1203	ENDDO
1204
1205	!$OMP DO
1206	DO i = 0, nx
1207	DO j = nys, nyn
1208	DO k = 1, nz
1209	f_out(k,j,i) = work_fftx(i,k,j)
1210	ENDDO
1211	ENDDO
1212	ENDDO
1213	!$OMP END PARALLEL
1214
1215	ELSE
1216
1217	!
1218	!-- Cache optimized code (there might be still a potential for better
1219	!-- optimization).
1220	!$OMP PARALLEL PRIVATE (i,j,k)
1221	!$OMP DO
1222	DO j = nys, nyn
1223	DO k = 1, nz
1224
1225	DO i = 0, nx
1226	work_fftx(i,k,j) = work(j,k,i)
1227	ENDDO
1228
1229	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1230
1231	ENDDO
1232	ENDDO
1233
1234	!$OMP DO
1235	DO i = 0, nx
1236	DO j = nys, nyn
1237	DO k = 1, nz
1238	f_out(k,j,i) = work_fftx(i,k,j)
1239	ENDDO
1240	ENDDO
1241	ENDDO
1242	!$OMP END PARALLEL
1243
1244	ENDIF
1245	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1246
1247	END SUBROUTINE tr_yx_fftx
1248
1249
1250	!------------------------------------------------------------------------------!
1251	! Description:
1252	! ------------
1253	!> FFT along y, solution of the tridiagonal system and backward FFT for
1254	!> a 1d-decomposition along y.
1255	!>
1256	!> @warning this subroutine may still not work for hybrid parallelization
1257	!> with OpenMP (for possible necessary changes see the original
1258	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1259	!------------------------------------------------------------------------------!
1260	SUBROUTINE ffty_tri_ffty( ar )
1261
1262
1263	USE control_parameters, &
1264	ONLY: loop_optimization
1265
1266	USE cpulog, &
1267	ONLY: cpu_log, log_point_s
1268
1269	USE grid_variables, &
1270	ONLY: ddx2, ddy2
1271
1272	USE kinds
1273
1274	USE pegrid
1275
1276	IMPLICIT NONE
1277
1278	INTEGER(iwp) :: i !<
1279	INTEGER(iwp) :: j !<
1280	INTEGER(iwp) :: k !<
1281	INTEGER(iwp) :: m !<
1282	INTEGER(iwp) :: n !<
1283	!$ INTEGER(iwp) :: omp_get_thread_num !<
1284	INTEGER(iwp) :: tn !<
1285
1286	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
1287	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
1288	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
1289	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1290
1291
1292	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1293
1294	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1295
1296	tn = 0 ! Default thread number in case of one thread
1297	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1298	DO i = nxl_y, nxr_y
1299
1300	!$ tn = omp_get_thread_num()
1301
1302	IF ( loop_optimization == 'vector' ) THEN
1303	!
1304	!-- Code optimized for vector processors
1305	DO k = 1, nz
1306
1307	m = 0
1308	DO n = 1, pdims(2)
1309	DO j = 1, nny
1310	work_triy(m,k) = ar(j,k,i,n)
1311	m = m + 1
1312	ENDDO
1313	ENDDO
1314
1315	ENDDO
1316
1317	CALL fft_y_m( work_triy, ny, 'forward' )
1318
1319	ELSE
1320	!
1321	!-- Cache optimized code
1322	DO k = 1, nz
1323
1324	m = 0
1325	DO n = 1, pdims(2)
1326	DO j = 1, nny
1327	work_ffty(m) = ar(j,k,i,n)
1328	m = m + 1
1329	ENDDO
1330	ENDDO
1331
1332	CALL fft_y_1d( work_ffty, 'forward' )
1333
1334	DO j = 0, ny
1335	work_triy(j,k) = work_ffty(j)
1336	ENDDO
1337
1338	ENDDO
1339
1340	ENDIF
1341
1342	!
1343	!-- Solve the linear equation system
1344	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1345
1346	IF ( loop_optimization == 'vector' ) THEN
1347	!
1348	!-- Code optimized for vector processors
1349	CALL fft_y_m( work_triy, ny, 'backward' )
1350
1351	DO k = 1, nz
1352
1353	m = 0
1354	DO n = 1, pdims(2)
1355	DO j = 1, nny
1356	ar(j,k,i,n) = work_triy(m,k)
1357	m = m + 1
1358	ENDDO
1359	ENDDO
1360
1361	ENDDO
1362
1363	ELSE
1364	!
1365	!-- Cache optimized code
1366	DO k = 1, nz
1367
1368	DO j = 0, ny
1369	work_ffty(j) = work_triy(j,k)
1370	ENDDO
1371
1372	CALL fft_y_1d( work_ffty, 'backward' )
1373
1374	m = 0
1375	DO n = 1, pdims(2)
1376	DO j = 1, nny
1377	ar(j,k,i,n) = work_ffty(m)
1378	m = m + 1
1379	ENDDO
1380	ENDDO
1381
1382	ENDDO
1383
1384	ENDIF
1385
1386	ENDDO
1387
1388	DEALLOCATE( tri )
1389
1390	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1391
1392	END SUBROUTINE ffty_tri_ffty
1393
1394	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |