Home

Context Navigation

source: palm/trunk/SOURCE/poisfft_mod.f90 @ 4283

Last change on this file since 4283 was 4182, checked in by scharf, 5 years ago
corrected "Former revisions" section minor formatting in "Former revisions" section added "Author" section
Property svn:keywords set to `Id`
File size: 41.9 KB

Line
1	!> @file poisfft_mod.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2019 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft_mod.f90 4182 2019-08-22 15:20:23Z resler $
27	! Corrected "Former revisions" section
28	!
29	! 3690 2019-01-22 22:56:42Z knoop
30	! OpenACC port for SPEC
31	!
32	! Revision 1.1 1997/07/24 11:24:14 raasch
33	! Initial revision
34	!
35	!
36	! Description:
37	! ------------
38	!> Solves the Poisson equation with a 2D spectral method
39	!> d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
40	!>
41	!> Input:
42	!> real ar contains (nnz,nny,nnx) elements of the velocity divergence,
43	!> starting from (1,nys,nxl)
44	!>
45	!> Output:
46	!> real ar contains the solution for perturbation pressure p
47	!------------------------------------------------------------------------------!
48	MODULE poisfft_mod
49
50
51	USE fft_xy, &
52	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
53
54	USE indices, &
55	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
56
57	USE transpose_indices, &
58	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
59	nzb_y, nzt_x, nzt_y
60
61	USE tridia_solver, &
62	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
63
64	IMPLICIT NONE
65
66	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
67
68	PRIVATE
69
70	PUBLIC poisfft, poisfft_init
71
72	INTERFACE poisfft
73	MODULE PROCEDURE poisfft
74	END INTERFACE poisfft
75
76	INTERFACE poisfft_init
77	MODULE PROCEDURE poisfft_init
78	END INTERFACE poisfft_init
79
80
81	CONTAINS
82
83	!------------------------------------------------------------------------------!
84	! Description:
85	! ------------
86	!> Setup coefficients for FFT and the tridiagonal solver
87	!------------------------------------------------------------------------------!
88	SUBROUTINE poisfft_init
89
90	IMPLICIT NONE
91
92
93	CALL fft_init
94
95	CALL tridia_init
96
97	poisfft_initialized = .TRUE.
98
99	END SUBROUTINE poisfft_init
100
101
102
103	!------------------------------------------------------------------------------!
104	! Description:
105	! ------------
106	!> Two-dimensional Fourier Transformation in x- and y-direction.
107	!------------------------------------------------------------------------------!
108	SUBROUTINE poisfft( ar )
109
110	USE control_parameters, &
111	ONLY: transpose_compute_overlap
112
113	USE cpulog, &
114	ONLY: cpu_log, cpu_log_nowait, log_point_s
115
116	USE kinds
117
118	USE pegrid
119
120	IMPLICIT NONE
121
122	INTEGER(iwp) :: ii !<
123	INTEGER(iwp) :: iind !<
124	INTEGER(iwp) :: inew !<
125	INTEGER(iwp) :: jj !<
126	INTEGER(iwp) :: jind !<
127	INTEGER(iwp) :: jnew !<
128	INTEGER(iwp) :: ki !<
129	INTEGER(iwp) :: kk !<
130	INTEGER(iwp) :: knew !<
131	INTEGER(iwp) :: n !<
132	INTEGER(iwp) :: nblk !<
133	INTEGER(iwp) :: nnx_y !<
134	INTEGER(iwp) :: nny_z !<
135	INTEGER(iwp) :: nnz_x !<
136	INTEGER(iwp) :: nxl_y_bound !<
137	INTEGER(iwp) :: nxr_y_bound !<
138
139	INTEGER(iwp), DIMENSION(4) :: isave !<
140
141	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !<
142	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !<
143
144	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
145	#if __acc_fft_device
146	!$ACC DECLARE CREATE(ar_inv)
147	#endif
148
149	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !<
150	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !<
151	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !<
152	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !<
153	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !<
154
155
156	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
157
158	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
159
160	#if !__acc_fft_device
161	!$ACC UPDATE HOST(ar)
162	#endif
163
164	#ifndef _OPENACC
165	!
166	!-- Two-dimensional Fourier Transformation in x- and y-direction.
167	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
168
169	!
170	!-- 1d-domain-decomposition along x:
171	!-- FFT along y and transposition y --> x
172	CALL ffty_tr_yx( ar, ar )
173
174	!
175	!-- FFT along x, solving the tridiagonal system and backward FFT
176	CALL fftx_tri_fftx( ar )
177
178	!
179	!-- Transposition x --> y and backward FFT along y
180	CALL tr_xy_ffty( ar, ar )
181
182	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
183
184	!
185	!-- 1d-domain-decomposition along y:
186	!-- FFT along x and transposition x --> y
187	CALL fftx_tr_xy( ar, ar )
188
189	!
190	!-- FFT along y, solving the tridiagonal system and backward FFT
191	CALL ffty_tri_ffty( ar )
192
193	!
194	!-- Transposition y --> x and backward FFT along x
195	CALL tr_yx_fftx( ar, ar )
196
197	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
198	#endif
199
200	!
201	!-- 2d-domain-decomposition or no decomposition (1 PE run)
202	!-- Transposition z --> x
203	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
204	CALL resort_for_zx( ar, ar_inv )
205	CALL transpose_zx( ar_inv, ar )
206	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
207
208	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
209	CALL fft_x( ar, 'forward' )
210	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
211
212	!
213	!-- Transposition x --> y
214	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
215	CALL resort_for_xy( ar, ar_inv )
216	CALL transpose_xy( ar_inv, ar )
217	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
218
219	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
220	CALL fft_y( ar, 'forward', ar_tr = ar, &
221	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
222	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
223	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
224
225	!
226	!-- Transposition y --> z
227	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
228	CALL resort_for_yz( ar, ar_inv )
229	CALL transpose_yz( ar_inv, ar )
230	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
231
232	!
233	!-- Solve the tridiagonal equation system along z
234	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
235	CALL tridia_substi( ar )
236	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
237
238	!
239	!-- Inverse Fourier Transformation
240	!-- Transposition z --> y
241	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
242	CALL transpose_zy( ar, ar_inv )
243	CALL resort_for_zy( ar_inv, ar )
244	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
245
246	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
247	CALL fft_y( ar, 'backward', ar_tr = ar, &
248	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
249	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
250	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
251
252	!
253	!-- Transposition y --> x
254	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
255	CALL transpose_yx( ar, ar_inv )
256	CALL resort_for_yx( ar_inv, ar )
257	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
258
259	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
260	CALL fft_x( ar, 'backward' )
261	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
262
263	!
264	!-- Transposition x --> z
265	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
266	CALL transpose_xz( ar, ar_inv )
267	CALL resort_for_xz( ar_inv, ar )
268	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
269
270	#ifndef _OPENACC
271	ELSE
272
273	!
274	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
275	!-- overlapping transposition / fft
276	!-- cputime logging must not use barriers, which would prevent overlapping
277	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
278	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
279	!
280	!-- Transposition z --> x + subsequent fft along x
281	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
282	CALL resort_for_zx( ar, f_inv )
283	!
284	!-- Save original indices and gridpoint counter
285	isave(1) = nz
286	isave(2) = nzb_x
287	isave(3) = nzt_x
288	isave(4) = sendrecvcount_zx
289	!
290	!-- Set new indices for transformation
291	nblk = nz / pdims(1)
292	nz = pdims(1)
293	nnz_x = 1
294	nzb_x = 1 + myidx * nnz_x
295	nzt_x = ( myidx + 1 ) * nnz_x
296	sendrecvcount_zx = nnx * nny * nnz_x
297
298	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
299	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
300
301	DO kk = 1, nblk
302
303	IF ( kk == 1 ) THEN
304	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
305	ELSE
306	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
307	ENDIF
308
309	DO knew = 1, nz
310	ki = kk + nblk * ( knew - 1 )
311	f_in(:,:,knew) = f_inv(:,:,ki)
312	ENDDO
313
314	CALL transpose_zx( f_in, ar1(:,:,:))
315	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
316
317	IF ( kk == 1 ) THEN
318	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
319	ELSE
320	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
321	ENDIF
322
323	n = isave(2) + kk - 1
324	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
325	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
326
327	ENDDO
328	!
329	!-- Restore original indices/counters
330	nz = isave(1)
331	nzb_x = isave(2)
332	nzt_x = isave(3)
333	sendrecvcount_zx = isave(4)
334
335	DEALLOCATE( ar1, f_in, f_inv )
336
337	!
338	!-- Transposition x --> y + subsequent fft along y
339	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
340	CALL resort_for_xy( f_out_z, f_inv )
341	!
342	!-- Save original indices and gridpoint counter
343	isave(1) = nx
344	isave(2) = nxl_y
345	isave(3) = nxr_y
346	isave(4) = sendrecvcount_xy
347	!
348	!-- Set new indices for transformation
349	nblk = ( ( nx+1 ) / pdims(2) ) - 1
350	nx = pdims(2)
351	nnx_y = 1
352	nxl_y = myidy * nnx_y
353	nxr_y = ( myidy + 1 ) * nnx_y - 1
354	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
355
356	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
357	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
358
359	DO ii = 0, nblk
360
361	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
362
363	DO inew = 0, nx-1
364	iind = ii + ( nblk + 1 ) * inew
365	f_in(:,:,inew) = f_inv(:,:,iind)
366	ENDDO
367
368	CALL transpose_xy( f_in, ar1(:,:,:) )
369
370	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
371
372	IF ( ii == 1 ) THEN
373	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
374	ELSE
375	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
376	ENDIF
377
378	nxl_y_bound = isave(2)
379	nxr_y_bound = isave(3)
380	n = isave(2) + ii
381	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
382	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
383	nxl_y_l = n, nxr_y_l = n )
384
385	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
386
387	ENDDO
388	!
389	!-- Restore original indices/counters
390	nx = isave(1)
391	nxl_y = isave(2)
392	nxr_y = isave(3)
393	sendrecvcount_xy = isave(4)
394
395	DEALLOCATE( ar1, f_in, f_inv )
396
397	!
398	!-- Transposition y --> z + subsequent tridia + resort for z --> y
399	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
400	CALL resort_for_yz( f_out_y, f_inv )
401	!
402	!-- Save original indices and gridpoint counter
403	isave(1) = ny
404	isave(2) = nys_z
405	isave(3) = nyn_z
406	isave(4) = sendrecvcount_yz
407	!
408	!-- Set new indices for transformation
409	nblk = ( ( ny+1 ) / pdims(1) ) - 1
410	ny = pdims(1)
411	nny_z = 1
412	nys_z = myidx * nny_z
413	nyn_z = ( myidx + 1 ) * nny_z - 1
414	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
415
416	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
417	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
418
419	DO jj = 0, nblk
420	!
421	!-- Forward Fourier Transformation
422	!-- Transposition y --> z
423	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
424
425	DO jnew = 0, ny-1
426	jind = jj + ( nblk + 1 ) * jnew
427	f_in(:,:,jnew) = f_inv(:,:,jind)
428	ENDDO
429
430	CALL transpose_yz( f_in, ar1(:,:,:) )
431
432	IF ( jj == nblk ) THEN
433	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
434	ELSE
435	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
436	ENDIF
437
438	!
439	!-- Solve the tridiagonal equation system along z
440	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
441
442	n = isave(2) + jj
443	CALL tridia_substi_overlap( ar1(:,:,:), n )
444
445	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
446
447	!
448	!-- Inverse Fourier Transformation
449	!-- Transposition z --> y
450	!-- Only one thread should call MPI routines, therefore forward and
451	!-- backward tranpose are in the same section
452	IF ( jj == 0 ) THEN
453	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
454	ELSE
455	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
456	ENDIF
457
458	CALL transpose_zy( ar1(:,:,:), f_in )
459
460	DO jnew = 0, ny-1
461	jind = jj + ( nblk + 1 ) * jnew
462	f_inv(:,:,jind) = f_in(:,:,jnew)
463	ENDDO
464
465	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
466
467	ENDDO
468	!
469	!-- Restore original indices/counters
470	ny = isave(1)
471	nys_z = isave(2)
472	nyn_z = isave(3)
473	sendrecvcount_yz = isave(4)
474
475	CALL resort_for_zy( f_inv, f_out_y )
476
477	DEALLOCATE( ar1, f_in, f_inv )
478
479	!
480	!-- fft along y backward + subsequent transposition y --> x
481	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
482	!
483	!-- Save original indices and gridpoint counter
484	isave(1) = nx
485	isave(2) = nxl_y
486	isave(3) = nxr_y
487	isave(4) = sendrecvcount_xy
488	!
489	!-- Set new indices for transformation
490	nblk = (( nx+1 ) / pdims(2) ) - 1
491	nx = pdims(2)
492	nnx_y = 1
493	nxl_y = myidy * nnx_y
494	nxr_y = ( myidy + 1 ) * nnx_y - 1
495	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
496
497	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
498	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
499
500	DO ii = 0, nblk
501
502	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
503
504	n = isave(2) + ii
505	nxl_y_bound = isave(2)
506	nxr_y_bound = isave(3)
507
508	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
509	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
510	nxl_y_l = n, nxr_y_l = n )
511
512	IF ( ii == nblk ) THEN
513	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
514	ELSE
515	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
516	ENDIF
517
518	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
519
520	CALL transpose_yx( ar1(:,:,:), f_in )
521
522	DO inew = 0, nx-1
523	iind = ii + (nblk+1) * inew
524	f_inv(:,:,iind) = f_in(:,:,inew)
525	ENDDO
526
527	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
528
529	ENDDO
530	!
531	!-- Restore original indices/counters
532	nx = isave(1)
533	nxl_y = isave(2)
534	nxr_y = isave(3)
535	sendrecvcount_xy = isave(4)
536
537	CALL resort_for_yx( f_inv, f_out_z )
538
539	DEALLOCATE( ar1, f_in, f_inv )
540
541	!
542	!-- fft along x backward + subsequent final transposition x --> z
543	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
544	!
545	!-- Save original indices and gridpoint counter
546	isave(1) = nz
547	isave(2) = nzb_x
548	isave(3) = nzt_x
549	isave(4) = sendrecvcount_zx
550	!
551	!-- Set new indices for transformation
552	nblk = nz / pdims(1)
553	nz = pdims(1)
554	nnz_x = 1
555	nzb_x = 1 + myidx * nnz_x
556	nzt_x = ( myidx + 1 ) * nnz_x
557	sendrecvcount_zx = nnx * nny * nnz_x
558
559	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
560	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
561
562	DO kk = 1, nblk
563
564	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
565
566	n = isave(2) + kk - 1
567	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
568
569	IF ( kk == nblk ) THEN
570	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
571	ELSE
572	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
573	ENDIF
574
575	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
576
577	CALL transpose_xz( ar1(:,:,:), f_in )
578
579	DO knew = 1, nz
580	ki = kk + nblk * (knew-1)
581	f_inv(:,:,ki) = f_in(:,:,knew)
582	ENDDO
583
584	IF ( kk == nblk ) THEN
585	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
586	ELSE
587	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
588	ENDIF
589
590	ENDDO
591	!
592	!-- Restore original indices/counters
593	nz = isave(1)
594	nzb_x = isave(2)
595	nzt_x = isave(3)
596	sendrecvcount_zx = isave(4)
597
598	CALL resort_for_xz( f_inv, ar )
599
600	DEALLOCATE( ar1, f_in, f_inv )
601
602	ENDIF
603	#endif
604
605	#if !__acc_fft_device
606	!$ACC UPDATE DEVICE(ar)
607	#endif
608
609	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
610
611	END SUBROUTINE poisfft
612
613
614	!------------------------------------------------------------------------------!
615	! Description:
616	! ------------
617	!> Fourier-transformation along y with subsequent transposition y --> x for
618	!> a 1d-decomposition along x.
619	!>
620	!> @attention The performance of this routine is much faster on the NEC-SX6,
621	!> if the first index of work_ffty_vec is odd. Otherwise
622	!> memory bank conflicts may occur (especially if the index is a
623	!> multiple of 128). That's why work_ffty_vec is dimensioned as
624	!> 0:ny+1.
625	!> Of course, this will not work if users are using an odd number
626	!> of gridpoints along y.
627	!------------------------------------------------------------------------------!
628	SUBROUTINE ffty_tr_yx( f_in, f_out )
629
630	USE control_parameters, &
631	ONLY: loop_optimization
632
633	USE cpulog, &
634	ONLY: cpu_log, log_point_s
635
636	USE kinds
637
638	USE pegrid
639
640	IMPLICIT NONE
641
642	INTEGER(iwp) :: i !<
643	INTEGER(iwp) :: iend !<
644	INTEGER(iwp) :: iouter !<
645	INTEGER(iwp) :: ir !<
646	INTEGER(iwp) :: j !<
647	INTEGER(iwp) :: k !<
648
649	INTEGER(iwp), PARAMETER :: stridex = 4 !<
650
651	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !<
652	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !<
653	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
654
655	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
656	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
657
658	!
659	!-- Carry out the FFT along y, where all data are present due to the
660	!-- 1d-decomposition along x. Resort the data in a way that x becomes
661	!-- the first index.
662	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
663
664	IF ( loop_optimization == 'vector' ) THEN
665
666	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
667	!
668	!-- Code optimized for vector processors
669	!$OMP PARALLEL PRIVATE ( i, j, k )
670	!$OMP DO
671	DO i = nxl, nxr
672
673	DO j = 0, ny
674	DO k = 1, nz
675	work_ffty_vec(j,k,i) = f_in(k,j,i)
676	ENDDO
677	ENDDO
678
679	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
680
681	ENDDO
682
683	!$OMP DO
684	DO k = 1, nz
685	DO j = 0, ny
686	DO i = nxl, nxr
687	work(i,k,j) = work_ffty_vec(j,k,i)
688	ENDDO
689	ENDDO
690	ENDDO
691	!$OMP END PARALLEL
692
693	DEALLOCATE( work_ffty_vec )
694
695	ELSE
696	!
697	!-- Cache optimized code.
698	ALLOCATE( work_ffty(0:ny,stridex) )
699	!
700	!-- The i-(x-)direction is split into a strided outer loop and an inner
701	!-- loop for better cache performance
702	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
703	!$OMP DO
704	DO iouter = nxl, nxr, stridex
705
706	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
707
708	DO k = 1, nz
709
710	DO i = iouter, iend
711
712	ir = i-iouter+1 ! counter within a stride
713	DO j = 0, ny
714	work_ffty(j,ir) = f_in(k,j,i)
715	ENDDO
716	!
717	!-- FFT along y
718	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
719
720	ENDDO
721
722	!
723	!-- Resort
724	DO j = 0, ny
725	DO i = iouter, iend
726	work(i,k,j) = work_ffty(j,i-iouter+1)
727	ENDDO
728	ENDDO
729
730	ENDDO
731
732	ENDDO
733	!$OMP END PARALLEL
734
735	DEALLOCATE( work_ffty )
736
737	ENDIF
738
739	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
740
741	!
742	!-- Transpose array
743	#if defined( __parallel )
744	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
745	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
746	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
747	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
748	comm1dx, ierr )
749	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
750	#endif
751
752	END SUBROUTINE ffty_tr_yx
753
754
755	!------------------------------------------------------------------------------!
756	! Description:
757	! ------------
758	!> Transposition x --> y with a subsequent backward Fourier transformation for
759	!> a 1d-decomposition along x
760	!------------------------------------------------------------------------------!
761	SUBROUTINE tr_xy_ffty( f_in, f_out )
762
763	USE control_parameters, &
764	ONLY: loop_optimization
765
766	USE cpulog, &
767	ONLY: cpu_log, log_point_s
768
769	USE kinds
770
771	USE pegrid
772
773	IMPLICIT NONE
774
775	INTEGER(iwp) :: i !<
776	INTEGER(iwp) :: iend !<
777	INTEGER(iwp) :: iouter !<
778	INTEGER(iwp) :: ir !<
779	INTEGER(iwp) :: j !<
780	INTEGER(iwp) :: k !<
781
782	INTEGER(iwp), PARAMETER :: stridex = 4 !<
783
784	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !<
785	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !<
786	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !<
787
788	REAL(wp), DIMENSION(:,:), ALLOCATABLE :: work_ffty !<
789	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: work_ffty_vec !<
790
791	!
792	!-- Transpose array
793	#if defined( __parallel )
794	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
795	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
796	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
797	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
798	comm1dx, ierr )
799	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
800	#endif
801
802	!
803	!-- Resort the data in a way that y becomes the first index and carry out the
804	!-- backward fft along y.
805	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
806
807	IF ( loop_optimization == 'vector' ) THEN
808
809	ALLOCATE( work_ffty_vec(0:ny+1,1:nz,nxl:nxr) )
810	!
811	!-- Code optimized for vector processors
812	!$OMP PARALLEL PRIVATE ( i, j, k )
813	!$OMP DO
814	DO k = 1, nz
815	DO j = 0, ny
816	DO i = nxl, nxr
817	work_ffty_vec(j,k,i) = work(i,k,j)
818	ENDDO
819	ENDDO
820	ENDDO
821
822	!$OMP DO
823	DO i = nxl, nxr
824
825	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
826
827	DO j = 0, ny
828	DO k = 1, nz
829	f_out(k,j,i) = work_ffty_vec(j,k,i)
830	ENDDO
831	ENDDO
832
833	ENDDO
834	!$OMP END PARALLEL
835
836	DEALLOCATE( work_ffty_vec )
837
838	ELSE
839	!
840	!-- Cache optimized code.
841	ALLOCATE( work_ffty(0:ny,stridex) )
842	!
843	!-- The i-(x-)direction is split into a strided outer loop and an inner
844	!-- loop for better cache performance
845	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
846	!$OMP DO
847	DO iouter = nxl, nxr, stridex
848
849	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
850
851	DO k = 1, nz
852	!
853	!-- Resort
854	DO j = 0, ny
855	DO i = iouter, iend
856	work_ffty(j,i-iouter+1) = work(i,k,j)
857	ENDDO
858	ENDDO
859
860	DO i = iouter, iend
861
862	!
863	!-- FFT along y
864	ir = i-iouter+1 ! counter within a stride
865	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
866
867	DO j = 0, ny
868	f_out(k,j,i) = work_ffty(j,ir)
869	ENDDO
870	ENDDO
871
872	ENDDO
873
874	ENDDO
875	!$OMP END PARALLEL
876
877	DEALLOCATE( work_ffty )
878
879	ENDIF
880
881	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
882
883	END SUBROUTINE tr_xy_ffty
884
885
886	!------------------------------------------------------------------------------!
887	! Description:
888	! ------------
889	!> FFT along x, solution of the tridiagonal system and backward FFT for
890	!> a 1d-decomposition along x
891	!>
892	!> @warning this subroutine may still not work for hybrid parallelization
893	!> with OpenMP (for possible necessary changes see the original
894	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
895	!------------------------------------------------------------------------------!
896	SUBROUTINE fftx_tri_fftx( ar )
897
898	USE control_parameters, &
899	ONLY: loop_optimization
900
901	USE cpulog, &
902	ONLY: cpu_log, log_point_s
903
904	USE grid_variables, &
905	ONLY: ddx2, ddy2
906
907	USE kinds
908
909	USE pegrid
910
911	IMPLICIT NONE
912
913	INTEGER(iwp) :: i !<
914	INTEGER(iwp) :: j !<
915	INTEGER(iwp) :: k !<
916	INTEGER(iwp) :: m !<
917	INTEGER(iwp) :: n !<
918	!$ INTEGER(iwp) :: omp_get_thread_num !<
919	INTEGER(iwp) :: tn !<
920
921	REAL(wp), DIMENSION(0:nx) :: work_fftx !<
922	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !<
923	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !<
924	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
925
926
927	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
928
929	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
930
931	tn = 0 ! Default thread number in case of one thread
932	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
933	DO j = nys_x, nyn_x
934
935	!$ tn = omp_get_thread_num()
936
937	IF ( loop_optimization == 'vector' ) THEN
938	!
939	!-- Code optimized for vector processors
940	DO k = 1, nz
941
942	m = 0
943	DO n = 1, pdims(1)
944	DO i = 1, nnx
945	work_trix(m,k) = ar(i,k,j,n)
946	m = m + 1
947	ENDDO
948	ENDDO
949
950	ENDDO
951
952	CALL fft_x_m( work_trix, 'forward' )
953
954	ELSE
955	!
956	!-- Cache optimized code
957	DO k = 1, nz
958
959	m = 0
960	DO n = 1, pdims(1)
961	DO i = 1, nnx
962	work_fftx(m) = ar(i,k,j,n)
963	m = m + 1
964	ENDDO
965	ENDDO
966
967	CALL fft_x_1d( work_fftx, 'forward' )
968
969	DO i = 0, nx
970	work_trix(i,k) = work_fftx(i)
971	ENDDO
972
973	ENDDO
974
975	ENDIF
976
977	!
978	!-- Solve the linear equation system
979	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
980
981	IF ( loop_optimization == 'vector' ) THEN
982	!
983	!-- Code optimized for vector processors
984	CALL fft_x_m( work_trix, 'backward' )
985
986	DO k = 1, nz
987
988	m = 0
989	DO n = 1, pdims(1)
990	DO i = 1, nnx
991	ar(i,k,j,n) = work_trix(m,k)
992	m = m + 1
993	ENDDO
994	ENDDO
995
996	ENDDO
997
998	ELSE
999	!
1000	!-- Cache optimized code
1001	DO k = 1, nz
1002
1003	DO i = 0, nx
1004	work_fftx(i) = work_trix(i,k)
1005	ENDDO
1006
1007	CALL fft_x_1d( work_fftx, 'backward' )
1008
1009	m = 0
1010	DO n = 1, pdims(1)
1011	DO i = 1, nnx
1012	ar(i,k,j,n) = work_fftx(m)
1013	m = m + 1
1014	ENDDO
1015	ENDDO
1016
1017	ENDDO
1018
1019	ENDIF
1020
1021	ENDDO
1022
1023	DEALLOCATE( tri )
1024
1025	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1026
1027	END SUBROUTINE fftx_tri_fftx
1028
1029
1030	!------------------------------------------------------------------------------!
1031	! Description:
1032	! ------------
1033	!> Fourier-transformation along x with subsequent transposition x --> y for
1034	!> a 1d-decomposition along y.
1035	!>
1036	!> @attention NEC-branch of this routine may significantly profit from
1037	!> further optimizations. So far, performance is much worse than
1038	!> for routine ffty_tr_yx (more than three times slower).
1039	!------------------------------------------------------------------------------!
1040	SUBROUTINE fftx_tr_xy( f_in, f_out )
1041
1042
1043	USE control_parameters, &
1044	ONLY: loop_optimization
1045
1046	USE cpulog, &
1047	ONLY: cpu_log, log_point_s
1048
1049	USE kinds
1050
1051	USE pegrid
1052
1053	IMPLICIT NONE
1054
1055	INTEGER(iwp) :: i !<
1056	INTEGER(iwp) :: j !<
1057	INTEGER(iwp) :: k !<
1058
1059	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1060	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !<
1061	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !<
1062	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1063
1064	!
1065	!-- Carry out the FFT along x, where all data are present due to the
1066	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1067	!-- the first index.
1068	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1069
1070	IF ( loop_optimization == 'vector' ) THEN
1071	!
1072	!-- Code for vector processors
1073	!$OMP PARALLEL PRIVATE ( i, j, k )
1074	!$OMP DO
1075	DO i = 0, nx
1076
1077	DO j = nys, nyn
1078	DO k = 1, nz
1079	work_fftx(i,k,j) = f_in(k,j,i)
1080	ENDDO
1081	ENDDO
1082
1083	ENDDO
1084
1085	!$OMP DO
1086	DO j = nys, nyn
1087
1088	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1089
1090	DO k = 1, nz
1091	DO i = 0, nx
1092	work(j,k,i) = work_fftx(i,k,j)
1093	ENDDO
1094	ENDDO
1095
1096	ENDDO
1097	!$OMP END PARALLEL
1098
1099	ELSE
1100
1101	!
1102	!-- Cache optimized code (there might be still a potential for better
1103	!-- optimization).
1104	!$OMP PARALLEL PRIVATE (i,j,k)
1105	!$OMP DO
1106	DO i = 0, nx
1107
1108	DO j = nys, nyn
1109	DO k = 1, nz
1110	work_fftx(i,k,j) = f_in(k,j,i)
1111	ENDDO
1112	ENDDO
1113
1114	ENDDO
1115
1116	!$OMP DO
1117	DO j = nys, nyn
1118	DO k = 1, nz
1119
1120	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1121
1122	DO i = 0, nx
1123	work(j,k,i) = work_fftx(i,k,j)
1124	ENDDO
1125	ENDDO
1126
1127	ENDDO
1128	!$OMP END PARALLEL
1129
1130	ENDIF
1131	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1132
1133	!
1134	!-- Transpose array
1135	#if defined( __parallel )
1136	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1137	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1138	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1139	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1140	comm1dy, ierr )
1141	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1142	#endif
1143
1144	END SUBROUTINE fftx_tr_xy
1145
1146
1147	!------------------------------------------------------------------------------!
1148	! Description:
1149	! ------------
1150	!> Transposition y --> x with a subsequent backward Fourier transformation for
1151	!> a 1d-decomposition along x.
1152	!------------------------------------------------------------------------------!
1153	SUBROUTINE tr_yx_fftx( f_in, f_out )
1154
1155
1156	USE control_parameters, &
1157	ONLY: loop_optimization
1158
1159	USE cpulog, &
1160	ONLY: cpu_log, log_point_s
1161
1162	USE kinds
1163
1164	USE pegrid
1165
1166	IMPLICIT NONE
1167
1168	INTEGER(iwp) :: i !<
1169	INTEGER(iwp) :: j !<
1170	INTEGER(iwp) :: k !<
1171
1172	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !<
1173	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !<
1174	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !<
1175	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !<
1176
1177	!
1178	!-- Transpose array
1179	#if defined( __parallel )
1180	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1181	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1182	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1183	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1184	comm1dy, ierr )
1185	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1186	#endif
1187
1188	!
1189	!-- Carry out the FFT along x, where all data are present due to the
1190	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1191	!-- the first index.
1192	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1193
1194	IF ( loop_optimization == 'vector' ) THEN
1195	!
1196	!-- Code optimized for vector processors
1197	!$OMP PARALLEL PRIVATE ( i, j, k )
1198	!$OMP DO
1199	DO j = nys, nyn
1200
1201	DO k = 1, nz
1202	DO i = 0, nx
1203	work_fftx(i,k,j) = work(j,k,i)
1204	ENDDO
1205	ENDDO
1206
1207	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1208
1209	ENDDO
1210
1211	!$OMP DO
1212	DO i = 0, nx
1213	DO j = nys, nyn
1214	DO k = 1, nz
1215	f_out(k,j,i) = work_fftx(i,k,j)
1216	ENDDO
1217	ENDDO
1218	ENDDO
1219	!$OMP END PARALLEL
1220
1221	ELSE
1222
1223	!
1224	!-- Cache optimized code (there might be still a potential for better
1225	!-- optimization).
1226	!$OMP PARALLEL PRIVATE (i,j,k)
1227	!$OMP DO
1228	DO j = nys, nyn
1229	DO k = 1, nz
1230
1231	DO i = 0, nx
1232	work_fftx(i,k,j) = work(j,k,i)
1233	ENDDO
1234
1235	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1236
1237	ENDDO
1238	ENDDO
1239
1240	!$OMP DO
1241	DO i = 0, nx
1242	DO j = nys, nyn
1243	DO k = 1, nz
1244	f_out(k,j,i) = work_fftx(i,k,j)
1245	ENDDO
1246	ENDDO
1247	ENDDO
1248	!$OMP END PARALLEL
1249
1250	ENDIF
1251	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1252
1253	END SUBROUTINE tr_yx_fftx
1254
1255
1256	!------------------------------------------------------------------------------!
1257	! Description:
1258	! ------------
1259	!> FFT along y, solution of the tridiagonal system and backward FFT for
1260	!> a 1d-decomposition along y.
1261	!>
1262	!> @warning this subroutine may still not work for hybrid parallelization
1263	!> with OpenMP (for possible necessary changes see the original
1264	!> routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1265	!------------------------------------------------------------------------------!
1266	SUBROUTINE ffty_tri_ffty( ar )
1267
1268
1269	USE control_parameters, &
1270	ONLY: loop_optimization
1271
1272	USE cpulog, &
1273	ONLY: cpu_log, log_point_s
1274
1275	USE grid_variables, &
1276	ONLY: ddx2, ddy2
1277
1278	USE kinds
1279
1280	USE pegrid
1281
1282	IMPLICIT NONE
1283
1284	INTEGER(iwp) :: i !<
1285	INTEGER(iwp) :: j !<
1286	INTEGER(iwp) :: k !<
1287	INTEGER(iwp) :: m !<
1288	INTEGER(iwp) :: n !<
1289	!$ INTEGER(iwp) :: omp_get_thread_num !<
1290	INTEGER(iwp) :: tn !<
1291
1292	REAL(wp), DIMENSION(0:ny) :: work_ffty !<
1293	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !<
1294	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !<
1295	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !<
1296
1297
1298	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1299
1300	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1301
1302	tn = 0 ! Default thread number in case of one thread
1303	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1304	DO i = nxl_y, nxr_y
1305
1306	!$ tn = omp_get_thread_num()
1307
1308	IF ( loop_optimization == 'vector' ) THEN
1309	!
1310	!-- Code optimized for vector processors
1311	DO k = 1, nz
1312
1313	m = 0
1314	DO n = 1, pdims(2)
1315	DO j = 1, nny
1316	work_triy(m,k) = ar(j,k,i,n)
1317	m = m + 1
1318	ENDDO
1319	ENDDO
1320
1321	ENDDO
1322
1323	CALL fft_y_m( work_triy, ny, 'forward' )
1324
1325	ELSE
1326	!
1327	!-- Cache optimized code
1328	DO k = 1, nz
1329
1330	m = 0
1331	DO n = 1, pdims(2)
1332	DO j = 1, nny
1333	work_ffty(m) = ar(j,k,i,n)
1334	m = m + 1
1335	ENDDO
1336	ENDDO
1337
1338	CALL fft_y_1d( work_ffty, 'forward' )
1339
1340	DO j = 0, ny
1341	work_triy(j,k) = work_ffty(j)
1342	ENDDO
1343
1344	ENDDO
1345
1346	ENDIF
1347
1348	!
1349	!-- Solve the linear equation system
1350	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1351
1352	IF ( loop_optimization == 'vector' ) THEN
1353	!
1354	!-- Code optimized for vector processors
1355	CALL fft_y_m( work_triy, ny, 'backward' )
1356
1357	DO k = 1, nz
1358
1359	m = 0
1360	DO n = 1, pdims(2)
1361	DO j = 1, nny
1362	ar(j,k,i,n) = work_triy(m,k)
1363	m = m + 1
1364	ENDDO
1365	ENDDO
1366
1367	ENDDO
1368
1369	ELSE
1370	!
1371	!-- Cache optimized code
1372	DO k = 1, nz
1373
1374	DO j = 0, ny
1375	work_ffty(j) = work_triy(j,k)
1376	ENDDO
1377
1378	CALL fft_y_1d( work_ffty, 'backward' )
1379
1380	m = 0
1381	DO n = 1, pdims(2)
1382	DO j = 1, nny
1383	ar(j,k,i,n) = work_ffty(m)
1384	m = m + 1
1385	ENDDO
1386	ENDDO
1387
1388	ENDDO
1389
1390	ENDIF
1391
1392	ENDDO
1393
1394	DEALLOCATE( tri )
1395
1396	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1397
1398	END SUBROUTINE ffty_tri_ffty
1399
1400	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |