Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1310

Last change on this file since 1310 was 1310, checked in by raasch, 10 years ago
update of GPL copyright
Property svn:keywords set to `Id`
File size: 42.1 KB

Line
1	MODULE poisfft_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2014 Leibniz Universitaet Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft.f90 1310 2014-03-14 08:01:56Z raasch $
27	!
28	! 1306 2014-03-13 14:30:59Z raasch
29	! openmp sections removed from the overlap branch,
30	! second argument removed from parameter list
31	!
32	! 1216 2013-08-26 09:31:42Z raasch
33	! resorting of arrays moved to separate routines resort_for_...,
34	! one argument, used as temporary work array, removed from all transpose
35	! routines
36	! overlapping fft / transposition implemented
37	!
38	! 1212 2013-08-15 08:46:27Z raasch
39	! tridia routines moved to seperate module tridia_solver
40	!
41	! 1208 2013-08-13 06:41:49Z raasch
42	! acc-update clauses added for "ar" so that ffts other than cufft can also be
43	! used (although they are not ported and will give a poor performance)
44	!
45	! 1111 2013-03-08 23:54:10Z raasch
46	! further openACC porting of non-parallel (MPI) branch:
47	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
48	! no distinction between parallel/non-parallel in poisfft and tridia any more,
49	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
50	! (otherwise "invalid device function" is indicated during runtime),
51	! optimization of tridia routines: constant elements and coefficients of tri are
52	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
53	! to 2,
54	! poisfft_init is now called internally from poisfft, maketri is called from
55	! poisfft_init,
56	! ibc_p_b = 2 removed
57	!
58	! 1106 2013-03-04 05:31:38Z raasch
59	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
60	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
61	! fft_y_1d
62	!
63	! 1103 2013-02-20 02:15:53Z raasch
64	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
65	! sometimes cause segmentation faults with intel 12.1 compiler
66	!
67	! 1092 2013-02-02 11:24:22Z raasch
68	! unused variables removed
69	!
70	! 1036 2012-10-22 13:43:42Z raasch
71	! code put under GPL (PALM 3.9)
72	!
73	! 2012-09-21 07:03:55Z raasch
74	! FLOAT type conversion replaced by REAL
75	!
76	! 1003 2012-09-14 14:35:53Z raasch
77	! indices nxa, nya, etc. replaced by nx, ny, etc.
78	!
79	! 940 2012-07-09 14:31:00Z raasch
80	! special handling of tri-array as an argument in tridia_1dd routines switched
81	! off because it caused segmentation faults with intel 12.1 compiler
82	!
83	! 877 2012-04-03 11:21:44Z suehring
84	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
85	! pressure at the top of the model domain.
86	!
87	! 809 2012-01-30 13:32:58Z maronga
88	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
89	!
90	! 807 2012-01-25 11:53:51Z maronga
91	! New cpp directive "__check" implemented which is used by check_namelist_files
92	! (most of the code is unneeded by check_namelist_files).
93	!
94	! 763 2011-10-06 09:32:09Z suehring
95	! Comment added concerning the last change.
96	!
97	! 761 2011-10-05 17:58:52Z suehring
98	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
99	! pressure at the top of the model domain.
100	!
101	! 696 2011-03-18 07:03:49Z raasch
102	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
103	!
104	! 683 2011-02-09 14:25:15Z raasch
105	! openMP parallelization for 2d-domain-decomposition
106	!
107	! 667 2010-12-23 12:06:00Z suehring/gryschka
108	! ddzu replaced by ddzu_pres due to changes in zu(0)
109	!
110	! 622 2010-12-10 08:08:13Z raasch
111	! optional barriers included in order to speed up collective operations
112	!
113	! 377 2009-09-04 11:09:00Z raasch
114	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
115	!
116	! 164 2008-05-15 08:46:15Z raasch
117	! Arguments removed from transpose routines
118	!
119	! 128 2007-10-26 13:11:14Z raasch
120	! Bugfix: wavenumber calculation for even nx in routines maketri
121	!
122	! 85 2007-05-11 09:35:14Z raasch
123	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
124	!
125	! 76 2007-03-29 00:58:32Z raasch
126	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
127	! the bottom and the top.
128	!
129	! RCS Log replace by Id keyword, revision history cleaned up
130	!
131	! Revision 1.24 2006/08/04 15:00:24 raasch
132	! Default setting of the thread number tn in case of not using OpenMP
133	!
134	! Revision 1.23 2006/02/23 12:48:38 raasch
135	! Additional compiler directive in routine tridia_1dd for preventing loop
136	! exchange on NEC-SX6
137	!
138	! Revision 1.20 2004/04/30 12:38:09 raasch
139	! Parts of former poisfft_hybrid moved to this subroutine,
140	! former subroutine changed to a module, renaming of FFT-subroutines and
141	! -module, FFTs completely substituted by calls of fft_x and fft_y,
142	! NAG fft used in the non-parallel case completely removed, l in maketri
143	! is now a 1d-array, variables passed by modules instead of using parameter
144	! lists, enlarged transposition arrays introduced
145	!
146	! Revision 1.1 1997/07/24 11:24:14 raasch
147	! Initial revision
148	!
149	!
150	! Description:
151	! ------------
152	! Original version by Stephan Siano (pois3d), as of July 23, 1996
153	! Adapted for 2D-domain-decomposition by Siegfried Raasch, July 3, 1997
154	!
155	! Solves the Poisson equation with a 2D spectral method
156	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
157	!
158	! Input:
159	! real ar contains (nnz,nny,nnx) elements of the velocity divergence,
160	! starting from (1,nys,nxl)
161	!
162	! Output:
163	! real ar contains the solution for perturbation pressure p
164	!------------------------------------------------------------------------------!
165
166	USE fft_xy
167	USE indices
168	USE transpose_indices
169	USE tridia_solver
170
171	IMPLICIT NONE
172
173	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
174
175	PRIVATE
176
177	#if ! defined ( __check )
178	PUBLIC poisfft, poisfft_init
179
180	INTERFACE poisfft
181	MODULE PROCEDURE poisfft
182	END INTERFACE poisfft
183
184	INTERFACE poisfft_init
185	MODULE PROCEDURE poisfft_init
186	END INTERFACE poisfft_init
187	#else
188	PUBLIC poisfft_init
189
190	INTERFACE poisfft_init
191	MODULE PROCEDURE poisfft_init
192	END INTERFACE poisfft_init
193	#endif
194
195	CONTAINS
196
197	SUBROUTINE poisfft_init
198
199	USE arrays_3d, ONLY: ddzu_pres, ddzw
200
201	IMPLICIT NONE
202
203	INTEGER :: k
204
205
206	CALL fft_init
207
208	CALL tridia_init
209
210	poisfft_initialized = .TRUE.
211
212	END SUBROUTINE poisfft_init
213
214
215	#if ! defined ( __check )
216	SUBROUTINE poisfft( ar )
217
218	USE control_parameters, ONLY : fft_method, transpose_compute_overlap
219	USE cpulog
220	USE interfaces
221	USE pegrid
222
223	IMPLICIT NONE
224
225	INTEGER :: ii, iind, inew, jj, jind, jnew, ki, kk, knew, n, nblk, &
226	nnx_y, nny_z, nnz_t, nnz_x, nxl_y_bound, nxr_y_bound
227	INTEGER, DIMENSION(4) :: isave
228
229	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
230	!$acc declare create( ar_inv )
231	REAL, DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv
232
233	REAL, DIMENSION(:,:,:), ALLOCATABLE :: ar1, f_in, f_inv, f_out_y, &
234	f_out_z
235
236
237	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
238
239	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
240
241	!
242	!-- Two-dimensional Fourier Transformation in x- and y-direction.
243	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
244
245	!
246	!-- 1d-domain-decomposition along x:
247	!-- FFT along y and transposition y --> x
248	CALL ffty_tr_yx( ar, ar )
249
250	!
251	!-- FFT along x, solving the tridiagonal system and backward FFT
252	CALL fftx_tri_fftx( ar )
253
254	!
255	!-- Transposition x --> y and backward FFT along y
256	CALL tr_xy_ffty( ar, ar )
257
258	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
259
260	!
261	!-- 1d-domain-decomposition along y:
262	!-- FFT along x and transposition x --> y
263	CALL fftx_tr_xy( ar, ar )
264
265	!
266	!-- FFT along y, solving the tridiagonal system and backward FFT
267	CALL ffty_tri_ffty( ar )
268
269	!
270	!-- Transposition y --> x and backward FFT along x
271	CALL tr_yx_fftx( ar, ar )
272
273	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
274
275	!
276	!-- 2d-domain-decomposition or no decomposition (1 PE run)
277	!-- Transposition z --> x
278	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
279	CALL resort_for_zx( ar, ar_inv )
280	CALL transpose_zx( ar_inv, ar )
281	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
282
283	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
284	IF ( fft_method /= 'system-specific' ) THEN
285	!$acc update host( ar )
286	ENDIF
287	CALL fft_x( ar, 'forward' )
288	IF ( fft_method /= 'system-specific' ) THEN
289	!$acc update device( ar )
290	ENDIF
291	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
292
293	!
294	!-- Transposition x --> y
295	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
296	CALL resort_for_xy( ar, ar_inv )
297	CALL transpose_xy( ar_inv, ar )
298	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
299
300	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
301	IF ( fft_method /= 'system-specific' ) THEN
302	!$acc update host( ar )
303	ENDIF
304	CALL fft_y( ar, 'forward', ar_tr = ar, &
305	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
306	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
307	IF ( fft_method /= 'system-specific' ) THEN
308	!$acc update device( ar )
309	ENDIF
310	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
311
312	!
313	!-- Transposition y --> z
314	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
315	CALL resort_for_yz( ar, ar_inv )
316	CALL transpose_yz( ar_inv, ar )
317	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
318
319	!
320	!-- Solve the tridiagonal equation system along z
321	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
322	CALL tridia_substi( ar )
323	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
324
325	!
326	!-- Inverse Fourier Transformation
327	!-- Transposition z --> y
328	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
329	CALL transpose_zy( ar, ar_inv )
330	CALL resort_for_zy( ar_inv, ar )
331	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
332
333	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
334	IF ( fft_method /= 'system-specific' ) THEN
335	!$acc update host( ar )
336	ENDIF
337	CALL fft_y( ar, 'backward', ar_tr = ar, &
338	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
339	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
340	IF ( fft_method /= 'system-specific' ) THEN
341	!$acc update device( ar )
342	ENDIF
343	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
344
345	!
346	!-- Transposition y --> x
347	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
348	CALL transpose_yx( ar, ar_inv )
349	CALL resort_for_yx( ar_inv, ar )
350	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
351
352	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
353	IF ( fft_method /= 'system-specific' ) THEN
354	!$acc update host( ar )
355	ENDIF
356	CALL fft_x( ar, 'backward' )
357	IF ( fft_method /= 'system-specific' ) THEN
358	!$acc update device( ar )
359	ENDIF
360	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
361
362	!
363	!-- Transposition x --> z
364	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
365	CALL transpose_xz( ar, ar_inv )
366	CALL resort_for_xz( ar_inv, ar )
367	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
368
369	ELSE
370
371	!
372	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
373	!-- overlapping transposition / fft
374	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
375	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
376	!
377	!-- Transposition z --> x + subsequent fft along x
378	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
379	CALL resort_for_zx( ar, f_inv )
380	!
381	!-- Save original indices and gridpoint counter
382	isave(1) = nz
383	isave(2) = nzb_x
384	isave(3) = nzt_x
385	isave(4) = sendrecvcount_zx
386	!
387	!-- Set new indices for transformation
388	nblk = nz / pdims(1)
389	nz = pdims(1)
390	nnz_x = 1
391	nzb_x = 1 + myidx * nnz_x
392	nzt_x = ( myidx + 1 ) * nnz_x
393	sendrecvcount_zx = nnx * nny * nnz_x
394
395	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
396	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
397
398	DO kk = 1, nblk
399
400	IF ( kk == 1 ) THEN
401	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
402	ELSE
403	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
404	ENDIF
405
406	DO knew = 1, nz
407	ki = kk + nblk * ( knew - 1 )
408	f_in(:,:,knew) = f_inv(:,:,ki)
409	ENDDO
410
411	CALL transpose_zx( f_in, ar1(:,:,:))
412	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
413
414	IF ( kk == 1 ) THEN
415	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
416	ELSE
417	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
418	ENDIF
419
420	n = isave(2) + kk - 1
421	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
422	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
423
424	ENDDO
425	!
426	!-- Restore original indices/counters
427	nz = isave(1)
428	nzb_x = isave(2)
429	nzt_x = isave(3)
430	sendrecvcount_zx = isave(4)
431
432	DEALLOCATE( ar1, f_in, f_inv )
433
434	!
435	!-- Transposition x --> y + subsequent fft along y
436	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
437	CALL resort_for_xy( f_out_z, f_inv )
438	!
439	!-- Save original indices and gridpoint counter
440	isave(1) = nx
441	isave(2) = nxl_y
442	isave(3) = nxr_y
443	isave(4) = sendrecvcount_xy
444	!
445	!-- Set new indices for transformation
446	nblk = ( ( nx+1 ) / pdims(2) ) - 1
447	nx = pdims(2)
448	nnx_y = 1
449	nxl_y = myidy * nnx_y
450	nxr_y = ( myidy + 1 ) * nnx_y - 1
451	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
452
453	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
454	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
455
456	DO ii = 0, nblk
457
458	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
459
460	DO inew = 0, nx-1
461	iind = ii + ( nblk + 1 ) * inew
462	f_in(:,:,inew) = f_inv(:,:,iind)
463	ENDDO
464
465	CALL transpose_xy( f_in, ar1(:,:,:) )
466
467	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
468
469	IF ( ii == 1 ) THEN
470	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
471	ELSE
472	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
473	ENDIF
474
475	nxl_y_bound = isave(2)
476	nxr_y_bound = isave(3)
477	n = isave(2) + ii
478	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
479	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
480	nxl_y_l = n, nxr_y_l = n )
481
482	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
483
484	ENDDO
485	!
486	!-- Restore original indices/counters
487	nx = isave(1)
488	nxl_y = isave(2)
489	nxr_y = isave(3)
490	sendrecvcount_xy = isave(4)
491
492	DEALLOCATE( ar1, f_in, f_inv )
493
494	!
495	!-- Transposition y --> z + subsequent tridia + resort for z --> y
496	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
497	CALL resort_for_yz( f_out_y, f_inv )
498	!
499	!-- Save original indices and gridpoint counter
500	isave(1) = ny
501	isave(2) = nys_z
502	isave(3) = nyn_z
503	isave(4) = sendrecvcount_yz
504	!
505	!-- Set new indices for transformation
506	nblk = ( ( ny+1 ) / pdims(1) ) - 1
507	ny = pdims(1)
508	nny_z = 1
509	nys_z = myidx * nny_z
510	nyn_z = ( myidx + 1 ) * nny_z - 1
511	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
512
513	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
514	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
515
516	DO jj = 0, nblk
517	!
518	!-- Forward Fourier Transformation
519	!-- Transposition y --> z
520	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
521
522	DO jnew = 0, ny-1
523	jind = jj + ( nblk + 1 ) * jnew
524	f_in(:,:,jnew) = f_inv(:,:,jind)
525	ENDDO
526
527	CALL transpose_yz( f_in, ar1(:,:,:) )
528
529	IF ( jj == nblk ) THEN
530	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
531	ELSE
532	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
533	ENDIF
534
535	!
536	!-- Solve the tridiagonal equation system along z
537	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
538
539	n = isave(2) + jj
540	CALL tridia_substi_overlap( ar1(:,:,:), n )
541
542	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
543
544	!
545	!-- Inverse Fourier Transformation
546	!-- Transposition z --> y
547	!-- Only one thread should call MPI routines, therefore forward and
548	!-- backward tranpose are in the same section
549	IF ( jj == 0 ) THEN
550	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
551	ELSE
552	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
553	ENDIF
554
555	CALL transpose_zy( ar1(:,:,:), f_in )
556
557	DO jnew = 0, ny-1
558	jind = jj + ( nblk + 1 ) * jnew
559	f_inv(:,:,jind) = f_in(:,:,jnew)
560	ENDDO
561
562	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
563
564	ENDDO
565	!
566	!-- Restore original indices/counters
567	ny = isave(1)
568	nys_z = isave(2)
569	nyn_z = isave(3)
570	sendrecvcount_yz = isave(4)
571
572	CALL resort_for_zy( f_inv, f_out_y )
573
574	DEALLOCATE( ar1, f_in, f_inv )
575
576	!
577	!-- fft along y backward + subsequent transposition y --> x
578	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
579	!
580	!-- Save original indices and gridpoint counter
581	isave(1) = nx
582	isave(2) = nxl_y
583	isave(3) = nxr_y
584	isave(4) = sendrecvcount_xy
585	!
586	!-- Set new indices for transformation
587	nblk = (( nx+1 ) / pdims(2) ) - 1
588	nx = pdims(2)
589	nnx_y = 1
590	nxl_y = myidy * nnx_y
591	nxr_y = ( myidy + 1 ) * nnx_y - 1
592	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
593
594	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
595	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
596
597	DO ii = 0, nblk
598
599	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
600
601	n = isave(2) + ii
602	nxl_y_bound = isave(2)
603	nxr_y_bound = isave(3)
604
605	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
606	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
607	nxl_y_l = n, nxr_y_l = n )
608
609	IF ( ii == nblk ) THEN
610	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
611	ELSE
612	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
613	ENDIF
614
615	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
616
617	CALL transpose_yx( ar1(:,:,:), f_in )
618
619	DO inew = 0, nx-1
620	iind = ii + (nblk+1) * inew
621	f_inv(:,:,iind) = f_in(:,:,inew)
622	ENDDO
623
624	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
625
626	ENDDO
627	!
628	!-- Restore original indices/counters
629	nx = isave(1)
630	nxl_y = isave(2)
631	nxr_y = isave(3)
632	sendrecvcount_xy = isave(4)
633
634	CALL resort_for_yx( f_inv, f_out_z )
635
636	DEALLOCATE( ar1, f_in, f_inv )
637
638	!
639	!-- fft along x backward + subsequent final transposition x --> z
640	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
641	!
642	!-- Save original indices and gridpoint counter
643	isave(1) = nz
644	isave(2) = nzb_x
645	isave(3) = nzt_x
646	isave(4) = sendrecvcount_zx
647	!
648	!-- Set new indices for transformation
649	nblk = nz / pdims(1)
650	nz = pdims(1)
651	nnz_x = 1
652	nzb_x = 1 + myidx * nnz_x
653	nzt_x = ( myidx + 1 ) * nnz_x
654	sendrecvcount_zx = nnx * nny * nnz_x
655
656	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
657	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
658
659	DO kk = 1, nblk
660
661	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
662
663	n = isave(2) + kk - 1
664	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
665
666	IF ( kk == nblk ) THEN
667	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
668	ELSE
669	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
670	ENDIF
671
672	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
673
674	CALL transpose_xz( ar1(:,:,:), f_in )
675
676	DO knew = 1, nz
677	ki = kk + nblk * (knew-1)
678	f_inv(:,:,ki) = f_in(:,:,knew)
679	ENDDO
680
681	IF ( kk == nblk ) THEN
682	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
683	ELSE
684	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
685	ENDIF
686
687	ENDDO
688	!
689	!-- Restore original indices/counters
690	nz = isave(1)
691	nzb_x = isave(2)
692	nzt_x = isave(3)
693	sendrecvcount_zx = isave(4)
694
695	CALL resort_for_xz( f_inv, ar )
696
697	DEALLOCATE( ar1, f_in, f_inv )
698
699	ENDIF
700
701	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
702
703	END SUBROUTINE poisfft
704
705
706
707	SUBROUTINE ffty_tr_yx( f_in, f_out )
708
709	!------------------------------------------------------------------------------!
710	! Fourier-transformation along y with subsequent transposition y --> x for
711	! a 1d-decomposition along x
712	!
713	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
714	! if the first index of work_ffty_vec is odd. Otherwise
715	! memory bank conflicts may occur (especially if the index is a
716	! multiple of 128). That's why work_ffty_vec is dimensioned as
717	! 0:ny+1.
718	! Of course, this will not work if users are using an odd number
719	! of gridpoints along y.
720	!------------------------------------------------------------------------------!
721
722	USE control_parameters
723	USE cpulog
724	USE indices
725	USE interfaces
726	USE pegrid
727	USE transpose_indices
728
729	IMPLICIT NONE
730
731	INTEGER :: i, iend, iouter, ir, j, k
732	INTEGER, PARAMETER :: stridex = 4
733
734	REAL, DIMENSION(0:ny,stridex) :: work_ffty
735	#if defined( __nec )
736	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
737	#endif
738	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
739	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
740	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
741
742	!
743	!-- Carry out the FFT along y, where all data are present due to the
744	!-- 1d-decomposition along x. Resort the data in a way that x becomes
745	!-- the first index.
746	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
747
748	IF ( host(1:3) == 'nec' ) THEN
749	#if defined( __nec )
750	!
751	!-- Code optimized for vector processors
752	!$OMP PARALLEL PRIVATE ( i, j, k )
753	!$OMP DO
754	DO i = nxl, nxr
755
756	DO j = 0, ny
757	DO k = 1, nz
758	work_ffty_vec(j,k,i) = f_in(k,j,i)
759	ENDDO
760	ENDDO
761
762	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
763
764	ENDDO
765
766	!$OMP DO
767	DO k = 1, nz
768	DO j = 0, ny
769	DO i = nxl, nxr
770	work(i,k,j) = work_ffty_vec(j,k,i)
771	ENDDO
772	ENDDO
773	ENDDO
774	!$OMP END PARALLEL
775	#endif
776
777	ELSE
778
779	!
780	!-- Cache optimized code.
781	!-- The i-(x-)direction is split into a strided outer loop and an inner
782	!-- loop for better cache performance
783	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
784	!$OMP DO
785	DO iouter = nxl, nxr, stridex
786
787	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
788
789	DO k = 1, nz
790
791	DO i = iouter, iend
792
793	ir = i-iouter+1 ! counter within a stride
794	DO j = 0, ny
795	work_ffty(j,ir) = f_in(k,j,i)
796	ENDDO
797	!
798	!-- FFT along y
799	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
800
801	ENDDO
802
803	!
804	!-- Resort
805	DO j = 0, ny
806	DO i = iouter, iend
807	work(i,k,j) = work_ffty(j,i-iouter+1)
808	ENDDO
809	ENDDO
810
811	ENDDO
812
813	ENDDO
814	!$OMP END PARALLEL
815
816	ENDIF
817	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
818
819	!
820	!-- Transpose array
821	#if defined( __parallel )
822	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
823	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
824	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
825	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
826	comm1dx, ierr )
827	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
828	#endif
829
830	END SUBROUTINE ffty_tr_yx
831
832
833	SUBROUTINE tr_xy_ffty( f_in, f_out )
834
835	!------------------------------------------------------------------------------!
836	! Transposition x --> y with a subsequent backward Fourier transformation for
837	! a 1d-decomposition along x
838	!------------------------------------------------------------------------------!
839
840	USE control_parameters
841	USE cpulog
842	USE indices
843	USE interfaces
844	USE pegrid
845	USE transpose_indices
846
847	IMPLICIT NONE
848
849	INTEGER :: i, iend, iouter, ir, j, k
850	INTEGER, PARAMETER :: stridex = 4
851
852	REAL, DIMENSION(0:ny,stridex) :: work_ffty
853	#if defined( __nec )
854	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
855	#endif
856	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
857	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
858	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
859
860	!
861	!-- Transpose array
862	#if defined( __parallel )
863	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
864	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
865	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
866	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
867	comm1dx, ierr )
868	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
869	#endif
870
871	!
872	!-- Resort the data in a way that y becomes the first index and carry out the
873	!-- backward fft along y.
874	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
875
876	IF ( host(1:3) == 'nec' ) THEN
877	#if defined( __nec )
878	!
879	!-- Code optimized for vector processors
880	!$OMP PARALLEL PRIVATE ( i, j, k )
881	!$OMP DO
882	DO k = 1, nz
883	DO j = 0, ny
884	DO i = nxl, nxr
885	work_ffty_vec(j,k,i) = work(i,k,j)
886	ENDDO
887	ENDDO
888	ENDDO
889
890	!$OMP DO
891	DO i = nxl, nxr
892
893	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
894
895	DO j = 0, ny
896	DO k = 1, nz
897	f_out(k,j,i) = work_ffty_vec(j,k,i)
898	ENDDO
899	ENDDO
900
901	ENDDO
902	!$OMP END PARALLEL
903	#endif
904
905	ELSE
906
907	!
908	!-- Cache optimized code.
909	!-- The i-(x-)direction is split into a strided outer loop and an inner
910	!-- loop for better cache performance
911	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
912	!$OMP DO
913	DO iouter = nxl, nxr, stridex
914
915	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
916
917	DO k = 1, nz
918	!
919	!-- Resort
920	DO j = 0, ny
921	DO i = iouter, iend
922	work_ffty(j,i-iouter+1) = work(i,k,j)
923	ENDDO
924	ENDDO
925
926	DO i = iouter, iend
927
928	!
929	!-- FFT along y
930	ir = i-iouter+1 ! counter within a stride
931	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
932
933	DO j = 0, ny
934	f_out(k,j,i) = work_ffty(j,ir)
935	ENDDO
936	ENDDO
937
938	ENDDO
939
940	ENDDO
941	!$OMP END PARALLEL
942
943	ENDIF
944
945	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
946
947	END SUBROUTINE tr_xy_ffty
948
949
950	SUBROUTINE fftx_tri_fftx( ar )
951
952	!------------------------------------------------------------------------------!
953	! FFT along x, solution of the tridiagonal system and backward FFT for
954	! a 1d-decomposition along x
955	!
956	! WARNING: this subroutine may still not work for hybrid parallelization
957	! with OpenMP (for possible necessary changes see the original
958	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
959	!------------------------------------------------------------------------------!
960
961	USE control_parameters
962	USE cpulog
963	USE grid_variables
964	USE indices
965	USE interfaces
966	USE pegrid
967	USE transpose_indices
968
969	IMPLICIT NONE
970
971	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
972
973	REAL, DIMENSION(0:nx) :: work_fftx
974	REAL, DIMENSION(0:nx,1:nz) :: work_trix
975	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
976	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
977
978
979	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
980
981	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
982
983	tn = 0 ! Default thread number in case of one thread
984	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
985	DO j = nys_x, nyn_x
986
987	!$ tn = omp_get_thread_num()
988
989	IF ( host(1:3) == 'nec' ) THEN
990	!
991	!-- Code optimized for vector processors
992	DO k = 1, nz
993
994	m = 0
995	DO n = 1, pdims(1)
996	DO i = 1, nnx
997	work_trix(m,k) = ar(i,k,j,n)
998	m = m + 1
999	ENDDO
1000	ENDDO
1001
1002	ENDDO
1003
1004	CALL fft_x_m( work_trix, 'forward' )
1005
1006	ELSE
1007	!
1008	!-- Cache optimized code
1009	DO k = 1, nz
1010
1011	m = 0
1012	DO n = 1, pdims(1)
1013	DO i = 1, nnx
1014	work_fftx(m) = ar(i,k,j,n)
1015	m = m + 1
1016	ENDDO
1017	ENDDO
1018
1019	CALL fft_x_1d( work_fftx, 'forward' )
1020
1021	DO i = 0, nx
1022	work_trix(i,k) = work_fftx(i)
1023	ENDDO
1024
1025	ENDDO
1026
1027	ENDIF
1028
1029	!
1030	!-- Solve the linear equation system
1031	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1032
1033	IF ( host(1:3) == 'nec' ) THEN
1034	!
1035	!-- Code optimized for vector processors
1036	CALL fft_x_m( work_trix, 'backward' )
1037
1038	DO k = 1, nz
1039
1040	m = 0
1041	DO n = 1, pdims(1)
1042	DO i = 1, nnx
1043	ar(i,k,j,n) = work_trix(m,k)
1044	m = m + 1
1045	ENDDO
1046	ENDDO
1047
1048	ENDDO
1049
1050	ELSE
1051	!
1052	!-- Cache optimized code
1053	DO k = 1, nz
1054
1055	DO i = 0, nx
1056	work_fftx(i) = work_trix(i,k)
1057	ENDDO
1058
1059	CALL fft_x_1d( work_fftx, 'backward' )
1060
1061	m = 0
1062	DO n = 1, pdims(1)
1063	DO i = 1, nnx
1064	ar(i,k,j,n) = work_fftx(m)
1065	m = m + 1
1066	ENDDO
1067	ENDDO
1068
1069	ENDDO
1070
1071	ENDIF
1072
1073	ENDDO
1074
1075	DEALLOCATE( tri )
1076
1077	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1078
1079	END SUBROUTINE fftx_tri_fftx
1080
1081
1082	SUBROUTINE fftx_tr_xy( f_in, f_out )
1083
1084	!------------------------------------------------------------------------------!
1085	! Fourier-transformation along x with subsequent transposition x --> y for
1086	! a 1d-decomposition along y
1087	!
1088	! ATTENTION: The NEC-branch of this routine may significantly profit from
1089	! further optimizations. So far, performance is much worse than
1090	! for routine ffty_tr_yx (more than three times slower).
1091	!------------------------------------------------------------------------------!
1092
1093	USE control_parameters
1094	USE cpulog
1095	USE indices
1096	USE interfaces
1097	USE pegrid
1098	USE transpose_indices
1099
1100	IMPLICIT NONE
1101
1102	INTEGER :: i, j, k
1103
1104	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1105	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
1106	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
1107	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1108
1109	!
1110	!-- Carry out the FFT along x, where all data are present due to the
1111	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1112	!-- the first index.
1113	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1114
1115	IF ( host(1:3) == 'nec' ) THEN
1116	!
1117	!-- Code for vector processors
1118	!$OMP PARALLEL PRIVATE ( i, j, k )
1119	!$OMP DO
1120	DO i = 0, nx
1121
1122	DO j = nys, nyn
1123	DO k = 1, nz
1124	work_fftx(i,k,j) = f_in(k,j,i)
1125	ENDDO
1126	ENDDO
1127
1128	ENDDO
1129
1130	!$OMP DO
1131	DO j = nys, nyn
1132
1133	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1134
1135	DO k = 1, nz
1136	DO i = 0, nx
1137	work(j,k,i) = work_fftx(i,k,j)
1138	ENDDO
1139	ENDDO
1140
1141	ENDDO
1142	!$OMP END PARALLEL
1143
1144	ELSE
1145
1146	!
1147	!-- Cache optimized code (there might be still a potential for better
1148	!-- optimization).
1149	!$OMP PARALLEL PRIVATE (i,j,k)
1150	!$OMP DO
1151	DO i = 0, nx
1152
1153	DO j = nys, nyn
1154	DO k = 1, nz
1155	work_fftx(i,k,j) = f_in(k,j,i)
1156	ENDDO
1157	ENDDO
1158
1159	ENDDO
1160
1161	!$OMP DO
1162	DO j = nys, nyn
1163	DO k = 1, nz
1164
1165	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1166
1167	DO i = 0, nx
1168	work(j,k,i) = work_fftx(i,k,j)
1169	ENDDO
1170	ENDDO
1171
1172	ENDDO
1173	!$OMP END PARALLEL
1174
1175	ENDIF
1176	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1177
1178	!
1179	!-- Transpose array
1180	#if defined( __parallel )
1181	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1182	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1183	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1184	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1185	comm1dy, ierr )
1186	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1187	#endif
1188
1189	END SUBROUTINE fftx_tr_xy
1190
1191
1192	SUBROUTINE tr_yx_fftx( f_in, f_out )
1193
1194	!------------------------------------------------------------------------------!
1195	! Transposition y --> x with a subsequent backward Fourier transformation for
1196	! a 1d-decomposition along x
1197	!------------------------------------------------------------------------------!
1198
1199	USE control_parameters
1200	USE cpulog
1201	USE indices
1202	USE interfaces
1203	USE pegrid
1204	USE transpose_indices
1205
1206	IMPLICIT NONE
1207
1208	INTEGER :: i, j, k
1209
1210	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1211	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
1212	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
1213	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1214
1215	!
1216	!-- Transpose array
1217	#if defined( __parallel )
1218	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1219	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1220	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1221	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1222	comm1dy, ierr )
1223	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1224	#endif
1225
1226	!
1227	!-- Carry out the FFT along x, where all data are present due to the
1228	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1229	!-- the first index.
1230	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1231
1232	IF ( host(1:3) == 'nec' ) THEN
1233	!
1234	!-- Code optimized for vector processors
1235	!$OMP PARALLEL PRIVATE ( i, j, k )
1236	!$OMP DO
1237	DO j = nys, nyn
1238
1239	DO k = 1, nz
1240	DO i = 0, nx
1241	work_fftx(i,k,j) = work(j,k,i)
1242	ENDDO
1243	ENDDO
1244
1245	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1246
1247	ENDDO
1248
1249	!$OMP DO
1250	DO i = 0, nx
1251	DO j = nys, nyn
1252	DO k = 1, nz
1253	f_out(k,j,i) = work_fftx(i,k,j)
1254	ENDDO
1255	ENDDO
1256	ENDDO
1257	!$OMP END PARALLEL
1258
1259	ELSE
1260
1261	!
1262	!-- Cache optimized code (there might be still a potential for better
1263	!-- optimization).
1264	!$OMP PARALLEL PRIVATE (i,j,k)
1265	!$OMP DO
1266	DO j = nys, nyn
1267	DO k = 1, nz
1268
1269	DO i = 0, nx
1270	work_fftx(i,k,j) = work(j,k,i)
1271	ENDDO
1272
1273	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1274
1275	ENDDO
1276	ENDDO
1277
1278	!$OMP DO
1279	DO i = 0, nx
1280	DO j = nys, nyn
1281	DO k = 1, nz
1282	f_out(k,j,i) = work_fftx(i,k,j)
1283	ENDDO
1284	ENDDO
1285	ENDDO
1286	!$OMP END PARALLEL
1287
1288	ENDIF
1289	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1290
1291	END SUBROUTINE tr_yx_fftx
1292
1293
1294	SUBROUTINE ffty_tri_ffty( ar )
1295
1296	!------------------------------------------------------------------------------!
1297	! FFT along y, solution of the tridiagonal system and backward FFT for
1298	! a 1d-decomposition along y
1299	!
1300	! WARNING: this subroutine may still not work for hybrid parallelization
1301	! with OpenMP (for possible necessary changes see the original
1302	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1303	!------------------------------------------------------------------------------!
1304
1305	USE control_parameters
1306	USE cpulog
1307	USE grid_variables
1308	USE indices
1309	USE interfaces
1310	USE pegrid
1311	USE transpose_indices
1312
1313	IMPLICIT NONE
1314
1315	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1316
1317	REAL, DIMENSION(0:ny) :: work_ffty
1318	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1319	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
1320	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1321
1322
1323	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1324
1325	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1326
1327	tn = 0 ! Default thread number in case of one thread
1328	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1329	DO i = nxl_y, nxr_y
1330
1331	!$ tn = omp_get_thread_num()
1332
1333	IF ( host(1:3) == 'nec' ) THEN
1334	!
1335	!-- Code optimized for vector processors
1336	DO k = 1, nz
1337
1338	m = 0
1339	DO n = 1, pdims(2)
1340	DO j = 1, nny
1341	work_triy(m,k) = ar(j,k,i,n)
1342	m = m + 1
1343	ENDDO
1344	ENDDO
1345
1346	ENDDO
1347
1348	CALL fft_y_m( work_triy, ny, 'forward' )
1349
1350	ELSE
1351	!
1352	!-- Cache optimized code
1353	DO k = 1, nz
1354
1355	m = 0
1356	DO n = 1, pdims(2)
1357	DO j = 1, nny
1358	work_ffty(m) = ar(j,k,i,n)
1359	m = m + 1
1360	ENDDO
1361	ENDDO
1362
1363	CALL fft_y_1d( work_ffty, 'forward' )
1364
1365	DO j = 0, ny
1366	work_triy(j,k) = work_ffty(j)
1367	ENDDO
1368
1369	ENDDO
1370
1371	ENDIF
1372
1373	!
1374	!-- Solve the linear equation system
1375	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1376
1377	IF ( host(1:3) == 'nec' ) THEN
1378	!
1379	!-- Code optimized for vector processors
1380	CALL fft_y_m( work_triy, ny, 'backward' )
1381
1382	DO k = 1, nz
1383
1384	m = 0
1385	DO n = 1, pdims(2)
1386	DO j = 1, nny
1387	ar(j,k,i,n) = work_triy(m,k)
1388	m = m + 1
1389	ENDDO
1390	ENDDO
1391
1392	ENDDO
1393
1394	ELSE
1395	!
1396	!-- Cache optimized code
1397	DO k = 1, nz
1398
1399	DO j = 0, ny
1400	work_ffty(j) = work_triy(j,k)
1401	ENDDO
1402
1403	CALL fft_y_1d( work_ffty, 'backward' )
1404
1405	m = 0
1406	DO n = 1, pdims(2)
1407	DO j = 1, nny
1408	ar(j,k,i,n) = work_ffty(m)
1409	m = m + 1
1410	ENDDO
1411	ENDDO
1412
1413	ENDDO
1414
1415	ENDIF
1416
1417	ENDDO
1418
1419	DEALLOCATE( tri )
1420
1421	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1422
1423	END SUBROUTINE ffty_tri_ffty
1424
1425	#endif
1426
1427	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |