Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1306

Last change on this file since 1306 was 1306, checked in by raasch, 10 years ago
openmp sections removed from fft/alltoall overlapping second argument removed from routine poisfft module for declaring kind definitions added
Property svn:keywords set to `Id`
File size: 42.0 KB

Line
1	MODULE poisfft_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2014 Leibniz University Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	! openmp sections removed from the overlap branch,
23	! second argument removed from parameter list
24	!
25	! Former revisions:
26	! -----------------
27	! $Id: poisfft.f90 1306 2014-03-13 14:30:59Z raasch $
28	!
29	! 1216 2013-08-26 09:31:42Z raasch
30	! resorting of arrays moved to separate routines resort_for_...,
31	! one argument, used as temporary work array, removed from all transpose
32	! routines
33	! overlapping fft / transposition implemented
34	!
35	! 1212 2013-08-15 08:46:27Z raasch
36	! tridia routines moved to seperate module tridia_solver
37	!
38	! 1208 2013-08-13 06:41:49Z raasch
39	! acc-update clauses added for "ar" so that ffts other than cufft can also be
40	! used (although they are not ported and will give a poor performance)
41	!
42	! 1111 2013-03-08 23:54:10Z raasch
43	! further openACC porting of non-parallel (MPI) branch:
44	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
45	! no distinction between parallel/non-parallel in poisfft and tridia any more,
46	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
47	! (otherwise "invalid device function" is indicated during runtime),
48	! optimization of tridia routines: constant elements and coefficients of tri are
49	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
50	! to 2,
51	! poisfft_init is now called internally from poisfft, maketri is called from
52	! poisfft_init,
53	! ibc_p_b = 2 removed
54	!
55	! 1106 2013-03-04 05:31:38Z raasch
56	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
57	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
58	! fft_y_1d
59	!
60	! 1103 2013-02-20 02:15:53Z raasch
61	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
62	! sometimes cause segmentation faults with intel 12.1 compiler
63	!
64	! 1092 2013-02-02 11:24:22Z raasch
65	! unused variables removed
66	!
67	! 1036 2012-10-22 13:43:42Z raasch
68	! code put under GPL (PALM 3.9)
69	!
70	! 2012-09-21 07:03:55Z raasch
71	! FLOAT type conversion replaced by REAL
72	!
73	! 1003 2012-09-14 14:35:53Z raasch
74	! indices nxa, nya, etc. replaced by nx, ny, etc.
75	!
76	! 940 2012-07-09 14:31:00Z raasch
77	! special handling of tri-array as an argument in tridia_1dd routines switched
78	! off because it caused segmentation faults with intel 12.1 compiler
79	!
80	! 877 2012-04-03 11:21:44Z suehring
81	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
82	! pressure at the top of the model domain.
83	!
84	! 809 2012-01-30 13:32:58Z maronga
85	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
86	!
87	! 807 2012-01-25 11:53:51Z maronga
88	! New cpp directive "__check" implemented which is used by check_namelist_files
89	! (most of the code is unneeded by check_namelist_files).
90	!
91	! 763 2011-10-06 09:32:09Z suehring
92	! Comment added concerning the last change.
93	!
94	! 761 2011-10-05 17:58:52Z suehring
95	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
96	! pressure at the top of the model domain.
97	!
98	! 696 2011-03-18 07:03:49Z raasch
99	! work_fftx removed from PRIVATE clauses in fftx_tr_xy and tr_yx_fftx
100	!
101	! 683 2011-02-09 14:25:15Z raasch
102	! openMP parallelization for 2d-domain-decomposition
103	!
104	! 667 2010-12-23 12:06:00Z suehring/gryschka
105	! ddzu replaced by ddzu_pres due to changes in zu(0)
106	!
107	! 622 2010-12-10 08:08:13Z raasch
108	! optional barriers included in order to speed up collective operations
109	!
110	! 377 2009-09-04 11:09:00Z raasch
111	! __lcmuk changed to __lc to avoid problems with Intel compiler on sgi-ice
112	!
113	! 164 2008-05-15 08:46:15Z raasch
114	! Arguments removed from transpose routines
115	!
116	! 128 2007-10-26 13:11:14Z raasch
117	! Bugfix: wavenumber calculation for even nx in routines maketri
118	!
119	! 85 2007-05-11 09:35:14Z raasch
120	! Bugfix: work_fft*_vec removed from some PRIVATE-declarations
121	!
122	! 76 2007-03-29 00:58:32Z raasch
123	! Tridiagonal coefficients adjusted for Neumann boundary conditions both at
124	! the bottom and the top.
125	!
126	! RCS Log replace by Id keyword, revision history cleaned up
127	!
128	! Revision 1.24 2006/08/04 15:00:24 raasch
129	! Default setting of the thread number tn in case of not using OpenMP
130	!
131	! Revision 1.23 2006/02/23 12:48:38 raasch
132	! Additional compiler directive in routine tridia_1dd for preventing loop
133	! exchange on NEC-SX6
134	!
135	! Revision 1.20 2004/04/30 12:38:09 raasch
136	! Parts of former poisfft_hybrid moved to this subroutine,
137	! former subroutine changed to a module, renaming of FFT-subroutines and
138	! -module, FFTs completely substituted by calls of fft_x and fft_y,
139	! NAG fft used in the non-parallel case completely removed, l in maketri
140	! is now a 1d-array, variables passed by modules instead of using parameter
141	! lists, enlarged transposition arrays introduced
142	!
143	! Revision 1.1 1997/07/24 11:24:14 raasch
144	! Initial revision
145	!
146	!
147	! Description:
148	! ------------
149	! Original version by Stephan Siano (pois3d), as of July 23, 1996
150	! Adapted for 2D-domain-decomposition by Siegfried Raasch, July 3, 1997
151	!
152	! Solves the Poisson equation with a 2D spectral method
153	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
154	!
155	! Input:
156	! real ar contains (nnz,nny,nnx) elements of the velocity divergence,
157	! starting from (1,nys,nxl)
158	!
159	! Output:
160	! real ar contains the solution for perturbation pressure p
161	!------------------------------------------------------------------------------!
162
163	USE fft_xy
164	USE indices
165	USE transpose_indices
166	USE tridia_solver
167
168	IMPLICIT NONE
169
170	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
171
172	PRIVATE
173
174	#if ! defined ( __check )
175	PUBLIC poisfft, poisfft_init
176
177	INTERFACE poisfft
178	MODULE PROCEDURE poisfft
179	END INTERFACE poisfft
180
181	INTERFACE poisfft_init
182	MODULE PROCEDURE poisfft_init
183	END INTERFACE poisfft_init
184	#else
185	PUBLIC poisfft_init
186
187	INTERFACE poisfft_init
188	MODULE PROCEDURE poisfft_init
189	END INTERFACE poisfft_init
190	#endif
191
192	CONTAINS
193
194	SUBROUTINE poisfft_init
195
196	USE arrays_3d, ONLY: ddzu_pres, ddzw
197
198	IMPLICIT NONE
199
200	INTEGER :: k
201
202
203	CALL fft_init
204
205	CALL tridia_init
206
207	poisfft_initialized = .TRUE.
208
209	END SUBROUTINE poisfft_init
210
211
212	#if ! defined ( __check )
213	SUBROUTINE poisfft( ar )
214
215	USE control_parameters, ONLY : fft_method, transpose_compute_overlap
216	USE cpulog
217	USE interfaces
218	USE pegrid
219
220	IMPLICIT NONE
221
222	INTEGER :: ii, iind, inew, jj, jind, jnew, ki, kk, knew, n, nblk, &
223	nnx_y, nny_z, nnz_t, nnz_x, nxl_y_bound, nxr_y_bound
224	INTEGER, DIMENSION(4) :: isave
225
226	REAL, DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar
227	!$acc declare create( ar_inv )
228	REAL, DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv
229
230	REAL, DIMENSION(:,:,:), ALLOCATABLE :: ar1, f_in, f_inv, f_out_y, &
231	f_out_z
232
233
234	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
235
236	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
237
238	!
239	!-- Two-dimensional Fourier Transformation in x- and y-direction.
240	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
241
242	!
243	!-- 1d-domain-decomposition along x:
244	!-- FFT along y and transposition y --> x
245	CALL ffty_tr_yx( ar, ar )
246
247	!
248	!-- FFT along x, solving the tridiagonal system and backward FFT
249	CALL fftx_tri_fftx( ar )
250
251	!
252	!-- Transposition x --> y and backward FFT along y
253	CALL tr_xy_ffty( ar, ar )
254
255	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
256
257	!
258	!-- 1d-domain-decomposition along y:
259	!-- FFT along x and transposition x --> y
260	CALL fftx_tr_xy( ar, ar )
261
262	!
263	!-- FFT along y, solving the tridiagonal system and backward FFT
264	CALL ffty_tri_ffty( ar )
265
266	!
267	!-- Transposition y --> x and backward FFT along x
268	CALL tr_yx_fftx( ar, ar )
269
270	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
271
272	!
273	!-- 2d-domain-decomposition or no decomposition (1 PE run)
274	!-- Transposition z --> x
275	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
276	CALL resort_for_zx( ar, ar_inv )
277	CALL transpose_zx( ar_inv, ar )
278	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
279
280	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
281	IF ( fft_method /= 'system-specific' ) THEN
282	!$acc update host( ar )
283	ENDIF
284	CALL fft_x( ar, 'forward' )
285	IF ( fft_method /= 'system-specific' ) THEN
286	!$acc update device( ar )
287	ENDIF
288	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
289
290	!
291	!-- Transposition x --> y
292	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
293	CALL resort_for_xy( ar, ar_inv )
294	CALL transpose_xy( ar_inv, ar )
295	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
296
297	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
298	IF ( fft_method /= 'system-specific' ) THEN
299	!$acc update host( ar )
300	ENDIF
301	CALL fft_y( ar, 'forward', ar_tr = ar, &
302	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
303	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
304	IF ( fft_method /= 'system-specific' ) THEN
305	!$acc update device( ar )
306	ENDIF
307	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
308
309	!
310	!-- Transposition y --> z
311	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
312	CALL resort_for_yz( ar, ar_inv )
313	CALL transpose_yz( ar_inv, ar )
314	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
315
316	!
317	!-- Solve the tridiagonal equation system along z
318	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
319	CALL tridia_substi( ar )
320	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
321
322	!
323	!-- Inverse Fourier Transformation
324	!-- Transposition z --> y
325	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
326	CALL transpose_zy( ar, ar_inv )
327	CALL resort_for_zy( ar_inv, ar )
328	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
329
330	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
331	IF ( fft_method /= 'system-specific' ) THEN
332	!$acc update host( ar )
333	ENDIF
334	CALL fft_y( ar, 'backward', ar_tr = ar, &
335	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
336	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
337	IF ( fft_method /= 'system-specific' ) THEN
338	!$acc update device( ar )
339	ENDIF
340	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
341
342	!
343	!-- Transposition y --> x
344	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
345	CALL transpose_yx( ar, ar_inv )
346	CALL resort_for_yx( ar_inv, ar )
347	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
348
349	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
350	IF ( fft_method /= 'system-specific' ) THEN
351	!$acc update host( ar )
352	ENDIF
353	CALL fft_x( ar, 'backward' )
354	IF ( fft_method /= 'system-specific' ) THEN
355	!$acc update device( ar )
356	ENDIF
357	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
358
359	!
360	!-- Transposition x --> z
361	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
362	CALL transpose_xz( ar, ar_inv )
363	CALL resort_for_xz( ar_inv, ar )
364	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
365
366	ELSE
367
368	!
369	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
370	!-- overlapping transposition / fft
371	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
372	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
373	!
374	!-- Transposition z --> x + subsequent fft along x
375	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
376	CALL resort_for_zx( ar, f_inv )
377	!
378	!-- Save original indices and gridpoint counter
379	isave(1) = nz
380	isave(2) = nzb_x
381	isave(3) = nzt_x
382	isave(4) = sendrecvcount_zx
383	!
384	!-- Set new indices for transformation
385	nblk = nz / pdims(1)
386	nz = pdims(1)
387	nnz_x = 1
388	nzb_x = 1 + myidx * nnz_x
389	nzt_x = ( myidx + 1 ) * nnz_x
390	sendrecvcount_zx = nnx * nny * nnz_x
391
392	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
393	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
394
395	DO kk = 1, nblk
396
397	IF ( kk == 1 ) THEN
398	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
399	ELSE
400	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
401	ENDIF
402
403	DO knew = 1, nz
404	ki = kk + nblk * ( knew - 1 )
405	f_in(:,:,knew) = f_inv(:,:,ki)
406	ENDDO
407
408	CALL transpose_zx( f_in, ar1(:,:,:))
409	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
410
411	IF ( kk == 1 ) THEN
412	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
413	ELSE
414	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
415	ENDIF
416
417	n = isave(2) + kk - 1
418	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
419	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
420
421	ENDDO
422	!
423	!-- Restore original indices/counters
424	nz = isave(1)
425	nzb_x = isave(2)
426	nzt_x = isave(3)
427	sendrecvcount_zx = isave(4)
428
429	DEALLOCATE( ar1, f_in, f_inv )
430
431	!
432	!-- Transposition x --> y + subsequent fft along y
433	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
434	CALL resort_for_xy( f_out_z, f_inv )
435	!
436	!-- Save original indices and gridpoint counter
437	isave(1) = nx
438	isave(2) = nxl_y
439	isave(3) = nxr_y
440	isave(4) = sendrecvcount_xy
441	!
442	!-- Set new indices for transformation
443	nblk = ( ( nx+1 ) / pdims(2) ) - 1
444	nx = pdims(2)
445	nnx_y = 1
446	nxl_y = myidy * nnx_y
447	nxr_y = ( myidy + 1 ) * nnx_y - 1
448	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
449
450	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
451	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
452
453	DO ii = 0, nblk
454
455	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
456
457	DO inew = 0, nx-1
458	iind = ii + ( nblk + 1 ) * inew
459	f_in(:,:,inew) = f_inv(:,:,iind)
460	ENDDO
461
462	CALL transpose_xy( f_in, ar1(:,:,:) )
463
464	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
465
466	IF ( ii == 1 ) THEN
467	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
468	ELSE
469	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
470	ENDIF
471
472	nxl_y_bound = isave(2)
473	nxr_y_bound = isave(3)
474	n = isave(2) + ii
475	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
476	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
477	nxl_y_l = n, nxr_y_l = n )
478
479	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
480
481	ENDDO
482	!
483	!-- Restore original indices/counters
484	nx = isave(1)
485	nxl_y = isave(2)
486	nxr_y = isave(3)
487	sendrecvcount_xy = isave(4)
488
489	DEALLOCATE( ar1, f_in, f_inv )
490
491	!
492	!-- Transposition y --> z + subsequent tridia + resort for z --> y
493	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
494	CALL resort_for_yz( f_out_y, f_inv )
495	!
496	!-- Save original indices and gridpoint counter
497	isave(1) = ny
498	isave(2) = nys_z
499	isave(3) = nyn_z
500	isave(4) = sendrecvcount_yz
501	!
502	!-- Set new indices for transformation
503	nblk = ( ( ny+1 ) / pdims(1) ) - 1
504	ny = pdims(1)
505	nny_z = 1
506	nys_z = myidx * nny_z
507	nyn_z = ( myidx + 1 ) * nny_z - 1
508	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
509
510	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
511	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
512
513	DO jj = 0, nblk
514	!
515	!-- Forward Fourier Transformation
516	!-- Transposition y --> z
517	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
518
519	DO jnew = 0, ny-1
520	jind = jj + ( nblk + 1 ) * jnew
521	f_in(:,:,jnew) = f_inv(:,:,jind)
522	ENDDO
523
524	CALL transpose_yz( f_in, ar1(:,:,:) )
525
526	IF ( jj == nblk ) THEN
527	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
528	ELSE
529	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
530	ENDIF
531
532	!
533	!-- Solve the tridiagonal equation system along z
534	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
535
536	n = isave(2) + jj
537	CALL tridia_substi_overlap( ar1(:,:,:), n )
538
539	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
540
541	!
542	!-- Inverse Fourier Transformation
543	!-- Transposition z --> y
544	!-- Only one thread should call MPI routines, therefore forward and
545	!-- backward tranpose are in the same section
546	IF ( jj == 0 ) THEN
547	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
548	ELSE
549	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
550	ENDIF
551
552	CALL transpose_zy( ar1(:,:,:), f_in )
553
554	DO jnew = 0, ny-1
555	jind = jj + ( nblk + 1 ) * jnew
556	f_inv(:,:,jind) = f_in(:,:,jnew)
557	ENDDO
558
559	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
560
561	ENDDO
562	!
563	!-- Restore original indices/counters
564	ny = isave(1)
565	nys_z = isave(2)
566	nyn_z = isave(3)
567	sendrecvcount_yz = isave(4)
568
569	CALL resort_for_zy( f_inv, f_out_y )
570
571	DEALLOCATE( ar1, f_in, f_inv )
572
573	!
574	!-- fft along y backward + subsequent transposition y --> x
575	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
576	!
577	!-- Save original indices and gridpoint counter
578	isave(1) = nx
579	isave(2) = nxl_y
580	isave(3) = nxr_y
581	isave(4) = sendrecvcount_xy
582	!
583	!-- Set new indices for transformation
584	nblk = (( nx+1 ) / pdims(2) ) - 1
585	nx = pdims(2)
586	nnx_y = 1
587	nxl_y = myidy * nnx_y
588	nxr_y = ( myidy + 1 ) * nnx_y - 1
589	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
590
591	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
592	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
593
594	DO ii = 0, nblk
595
596	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
597
598	n = isave(2) + ii
599	nxl_y_bound = isave(2)
600	nxr_y_bound = isave(3)
601
602	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
603	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
604	nxl_y_l = n, nxr_y_l = n )
605
606	IF ( ii == nblk ) THEN
607	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
608	ELSE
609	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
610	ENDIF
611
612	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
613
614	CALL transpose_yx( ar1(:,:,:), f_in )
615
616	DO inew = 0, nx-1
617	iind = ii + (nblk+1) * inew
618	f_inv(:,:,iind) = f_in(:,:,inew)
619	ENDDO
620
621	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
622
623	ENDDO
624	!
625	!-- Restore original indices/counters
626	nx = isave(1)
627	nxl_y = isave(2)
628	nxr_y = isave(3)
629	sendrecvcount_xy = isave(4)
630
631	CALL resort_for_yx( f_inv, f_out_z )
632
633	DEALLOCATE( ar1, f_in, f_inv )
634
635	!
636	!-- fft along x backward + subsequent final transposition x --> z
637	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
638	!
639	!-- Save original indices and gridpoint counter
640	isave(1) = nz
641	isave(2) = nzb_x
642	isave(3) = nzt_x
643	isave(4) = sendrecvcount_zx
644	!
645	!-- Set new indices for transformation
646	nblk = nz / pdims(1)
647	nz = pdims(1)
648	nnz_x = 1
649	nzb_x = 1 + myidx * nnz_x
650	nzt_x = ( myidx + 1 ) * nnz_x
651	sendrecvcount_zx = nnx * nny * nnz_x
652
653	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
654	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
655
656	DO kk = 1, nblk
657
658	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
659
660	n = isave(2) + kk - 1
661	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
662
663	IF ( kk == nblk ) THEN
664	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
665	ELSE
666	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
667	ENDIF
668
669	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
670
671	CALL transpose_xz( ar1(:,:,:), f_in )
672
673	DO knew = 1, nz
674	ki = kk + nblk * (knew-1)
675	f_inv(:,:,ki) = f_in(:,:,knew)
676	ENDDO
677
678	IF ( kk == nblk ) THEN
679	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
680	ELSE
681	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
682	ENDIF
683
684	ENDDO
685	!
686	!-- Restore original indices/counters
687	nz = isave(1)
688	nzb_x = isave(2)
689	nzt_x = isave(3)
690	sendrecvcount_zx = isave(4)
691
692	CALL resort_for_xz( f_inv, ar )
693
694	DEALLOCATE( ar1, f_in, f_inv )
695
696	ENDIF
697
698	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
699
700	END SUBROUTINE poisfft
701
702
703
704	SUBROUTINE ffty_tr_yx( f_in, f_out )
705
706	!------------------------------------------------------------------------------!
707	! Fourier-transformation along y with subsequent transposition y --> x for
708	! a 1d-decomposition along x
709	!
710	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
711	! if the first index of work_ffty_vec is odd. Otherwise
712	! memory bank conflicts may occur (especially if the index is a
713	! multiple of 128). That's why work_ffty_vec is dimensioned as
714	! 0:ny+1.
715	! Of course, this will not work if users are using an odd number
716	! of gridpoints along y.
717	!------------------------------------------------------------------------------!
718
719	USE control_parameters
720	USE cpulog
721	USE indices
722	USE interfaces
723	USE pegrid
724	USE transpose_indices
725
726	IMPLICIT NONE
727
728	INTEGER :: i, iend, iouter, ir, j, k
729	INTEGER, PARAMETER :: stridex = 4
730
731	REAL, DIMENSION(0:ny,stridex) :: work_ffty
732	#if defined( __nec )
733	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
734	#endif
735	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in
736	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out
737	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
738
739	!
740	!-- Carry out the FFT along y, where all data are present due to the
741	!-- 1d-decomposition along x. Resort the data in a way that x becomes
742	!-- the first index.
743	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
744
745	IF ( host(1:3) == 'nec' ) THEN
746	#if defined( __nec )
747	!
748	!-- Code optimized for vector processors
749	!$OMP PARALLEL PRIVATE ( i, j, k )
750	!$OMP DO
751	DO i = nxl, nxr
752
753	DO j = 0, ny
754	DO k = 1, nz
755	work_ffty_vec(j,k,i) = f_in(k,j,i)
756	ENDDO
757	ENDDO
758
759	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
760
761	ENDDO
762
763	!$OMP DO
764	DO k = 1, nz
765	DO j = 0, ny
766	DO i = nxl, nxr
767	work(i,k,j) = work_ffty_vec(j,k,i)
768	ENDDO
769	ENDDO
770	ENDDO
771	!$OMP END PARALLEL
772	#endif
773
774	ELSE
775
776	!
777	!-- Cache optimized code.
778	!-- The i-(x-)direction is split into a strided outer loop and an inner
779	!-- loop for better cache performance
780	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
781	!$OMP DO
782	DO iouter = nxl, nxr, stridex
783
784	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
785
786	DO k = 1, nz
787
788	DO i = iouter, iend
789
790	ir = i-iouter+1 ! counter within a stride
791	DO j = 0, ny
792	work_ffty(j,ir) = f_in(k,j,i)
793	ENDDO
794	!
795	!-- FFT along y
796	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
797
798	ENDDO
799
800	!
801	!-- Resort
802	DO j = 0, ny
803	DO i = iouter, iend
804	work(i,k,j) = work_ffty(j,i-iouter+1)
805	ENDDO
806	ENDDO
807
808	ENDDO
809
810	ENDDO
811	!$OMP END PARALLEL
812
813	ENDIF
814	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
815
816	!
817	!-- Transpose array
818	#if defined( __parallel )
819	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
820	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
821	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
822	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
823	comm1dx, ierr )
824	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
825	#endif
826
827	END SUBROUTINE ffty_tr_yx
828
829
830	SUBROUTINE tr_xy_ffty( f_in, f_out )
831
832	!------------------------------------------------------------------------------!
833	! Transposition x --> y with a subsequent backward Fourier transformation for
834	! a 1d-decomposition along x
835	!------------------------------------------------------------------------------!
836
837	USE control_parameters
838	USE cpulog
839	USE indices
840	USE interfaces
841	USE pegrid
842	USE transpose_indices
843
844	IMPLICIT NONE
845
846	INTEGER :: i, iend, iouter, ir, j, k
847	INTEGER, PARAMETER :: stridex = 4
848
849	REAL, DIMENSION(0:ny,stridex) :: work_ffty
850	#if defined( __nec )
851	REAL, DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec
852	#endif
853	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in
854	REAL, DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out
855	REAL, DIMENSION(nxl:nxr,1:nz,0:ny) :: work
856
857	!
858	!-- Transpose array
859	#if defined( __parallel )
860	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
861	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
862	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
863	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
864	comm1dx, ierr )
865	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
866	#endif
867
868	!
869	!-- Resort the data in a way that y becomes the first index and carry out the
870	!-- backward fft along y.
871	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
872
873	IF ( host(1:3) == 'nec' ) THEN
874	#if defined( __nec )
875	!
876	!-- Code optimized for vector processors
877	!$OMP PARALLEL PRIVATE ( i, j, k )
878	!$OMP DO
879	DO k = 1, nz
880	DO j = 0, ny
881	DO i = nxl, nxr
882	work_ffty_vec(j,k,i) = work(i,k,j)
883	ENDDO
884	ENDDO
885	ENDDO
886
887	!$OMP DO
888	DO i = nxl, nxr
889
890	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
891
892	DO j = 0, ny
893	DO k = 1, nz
894	f_out(k,j,i) = work_ffty_vec(j,k,i)
895	ENDDO
896	ENDDO
897
898	ENDDO
899	!$OMP END PARALLEL
900	#endif
901
902	ELSE
903
904	!
905	!-- Cache optimized code.
906	!-- The i-(x-)direction is split into a strided outer loop and an inner
907	!-- loop for better cache performance
908	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
909	!$OMP DO
910	DO iouter = nxl, nxr, stridex
911
912	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
913
914	DO k = 1, nz
915	!
916	!-- Resort
917	DO j = 0, ny
918	DO i = iouter, iend
919	work_ffty(j,i-iouter+1) = work(i,k,j)
920	ENDDO
921	ENDDO
922
923	DO i = iouter, iend
924
925	!
926	!-- FFT along y
927	ir = i-iouter+1 ! counter within a stride
928	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
929
930	DO j = 0, ny
931	f_out(k,j,i) = work_ffty(j,ir)
932	ENDDO
933	ENDDO
934
935	ENDDO
936
937	ENDDO
938	!$OMP END PARALLEL
939
940	ENDIF
941
942	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
943
944	END SUBROUTINE tr_xy_ffty
945
946
947	SUBROUTINE fftx_tri_fftx( ar )
948
949	!------------------------------------------------------------------------------!
950	! FFT along x, solution of the tridiagonal system and backward FFT for
951	! a 1d-decomposition along x
952	!
953	! WARNING: this subroutine may still not work for hybrid parallelization
954	! with OpenMP (for possible necessary changes see the original
955	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
956	!------------------------------------------------------------------------------!
957
958	USE control_parameters
959	USE cpulog
960	USE grid_variables
961	USE indices
962	USE interfaces
963	USE pegrid
964	USE transpose_indices
965
966	IMPLICIT NONE
967
968	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
969
970	REAL, DIMENSION(0:nx) :: work_fftx
971	REAL, DIMENSION(0:nx,1:nz) :: work_trix
972	REAL, DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar
973	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
974
975
976	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
977
978	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
979
980	tn = 0 ! Default thread number in case of one thread
981	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
982	DO j = nys_x, nyn_x
983
984	!$ tn = omp_get_thread_num()
985
986	IF ( host(1:3) == 'nec' ) THEN
987	!
988	!-- Code optimized for vector processors
989	DO k = 1, nz
990
991	m = 0
992	DO n = 1, pdims(1)
993	DO i = 1, nnx
994	work_trix(m,k) = ar(i,k,j,n)
995	m = m + 1
996	ENDDO
997	ENDDO
998
999	ENDDO
1000
1001	CALL fft_x_m( work_trix, 'forward' )
1002
1003	ELSE
1004	!
1005	!-- Cache optimized code
1006	DO k = 1, nz
1007
1008	m = 0
1009	DO n = 1, pdims(1)
1010	DO i = 1, nnx
1011	work_fftx(m) = ar(i,k,j,n)
1012	m = m + 1
1013	ENDDO
1014	ENDDO
1015
1016	CALL fft_x_1d( work_fftx, 'forward' )
1017
1018	DO i = 0, nx
1019	work_trix(i,k) = work_fftx(i)
1020	ENDDO
1021
1022	ENDDO
1023
1024	ENDIF
1025
1026	!
1027	!-- Solve the linear equation system
1028	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1029
1030	IF ( host(1:3) == 'nec' ) THEN
1031	!
1032	!-- Code optimized for vector processors
1033	CALL fft_x_m( work_trix, 'backward' )
1034
1035	DO k = 1, nz
1036
1037	m = 0
1038	DO n = 1, pdims(1)
1039	DO i = 1, nnx
1040	ar(i,k,j,n) = work_trix(m,k)
1041	m = m + 1
1042	ENDDO
1043	ENDDO
1044
1045	ENDDO
1046
1047	ELSE
1048	!
1049	!-- Cache optimized code
1050	DO k = 1, nz
1051
1052	DO i = 0, nx
1053	work_fftx(i) = work_trix(i,k)
1054	ENDDO
1055
1056	CALL fft_x_1d( work_fftx, 'backward' )
1057
1058	m = 0
1059	DO n = 1, pdims(1)
1060	DO i = 1, nnx
1061	ar(i,k,j,n) = work_fftx(m)
1062	m = m + 1
1063	ENDDO
1064	ENDDO
1065
1066	ENDDO
1067
1068	ENDIF
1069
1070	ENDDO
1071
1072	DEALLOCATE( tri )
1073
1074	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1075
1076	END SUBROUTINE fftx_tri_fftx
1077
1078
1079	SUBROUTINE fftx_tr_xy( f_in, f_out )
1080
1081	!------------------------------------------------------------------------------!
1082	! Fourier-transformation along x with subsequent transposition x --> y for
1083	! a 1d-decomposition along y
1084	!
1085	! ATTENTION: The NEC-branch of this routine may significantly profit from
1086	! further optimizations. So far, performance is much worse than
1087	! for routine ffty_tr_yx (more than three times slower).
1088	!------------------------------------------------------------------------------!
1089
1090	USE control_parameters
1091	USE cpulog
1092	USE indices
1093	USE interfaces
1094	USE pegrid
1095	USE transpose_indices
1096
1097	IMPLICIT NONE
1098
1099	INTEGER :: i, j, k
1100
1101	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1102	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_in
1103	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out
1104	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1105
1106	!
1107	!-- Carry out the FFT along x, where all data are present due to the
1108	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1109	!-- the first index.
1110	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1111
1112	IF ( host(1:3) == 'nec' ) THEN
1113	!
1114	!-- Code for vector processors
1115	!$OMP PARALLEL PRIVATE ( i, j, k )
1116	!$OMP DO
1117	DO i = 0, nx
1118
1119	DO j = nys, nyn
1120	DO k = 1, nz
1121	work_fftx(i,k,j) = f_in(k,j,i)
1122	ENDDO
1123	ENDDO
1124
1125	ENDDO
1126
1127	!$OMP DO
1128	DO j = nys, nyn
1129
1130	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1131
1132	DO k = 1, nz
1133	DO i = 0, nx
1134	work(j,k,i) = work_fftx(i,k,j)
1135	ENDDO
1136	ENDDO
1137
1138	ENDDO
1139	!$OMP END PARALLEL
1140
1141	ELSE
1142
1143	!
1144	!-- Cache optimized code (there might be still a potential for better
1145	!-- optimization).
1146	!$OMP PARALLEL PRIVATE (i,j,k)
1147	!$OMP DO
1148	DO i = 0, nx
1149
1150	DO j = nys, nyn
1151	DO k = 1, nz
1152	work_fftx(i,k,j) = f_in(k,j,i)
1153	ENDDO
1154	ENDDO
1155
1156	ENDDO
1157
1158	!$OMP DO
1159	DO j = nys, nyn
1160	DO k = 1, nz
1161
1162	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1163
1164	DO i = 0, nx
1165	work(j,k,i) = work_fftx(i,k,j)
1166	ENDDO
1167	ENDDO
1168
1169	ENDDO
1170	!$OMP END PARALLEL
1171
1172	ENDIF
1173	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1174
1175	!
1176	!-- Transpose array
1177	#if defined( __parallel )
1178	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1179	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1180	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1181	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1182	comm1dy, ierr )
1183	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1184	#endif
1185
1186	END SUBROUTINE fftx_tr_xy
1187
1188
1189	SUBROUTINE tr_yx_fftx( f_in, f_out )
1190
1191	!------------------------------------------------------------------------------!
1192	! Transposition y --> x with a subsequent backward Fourier transformation for
1193	! a 1d-decomposition along x
1194	!------------------------------------------------------------------------------!
1195
1196	USE control_parameters
1197	USE cpulog
1198	USE indices
1199	USE interfaces
1200	USE pegrid
1201	USE transpose_indices
1202
1203	IMPLICIT NONE
1204
1205	INTEGER :: i, j, k
1206
1207	REAL, DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx
1208	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in
1209	REAL, DIMENSION(1:nz,nys:nyn,0:nx) :: f_out
1210	REAL, DIMENSION(nys:nyn,1:nz,0:nx) :: work
1211
1212	!
1213	!-- Transpose array
1214	#if defined( __parallel )
1215	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1216	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1217	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1218	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1219	comm1dy, ierr )
1220	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1221	#endif
1222
1223	!
1224	!-- Carry out the FFT along x, where all data are present due to the
1225	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1226	!-- the first index.
1227	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1228
1229	IF ( host(1:3) == 'nec' ) THEN
1230	!
1231	!-- Code optimized for vector processors
1232	!$OMP PARALLEL PRIVATE ( i, j, k )
1233	!$OMP DO
1234	DO j = nys, nyn
1235
1236	DO k = 1, nz
1237	DO i = 0, nx
1238	work_fftx(i,k,j) = work(j,k,i)
1239	ENDDO
1240	ENDDO
1241
1242	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1243
1244	ENDDO
1245
1246	!$OMP DO
1247	DO i = 0, nx
1248	DO j = nys, nyn
1249	DO k = 1, nz
1250	f_out(k,j,i) = work_fftx(i,k,j)
1251	ENDDO
1252	ENDDO
1253	ENDDO
1254	!$OMP END PARALLEL
1255
1256	ELSE
1257
1258	!
1259	!-- Cache optimized code (there might be still a potential for better
1260	!-- optimization).
1261	!$OMP PARALLEL PRIVATE (i,j,k)
1262	!$OMP DO
1263	DO j = nys, nyn
1264	DO k = 1, nz
1265
1266	DO i = 0, nx
1267	work_fftx(i,k,j) = work(j,k,i)
1268	ENDDO
1269
1270	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1271
1272	ENDDO
1273	ENDDO
1274
1275	!$OMP DO
1276	DO i = 0, nx
1277	DO j = nys, nyn
1278	DO k = 1, nz
1279	f_out(k,j,i) = work_fftx(i,k,j)
1280	ENDDO
1281	ENDDO
1282	ENDDO
1283	!$OMP END PARALLEL
1284
1285	ENDIF
1286	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1287
1288	END SUBROUTINE tr_yx_fftx
1289
1290
1291	SUBROUTINE ffty_tri_ffty( ar )
1292
1293	!------------------------------------------------------------------------------!
1294	! FFT along y, solution of the tridiagonal system and backward FFT for
1295	! a 1d-decomposition along y
1296	!
1297	! WARNING: this subroutine may still not work for hybrid parallelization
1298	! with OpenMP (for possible necessary changes see the original
1299	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1300	!------------------------------------------------------------------------------!
1301
1302	USE control_parameters
1303	USE cpulog
1304	USE grid_variables
1305	USE indices
1306	USE interfaces
1307	USE pegrid
1308	USE transpose_indices
1309
1310	IMPLICIT NONE
1311
1312	INTEGER :: i, j, k, m, n, omp_get_thread_num, tn
1313
1314	REAL, DIMENSION(0:ny) :: work_ffty
1315	REAL, DIMENSION(0:ny,1:nz) :: work_triy
1316	REAL, DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar
1317	REAL, DIMENSION(:,:,:,:), ALLOCATABLE :: tri
1318
1319
1320	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1321
1322	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1323
1324	tn = 0 ! Default thread number in case of one thread
1325	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1326	DO i = nxl_y, nxr_y
1327
1328	!$ tn = omp_get_thread_num()
1329
1330	IF ( host(1:3) == 'nec' ) THEN
1331	!
1332	!-- Code optimized for vector processors
1333	DO k = 1, nz
1334
1335	m = 0
1336	DO n = 1, pdims(2)
1337	DO j = 1, nny
1338	work_triy(m,k) = ar(j,k,i,n)
1339	m = m + 1
1340	ENDDO
1341	ENDDO
1342
1343	ENDDO
1344
1345	CALL fft_y_m( work_triy, ny, 'forward' )
1346
1347	ELSE
1348	!
1349	!-- Cache optimized code
1350	DO k = 1, nz
1351
1352	m = 0
1353	DO n = 1, pdims(2)
1354	DO j = 1, nny
1355	work_ffty(m) = ar(j,k,i,n)
1356	m = m + 1
1357	ENDDO
1358	ENDDO
1359
1360	CALL fft_y_1d( work_ffty, 'forward' )
1361
1362	DO j = 0, ny
1363	work_triy(j,k) = work_ffty(j)
1364	ENDDO
1365
1366	ENDDO
1367
1368	ENDIF
1369
1370	!
1371	!-- Solve the linear equation system
1372	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1373
1374	IF ( host(1:3) == 'nec' ) THEN
1375	!
1376	!-- Code optimized for vector processors
1377	CALL fft_y_m( work_triy, ny, 'backward' )
1378
1379	DO k = 1, nz
1380
1381	m = 0
1382	DO n = 1, pdims(2)
1383	DO j = 1, nny
1384	ar(j,k,i,n) = work_triy(m,k)
1385	m = m + 1
1386	ENDDO
1387	ENDDO
1388
1389	ENDDO
1390
1391	ELSE
1392	!
1393	!-- Cache optimized code
1394	DO k = 1, nz
1395
1396	DO j = 0, ny
1397	work_ffty(j) = work_triy(j,k)
1398	ENDDO
1399
1400	CALL fft_y_1d( work_ffty, 'backward' )
1401
1402	m = 0
1403	DO n = 1, pdims(2)
1404	DO j = 1, nny
1405	ar(j,k,i,n) = work_ffty(m)
1406	m = m + 1
1407	ENDDO
1408	ENDDO
1409
1410	ENDDO
1411
1412	ENDIF
1413
1414	ENDDO
1415
1416	DEALLOCATE( tri )
1417
1418	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1419
1420	END SUBROUTINE ffty_tri_ffty
1421
1422	#endif
1423
1424	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |