Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1524

Last change on this file since 1524 was 1483, checked in by raasch, 10 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 45.0 KB

Line
1	MODULE poisfft_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2014 Leibniz Universitaet Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft.f90 1483 2014-10-18 12:51:13Z keck $
27	!
28	! 1482 2014-10-18 12:34:45Z raasch
29	! use 2d-decomposition, if accelerator boards are used
30	!
31	! 1406 2014-05-16 13:47:01Z raasch
32	! bugfix for pgi 14.4: declare create moved after array declaration
33	!
34	! 1320 2014-03-20 08:40:49Z raasch
35	! ONLY-attribute added to USE-statements,
36	! kind-parameters added to all INTEGER and REAL declaration statements,
37	! kinds are defined in new module kinds,
38	! old module precision_kind is removed,
39	! revision history before 2012 removed,
40	! comment fields (!:) to be used for variable explanations added to
41	! all variable declaration statements
42	!
43	! 1318 2014-03-17 13:35:16Z raasch
44	! module interfaces removed
45	!
46	! 1306 2014-03-13 14:30:59Z raasch
47	! openmp sections removed from the overlap branch,
48	! second argument removed from parameter list
49	!
50	! 1216 2013-08-26 09:31:42Z raasch
51	! resorting of arrays moved to separate routines resort_for_...,
52	! one argument, used as temporary work array, removed from all transpose
53	! routines
54	! overlapping fft / transposition implemented
55	!
56	! 1212 2013-08-15 08:46:27Z raasch
57	! tridia routines moved to seperate module tridia_solver
58	!
59	! 1208 2013-08-13 06:41:49Z raasch
60	! acc-update clauses added for "ar" so that ffts other than cufft can also be
61	! used (although they are not ported and will give a poor performance)
62	!
63	! 1111 2013-03-08 23:54:10Z raasch
64	! further openACC porting of non-parallel (MPI) branch:
65	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
66	! no distinction between parallel/non-parallel in poisfft and tridia any more,
67	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
68	! (otherwise "invalid device function" is indicated during runtime),
69	! optimization of tridia routines: constant elements and coefficients of tri are
70	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
71	! to 2,
72	! poisfft_init is now called internally from poisfft, maketri is called from
73	! poisfft_init,
74	! ibc_p_b = 2 removed
75	!
76	! 1106 2013-03-04 05:31:38Z raasch
77	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
78	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
79	! fft_y_1d
80	!
81	! 1103 2013-02-20 02:15:53Z raasch
82	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
83	! sometimes cause segmentation faults with intel 12.1 compiler
84	!
85	! 1092 2013-02-02 11:24:22Z raasch
86	! unused variables removed
87	!
88	! 1036 2012-10-22 13:43:42Z raasch
89	! code put under GPL (PALM 3.9)
90	!
91	! 2012-09-21 07:03:55Z raasch
92	! FLOAT type conversion replaced by REAL
93	!
94	! 1003 2012-09-14 14:35:53Z raasch
95	! indices nxa, nya, etc. replaced by nx, ny, etc.
96	!
97	! 940 2012-07-09 14:31:00Z raasch
98	! special handling of tri-array as an argument in tridia_1dd routines switched
99	! off because it caused segmentation faults with intel 12.1 compiler
100	!
101	! 877 2012-04-03 11:21:44Z suehring
102	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
103	! pressure at the top of the model domain.
104	!
105	! 809 2012-01-30 13:32:58Z maronga
106	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
107	!
108	! 807 2012-01-25 11:53:51Z maronga
109	! New cpp directive "__check" implemented which is used by check_namelist_files
110	! (most of the code is unneeded by check_namelist_files).
111	!
112	! Revision 1.1 1997/07/24 11:24:14 raasch
113	! Initial revision
114	!
115	!
116	! Description:
117	! ------------
118	! Original version by Stephan Siano (pois3d), as of July 23, 1996
119	! Adapted for 2D-domain-decomposition by Siegfried Raasch, July 3, 1997
120	!
121	! Solves the Poisson equation with a 2D spectral method
122	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
123	!
124	! Input:
125	! real ar contains (nnz,nny,nnx) elements of the velocity divergence,
126	! starting from (1,nys,nxl)
127	!
128	! Output:
129	! real ar contains the solution for perturbation pressure p
130	!------------------------------------------------------------------------------!
131
132	USE fft_xy, &
133	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
134
135	USE indices, &
136	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
137
138	USE transpose_indices, &
139	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
140	nzb_y, nzt_x, nzt_y
141
142	USE tridia_solver, &
143	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
144
145	IMPLICIT NONE
146
147	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
148
149	PRIVATE
150
151	#if ! defined ( __check )
152	PUBLIC poisfft, poisfft_init
153
154	INTERFACE poisfft
155	MODULE PROCEDURE poisfft
156	END INTERFACE poisfft
157
158	INTERFACE poisfft_init
159	MODULE PROCEDURE poisfft_init
160	END INTERFACE poisfft_init
161	#else
162	PUBLIC poisfft_init
163
164	INTERFACE poisfft_init
165	MODULE PROCEDURE poisfft_init
166	END INTERFACE poisfft_init
167	#endif
168
169	CONTAINS
170
171	SUBROUTINE poisfft_init
172
173	USE arrays_3d, &
174	ONLY: ddzu_pres, ddzw
175
176	USE kinds
177
178	IMPLICIT NONE
179
180	INTEGER(iwp) :: k !:
181
182
183	CALL fft_init
184
185	CALL tridia_init
186
187	poisfft_initialized = .TRUE.
188
189	END SUBROUTINE poisfft_init
190
191
192	#if ! defined ( __check )
193	SUBROUTINE poisfft( ar )
194
195	USE control_parameters, &
196	ONLY: fft_method, transpose_compute_overlap
197
198	USE cpulog, &
199	ONLY: cpu_log, cpu_log_nowait, log_point_s
200
201	USE kinds
202
203	USE pegrid
204
205	IMPLICIT NONE
206
207	INTEGER(iwp) :: ii !:
208	INTEGER(iwp) :: iind !:
209	INTEGER(iwp) :: inew !:
210	INTEGER(iwp) :: jj !:
211	INTEGER(iwp) :: jind !:
212	INTEGER(iwp) :: jnew !:
213	INTEGER(iwp) :: ki !:
214	INTEGER(iwp) :: kk !:
215	INTEGER(iwp) :: knew !:
216	INTEGER(iwp) :: n !:
217	INTEGER(iwp) :: nblk !:
218	INTEGER(iwp) :: nnx_y !:
219	INTEGER(iwp) :: nny_z !:
220	INTEGER(iwp) :: nnz_t !:
221	INTEGER(iwp) :: nnz_x !:
222	INTEGER(iwp) :: nxl_y_bound !:
223	INTEGER(iwp) :: nxr_y_bound !:
224
225	INTEGER(iwp), DIMENSION(4) :: isave !:
226
227	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !:
228	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !:
229	!$acc declare create( ar_inv )
230
231	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !:
232	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !:
233	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !:
234	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !:
235	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !:
236
237
238	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
239
240	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
241
242	!
243	!-- Two-dimensional Fourier Transformation in x- and y-direction.
244	IF ( pdims(2) == 1 .AND. pdims(1) > 1 .AND. num_acc_per_node == 0 ) &
245	THEN
246
247	!
248	!-- 1d-domain-decomposition along x:
249	!-- FFT along y and transposition y --> x
250	CALL ffty_tr_yx( ar, ar )
251
252	!
253	!-- FFT along x, solving the tridiagonal system and backward FFT
254	CALL fftx_tri_fftx( ar )
255
256	!
257	!-- Transposition x --> y and backward FFT along y
258	CALL tr_xy_ffty( ar, ar )
259
260	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 .AND. num_acc_per_node == 0 ) &
261	THEN
262
263	!
264	!-- 1d-domain-decomposition along y:
265	!-- FFT along x and transposition x --> y
266	CALL fftx_tr_xy( ar, ar )
267
268	!
269	!-- FFT along y, solving the tridiagonal system and backward FFT
270	CALL ffty_tri_ffty( ar )
271
272	!
273	!-- Transposition y --> x and backward FFT along x
274	CALL tr_yx_fftx( ar, ar )
275
276	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
277
278	!
279	!-- 2d-domain-decomposition or no decomposition (1 PE run)
280	!-- Transposition z --> x
281	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
282	CALL resort_for_zx( ar, ar_inv )
283	CALL transpose_zx( ar_inv, ar )
284	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
285
286	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
287	IF ( fft_method /= 'system-specific' ) THEN
288	!$acc update host( ar )
289	ENDIF
290	CALL fft_x( ar, 'forward' )
291	IF ( fft_method /= 'system-specific' ) THEN
292	!$acc update device( ar )
293	ENDIF
294	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
295
296	!
297	!-- Transposition x --> y
298	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
299	CALL resort_for_xy( ar, ar_inv )
300	CALL transpose_xy( ar_inv, ar )
301	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
302
303	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
304	IF ( fft_method /= 'system-specific' ) THEN
305	!$acc update host( ar )
306	ENDIF
307	CALL fft_y( ar, 'forward', ar_tr = ar, &
308	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
309	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
310	IF ( fft_method /= 'system-specific' ) THEN
311	!$acc update device( ar )
312	ENDIF
313	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
314
315	!
316	!-- Transposition y --> z
317	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
318	CALL resort_for_yz( ar, ar_inv )
319	CALL transpose_yz( ar_inv, ar )
320	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
321
322	!
323	!-- Solve the tridiagonal equation system along z
324	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
325	CALL tridia_substi( ar )
326	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
327
328	!
329	!-- Inverse Fourier Transformation
330	!-- Transposition z --> y
331	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
332	CALL transpose_zy( ar, ar_inv )
333	CALL resort_for_zy( ar_inv, ar )
334	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
335
336	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
337	IF ( fft_method /= 'system-specific' ) THEN
338	!$acc update host( ar )
339	ENDIF
340	CALL fft_y( ar, 'backward', ar_tr = ar, &
341	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
342	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
343	IF ( fft_method /= 'system-specific' ) THEN
344	!$acc update device( ar )
345	ENDIF
346	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
347
348	!
349	!-- Transposition y --> x
350	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
351	CALL transpose_yx( ar, ar_inv )
352	CALL resort_for_yx( ar_inv, ar )
353	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
354
355	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
356	IF ( fft_method /= 'system-specific' ) THEN
357	!$acc update host( ar )
358	ENDIF
359	CALL fft_x( ar, 'backward' )
360	IF ( fft_method /= 'system-specific' ) THEN
361	!$acc update device( ar )
362	ENDIF
363	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
364
365	!
366	!-- Transposition x --> z
367	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
368	CALL transpose_xz( ar, ar_inv )
369	CALL resort_for_xz( ar_inv, ar )
370	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
371
372	ELSE
373
374	!
375	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
376	!-- overlapping transposition / fft
377	!-- cputime logging must not use barriers, which would prevent overlapping
378	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
379	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
380	!
381	!-- Transposition z --> x + subsequent fft along x
382	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
383	CALL resort_for_zx( ar, f_inv )
384	!
385	!-- Save original indices and gridpoint counter
386	isave(1) = nz
387	isave(2) = nzb_x
388	isave(3) = nzt_x
389	isave(4) = sendrecvcount_zx
390	!
391	!-- Set new indices for transformation
392	nblk = nz / pdims(1)
393	nz = pdims(1)
394	nnz_x = 1
395	nzb_x = 1 + myidx * nnz_x
396	nzt_x = ( myidx + 1 ) * nnz_x
397	sendrecvcount_zx = nnx * nny * nnz_x
398
399	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
400	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
401
402	DO kk = 1, nblk
403
404	IF ( kk == 1 ) THEN
405	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
406	ELSE
407	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
408	ENDIF
409
410	DO knew = 1, nz
411	ki = kk + nblk * ( knew - 1 )
412	f_in(:,:,knew) = f_inv(:,:,ki)
413	ENDDO
414
415	CALL transpose_zx( f_in, ar1(:,:,:))
416	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
417
418	IF ( kk == 1 ) THEN
419	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
420	ELSE
421	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
422	ENDIF
423
424	n = isave(2) + kk - 1
425	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
426	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
427
428	ENDDO
429	!
430	!-- Restore original indices/counters
431	nz = isave(1)
432	nzb_x = isave(2)
433	nzt_x = isave(3)
434	sendrecvcount_zx = isave(4)
435
436	DEALLOCATE( ar1, f_in, f_inv )
437
438	!
439	!-- Transposition x --> y + subsequent fft along y
440	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
441	CALL resort_for_xy( f_out_z, f_inv )
442	!
443	!-- Save original indices and gridpoint counter
444	isave(1) = nx
445	isave(2) = nxl_y
446	isave(3) = nxr_y
447	isave(4) = sendrecvcount_xy
448	!
449	!-- Set new indices for transformation
450	nblk = ( ( nx+1 ) / pdims(2) ) - 1
451	nx = pdims(2)
452	nnx_y = 1
453	nxl_y = myidy * nnx_y
454	nxr_y = ( myidy + 1 ) * nnx_y - 1
455	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
456
457	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
458	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
459
460	DO ii = 0, nblk
461
462	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
463
464	DO inew = 0, nx-1
465	iind = ii + ( nblk + 1 ) * inew
466	f_in(:,:,inew) = f_inv(:,:,iind)
467	ENDDO
468
469	CALL transpose_xy( f_in, ar1(:,:,:) )
470
471	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
472
473	IF ( ii == 1 ) THEN
474	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
475	ELSE
476	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
477	ENDIF
478
479	nxl_y_bound = isave(2)
480	nxr_y_bound = isave(3)
481	n = isave(2) + ii
482	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
483	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
484	nxl_y_l = n, nxr_y_l = n )
485
486	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
487
488	ENDDO
489	!
490	!-- Restore original indices/counters
491	nx = isave(1)
492	nxl_y = isave(2)
493	nxr_y = isave(3)
494	sendrecvcount_xy = isave(4)
495
496	DEALLOCATE( ar1, f_in, f_inv )
497
498	!
499	!-- Transposition y --> z + subsequent tridia + resort for z --> y
500	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
501	CALL resort_for_yz( f_out_y, f_inv )
502	!
503	!-- Save original indices and gridpoint counter
504	isave(1) = ny
505	isave(2) = nys_z
506	isave(3) = nyn_z
507	isave(4) = sendrecvcount_yz
508	!
509	!-- Set new indices for transformation
510	nblk = ( ( ny+1 ) / pdims(1) ) - 1
511	ny = pdims(1)
512	nny_z = 1
513	nys_z = myidx * nny_z
514	nyn_z = ( myidx + 1 ) * nny_z - 1
515	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
516
517	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
518	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
519
520	DO jj = 0, nblk
521	!
522	!-- Forward Fourier Transformation
523	!-- Transposition y --> z
524	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
525
526	DO jnew = 0, ny-1
527	jind = jj + ( nblk + 1 ) * jnew
528	f_in(:,:,jnew) = f_inv(:,:,jind)
529	ENDDO
530
531	CALL transpose_yz( f_in, ar1(:,:,:) )
532
533	IF ( jj == nblk ) THEN
534	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
535	ELSE
536	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
537	ENDIF
538
539	!
540	!-- Solve the tridiagonal equation system along z
541	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
542
543	n = isave(2) + jj
544	CALL tridia_substi_overlap( ar1(:,:,:), n )
545
546	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
547
548	!
549	!-- Inverse Fourier Transformation
550	!-- Transposition z --> y
551	!-- Only one thread should call MPI routines, therefore forward and
552	!-- backward tranpose are in the same section
553	IF ( jj == 0 ) THEN
554	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
555	ELSE
556	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
557	ENDIF
558
559	CALL transpose_zy( ar1(:,:,:), f_in )
560
561	DO jnew = 0, ny-1
562	jind = jj + ( nblk + 1 ) * jnew
563	f_inv(:,:,jind) = f_in(:,:,jnew)
564	ENDDO
565
566	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
567
568	ENDDO
569	!
570	!-- Restore original indices/counters
571	ny = isave(1)
572	nys_z = isave(2)
573	nyn_z = isave(3)
574	sendrecvcount_yz = isave(4)
575
576	CALL resort_for_zy( f_inv, f_out_y )
577
578	DEALLOCATE( ar1, f_in, f_inv )
579
580	!
581	!-- fft along y backward + subsequent transposition y --> x
582	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
583	!
584	!-- Save original indices and gridpoint counter
585	isave(1) = nx
586	isave(2) = nxl_y
587	isave(3) = nxr_y
588	isave(4) = sendrecvcount_xy
589	!
590	!-- Set new indices for transformation
591	nblk = (( nx+1 ) / pdims(2) ) - 1
592	nx = pdims(2)
593	nnx_y = 1
594	nxl_y = myidy * nnx_y
595	nxr_y = ( myidy + 1 ) * nnx_y - 1
596	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
597
598	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
599	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
600
601	DO ii = 0, nblk
602
603	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
604
605	n = isave(2) + ii
606	nxl_y_bound = isave(2)
607	nxr_y_bound = isave(3)
608
609	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
610	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
611	nxl_y_l = n, nxr_y_l = n )
612
613	IF ( ii == nblk ) THEN
614	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
615	ELSE
616	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
617	ENDIF
618
619	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
620
621	CALL transpose_yx( ar1(:,:,:), f_in )
622
623	DO inew = 0, nx-1
624	iind = ii + (nblk+1) * inew
625	f_inv(:,:,iind) = f_in(:,:,inew)
626	ENDDO
627
628	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
629
630	ENDDO
631	!
632	!-- Restore original indices/counters
633	nx = isave(1)
634	nxl_y = isave(2)
635	nxr_y = isave(3)
636	sendrecvcount_xy = isave(4)
637
638	CALL resort_for_yx( f_inv, f_out_z )
639
640	DEALLOCATE( ar1, f_in, f_inv )
641
642	!
643	!-- fft along x backward + subsequent final transposition x --> z
644	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
645	!
646	!-- Save original indices and gridpoint counter
647	isave(1) = nz
648	isave(2) = nzb_x
649	isave(3) = nzt_x
650	isave(4) = sendrecvcount_zx
651	!
652	!-- Set new indices for transformation
653	nblk = nz / pdims(1)
654	nz = pdims(1)
655	nnz_x = 1
656	nzb_x = 1 + myidx * nnz_x
657	nzt_x = ( myidx + 1 ) * nnz_x
658	sendrecvcount_zx = nnx * nny * nnz_x
659
660	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
661	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
662
663	DO kk = 1, nblk
664
665	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
666
667	n = isave(2) + kk - 1
668	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
669
670	IF ( kk == nblk ) THEN
671	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
672	ELSE
673	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
674	ENDIF
675
676	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
677
678	CALL transpose_xz( ar1(:,:,:), f_in )
679
680	DO knew = 1, nz
681	ki = kk + nblk * (knew-1)
682	f_inv(:,:,ki) = f_in(:,:,knew)
683	ENDDO
684
685	IF ( kk == nblk ) THEN
686	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
687	ELSE
688	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
689	ENDIF
690
691	ENDDO
692	!
693	!-- Restore original indices/counters
694	nz = isave(1)
695	nzb_x = isave(2)
696	nzt_x = isave(3)
697	sendrecvcount_zx = isave(4)
698
699	CALL resort_for_xz( f_inv, ar )
700
701	DEALLOCATE( ar1, f_in, f_inv )
702
703	ENDIF
704
705	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
706
707	END SUBROUTINE poisfft
708
709
710
711	SUBROUTINE ffty_tr_yx( f_in, f_out )
712
713	!------------------------------------------------------------------------------!
714	! Fourier-transformation along y with subsequent transposition y --> x for
715	! a 1d-decomposition along x
716	!
717	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
718	! if the first index of work_ffty_vec is odd. Otherwise
719	! memory bank conflicts may occur (especially if the index is a
720	! multiple of 128). That's why work_ffty_vec is dimensioned as
721	! 0:ny+1.
722	! Of course, this will not work if users are using an odd number
723	! of gridpoints along y.
724	!------------------------------------------------------------------------------!
725
726	USE control_parameters, &
727	ONLY: host
728
729	USE cpulog, &
730	ONLY: cpu_log, log_point_s
731
732	USE kinds
733
734	USE pegrid
735
736	IMPLICIT NONE
737
738	INTEGER(iwp) :: i !:
739	INTEGER(iwp) :: iend !:
740	INTEGER(iwp) :: iouter !:
741	INTEGER(iwp) :: ir !:
742	INTEGER(iwp) :: j !:
743	INTEGER(iwp) :: k !:
744
745	INTEGER(iwp), PARAMETER :: stridex = 4 !:
746
747	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !:
748	#if defined( __nec )
749	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !:
750	#endif
751	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !:
752	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !:
753	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !:
754
755	!
756	!-- Carry out the FFT along y, where all data are present due to the
757	!-- 1d-decomposition along x. Resort the data in a way that x becomes
758	!-- the first index.
759	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
760
761	IF ( host(1:3) == 'nec' ) THEN
762	#if defined( __nec )
763	!
764	!-- Code optimized for vector processors
765	!$OMP PARALLEL PRIVATE ( i, j, k )
766	!$OMP DO
767	DO i = nxl, nxr
768
769	DO j = 0, ny
770	DO k = 1, nz
771	work_ffty_vec(j,k,i) = f_in(k,j,i)
772	ENDDO
773	ENDDO
774
775	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
776
777	ENDDO
778
779	!$OMP DO
780	DO k = 1, nz
781	DO j = 0, ny
782	DO i = nxl, nxr
783	work(i,k,j) = work_ffty_vec(j,k,i)
784	ENDDO
785	ENDDO
786	ENDDO
787	!$OMP END PARALLEL
788	#endif
789
790	ELSE
791
792	!
793	!-- Cache optimized code.
794	!-- The i-(x-)direction is split into a strided outer loop and an inner
795	!-- loop for better cache performance
796	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
797	!$OMP DO
798	DO iouter = nxl, nxr, stridex
799
800	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
801
802	DO k = 1, nz
803
804	DO i = iouter, iend
805
806	ir = i-iouter+1 ! counter within a stride
807	DO j = 0, ny
808	work_ffty(j,ir) = f_in(k,j,i)
809	ENDDO
810	!
811	!-- FFT along y
812	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
813
814	ENDDO
815
816	!
817	!-- Resort
818	DO j = 0, ny
819	DO i = iouter, iend
820	work(i,k,j) = work_ffty(j,i-iouter+1)
821	ENDDO
822	ENDDO
823
824	ENDDO
825
826	ENDDO
827	!$OMP END PARALLEL
828
829	ENDIF
830	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
831
832	!
833	!-- Transpose array
834	#if defined( __parallel )
835	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
836	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
837	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
838	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
839	comm1dx, ierr )
840	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
841	#endif
842
843	END SUBROUTINE ffty_tr_yx
844
845
846	SUBROUTINE tr_xy_ffty( f_in, f_out )
847
848	!------------------------------------------------------------------------------!
849	! Transposition x --> y with a subsequent backward Fourier transformation for
850	! a 1d-decomposition along x
851	!------------------------------------------------------------------------------!
852
853	USE control_parameters, &
854	ONLY: host
855
856	USE cpulog, &
857	ONLY: cpu_log, log_point_s
858
859	USE kinds
860
861	USE pegrid
862
863	IMPLICIT NONE
864
865	INTEGER(iwp) :: i !:
866	INTEGER(iwp) :: iend !:
867	INTEGER(iwp) :: iouter !:
868	INTEGER(iwp) :: ir !:
869	INTEGER(iwp) :: j !:
870	INTEGER(iwp) :: k !:
871
872	INTEGER(iwp), PARAMETER :: stridex = 4 !:
873
874	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !:
875	#if defined( __nec )
876	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !:
877	#endif
878	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !:
879	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !:
880	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !:
881
882	!
883	!-- Transpose array
884	#if defined( __parallel )
885	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
886	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
887	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
888	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
889	comm1dx, ierr )
890	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
891	#endif
892
893	!
894	!-- Resort the data in a way that y becomes the first index and carry out the
895	!-- backward fft along y.
896	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
897
898	IF ( host(1:3) == 'nec' ) THEN
899	#if defined( __nec )
900	!
901	!-- Code optimized for vector processors
902	!$OMP PARALLEL PRIVATE ( i, j, k )
903	!$OMP DO
904	DO k = 1, nz
905	DO j = 0, ny
906	DO i = nxl, nxr
907	work_ffty_vec(j,k,i) = work(i,k,j)
908	ENDDO
909	ENDDO
910	ENDDO
911
912	!$OMP DO
913	DO i = nxl, nxr
914
915	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
916
917	DO j = 0, ny
918	DO k = 1, nz
919	f_out(k,j,i) = work_ffty_vec(j,k,i)
920	ENDDO
921	ENDDO
922
923	ENDDO
924	!$OMP END PARALLEL
925	#endif
926
927	ELSE
928
929	!
930	!-- Cache optimized code.
931	!-- The i-(x-)direction is split into a strided outer loop and an inner
932	!-- loop for better cache performance
933	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
934	!$OMP DO
935	DO iouter = nxl, nxr, stridex
936
937	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
938
939	DO k = 1, nz
940	!
941	!-- Resort
942	DO j = 0, ny
943	DO i = iouter, iend
944	work_ffty(j,i-iouter+1) = work(i,k,j)
945	ENDDO
946	ENDDO
947
948	DO i = iouter, iend
949
950	!
951	!-- FFT along y
952	ir = i-iouter+1 ! counter within a stride
953	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
954
955	DO j = 0, ny
956	f_out(k,j,i) = work_ffty(j,ir)
957	ENDDO
958	ENDDO
959
960	ENDDO
961
962	ENDDO
963	!$OMP END PARALLEL
964
965	ENDIF
966
967	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
968
969	END SUBROUTINE tr_xy_ffty
970
971
972	SUBROUTINE fftx_tri_fftx( ar )
973
974	!------------------------------------------------------------------------------!
975	! FFT along x, solution of the tridiagonal system and backward FFT for
976	! a 1d-decomposition along x
977	!
978	! WARNING: this subroutine may still not work for hybrid parallelization
979	! with OpenMP (for possible necessary changes see the original
980	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
981	!------------------------------------------------------------------------------!
982
983	USE control_parameters, &
984	ONLY: host
985
986	USE cpulog, &
987	ONLY: cpu_log, log_point_s
988
989	USE grid_variables, &
990	ONLY: ddx2, ddy2
991
992	USE kinds
993
994	USE pegrid
995
996	IMPLICIT NONE
997
998	INTEGER(iwp) :: i !:
999	INTEGER(iwp) :: j !:
1000	INTEGER(iwp) :: k !:
1001	INTEGER(iwp) :: m !:
1002	INTEGER(iwp) :: n !:
1003	INTEGER(iwp) :: omp_get_thread_num !:
1004	INTEGER(iwp) :: tn !:
1005
1006	REAL(wp), DIMENSION(0:nx) :: work_fftx !:
1007	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !:
1008	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !:
1009	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !:
1010
1011
1012	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
1013
1014	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1015
1016	tn = 0 ! Default thread number in case of one thread
1017	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1018	DO j = nys_x, nyn_x
1019
1020	!$ tn = omp_get_thread_num()
1021
1022	IF ( host(1:3) == 'nec' ) THEN
1023	!
1024	!-- Code optimized for vector processors
1025	DO k = 1, nz
1026
1027	m = 0
1028	DO n = 1, pdims(1)
1029	DO i = 1, nnx
1030	work_trix(m,k) = ar(i,k,j,n)
1031	m = m + 1
1032	ENDDO
1033	ENDDO
1034
1035	ENDDO
1036
1037	CALL fft_x_m( work_trix, 'forward' )
1038
1039	ELSE
1040	!
1041	!-- Cache optimized code
1042	DO k = 1, nz
1043
1044	m = 0
1045	DO n = 1, pdims(1)
1046	DO i = 1, nnx
1047	work_fftx(m) = ar(i,k,j,n)
1048	m = m + 1
1049	ENDDO
1050	ENDDO
1051
1052	CALL fft_x_1d( work_fftx, 'forward' )
1053
1054	DO i = 0, nx
1055	work_trix(i,k) = work_fftx(i)
1056	ENDDO
1057
1058	ENDDO
1059
1060	ENDIF
1061
1062	!
1063	!-- Solve the linear equation system
1064	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1065
1066	IF ( host(1:3) == 'nec' ) THEN
1067	!
1068	!-- Code optimized for vector processors
1069	CALL fft_x_m( work_trix, 'backward' )
1070
1071	DO k = 1, nz
1072
1073	m = 0
1074	DO n = 1, pdims(1)
1075	DO i = 1, nnx
1076	ar(i,k,j,n) = work_trix(m,k)
1077	m = m + 1
1078	ENDDO
1079	ENDDO
1080
1081	ENDDO
1082
1083	ELSE
1084	!
1085	!-- Cache optimized code
1086	DO k = 1, nz
1087
1088	DO i = 0, nx
1089	work_fftx(i) = work_trix(i,k)
1090	ENDDO
1091
1092	CALL fft_x_1d( work_fftx, 'backward' )
1093
1094	m = 0
1095	DO n = 1, pdims(1)
1096	DO i = 1, nnx
1097	ar(i,k,j,n) = work_fftx(m)
1098	m = m + 1
1099	ENDDO
1100	ENDDO
1101
1102	ENDDO
1103
1104	ENDIF
1105
1106	ENDDO
1107
1108	DEALLOCATE( tri )
1109
1110	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1111
1112	END SUBROUTINE fftx_tri_fftx
1113
1114
1115	SUBROUTINE fftx_tr_xy( f_in, f_out )
1116
1117	!------------------------------------------------------------------------------!
1118	! Fourier-transformation along x with subsequent transposition x --> y for
1119	! a 1d-decomposition along y
1120	!
1121	! ATTENTION: The NEC-branch of this routine may significantly profit from
1122	! further optimizations. So far, performance is much worse than
1123	! for routine ffty_tr_yx (more than three times slower).
1124	!------------------------------------------------------------------------------!
1125
1126	USE control_parameters, &
1127	ONLY: host
1128
1129	USE cpulog, &
1130	ONLY: cpu_log, log_point_s
1131
1132	USE kinds
1133
1134	USE pegrid
1135
1136	IMPLICIT NONE
1137
1138	INTEGER(iwp) :: i !:
1139	INTEGER(iwp) :: j !:
1140	INTEGER(iwp) :: k !:
1141
1142	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !:
1143	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !:
1144	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !:
1145	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !:
1146
1147	!
1148	!-- Carry out the FFT along x, where all data are present due to the
1149	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1150	!-- the first index.
1151	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1152
1153	IF ( host(1:3) == 'nec' ) THEN
1154	!
1155	!-- Code for vector processors
1156	!$OMP PARALLEL PRIVATE ( i, j, k )
1157	!$OMP DO
1158	DO i = 0, nx
1159
1160	DO j = nys, nyn
1161	DO k = 1, nz
1162	work_fftx(i,k,j) = f_in(k,j,i)
1163	ENDDO
1164	ENDDO
1165
1166	ENDDO
1167
1168	!$OMP DO
1169	DO j = nys, nyn
1170
1171	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1172
1173	DO k = 1, nz
1174	DO i = 0, nx
1175	work(j,k,i) = work_fftx(i,k,j)
1176	ENDDO
1177	ENDDO
1178
1179	ENDDO
1180	!$OMP END PARALLEL
1181
1182	ELSE
1183
1184	!
1185	!-- Cache optimized code (there might be still a potential for better
1186	!-- optimization).
1187	!$OMP PARALLEL PRIVATE (i,j,k)
1188	!$OMP DO
1189	DO i = 0, nx
1190
1191	DO j = nys, nyn
1192	DO k = 1, nz
1193	work_fftx(i,k,j) = f_in(k,j,i)
1194	ENDDO
1195	ENDDO
1196
1197	ENDDO
1198
1199	!$OMP DO
1200	DO j = nys, nyn
1201	DO k = 1, nz
1202
1203	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1204
1205	DO i = 0, nx
1206	work(j,k,i) = work_fftx(i,k,j)
1207	ENDDO
1208	ENDDO
1209
1210	ENDDO
1211	!$OMP END PARALLEL
1212
1213	ENDIF
1214	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1215
1216	!
1217	!-- Transpose array
1218	#if defined( __parallel )
1219	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1220	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1221	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1222	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1223	comm1dy, ierr )
1224	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1225	#endif
1226
1227	END SUBROUTINE fftx_tr_xy
1228
1229
1230	SUBROUTINE tr_yx_fftx( f_in, f_out )
1231
1232	!------------------------------------------------------------------------------!
1233	! Transposition y --> x with a subsequent backward Fourier transformation for
1234	! a 1d-decomposition along x
1235	!------------------------------------------------------------------------------!
1236
1237	USE control_parameters, &
1238	ONLY: host
1239
1240	USE cpulog, &
1241	ONLY: cpu_log, log_point_s
1242
1243	USE kinds
1244
1245	USE pegrid
1246
1247	IMPLICIT NONE
1248
1249	INTEGER(iwp) :: i !:
1250	INTEGER(iwp) :: j !:
1251	INTEGER(iwp) :: k !:
1252
1253	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !:
1254	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !:
1255	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !:
1256	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !:
1257
1258	!
1259	!-- Transpose array
1260	#if defined( __parallel )
1261	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1262	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1263	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1264	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1265	comm1dy, ierr )
1266	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1267	#endif
1268
1269	!
1270	!-- Carry out the FFT along x, where all data are present due to the
1271	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1272	!-- the first index.
1273	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1274
1275	IF ( host(1:3) == 'nec' ) THEN
1276	!
1277	!-- Code optimized for vector processors
1278	!$OMP PARALLEL PRIVATE ( i, j, k )
1279	!$OMP DO
1280	DO j = nys, nyn
1281
1282	DO k = 1, nz
1283	DO i = 0, nx
1284	work_fftx(i,k,j) = work(j,k,i)
1285	ENDDO
1286	ENDDO
1287
1288	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1289
1290	ENDDO
1291
1292	!$OMP DO
1293	DO i = 0, nx
1294	DO j = nys, nyn
1295	DO k = 1, nz
1296	f_out(k,j,i) = work_fftx(i,k,j)
1297	ENDDO
1298	ENDDO
1299	ENDDO
1300	!$OMP END PARALLEL
1301
1302	ELSE
1303
1304	!
1305	!-- Cache optimized code (there might be still a potential for better
1306	!-- optimization).
1307	!$OMP PARALLEL PRIVATE (i,j,k)
1308	!$OMP DO
1309	DO j = nys, nyn
1310	DO k = 1, nz
1311
1312	DO i = 0, nx
1313	work_fftx(i,k,j) = work(j,k,i)
1314	ENDDO
1315
1316	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1317
1318	ENDDO
1319	ENDDO
1320
1321	!$OMP DO
1322	DO i = 0, nx
1323	DO j = nys, nyn
1324	DO k = 1, nz
1325	f_out(k,j,i) = work_fftx(i,k,j)
1326	ENDDO
1327	ENDDO
1328	ENDDO
1329	!$OMP END PARALLEL
1330
1331	ENDIF
1332	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1333
1334	END SUBROUTINE tr_yx_fftx
1335
1336
1337	SUBROUTINE ffty_tri_ffty( ar )
1338
1339	!------------------------------------------------------------------------------!
1340	! FFT along y, solution of the tridiagonal system and backward FFT for
1341	! a 1d-decomposition along y
1342	!
1343	! WARNING: this subroutine may still not work for hybrid parallelization
1344	! with OpenMP (for possible necessary changes see the original
1345	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1346	!------------------------------------------------------------------------------!
1347
1348	USE control_parameters, &
1349	ONLY: host
1350
1351	USE cpulog, &
1352	ONLY: cpu_log, log_point_s
1353
1354	USE grid_variables, &
1355	ONLY: ddx2, ddy2
1356
1357	USE kinds
1358
1359	USE pegrid
1360
1361	IMPLICIT NONE
1362
1363	INTEGER(iwp) :: i !:
1364	INTEGER(iwp) :: j !:
1365	INTEGER(iwp) :: k !:
1366	INTEGER(iwp) :: m !:
1367	INTEGER(iwp) :: n !:
1368	INTEGER(iwp) :: omp_get_thread_num !:
1369	INTEGER(iwp) :: tn !:
1370
1371	REAL(wp), DIMENSION(0:ny) :: work_ffty !:
1372	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !:
1373	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !:
1374	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !:
1375
1376
1377	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1378
1379	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1380
1381	tn = 0 ! Default thread number in case of one thread
1382	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1383	DO i = nxl_y, nxr_y
1384
1385	!$ tn = omp_get_thread_num()
1386
1387	IF ( host(1:3) == 'nec' ) THEN
1388	!
1389	!-- Code optimized for vector processors
1390	DO k = 1, nz
1391
1392	m = 0
1393	DO n = 1, pdims(2)
1394	DO j = 1, nny
1395	work_triy(m,k) = ar(j,k,i,n)
1396	m = m + 1
1397	ENDDO
1398	ENDDO
1399
1400	ENDDO
1401
1402	CALL fft_y_m( work_triy, ny, 'forward' )
1403
1404	ELSE
1405	!
1406	!-- Cache optimized code
1407	DO k = 1, nz
1408
1409	m = 0
1410	DO n = 1, pdims(2)
1411	DO j = 1, nny
1412	work_ffty(m) = ar(j,k,i,n)
1413	m = m + 1
1414	ENDDO
1415	ENDDO
1416
1417	CALL fft_y_1d( work_ffty, 'forward' )
1418
1419	DO j = 0, ny
1420	work_triy(j,k) = work_ffty(j)
1421	ENDDO
1422
1423	ENDDO
1424
1425	ENDIF
1426
1427	!
1428	!-- Solve the linear equation system
1429	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1430
1431	IF ( host(1:3) == 'nec' ) THEN
1432	!
1433	!-- Code optimized for vector processors
1434	CALL fft_y_m( work_triy, ny, 'backward' )
1435
1436	DO k = 1, nz
1437
1438	m = 0
1439	DO n = 1, pdims(2)
1440	DO j = 1, nny
1441	ar(j,k,i,n) = work_triy(m,k)
1442	m = m + 1
1443	ENDDO
1444	ENDDO
1445
1446	ENDDO
1447
1448	ELSE
1449	!
1450	!-- Cache optimized code
1451	DO k = 1, nz
1452
1453	DO j = 0, ny
1454	work_ffty(j) = work_triy(j,k)
1455	ENDDO
1456
1457	CALL fft_y_1d( work_ffty, 'backward' )
1458
1459	m = 0
1460	DO n = 1, pdims(2)
1461	DO j = 1, nny
1462	ar(j,k,i,n) = work_ffty(m)
1463	m = m + 1
1464	ENDDO
1465	ENDDO
1466
1467	ENDDO
1468
1469	ENDIF
1470
1471	ENDDO
1472
1473	DEALLOCATE( tri )
1474
1475	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1476
1477	END SUBROUTINE ffty_tri_ffty
1478
1479	#endif
1480
1481	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |