Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1444

Last change on this file since 1444 was 1407, checked in by raasch, 11 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 44.9 KB

Line
1	MODULE poisfft_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2014 Leibniz Universitaet Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft.f90 1407 2014-05-16 14:06:08Z letzel $
27	!
28	! 1406 2014-05-16 13:47:01Z raasch
29	! bugfix for pgi 14.4: declare create moved after array declaration
30	!
31	! 1320 2014-03-20 08:40:49Z raasch
32	! ONLY-attribute added to USE-statements,
33	! kind-parameters added to all INTEGER and REAL declaration statements,
34	! kinds are defined in new module kinds,
35	! old module precision_kind is removed,
36	! revision history before 2012 removed,
37	! comment fields (!:) to be used for variable explanations added to
38	! all variable declaration statements
39	!
40	! 1318 2014-03-17 13:35:16Z raasch
41	! module interfaces removed
42	!
43	! 1306 2014-03-13 14:30:59Z raasch
44	! openmp sections removed from the overlap branch,
45	! second argument removed from parameter list
46	!
47	! 1216 2013-08-26 09:31:42Z raasch
48	! resorting of arrays moved to separate routines resort_for_...,
49	! one argument, used as temporary work array, removed from all transpose
50	! routines
51	! overlapping fft / transposition implemented
52	!
53	! 1212 2013-08-15 08:46:27Z raasch
54	! tridia routines moved to seperate module tridia_solver
55	!
56	! 1208 2013-08-13 06:41:49Z raasch
57	! acc-update clauses added for "ar" so that ffts other than cufft can also be
58	! used (although they are not ported and will give a poor performance)
59	!
60	! 1111 2013-03-08 23:54:10Z raasch
61	! further openACC porting of non-parallel (MPI) branch:
62	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
63	! no distinction between parallel/non-parallel in poisfft and tridia any more,
64	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
65	! (otherwise "invalid device function" is indicated during runtime),
66	! optimization of tridia routines: constant elements and coefficients of tri are
67	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
68	! to 2,
69	! poisfft_init is now called internally from poisfft, maketri is called from
70	! poisfft_init,
71	! ibc_p_b = 2 removed
72	!
73	! 1106 2013-03-04 05:31:38Z raasch
74	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
75	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
76	! fft_y_1d
77	!
78	! 1103 2013-02-20 02:15:53Z raasch
79	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
80	! sometimes cause segmentation faults with intel 12.1 compiler
81	!
82	! 1092 2013-02-02 11:24:22Z raasch
83	! unused variables removed
84	!
85	! 1036 2012-10-22 13:43:42Z raasch
86	! code put under GPL (PALM 3.9)
87	!
88	! 2012-09-21 07:03:55Z raasch
89	! FLOAT type conversion replaced by REAL
90	!
91	! 1003 2012-09-14 14:35:53Z raasch
92	! indices nxa, nya, etc. replaced by nx, ny, etc.
93	!
94	! 940 2012-07-09 14:31:00Z raasch
95	! special handling of tri-array as an argument in tridia_1dd routines switched
96	! off because it caused segmentation faults with intel 12.1 compiler
97	!
98	! 877 2012-04-03 11:21:44Z suehring
99	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
100	! pressure at the top of the model domain.
101	!
102	! 809 2012-01-30 13:32:58Z maronga
103	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
104	!
105	! 807 2012-01-25 11:53:51Z maronga
106	! New cpp directive "__check" implemented which is used by check_namelist_files
107	! (most of the code is unneeded by check_namelist_files).
108	!
109	! Revision 1.1 1997/07/24 11:24:14 raasch
110	! Initial revision
111	!
112	!
113	! Description:
114	! ------------
115	! Original version by Stephan Siano (pois3d), as of July 23, 1996
116	! Adapted for 2D-domain-decomposition by Siegfried Raasch, July 3, 1997
117	!
118	! Solves the Poisson equation with a 2D spectral method
119	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
120	!
121	! Input:
122	! real ar contains (nnz,nny,nnx) elements of the velocity divergence,
123	! starting from (1,nys,nxl)
124	!
125	! Output:
126	! real ar contains the solution for perturbation pressure p
127	!------------------------------------------------------------------------------!
128
129	USE fft_xy, &
130	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
131
132	USE indices, &
133	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
134
135	USE transpose_indices, &
136	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
137	nzb_y, nzt_x, nzt_y
138
139	USE tridia_solver, &
140	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
141
142	IMPLICIT NONE
143
144	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
145
146	PRIVATE
147
148	#if ! defined ( __check )
149	PUBLIC poisfft, poisfft_init
150
151	INTERFACE poisfft
152	MODULE PROCEDURE poisfft
153	END INTERFACE poisfft
154
155	INTERFACE poisfft_init
156	MODULE PROCEDURE poisfft_init
157	END INTERFACE poisfft_init
158	#else
159	PUBLIC poisfft_init
160
161	INTERFACE poisfft_init
162	MODULE PROCEDURE poisfft_init
163	END INTERFACE poisfft_init
164	#endif
165
166	CONTAINS
167
168	SUBROUTINE poisfft_init
169
170	USE arrays_3d, &
171	ONLY: ddzu_pres, ddzw
172
173	USE kinds
174
175	IMPLICIT NONE
176
177	INTEGER(iwp) :: k !:
178
179
180	CALL fft_init
181
182	CALL tridia_init
183
184	poisfft_initialized = .TRUE.
185
186	END SUBROUTINE poisfft_init
187
188
189	#if ! defined ( __check )
190	SUBROUTINE poisfft( ar )
191
192	USE control_parameters, &
193	ONLY: fft_method, transpose_compute_overlap
194
195	USE cpulog, &
196	ONLY: cpu_log, cpu_log_nowait, log_point_s
197
198	USE kinds
199
200	USE pegrid
201
202	IMPLICIT NONE
203
204	INTEGER(iwp) :: ii !:
205	INTEGER(iwp) :: iind !:
206	INTEGER(iwp) :: inew !:
207	INTEGER(iwp) :: jj !:
208	INTEGER(iwp) :: jind !:
209	INTEGER(iwp) :: jnew !:
210	INTEGER(iwp) :: ki !:
211	INTEGER(iwp) :: kk !:
212	INTEGER(iwp) :: knew !:
213	INTEGER(iwp) :: n !:
214	INTEGER(iwp) :: nblk !:
215	INTEGER(iwp) :: nnx_y !:
216	INTEGER(iwp) :: nny_z !:
217	INTEGER(iwp) :: nnz_t !:
218	INTEGER(iwp) :: nnz_x !:
219	INTEGER(iwp) :: nxl_y_bound !:
220	INTEGER(iwp) :: nxr_y_bound !:
221
222	INTEGER(iwp), DIMENSION(4) :: isave !:
223
224	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !:
225	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !:
226	!$acc declare create( ar_inv )
227
228	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !:
229	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !:
230	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !:
231	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !:
232	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !:
233
234
235	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
236
237	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
238
239	!
240	!-- Two-dimensional Fourier Transformation in x- and y-direction.
241	IF ( pdims(2) == 1 .AND. pdims(1) > 1 ) THEN
242
243	!
244	!-- 1d-domain-decomposition along x:
245	!-- FFT along y and transposition y --> x
246	CALL ffty_tr_yx( ar, ar )
247
248	!
249	!-- FFT along x, solving the tridiagonal system and backward FFT
250	CALL fftx_tri_fftx( ar )
251
252	!
253	!-- Transposition x --> y and backward FFT along y
254	CALL tr_xy_ffty( ar, ar )
255
256	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 ) THEN
257
258	!
259	!-- 1d-domain-decomposition along y:
260	!-- FFT along x and transposition x --> y
261	CALL fftx_tr_xy( ar, ar )
262
263	!
264	!-- FFT along y, solving the tridiagonal system and backward FFT
265	CALL ffty_tri_ffty( ar )
266
267	!
268	!-- Transposition y --> x and backward FFT along x
269	CALL tr_yx_fftx( ar, ar )
270
271	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
272
273	!
274	!-- 2d-domain-decomposition or no decomposition (1 PE run)
275	!-- Transposition z --> x
276	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
277	CALL resort_for_zx( ar, ar_inv )
278	CALL transpose_zx( ar_inv, ar )
279	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
280
281	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
282	IF ( fft_method /= 'system-specific' ) THEN
283	!$acc update host( ar )
284	ENDIF
285	CALL fft_x( ar, 'forward' )
286	IF ( fft_method /= 'system-specific' ) THEN
287	!$acc update device( ar )
288	ENDIF
289	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
290
291	!
292	!-- Transposition x --> y
293	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
294	CALL resort_for_xy( ar, ar_inv )
295	CALL transpose_xy( ar_inv, ar )
296	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
297
298	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
299	IF ( fft_method /= 'system-specific' ) THEN
300	!$acc update host( ar )
301	ENDIF
302	CALL fft_y( ar, 'forward', ar_tr = ar, &
303	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
304	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
305	IF ( fft_method /= 'system-specific' ) THEN
306	!$acc update device( ar )
307	ENDIF
308	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
309
310	!
311	!-- Transposition y --> z
312	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
313	CALL resort_for_yz( ar, ar_inv )
314	CALL transpose_yz( ar_inv, ar )
315	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
316
317	!
318	!-- Solve the tridiagonal equation system along z
319	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
320	CALL tridia_substi( ar )
321	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
322
323	!
324	!-- Inverse Fourier Transformation
325	!-- Transposition z --> y
326	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
327	CALL transpose_zy( ar, ar_inv )
328	CALL resort_for_zy( ar_inv, ar )
329	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
330
331	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
332	IF ( fft_method /= 'system-specific' ) THEN
333	!$acc update host( ar )
334	ENDIF
335	CALL fft_y( ar, 'backward', ar_tr = ar, &
336	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
337	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
338	IF ( fft_method /= 'system-specific' ) THEN
339	!$acc update device( ar )
340	ENDIF
341	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
342
343	!
344	!-- Transposition y --> x
345	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
346	CALL transpose_yx( ar, ar_inv )
347	CALL resort_for_yx( ar_inv, ar )
348	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
349
350	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
351	IF ( fft_method /= 'system-specific' ) THEN
352	!$acc update host( ar )
353	ENDIF
354	CALL fft_x( ar, 'backward' )
355	IF ( fft_method /= 'system-specific' ) THEN
356	!$acc update device( ar )
357	ENDIF
358	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
359
360	!
361	!-- Transposition x --> z
362	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
363	CALL transpose_xz( ar, ar_inv )
364	CALL resort_for_xz( ar_inv, ar )
365	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
366
367	ELSE
368
369	!
370	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
371	!-- overlapping transposition / fft
372	!-- cputime logging must not use barriers, which would prevent overlapping
373	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
374	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
375	!
376	!-- Transposition z --> x + subsequent fft along x
377	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
378	CALL resort_for_zx( ar, f_inv )
379	!
380	!-- Save original indices and gridpoint counter
381	isave(1) = nz
382	isave(2) = nzb_x
383	isave(3) = nzt_x
384	isave(4) = sendrecvcount_zx
385	!
386	!-- Set new indices for transformation
387	nblk = nz / pdims(1)
388	nz = pdims(1)
389	nnz_x = 1
390	nzb_x = 1 + myidx * nnz_x
391	nzt_x = ( myidx + 1 ) * nnz_x
392	sendrecvcount_zx = nnx * nny * nnz_x
393
394	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
395	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
396
397	DO kk = 1, nblk
398
399	IF ( kk == 1 ) THEN
400	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
401	ELSE
402	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
403	ENDIF
404
405	DO knew = 1, nz
406	ki = kk + nblk * ( knew - 1 )
407	f_in(:,:,knew) = f_inv(:,:,ki)
408	ENDDO
409
410	CALL transpose_zx( f_in, ar1(:,:,:))
411	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
412
413	IF ( kk == 1 ) THEN
414	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
415	ELSE
416	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
417	ENDIF
418
419	n = isave(2) + kk - 1
420	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
421	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
422
423	ENDDO
424	!
425	!-- Restore original indices/counters
426	nz = isave(1)
427	nzb_x = isave(2)
428	nzt_x = isave(3)
429	sendrecvcount_zx = isave(4)
430
431	DEALLOCATE( ar1, f_in, f_inv )
432
433	!
434	!-- Transposition x --> y + subsequent fft along y
435	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
436	CALL resort_for_xy( f_out_z, f_inv )
437	!
438	!-- Save original indices and gridpoint counter
439	isave(1) = nx
440	isave(2) = nxl_y
441	isave(3) = nxr_y
442	isave(4) = sendrecvcount_xy
443	!
444	!-- Set new indices for transformation
445	nblk = ( ( nx+1 ) / pdims(2) ) - 1
446	nx = pdims(2)
447	nnx_y = 1
448	nxl_y = myidy * nnx_y
449	nxr_y = ( myidy + 1 ) * nnx_y - 1
450	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
451
452	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
453	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
454
455	DO ii = 0, nblk
456
457	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
458
459	DO inew = 0, nx-1
460	iind = ii + ( nblk + 1 ) * inew
461	f_in(:,:,inew) = f_inv(:,:,iind)
462	ENDDO
463
464	CALL transpose_xy( f_in, ar1(:,:,:) )
465
466	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
467
468	IF ( ii == 1 ) THEN
469	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
470	ELSE
471	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
472	ENDIF
473
474	nxl_y_bound = isave(2)
475	nxr_y_bound = isave(3)
476	n = isave(2) + ii
477	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
478	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
479	nxl_y_l = n, nxr_y_l = n )
480
481	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
482
483	ENDDO
484	!
485	!-- Restore original indices/counters
486	nx = isave(1)
487	nxl_y = isave(2)
488	nxr_y = isave(3)
489	sendrecvcount_xy = isave(4)
490
491	DEALLOCATE( ar1, f_in, f_inv )
492
493	!
494	!-- Transposition y --> z + subsequent tridia + resort for z --> y
495	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
496	CALL resort_for_yz( f_out_y, f_inv )
497	!
498	!-- Save original indices and gridpoint counter
499	isave(1) = ny
500	isave(2) = nys_z
501	isave(3) = nyn_z
502	isave(4) = sendrecvcount_yz
503	!
504	!-- Set new indices for transformation
505	nblk = ( ( ny+1 ) / pdims(1) ) - 1
506	ny = pdims(1)
507	nny_z = 1
508	nys_z = myidx * nny_z
509	nyn_z = ( myidx + 1 ) * nny_z - 1
510	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
511
512	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
513	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
514
515	DO jj = 0, nblk
516	!
517	!-- Forward Fourier Transformation
518	!-- Transposition y --> z
519	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
520
521	DO jnew = 0, ny-1
522	jind = jj + ( nblk + 1 ) * jnew
523	f_in(:,:,jnew) = f_inv(:,:,jind)
524	ENDDO
525
526	CALL transpose_yz( f_in, ar1(:,:,:) )
527
528	IF ( jj == nblk ) THEN
529	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
530	ELSE
531	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
532	ENDIF
533
534	!
535	!-- Solve the tridiagonal equation system along z
536	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
537
538	n = isave(2) + jj
539	CALL tridia_substi_overlap( ar1(:,:,:), n )
540
541	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
542
543	!
544	!-- Inverse Fourier Transformation
545	!-- Transposition z --> y
546	!-- Only one thread should call MPI routines, therefore forward and
547	!-- backward tranpose are in the same section
548	IF ( jj == 0 ) THEN
549	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
550	ELSE
551	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
552	ENDIF
553
554	CALL transpose_zy( ar1(:,:,:), f_in )
555
556	DO jnew = 0, ny-1
557	jind = jj + ( nblk + 1 ) * jnew
558	f_inv(:,:,jind) = f_in(:,:,jnew)
559	ENDDO
560
561	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
562
563	ENDDO
564	!
565	!-- Restore original indices/counters
566	ny = isave(1)
567	nys_z = isave(2)
568	nyn_z = isave(3)
569	sendrecvcount_yz = isave(4)
570
571	CALL resort_for_zy( f_inv, f_out_y )
572
573	DEALLOCATE( ar1, f_in, f_inv )
574
575	!
576	!-- fft along y backward + subsequent transposition y --> x
577	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
578	!
579	!-- Save original indices and gridpoint counter
580	isave(1) = nx
581	isave(2) = nxl_y
582	isave(3) = nxr_y
583	isave(4) = sendrecvcount_xy
584	!
585	!-- Set new indices for transformation
586	nblk = (( nx+1 ) / pdims(2) ) - 1
587	nx = pdims(2)
588	nnx_y = 1
589	nxl_y = myidy * nnx_y
590	nxr_y = ( myidy + 1 ) * nnx_y - 1
591	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
592
593	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
594	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
595
596	DO ii = 0, nblk
597
598	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
599
600	n = isave(2) + ii
601	nxl_y_bound = isave(2)
602	nxr_y_bound = isave(3)
603
604	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
605	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
606	nxl_y_l = n, nxr_y_l = n )
607
608	IF ( ii == nblk ) THEN
609	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
610	ELSE
611	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
612	ENDIF
613
614	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
615
616	CALL transpose_yx( ar1(:,:,:), f_in )
617
618	DO inew = 0, nx-1
619	iind = ii + (nblk+1) * inew
620	f_inv(:,:,iind) = f_in(:,:,inew)
621	ENDDO
622
623	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
624
625	ENDDO
626	!
627	!-- Restore original indices/counters
628	nx = isave(1)
629	nxl_y = isave(2)
630	nxr_y = isave(3)
631	sendrecvcount_xy = isave(4)
632
633	CALL resort_for_yx( f_inv, f_out_z )
634
635	DEALLOCATE( ar1, f_in, f_inv )
636
637	!
638	!-- fft along x backward + subsequent final transposition x --> z
639	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
640	!
641	!-- Save original indices and gridpoint counter
642	isave(1) = nz
643	isave(2) = nzb_x
644	isave(3) = nzt_x
645	isave(4) = sendrecvcount_zx
646	!
647	!-- Set new indices for transformation
648	nblk = nz / pdims(1)
649	nz = pdims(1)
650	nnz_x = 1
651	nzb_x = 1 + myidx * nnz_x
652	nzt_x = ( myidx + 1 ) * nnz_x
653	sendrecvcount_zx = nnx * nny * nnz_x
654
655	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
656	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
657
658	DO kk = 1, nblk
659
660	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
661
662	n = isave(2) + kk - 1
663	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
664
665	IF ( kk == nblk ) THEN
666	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
667	ELSE
668	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
669	ENDIF
670
671	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
672
673	CALL transpose_xz( ar1(:,:,:), f_in )
674
675	DO knew = 1, nz
676	ki = kk + nblk * (knew-1)
677	f_inv(:,:,ki) = f_in(:,:,knew)
678	ENDDO
679
680	IF ( kk == nblk ) THEN
681	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
682	ELSE
683	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
684	ENDIF
685
686	ENDDO
687	!
688	!-- Restore original indices/counters
689	nz = isave(1)
690	nzb_x = isave(2)
691	nzt_x = isave(3)
692	sendrecvcount_zx = isave(4)
693
694	CALL resort_for_xz( f_inv, ar )
695
696	DEALLOCATE( ar1, f_in, f_inv )
697
698	ENDIF
699
700	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
701
702	END SUBROUTINE poisfft
703
704
705
706	SUBROUTINE ffty_tr_yx( f_in, f_out )
707
708	!------------------------------------------------------------------------------!
709	! Fourier-transformation along y with subsequent transposition y --> x for
710	! a 1d-decomposition along x
711	!
712	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
713	! if the first index of work_ffty_vec is odd. Otherwise
714	! memory bank conflicts may occur (especially if the index is a
715	! multiple of 128). That's why work_ffty_vec is dimensioned as
716	! 0:ny+1.
717	! Of course, this will not work if users are using an odd number
718	! of gridpoints along y.
719	!------------------------------------------------------------------------------!
720
721	USE control_parameters, &
722	ONLY: host
723
724	USE cpulog, &
725	ONLY: cpu_log, log_point_s
726
727	USE kinds
728
729	USE pegrid
730
731	IMPLICIT NONE
732
733	INTEGER(iwp) :: i !:
734	INTEGER(iwp) :: iend !:
735	INTEGER(iwp) :: iouter !:
736	INTEGER(iwp) :: ir !:
737	INTEGER(iwp) :: j !:
738	INTEGER(iwp) :: k !:
739
740	INTEGER(iwp), PARAMETER :: stridex = 4 !:
741
742	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !:
743	#if defined( __nec )
744	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !:
745	#endif
746	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !:
747	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !:
748	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !:
749
750	!
751	!-- Carry out the FFT along y, where all data are present due to the
752	!-- 1d-decomposition along x. Resort the data in a way that x becomes
753	!-- the first index.
754	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
755
756	IF ( host(1:3) == 'nec' ) THEN
757	#if defined( __nec )
758	!
759	!-- Code optimized for vector processors
760	!$OMP PARALLEL PRIVATE ( i, j, k )
761	!$OMP DO
762	DO i = nxl, nxr
763
764	DO j = 0, ny
765	DO k = 1, nz
766	work_ffty_vec(j,k,i) = f_in(k,j,i)
767	ENDDO
768	ENDDO
769
770	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
771
772	ENDDO
773
774	!$OMP DO
775	DO k = 1, nz
776	DO j = 0, ny
777	DO i = nxl, nxr
778	work(i,k,j) = work_ffty_vec(j,k,i)
779	ENDDO
780	ENDDO
781	ENDDO
782	!$OMP END PARALLEL
783	#endif
784
785	ELSE
786
787	!
788	!-- Cache optimized code.
789	!-- The i-(x-)direction is split into a strided outer loop and an inner
790	!-- loop for better cache performance
791	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
792	!$OMP DO
793	DO iouter = nxl, nxr, stridex
794
795	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
796
797	DO k = 1, nz
798
799	DO i = iouter, iend
800
801	ir = i-iouter+1 ! counter within a stride
802	DO j = 0, ny
803	work_ffty(j,ir) = f_in(k,j,i)
804	ENDDO
805	!
806	!-- FFT along y
807	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
808
809	ENDDO
810
811	!
812	!-- Resort
813	DO j = 0, ny
814	DO i = iouter, iend
815	work(i,k,j) = work_ffty(j,i-iouter+1)
816	ENDDO
817	ENDDO
818
819	ENDDO
820
821	ENDDO
822	!$OMP END PARALLEL
823
824	ENDIF
825	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
826
827	!
828	!-- Transpose array
829	#if defined( __parallel )
830	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
831	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
832	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
833	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
834	comm1dx, ierr )
835	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
836	#endif
837
838	END SUBROUTINE ffty_tr_yx
839
840
841	SUBROUTINE tr_xy_ffty( f_in, f_out )
842
843	!------------------------------------------------------------------------------!
844	! Transposition x --> y with a subsequent backward Fourier transformation for
845	! a 1d-decomposition along x
846	!------------------------------------------------------------------------------!
847
848	USE control_parameters, &
849	ONLY: host
850
851	USE cpulog, &
852	ONLY: cpu_log, log_point_s
853
854	USE kinds
855
856	USE pegrid
857
858	IMPLICIT NONE
859
860	INTEGER(iwp) :: i !:
861	INTEGER(iwp) :: iend !:
862	INTEGER(iwp) :: iouter !:
863	INTEGER(iwp) :: ir !:
864	INTEGER(iwp) :: j !:
865	INTEGER(iwp) :: k !:
866
867	INTEGER(iwp), PARAMETER :: stridex = 4 !:
868
869	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !:
870	#if defined( __nec )
871	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !:
872	#endif
873	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !:
874	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !:
875	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !:
876
877	!
878	!-- Transpose array
879	#if defined( __parallel )
880	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
881	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
882	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
883	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
884	comm1dx, ierr )
885	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
886	#endif
887
888	!
889	!-- Resort the data in a way that y becomes the first index and carry out the
890	!-- backward fft along y.
891	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
892
893	IF ( host(1:3) == 'nec' ) THEN
894	#if defined( __nec )
895	!
896	!-- Code optimized for vector processors
897	!$OMP PARALLEL PRIVATE ( i, j, k )
898	!$OMP DO
899	DO k = 1, nz
900	DO j = 0, ny
901	DO i = nxl, nxr
902	work_ffty_vec(j,k,i) = work(i,k,j)
903	ENDDO
904	ENDDO
905	ENDDO
906
907	!$OMP DO
908	DO i = nxl, nxr
909
910	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
911
912	DO j = 0, ny
913	DO k = 1, nz
914	f_out(k,j,i) = work_ffty_vec(j,k,i)
915	ENDDO
916	ENDDO
917
918	ENDDO
919	!$OMP END PARALLEL
920	#endif
921
922	ELSE
923
924	!
925	!-- Cache optimized code.
926	!-- The i-(x-)direction is split into a strided outer loop and an inner
927	!-- loop for better cache performance
928	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
929	!$OMP DO
930	DO iouter = nxl, nxr, stridex
931
932	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
933
934	DO k = 1, nz
935	!
936	!-- Resort
937	DO j = 0, ny
938	DO i = iouter, iend
939	work_ffty(j,i-iouter+1) = work(i,k,j)
940	ENDDO
941	ENDDO
942
943	DO i = iouter, iend
944
945	!
946	!-- FFT along y
947	ir = i-iouter+1 ! counter within a stride
948	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
949
950	DO j = 0, ny
951	f_out(k,j,i) = work_ffty(j,ir)
952	ENDDO
953	ENDDO
954
955	ENDDO
956
957	ENDDO
958	!$OMP END PARALLEL
959
960	ENDIF
961
962	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
963
964	END SUBROUTINE tr_xy_ffty
965
966
967	SUBROUTINE fftx_tri_fftx( ar )
968
969	!------------------------------------------------------------------------------!
970	! FFT along x, solution of the tridiagonal system and backward FFT for
971	! a 1d-decomposition along x
972	!
973	! WARNING: this subroutine may still not work for hybrid parallelization
974	! with OpenMP (for possible necessary changes see the original
975	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
976	!------------------------------------------------------------------------------!
977
978	USE control_parameters, &
979	ONLY: host
980
981	USE cpulog, &
982	ONLY: cpu_log, log_point_s
983
984	USE grid_variables, &
985	ONLY: ddx2, ddy2
986
987	USE kinds
988
989	USE pegrid
990
991	IMPLICIT NONE
992
993	INTEGER(iwp) :: i !:
994	INTEGER(iwp) :: j !:
995	INTEGER(iwp) :: k !:
996	INTEGER(iwp) :: m !:
997	INTEGER(iwp) :: n !:
998	INTEGER(iwp) :: omp_get_thread_num !:
999	INTEGER(iwp) :: tn !:
1000
1001	REAL(wp), DIMENSION(0:nx) :: work_fftx !:
1002	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !:
1003	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !:
1004	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !:
1005
1006
1007	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
1008
1009	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1010
1011	tn = 0 ! Default thread number in case of one thread
1012	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1013	DO j = nys_x, nyn_x
1014
1015	!$ tn = omp_get_thread_num()
1016
1017	IF ( host(1:3) == 'nec' ) THEN
1018	!
1019	!-- Code optimized for vector processors
1020	DO k = 1, nz
1021
1022	m = 0
1023	DO n = 1, pdims(1)
1024	DO i = 1, nnx
1025	work_trix(m,k) = ar(i,k,j,n)
1026	m = m + 1
1027	ENDDO
1028	ENDDO
1029
1030	ENDDO
1031
1032	CALL fft_x_m( work_trix, 'forward' )
1033
1034	ELSE
1035	!
1036	!-- Cache optimized code
1037	DO k = 1, nz
1038
1039	m = 0
1040	DO n = 1, pdims(1)
1041	DO i = 1, nnx
1042	work_fftx(m) = ar(i,k,j,n)
1043	m = m + 1
1044	ENDDO
1045	ENDDO
1046
1047	CALL fft_x_1d( work_fftx, 'forward' )
1048
1049	DO i = 0, nx
1050	work_trix(i,k) = work_fftx(i)
1051	ENDDO
1052
1053	ENDDO
1054
1055	ENDIF
1056
1057	!
1058	!-- Solve the linear equation system
1059	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1060
1061	IF ( host(1:3) == 'nec' ) THEN
1062	!
1063	!-- Code optimized for vector processors
1064	CALL fft_x_m( work_trix, 'backward' )
1065
1066	DO k = 1, nz
1067
1068	m = 0
1069	DO n = 1, pdims(1)
1070	DO i = 1, nnx
1071	ar(i,k,j,n) = work_trix(m,k)
1072	m = m + 1
1073	ENDDO
1074	ENDDO
1075
1076	ENDDO
1077
1078	ELSE
1079	!
1080	!-- Cache optimized code
1081	DO k = 1, nz
1082
1083	DO i = 0, nx
1084	work_fftx(i) = work_trix(i,k)
1085	ENDDO
1086
1087	CALL fft_x_1d( work_fftx, 'backward' )
1088
1089	m = 0
1090	DO n = 1, pdims(1)
1091	DO i = 1, nnx
1092	ar(i,k,j,n) = work_fftx(m)
1093	m = m + 1
1094	ENDDO
1095	ENDDO
1096
1097	ENDDO
1098
1099	ENDIF
1100
1101	ENDDO
1102
1103	DEALLOCATE( tri )
1104
1105	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1106
1107	END SUBROUTINE fftx_tri_fftx
1108
1109
1110	SUBROUTINE fftx_tr_xy( f_in, f_out )
1111
1112	!------------------------------------------------------------------------------!
1113	! Fourier-transformation along x with subsequent transposition x --> y for
1114	! a 1d-decomposition along y
1115	!
1116	! ATTENTION: The NEC-branch of this routine may significantly profit from
1117	! further optimizations. So far, performance is much worse than
1118	! for routine ffty_tr_yx (more than three times slower).
1119	!------------------------------------------------------------------------------!
1120
1121	USE control_parameters, &
1122	ONLY: host
1123
1124	USE cpulog, &
1125	ONLY: cpu_log, log_point_s
1126
1127	USE kinds
1128
1129	USE pegrid
1130
1131	IMPLICIT NONE
1132
1133	INTEGER(iwp) :: i !:
1134	INTEGER(iwp) :: j !:
1135	INTEGER(iwp) :: k !:
1136
1137	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !:
1138	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !:
1139	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !:
1140	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !:
1141
1142	!
1143	!-- Carry out the FFT along x, where all data are present due to the
1144	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1145	!-- the first index.
1146	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1147
1148	IF ( host(1:3) == 'nec' ) THEN
1149	!
1150	!-- Code for vector processors
1151	!$OMP PARALLEL PRIVATE ( i, j, k )
1152	!$OMP DO
1153	DO i = 0, nx
1154
1155	DO j = nys, nyn
1156	DO k = 1, nz
1157	work_fftx(i,k,j) = f_in(k,j,i)
1158	ENDDO
1159	ENDDO
1160
1161	ENDDO
1162
1163	!$OMP DO
1164	DO j = nys, nyn
1165
1166	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1167
1168	DO k = 1, nz
1169	DO i = 0, nx
1170	work(j,k,i) = work_fftx(i,k,j)
1171	ENDDO
1172	ENDDO
1173
1174	ENDDO
1175	!$OMP END PARALLEL
1176
1177	ELSE
1178
1179	!
1180	!-- Cache optimized code (there might be still a potential for better
1181	!-- optimization).
1182	!$OMP PARALLEL PRIVATE (i,j,k)
1183	!$OMP DO
1184	DO i = 0, nx
1185
1186	DO j = nys, nyn
1187	DO k = 1, nz
1188	work_fftx(i,k,j) = f_in(k,j,i)
1189	ENDDO
1190	ENDDO
1191
1192	ENDDO
1193
1194	!$OMP DO
1195	DO j = nys, nyn
1196	DO k = 1, nz
1197
1198	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1199
1200	DO i = 0, nx
1201	work(j,k,i) = work_fftx(i,k,j)
1202	ENDDO
1203	ENDDO
1204
1205	ENDDO
1206	!$OMP END PARALLEL
1207
1208	ENDIF
1209	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1210
1211	!
1212	!-- Transpose array
1213	#if defined( __parallel )
1214	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1215	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1216	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1217	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1218	comm1dy, ierr )
1219	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1220	#endif
1221
1222	END SUBROUTINE fftx_tr_xy
1223
1224
1225	SUBROUTINE tr_yx_fftx( f_in, f_out )
1226
1227	!------------------------------------------------------------------------------!
1228	! Transposition y --> x with a subsequent backward Fourier transformation for
1229	! a 1d-decomposition along x
1230	!------------------------------------------------------------------------------!
1231
1232	USE control_parameters, &
1233	ONLY: host
1234
1235	USE cpulog, &
1236	ONLY: cpu_log, log_point_s
1237
1238	USE kinds
1239
1240	USE pegrid
1241
1242	IMPLICIT NONE
1243
1244	INTEGER(iwp) :: i !:
1245	INTEGER(iwp) :: j !:
1246	INTEGER(iwp) :: k !:
1247
1248	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !:
1249	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !:
1250	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !:
1251	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !:
1252
1253	!
1254	!-- Transpose array
1255	#if defined( __parallel )
1256	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1257	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1258	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1259	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1260	comm1dy, ierr )
1261	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1262	#endif
1263
1264	!
1265	!-- Carry out the FFT along x, where all data are present due to the
1266	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1267	!-- the first index.
1268	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1269
1270	IF ( host(1:3) == 'nec' ) THEN
1271	!
1272	!-- Code optimized for vector processors
1273	!$OMP PARALLEL PRIVATE ( i, j, k )
1274	!$OMP DO
1275	DO j = nys, nyn
1276
1277	DO k = 1, nz
1278	DO i = 0, nx
1279	work_fftx(i,k,j) = work(j,k,i)
1280	ENDDO
1281	ENDDO
1282
1283	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1284
1285	ENDDO
1286
1287	!$OMP DO
1288	DO i = 0, nx
1289	DO j = nys, nyn
1290	DO k = 1, nz
1291	f_out(k,j,i) = work_fftx(i,k,j)
1292	ENDDO
1293	ENDDO
1294	ENDDO
1295	!$OMP END PARALLEL
1296
1297	ELSE
1298
1299	!
1300	!-- Cache optimized code (there might be still a potential for better
1301	!-- optimization).
1302	!$OMP PARALLEL PRIVATE (i,j,k)
1303	!$OMP DO
1304	DO j = nys, nyn
1305	DO k = 1, nz
1306
1307	DO i = 0, nx
1308	work_fftx(i,k,j) = work(j,k,i)
1309	ENDDO
1310
1311	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1312
1313	ENDDO
1314	ENDDO
1315
1316	!$OMP DO
1317	DO i = 0, nx
1318	DO j = nys, nyn
1319	DO k = 1, nz
1320	f_out(k,j,i) = work_fftx(i,k,j)
1321	ENDDO
1322	ENDDO
1323	ENDDO
1324	!$OMP END PARALLEL
1325
1326	ENDIF
1327	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1328
1329	END SUBROUTINE tr_yx_fftx
1330
1331
1332	SUBROUTINE ffty_tri_ffty( ar )
1333
1334	!------------------------------------------------------------------------------!
1335	! FFT along y, solution of the tridiagonal system and backward FFT for
1336	! a 1d-decomposition along y
1337	!
1338	! WARNING: this subroutine may still not work for hybrid parallelization
1339	! with OpenMP (for possible necessary changes see the original
1340	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1341	!------------------------------------------------------------------------------!
1342
1343	USE control_parameters, &
1344	ONLY: host
1345
1346	USE cpulog, &
1347	ONLY: cpu_log, log_point_s
1348
1349	USE grid_variables, &
1350	ONLY: ddx2, ddy2
1351
1352	USE kinds
1353
1354	USE pegrid
1355
1356	IMPLICIT NONE
1357
1358	INTEGER(iwp) :: i !:
1359	INTEGER(iwp) :: j !:
1360	INTEGER(iwp) :: k !:
1361	INTEGER(iwp) :: m !:
1362	INTEGER(iwp) :: n !:
1363	INTEGER(iwp) :: omp_get_thread_num !:
1364	INTEGER(iwp) :: tn !:
1365
1366	REAL(wp), DIMENSION(0:ny) :: work_ffty !:
1367	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !:
1368	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !:
1369	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !:
1370
1371
1372	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1373
1374	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1375
1376	tn = 0 ! Default thread number in case of one thread
1377	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1378	DO i = nxl_y, nxr_y
1379
1380	!$ tn = omp_get_thread_num()
1381
1382	IF ( host(1:3) == 'nec' ) THEN
1383	!
1384	!-- Code optimized for vector processors
1385	DO k = 1, nz
1386
1387	m = 0
1388	DO n = 1, pdims(2)
1389	DO j = 1, nny
1390	work_triy(m,k) = ar(j,k,i,n)
1391	m = m + 1
1392	ENDDO
1393	ENDDO
1394
1395	ENDDO
1396
1397	CALL fft_y_m( work_triy, ny, 'forward' )
1398
1399	ELSE
1400	!
1401	!-- Cache optimized code
1402	DO k = 1, nz
1403
1404	m = 0
1405	DO n = 1, pdims(2)
1406	DO j = 1, nny
1407	work_ffty(m) = ar(j,k,i,n)
1408	m = m + 1
1409	ENDDO
1410	ENDDO
1411
1412	CALL fft_y_1d( work_ffty, 'forward' )
1413
1414	DO j = 0, ny
1415	work_triy(j,k) = work_ffty(j)
1416	ENDDO
1417
1418	ENDDO
1419
1420	ENDIF
1421
1422	!
1423	!-- Solve the linear equation system
1424	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1425
1426	IF ( host(1:3) == 'nec' ) THEN
1427	!
1428	!-- Code optimized for vector processors
1429	CALL fft_y_m( work_triy, ny, 'backward' )
1430
1431	DO k = 1, nz
1432
1433	m = 0
1434	DO n = 1, pdims(2)
1435	DO j = 1, nny
1436	ar(j,k,i,n) = work_triy(m,k)
1437	m = m + 1
1438	ENDDO
1439	ENDDO
1440
1441	ENDDO
1442
1443	ELSE
1444	!
1445	!-- Cache optimized code
1446	DO k = 1, nz
1447
1448	DO j = 0, ny
1449	work_ffty(j) = work_triy(j,k)
1450	ENDDO
1451
1452	CALL fft_y_1d( work_ffty, 'backward' )
1453
1454	m = 0
1455	DO n = 1, pdims(2)
1456	DO j = 1, nny
1457	ar(j,k,i,n) = work_ffty(m)
1458	m = m + 1
1459	ENDDO
1460	ENDDO
1461
1462	ENDDO
1463
1464	ENDIF
1465
1466	ENDDO
1467
1468	DEALLOCATE( tri )
1469
1470	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1471
1472	END SUBROUTINE ffty_tri_ffty
1473
1474	#endif
1475
1476	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |