Home

Context Navigation

source: palm/trunk/SOURCE/poisfft.f90 @ 1482

Last change on this file since 1482 was 1482, checked in by raasch, 10 years ago
adjustments for using CUDA-aware MPI
Property svn:keywords set to `Id`
File size: 45.0 KB

Line
1	MODULE poisfft_mod
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2014 Leibniz Universitaet Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	! use 2d-decomposition, if accelerator boards are used
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: poisfft.f90 1482 2014-10-18 12:34:45Z raasch $
27	!
28	! 1406 2014-05-16 13:47:01Z raasch
29	! bugfix for pgi 14.4: declare create moved after array declaration
30	!
31	! 1320 2014-03-20 08:40:49Z raasch
32	! ONLY-attribute added to USE-statements,
33	! kind-parameters added to all INTEGER and REAL declaration statements,
34	! kinds are defined in new module kinds,
35	! old module precision_kind is removed,
36	! revision history before 2012 removed,
37	! comment fields (!:) to be used for variable explanations added to
38	! all variable declaration statements
39	!
40	! 1318 2014-03-17 13:35:16Z raasch
41	! module interfaces removed
42	!
43	! 1306 2014-03-13 14:30:59Z raasch
44	! openmp sections removed from the overlap branch,
45	! second argument removed from parameter list
46	!
47	! 1216 2013-08-26 09:31:42Z raasch
48	! resorting of arrays moved to separate routines resort_for_...,
49	! one argument, used as temporary work array, removed from all transpose
50	! routines
51	! overlapping fft / transposition implemented
52	!
53	! 1212 2013-08-15 08:46:27Z raasch
54	! tridia routines moved to seperate module tridia_solver
55	!
56	! 1208 2013-08-13 06:41:49Z raasch
57	! acc-update clauses added for "ar" so that ffts other than cufft can also be
58	! used (although they are not ported and will give a poor performance)
59	!
60	! 1111 2013-03-08 23:54:10Z raasch
61	! further openACC porting of non-parallel (MPI) branch:
62	! tridiagonal routines split into extermal subroutines (instead using CONTAINS),
63	! no distinction between parallel/non-parallel in poisfft and tridia any more,
64	! tridia routines moved to end of file because of probable bug in PGI compiler 12.5
65	! (otherwise "invalid device function" is indicated during runtime),
66	! optimization of tridia routines: constant elements and coefficients of tri are
67	! stored in seperate arrays ddzuw and tric, last dimension of tri reduced from 5
68	! to 2,
69	! poisfft_init is now called internally from poisfft, maketri is called from
70	! poisfft_init,
71	! ibc_p_b = 2 removed
72	!
73	! 1106 2013-03-04 05:31:38Z raasch
74	! routines fftx, ffty, fftxp, fftyp removed, calls replaced by fft_x, fft_y,
75	! in the 1D-decomposition routines fft_x, ffty are replaced by fft_x_1d,
76	! fft_y_1d
77	!
78	! 1103 2013-02-20 02:15:53Z raasch
79	! tri, ar, and ar1 arguments in tridia-routines (2d) are removed because they
80	! sometimes cause segmentation faults with intel 12.1 compiler
81	!
82	! 1092 2013-02-02 11:24:22Z raasch
83	! unused variables removed
84	!
85	! 1036 2012-10-22 13:43:42Z raasch
86	! code put under GPL (PALM 3.9)
87	!
88	! 2012-09-21 07:03:55Z raasch
89	! FLOAT type conversion replaced by REAL
90	!
91	! 1003 2012-09-14 14:35:53Z raasch
92	! indices nxa, nya, etc. replaced by nx, ny, etc.
93	!
94	! 940 2012-07-09 14:31:00Z raasch
95	! special handling of tri-array as an argument in tridia_1dd routines switched
96	! off because it caused segmentation faults with intel 12.1 compiler
97	!
98	! 877 2012-04-03 11:21:44Z suehring
99	! Bugfix: Avoid divisions by zero in case of using a 'neumann' bc for the
100	! pressure at the top of the model domain.
101	!
102	! 809 2012-01-30 13:32:58Z maronga
103	! Bugfix: replaced .AND. and .NOT. with && and ! in the preprocessor directives
104	!
105	! 807 2012-01-25 11:53:51Z maronga
106	! New cpp directive "__check" implemented which is used by check_namelist_files
107	! (most of the code is unneeded by check_namelist_files).
108	!
109	! Revision 1.1 1997/07/24 11:24:14 raasch
110	! Initial revision
111	!
112	!
113	! Description:
114	! ------------
115	! Original version by Stephan Siano (pois3d), as of July 23, 1996
116	! Adapted for 2D-domain-decomposition by Siegfried Raasch, July 3, 1997
117	!
118	! Solves the Poisson equation with a 2D spectral method
119	! d^2 p / dx^2 + d^2 p / dy^2 + d^2 p / dz^2 = s
120	!
121	! Input:
122	! real ar contains (nnz,nny,nnx) elements of the velocity divergence,
123	! starting from (1,nys,nxl)
124	!
125	! Output:
126	! real ar contains the solution for perturbation pressure p
127	!------------------------------------------------------------------------------!
128
129	USE fft_xy, &
130	ONLY: fft_init, fft_y, fft_y_1d, fft_y_m, fft_x, fft_x_1d, fft_x_m
131
132	USE indices, &
133	ONLY: nnx, nny, nx, nxl, nxr, ny, nys, nyn, nz
134
135	USE transpose_indices, &
136	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nys_x, nys_z, nyn_x, nyn_z, nzb_x, &
137	nzb_y, nzt_x, nzt_y
138
139	USE tridia_solver, &
140	ONLY: tridia_1dd, tridia_init, tridia_substi, tridia_substi_overlap
141
142	IMPLICIT NONE
143
144	LOGICAL, SAVE :: poisfft_initialized = .FALSE.
145
146	PRIVATE
147
148	#if ! defined ( __check )
149	PUBLIC poisfft, poisfft_init
150
151	INTERFACE poisfft
152	MODULE PROCEDURE poisfft
153	END INTERFACE poisfft
154
155	INTERFACE poisfft_init
156	MODULE PROCEDURE poisfft_init
157	END INTERFACE poisfft_init
158	#else
159	PUBLIC poisfft_init
160
161	INTERFACE poisfft_init
162	MODULE PROCEDURE poisfft_init
163	END INTERFACE poisfft_init
164	#endif
165
166	CONTAINS
167
168	SUBROUTINE poisfft_init
169
170	USE arrays_3d, &
171	ONLY: ddzu_pres, ddzw
172
173	USE kinds
174
175	IMPLICIT NONE
176
177	INTEGER(iwp) :: k !:
178
179
180	CALL fft_init
181
182	CALL tridia_init
183
184	poisfft_initialized = .TRUE.
185
186	END SUBROUTINE poisfft_init
187
188
189	#if ! defined ( __check )
190	SUBROUTINE poisfft( ar )
191
192	USE control_parameters, &
193	ONLY: fft_method, transpose_compute_overlap
194
195	USE cpulog, &
196	ONLY: cpu_log, cpu_log_nowait, log_point_s
197
198	USE kinds
199
200	USE pegrid
201
202	IMPLICIT NONE
203
204	INTEGER(iwp) :: ii !:
205	INTEGER(iwp) :: iind !:
206	INTEGER(iwp) :: inew !:
207	INTEGER(iwp) :: jj !:
208	INTEGER(iwp) :: jind !:
209	INTEGER(iwp) :: jnew !:
210	INTEGER(iwp) :: ki !:
211	INTEGER(iwp) :: kk !:
212	INTEGER(iwp) :: knew !:
213	INTEGER(iwp) :: n !:
214	INTEGER(iwp) :: nblk !:
215	INTEGER(iwp) :: nnx_y !:
216	INTEGER(iwp) :: nny_z !:
217	INTEGER(iwp) :: nnz_t !:
218	INTEGER(iwp) :: nnz_x !:
219	INTEGER(iwp) :: nxl_y_bound !:
220	INTEGER(iwp) :: nxr_y_bound !:
221
222	INTEGER(iwp), DIMENSION(4) :: isave !:
223
224	REAL(wp), DIMENSION(1:nz,nys:nyn,nxl:nxr) :: ar !:
225	REAL(wp), DIMENSION(nys:nyn,nxl:nxr,1:nz) :: ar_inv !:
226	!$acc declare create( ar_inv )
227
228	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: ar1 !:
229	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_in !:
230	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_inv !:
231	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_y !:
232	REAL(wp), DIMENSION(:,:,:), ALLOCATABLE :: f_out_z !:
233
234
235	CALL cpu_log( log_point_s(3), 'poisfft', 'start' )
236
237	IF ( .NOT. poisfft_initialized ) CALL poisfft_init
238
239	!
240	!-- Two-dimensional Fourier Transformation in x- and y-direction.
241	IF ( pdims(2) == 1 .AND. pdims(1) > 1 .AND. num_acc_per_node == 0 ) &
242	THEN
243
244	!
245	!-- 1d-domain-decomposition along x:
246	!-- FFT along y and transposition y --> x
247	CALL ffty_tr_yx( ar, ar )
248
249	!
250	!-- FFT along x, solving the tridiagonal system and backward FFT
251	CALL fftx_tri_fftx( ar )
252
253	!
254	!-- Transposition x --> y and backward FFT along y
255	CALL tr_xy_ffty( ar, ar )
256
257	ELSEIF ( pdims(1) == 1 .AND. pdims(2) > 1 .AND. num_acc_per_node == 0 ) &
258	THEN
259
260	!
261	!-- 1d-domain-decomposition along y:
262	!-- FFT along x and transposition x --> y
263	CALL fftx_tr_xy( ar, ar )
264
265	!
266	!-- FFT along y, solving the tridiagonal system and backward FFT
267	CALL ffty_tri_ffty( ar )
268
269	!
270	!-- Transposition y --> x and backward FFT along x
271	CALL tr_yx_fftx( ar, ar )
272
273	ELSEIF ( .NOT. transpose_compute_overlap ) THEN
274
275	!
276	!-- 2d-domain-decomposition or no decomposition (1 PE run)
277	!-- Transposition z --> x
278	CALL cpu_log( log_point_s(5), 'transpo forward', 'start' )
279	CALL resort_for_zx( ar, ar_inv )
280	CALL transpose_zx( ar_inv, ar )
281	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
282
283	CALL cpu_log( log_point_s(4), 'fft_x', 'start' )
284	IF ( fft_method /= 'system-specific' ) THEN
285	!$acc update host( ar )
286	ENDIF
287	CALL fft_x( ar, 'forward' )
288	IF ( fft_method /= 'system-specific' ) THEN
289	!$acc update device( ar )
290	ENDIF
291	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
292
293	!
294	!-- Transposition x --> y
295	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
296	CALL resort_for_xy( ar, ar_inv )
297	CALL transpose_xy( ar_inv, ar )
298	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
299
300	CALL cpu_log( log_point_s(7), 'fft_y', 'start' )
301	IF ( fft_method /= 'system-specific' ) THEN
302	!$acc update host( ar )
303	ENDIF
304	CALL fft_y( ar, 'forward', ar_tr = ar, &
305	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
306	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
307	IF ( fft_method /= 'system-specific' ) THEN
308	!$acc update device( ar )
309	ENDIF
310	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
311
312	!
313	!-- Transposition y --> z
314	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue' )
315	CALL resort_for_yz( ar, ar_inv )
316	CALL transpose_yz( ar_inv, ar )
317	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
318
319	!
320	!-- Solve the tridiagonal equation system along z
321	CALL cpu_log( log_point_s(6), 'tridia', 'start' )
322	CALL tridia_substi( ar )
323	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
324
325	!
326	!-- Inverse Fourier Transformation
327	!-- Transposition z --> y
328	CALL cpu_log( log_point_s(8), 'transpo invers', 'start' )
329	CALL transpose_zy( ar, ar_inv )
330	CALL resort_for_zy( ar_inv, ar )
331	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
332
333	CALL cpu_log( log_point_s(7), 'fft_y', 'continue' )
334	IF ( fft_method /= 'system-specific' ) THEN
335	!$acc update host( ar )
336	ENDIF
337	CALL fft_y( ar, 'backward', ar_tr = ar, &
338	nxl_y_bound = nxl_y, nxr_y_bound = nxr_y, &
339	nxl_y_l = nxl_y, nxr_y_l = nxr_y )
340	IF ( fft_method /= 'system-specific' ) THEN
341	!$acc update device( ar )
342	ENDIF
343	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
344
345	!
346	!-- Transposition y --> x
347	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
348	CALL transpose_yx( ar, ar_inv )
349	CALL resort_for_yx( ar_inv, ar )
350	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
351
352	CALL cpu_log( log_point_s(4), 'fft_x', 'continue' )
353	IF ( fft_method /= 'system-specific' ) THEN
354	!$acc update host( ar )
355	ENDIF
356	CALL fft_x( ar, 'backward' )
357	IF ( fft_method /= 'system-specific' ) THEN
358	!$acc update device( ar )
359	ENDIF
360	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
361
362	!
363	!-- Transposition x --> z
364	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue' )
365	CALL transpose_xz( ar, ar_inv )
366	CALL resort_for_xz( ar_inv, ar )
367	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
368
369	ELSE
370
371	!
372	!-- 2d-domain-decomposition or no decomposition (1 PE run) with
373	!-- overlapping transposition / fft
374	!-- cputime logging must not use barriers, which would prevent overlapping
375	ALLOCATE( f_out_y(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), &
376	f_out_z(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
377	!
378	!-- Transposition z --> x + subsequent fft along x
379	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
380	CALL resort_for_zx( ar, f_inv )
381	!
382	!-- Save original indices and gridpoint counter
383	isave(1) = nz
384	isave(2) = nzb_x
385	isave(3) = nzt_x
386	isave(4) = sendrecvcount_zx
387	!
388	!-- Set new indices for transformation
389	nblk = nz / pdims(1)
390	nz = pdims(1)
391	nnz_x = 1
392	nzb_x = 1 + myidx * nnz_x
393	nzt_x = ( myidx + 1 ) * nnz_x
394	sendrecvcount_zx = nnx * nny * nnz_x
395
396	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
397	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
398
399	DO kk = 1, nblk
400
401	IF ( kk == 1 ) THEN
402	CALL cpu_log( log_point_s(5), 'transpo forward', 'start', cpu_log_nowait )
403	ELSE
404	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
405	ENDIF
406
407	DO knew = 1, nz
408	ki = kk + nblk * ( knew - 1 )
409	f_in(:,:,knew) = f_inv(:,:,ki)
410	ENDDO
411
412	CALL transpose_zx( f_in, ar1(:,:,:))
413	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
414
415	IF ( kk == 1 ) THEN
416	CALL cpu_log( log_point_s(4), 'fft_x', 'start', cpu_log_nowait )
417	ELSE
418	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
419	ENDIF
420
421	n = isave(2) + kk - 1
422	CALL fft_x( ar1(:,:,:), 'forward', ar_2d = f_out_z(:,:,n))
423	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
424
425	ENDDO
426	!
427	!-- Restore original indices/counters
428	nz = isave(1)
429	nzb_x = isave(2)
430	nzt_x = isave(3)
431	sendrecvcount_zx = isave(4)
432
433	DEALLOCATE( ar1, f_in, f_inv )
434
435	!
436	!-- Transposition x --> y + subsequent fft along y
437	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
438	CALL resort_for_xy( f_out_z, f_inv )
439	!
440	!-- Save original indices and gridpoint counter
441	isave(1) = nx
442	isave(2) = nxl_y
443	isave(3) = nxr_y
444	isave(4) = sendrecvcount_xy
445	!
446	!-- Set new indices for transformation
447	nblk = ( ( nx+1 ) / pdims(2) ) - 1
448	nx = pdims(2)
449	nnx_y = 1
450	nxl_y = myidy * nnx_y
451	nxr_y = ( myidy + 1 ) * nnx_y - 1
452	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
453
454	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
455	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
456
457	DO ii = 0, nblk
458
459	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
460
461	DO inew = 0, nx-1
462	iind = ii + ( nblk + 1 ) * inew
463	f_in(:,:,inew) = f_inv(:,:,iind)
464	ENDDO
465
466	CALL transpose_xy( f_in, ar1(:,:,:) )
467
468	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
469
470	IF ( ii == 1 ) THEN
471	CALL cpu_log( log_point_s(7), 'fft_y', 'start', cpu_log_nowait )
472	ELSE
473	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
474	ENDIF
475
476	nxl_y_bound = isave(2)
477	nxr_y_bound = isave(3)
478	n = isave(2) + ii
479	CALL fft_y( ar1(:,:,:), 'forward', ar_tr = f_out_y, &
480	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
481	nxl_y_l = n, nxr_y_l = n )
482
483	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
484
485	ENDDO
486	!
487	!-- Restore original indices/counters
488	nx = isave(1)
489	nxl_y = isave(2)
490	nxr_y = isave(3)
491	sendrecvcount_xy = isave(4)
492
493	DEALLOCATE( ar1, f_in, f_inv )
494
495	!
496	!-- Transposition y --> z + subsequent tridia + resort for z --> y
497	ALLOCATE( f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
498	CALL resort_for_yz( f_out_y, f_inv )
499	!
500	!-- Save original indices and gridpoint counter
501	isave(1) = ny
502	isave(2) = nys_z
503	isave(3) = nyn_z
504	isave(4) = sendrecvcount_yz
505	!
506	!-- Set new indices for transformation
507	nblk = ( ( ny+1 ) / pdims(1) ) - 1
508	ny = pdims(1)
509	nny_z = 1
510	nys_z = myidx * nny_z
511	nyn_z = ( myidx + 1 ) * nny_z - 1
512	sendrecvcount_yz = ( nxr_y-nxl_y+1 ) * nny_z * ( nzt_y-nzb_y+1 )
513
514	ALLOCATE( ar1(nxl_z:nxr_z,nys_z:nyn_z,1:nz) )
515	ALLOCATE( f_in(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) )
516
517	DO jj = 0, nblk
518	!
519	!-- Forward Fourier Transformation
520	!-- Transposition y --> z
521	CALL cpu_log( log_point_s(5), 'transpo forward', 'continue', cpu_log_nowait )
522
523	DO jnew = 0, ny-1
524	jind = jj + ( nblk + 1 ) * jnew
525	f_in(:,:,jnew) = f_inv(:,:,jind)
526	ENDDO
527
528	CALL transpose_yz( f_in, ar1(:,:,:) )
529
530	IF ( jj == nblk ) THEN
531	CALL cpu_log( log_point_s(5), 'transpo forward', 'stop' )
532	ELSE
533	CALL cpu_log( log_point_s(5), 'transpo forward', 'pause' )
534	ENDIF
535
536	!
537	!-- Solve the tridiagonal equation system along z
538	CALL cpu_log( log_point_s(6), 'tridia', 'start', cpu_log_nowait )
539
540	n = isave(2) + jj
541	CALL tridia_substi_overlap( ar1(:,:,:), n )
542
543	CALL cpu_log( log_point_s(6), 'tridia', 'stop' )
544
545	!
546	!-- Inverse Fourier Transformation
547	!-- Transposition z --> y
548	!-- Only one thread should call MPI routines, therefore forward and
549	!-- backward tranpose are in the same section
550	IF ( jj == 0 ) THEN
551	CALL cpu_log( log_point_s(8), 'transpo invers', 'start', cpu_log_nowait )
552	ELSE
553	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
554	ENDIF
555
556	CALL transpose_zy( ar1(:,:,:), f_in )
557
558	DO jnew = 0, ny-1
559	jind = jj + ( nblk + 1 ) * jnew
560	f_inv(:,:,jind) = f_in(:,:,jnew)
561	ENDDO
562
563	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
564
565	ENDDO
566	!
567	!-- Restore original indices/counters
568	ny = isave(1)
569	nys_z = isave(2)
570	nyn_z = isave(3)
571	sendrecvcount_yz = isave(4)
572
573	CALL resort_for_zy( f_inv, f_out_y )
574
575	DEALLOCATE( ar1, f_in, f_inv )
576
577	!
578	!-- fft along y backward + subsequent transposition y --> x
579	ALLOCATE( f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
580	!
581	!-- Save original indices and gridpoint counter
582	isave(1) = nx
583	isave(2) = nxl_y
584	isave(3) = nxr_y
585	isave(4) = sendrecvcount_xy
586	!
587	!-- Set new indices for transformation
588	nblk = (( nx+1 ) / pdims(2) ) - 1
589	nx = pdims(2)
590	nnx_y = 1
591	nxl_y = myidy * nnx_y
592	nxr_y = ( myidy + 1 ) * nnx_y - 1
593	sendrecvcount_xy = nnx_y * ( nyn_x-nys_x+1 ) * ( nzt_x-nzb_x+1 )
594
595	ALLOCATE( ar1(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) )
596	ALLOCATE( f_in(nys_x:nyn_x,nzb_x:nzt_x,0:nx) )
597
598	DO ii = 0, nblk
599
600	CALL cpu_log( log_point_s(7), 'fft_y', 'continue', cpu_log_nowait )
601
602	n = isave(2) + ii
603	nxl_y_bound = isave(2)
604	nxr_y_bound = isave(3)
605
606	CALL fft_y( ar1(:,:,:), 'backward', ar_tr = f_out_y, &
607	nxl_y_bound = nxl_y_bound, nxr_y_bound = nxr_y_bound, &
608	nxl_y_l = n, nxr_y_l = n )
609
610	IF ( ii == nblk ) THEN
611	CALL cpu_log( log_point_s(7), 'fft_y', 'stop' )
612	ELSE
613	CALL cpu_log( log_point_s(7), 'fft_y', 'pause' )
614	ENDIF
615
616	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
617
618	CALL transpose_yx( ar1(:,:,:), f_in )
619
620	DO inew = 0, nx-1
621	iind = ii + (nblk+1) * inew
622	f_inv(:,:,iind) = f_in(:,:,inew)
623	ENDDO
624
625	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
626
627	ENDDO
628	!
629	!-- Restore original indices/counters
630	nx = isave(1)
631	nxl_y = isave(2)
632	nxr_y = isave(3)
633	sendrecvcount_xy = isave(4)
634
635	CALL resort_for_yx( f_inv, f_out_z )
636
637	DEALLOCATE( ar1, f_in, f_inv )
638
639	!
640	!-- fft along x backward + subsequent final transposition x --> z
641	ALLOCATE( f_inv(nys:nyn,nxl:nxr,1:nz) )
642	!
643	!-- Save original indices and gridpoint counter
644	isave(1) = nz
645	isave(2) = nzb_x
646	isave(3) = nzt_x
647	isave(4) = sendrecvcount_zx
648	!
649	!-- Set new indices for transformation
650	nblk = nz / pdims(1)
651	nz = pdims(1)
652	nnz_x = 1
653	nzb_x = 1 + myidx * nnz_x
654	nzt_x = ( myidx + 1 ) * nnz_x
655	sendrecvcount_zx = nnx * nny * nnz_x
656
657	ALLOCATE( ar1(0:nx,nys_x:nyn_x,nzb_x:nzt_x) )
658	ALLOCATE( f_in(nys:nyn,nxl:nxr,1:nz) )
659
660	DO kk = 1, nblk
661
662	CALL cpu_log( log_point_s(4), 'fft_x', 'continue', cpu_log_nowait )
663
664	n = isave(2) + kk - 1
665	CALL fft_x( ar1(:,:,:), 'backward', f_out_z(:,:,n))
666
667	IF ( kk == nblk ) THEN
668	CALL cpu_log( log_point_s(4), 'fft_x', 'stop' )
669	ELSE
670	CALL cpu_log( log_point_s(4), 'fft_x', 'pause' )
671	ENDIF
672
673	CALL cpu_log( log_point_s(8), 'transpo invers', 'continue', cpu_log_nowait )
674
675	CALL transpose_xz( ar1(:,:,:), f_in )
676
677	DO knew = 1, nz
678	ki = kk + nblk * (knew-1)
679	f_inv(:,:,ki) = f_in(:,:,knew)
680	ENDDO
681
682	IF ( kk == nblk ) THEN
683	CALL cpu_log( log_point_s(8), 'transpo invers', 'stop' )
684	ELSE
685	CALL cpu_log( log_point_s(8), 'transpo invers', 'pause' )
686	ENDIF
687
688	ENDDO
689	!
690	!-- Restore original indices/counters
691	nz = isave(1)
692	nzb_x = isave(2)
693	nzt_x = isave(3)
694	sendrecvcount_zx = isave(4)
695
696	CALL resort_for_xz( f_inv, ar )
697
698	DEALLOCATE( ar1, f_in, f_inv )
699
700	ENDIF
701
702	CALL cpu_log( log_point_s(3), 'poisfft', 'stop' )
703
704	END SUBROUTINE poisfft
705
706
707
708	SUBROUTINE ffty_tr_yx( f_in, f_out )
709
710	!------------------------------------------------------------------------------!
711	! Fourier-transformation along y with subsequent transposition y --> x for
712	! a 1d-decomposition along x
713	!
714	! ATTENTION: The performance of this routine is much faster on the NEC-SX6,
715	! if the first index of work_ffty_vec is odd. Otherwise
716	! memory bank conflicts may occur (especially if the index is a
717	! multiple of 128). That's why work_ffty_vec is dimensioned as
718	! 0:ny+1.
719	! Of course, this will not work if users are using an odd number
720	! of gridpoints along y.
721	!------------------------------------------------------------------------------!
722
723	USE control_parameters, &
724	ONLY: host
725
726	USE cpulog, &
727	ONLY: cpu_log, log_point_s
728
729	USE kinds
730
731	USE pegrid
732
733	IMPLICIT NONE
734
735	INTEGER(iwp) :: i !:
736	INTEGER(iwp) :: iend !:
737	INTEGER(iwp) :: iouter !:
738	INTEGER(iwp) :: ir !:
739	INTEGER(iwp) :: j !:
740	INTEGER(iwp) :: k !:
741
742	INTEGER(iwp), PARAMETER :: stridex = 4 !:
743
744	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !:
745	#if defined( __nec )
746	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !:
747	#endif
748	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_in !:
749	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_out !:
750	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !:
751
752	!
753	!-- Carry out the FFT along y, where all data are present due to the
754	!-- 1d-decomposition along x. Resort the data in a way that x becomes
755	!-- the first index.
756	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'start' )
757
758	IF ( host(1:3) == 'nec' ) THEN
759	#if defined( __nec )
760	!
761	!-- Code optimized for vector processors
762	!$OMP PARALLEL PRIVATE ( i, j, k )
763	!$OMP DO
764	DO i = nxl, nxr
765
766	DO j = 0, ny
767	DO k = 1, nz
768	work_ffty_vec(j,k,i) = f_in(k,j,i)
769	ENDDO
770	ENDDO
771
772	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'forward' )
773
774	ENDDO
775
776	!$OMP DO
777	DO k = 1, nz
778	DO j = 0, ny
779	DO i = nxl, nxr
780	work(i,k,j) = work_ffty_vec(j,k,i)
781	ENDDO
782	ENDDO
783	ENDDO
784	!$OMP END PARALLEL
785	#endif
786
787	ELSE
788
789	!
790	!-- Cache optimized code.
791	!-- The i-(x-)direction is split into a strided outer loop and an inner
792	!-- loop for better cache performance
793	!$OMP PARALLEL PRIVATE (i,iend,iouter,ir,j,k,work_ffty)
794	!$OMP DO
795	DO iouter = nxl, nxr, stridex
796
797	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
798
799	DO k = 1, nz
800
801	DO i = iouter, iend
802
803	ir = i-iouter+1 ! counter within a stride
804	DO j = 0, ny
805	work_ffty(j,ir) = f_in(k,j,i)
806	ENDDO
807	!
808	!-- FFT along y
809	CALL fft_y_1d( work_ffty(:,ir), 'forward' )
810
811	ENDDO
812
813	!
814	!-- Resort
815	DO j = 0, ny
816	DO i = iouter, iend
817	work(i,k,j) = work_ffty(j,i-iouter+1)
818	ENDDO
819	ENDDO
820
821	ENDDO
822
823	ENDDO
824	!$OMP END PARALLEL
825
826	ENDIF
827	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'pause' )
828
829	!
830	!-- Transpose array
831	#if defined( __parallel )
832	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
833	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
834	CALL MPI_ALLTOALL( work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
835	f_out(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
836	comm1dx, ierr )
837	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
838	#endif
839
840	END SUBROUTINE ffty_tr_yx
841
842
843	SUBROUTINE tr_xy_ffty( f_in, f_out )
844
845	!------------------------------------------------------------------------------!
846	! Transposition x --> y with a subsequent backward Fourier transformation for
847	! a 1d-decomposition along x
848	!------------------------------------------------------------------------------!
849
850	USE control_parameters, &
851	ONLY: host
852
853	USE cpulog, &
854	ONLY: cpu_log, log_point_s
855
856	USE kinds
857
858	USE pegrid
859
860	IMPLICIT NONE
861
862	INTEGER(iwp) :: i !:
863	INTEGER(iwp) :: iend !:
864	INTEGER(iwp) :: iouter !:
865	INTEGER(iwp) :: ir !:
866	INTEGER(iwp) :: j !:
867	INTEGER(iwp) :: k !:
868
869	INTEGER(iwp), PARAMETER :: stridex = 4 !:
870
871	REAL(wp), DIMENSION(0:ny,stridex) :: work_ffty !:
872	#if defined( __nec )
873	REAL(wp), DIMENSION(0:ny+1,1:nz,nxl:nxr) :: work_ffty_vec !:
874	#endif
875	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: f_in !:
876	REAL(wp), DIMENSION(1:nz,0:ny,nxl:nxr) :: f_out !:
877	REAL(wp), DIMENSION(nxl:nxr,1:nz,0:ny) :: work !:
878
879	!
880	!-- Transpose array
881	#if defined( __parallel )
882	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
883	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
884	CALL MPI_ALLTOALL( f_in(1,1,nys_x,1), sendrecvcount_xy, MPI_REAL, &
885	work(nxl,1,0), sendrecvcount_xy, MPI_REAL, &
886	comm1dx, ierr )
887	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
888	#endif
889
890	!
891	!-- Resort the data in a way that y becomes the first index and carry out the
892	!-- backward fft along y.
893	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'continue' )
894
895	IF ( host(1:3) == 'nec' ) THEN
896	#if defined( __nec )
897	!
898	!-- Code optimized for vector processors
899	!$OMP PARALLEL PRIVATE ( i, j, k )
900	!$OMP DO
901	DO k = 1, nz
902	DO j = 0, ny
903	DO i = nxl, nxr
904	work_ffty_vec(j,k,i) = work(i,k,j)
905	ENDDO
906	ENDDO
907	ENDDO
908
909	!$OMP DO
910	DO i = nxl, nxr
911
912	CALL fft_y_m( work_ffty_vec(:,:,i), ny+1, 'backward' )
913
914	DO j = 0, ny
915	DO k = 1, nz
916	f_out(k,j,i) = work_ffty_vec(j,k,i)
917	ENDDO
918	ENDDO
919
920	ENDDO
921	!$OMP END PARALLEL
922	#endif
923
924	ELSE
925
926	!
927	!-- Cache optimized code.
928	!-- The i-(x-)direction is split into a strided outer loop and an inner
929	!-- loop for better cache performance
930	!$OMP PARALLEL PRIVATE ( i, iend, iouter, ir, j, k, work_ffty )
931	!$OMP DO
932	DO iouter = nxl, nxr, stridex
933
934	iend = MIN( iouter+stridex-1, nxr ) ! Upper bound for inner i loop
935
936	DO k = 1, nz
937	!
938	!-- Resort
939	DO j = 0, ny
940	DO i = iouter, iend
941	work_ffty(j,i-iouter+1) = work(i,k,j)
942	ENDDO
943	ENDDO
944
945	DO i = iouter, iend
946
947	!
948	!-- FFT along y
949	ir = i-iouter+1 ! counter within a stride
950	CALL fft_y_1d( work_ffty(:,ir), 'backward' )
951
952	DO j = 0, ny
953	f_out(k,j,i) = work_ffty(j,ir)
954	ENDDO
955	ENDDO
956
957	ENDDO
958
959	ENDDO
960	!$OMP END PARALLEL
961
962	ENDIF
963
964	CALL cpu_log( log_point_s(7), 'fft_y_1d', 'stop' )
965
966	END SUBROUTINE tr_xy_ffty
967
968
969	SUBROUTINE fftx_tri_fftx( ar )
970
971	!------------------------------------------------------------------------------!
972	! FFT along x, solution of the tridiagonal system and backward FFT for
973	! a 1d-decomposition along x
974	!
975	! WARNING: this subroutine may still not work for hybrid parallelization
976	! with OpenMP (for possible necessary changes see the original
977	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
978	!------------------------------------------------------------------------------!
979
980	USE control_parameters, &
981	ONLY: host
982
983	USE cpulog, &
984	ONLY: cpu_log, log_point_s
985
986	USE grid_variables, &
987	ONLY: ddx2, ddy2
988
989	USE kinds
990
991	USE pegrid
992
993	IMPLICIT NONE
994
995	INTEGER(iwp) :: i !:
996	INTEGER(iwp) :: j !:
997	INTEGER(iwp) :: k !:
998	INTEGER(iwp) :: m !:
999	INTEGER(iwp) :: n !:
1000	INTEGER(iwp) :: omp_get_thread_num !:
1001	INTEGER(iwp) :: tn !:
1002
1003	REAL(wp), DIMENSION(0:nx) :: work_fftx !:
1004	REAL(wp), DIMENSION(0:nx,1:nz) :: work_trix !:
1005	REAL(wp), DIMENSION(nnx,1:nz,nys_x:nyn_x,pdims(1)) :: ar !:
1006	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !:
1007
1008
1009	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'start' )
1010
1011	ALLOCATE( tri(5,0:nx,0:nz-1,0:threads_per_task-1) )
1012
1013	tn = 0 ! Default thread number in case of one thread
1014	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_fftx, work_trix )
1015	DO j = nys_x, nyn_x
1016
1017	!$ tn = omp_get_thread_num()
1018
1019	IF ( host(1:3) == 'nec' ) THEN
1020	!
1021	!-- Code optimized for vector processors
1022	DO k = 1, nz
1023
1024	m = 0
1025	DO n = 1, pdims(1)
1026	DO i = 1, nnx
1027	work_trix(m,k) = ar(i,k,j,n)
1028	m = m + 1
1029	ENDDO
1030	ENDDO
1031
1032	ENDDO
1033
1034	CALL fft_x_m( work_trix, 'forward' )
1035
1036	ELSE
1037	!
1038	!-- Cache optimized code
1039	DO k = 1, nz
1040
1041	m = 0
1042	DO n = 1, pdims(1)
1043	DO i = 1, nnx
1044	work_fftx(m) = ar(i,k,j,n)
1045	m = m + 1
1046	ENDDO
1047	ENDDO
1048
1049	CALL fft_x_1d( work_fftx, 'forward' )
1050
1051	DO i = 0, nx
1052	work_trix(i,k) = work_fftx(i)
1053	ENDDO
1054
1055	ENDDO
1056
1057	ENDIF
1058
1059	!
1060	!-- Solve the linear equation system
1061	CALL tridia_1dd( ddx2, ddy2, nx, ny, j, work_trix, tri(:,:,:,tn) )
1062
1063	IF ( host(1:3) == 'nec' ) THEN
1064	!
1065	!-- Code optimized for vector processors
1066	CALL fft_x_m( work_trix, 'backward' )
1067
1068	DO k = 1, nz
1069
1070	m = 0
1071	DO n = 1, pdims(1)
1072	DO i = 1, nnx
1073	ar(i,k,j,n) = work_trix(m,k)
1074	m = m + 1
1075	ENDDO
1076	ENDDO
1077
1078	ENDDO
1079
1080	ELSE
1081	!
1082	!-- Cache optimized code
1083	DO k = 1, nz
1084
1085	DO i = 0, nx
1086	work_fftx(i) = work_trix(i,k)
1087	ENDDO
1088
1089	CALL fft_x_1d( work_fftx, 'backward' )
1090
1091	m = 0
1092	DO n = 1, pdims(1)
1093	DO i = 1, nnx
1094	ar(i,k,j,n) = work_fftx(m)
1095	m = m + 1
1096	ENDDO
1097	ENDDO
1098
1099	ENDDO
1100
1101	ENDIF
1102
1103	ENDDO
1104
1105	DEALLOCATE( tri )
1106
1107	CALL cpu_log( log_point_s(33), 'fft_x_1d + tridia', 'stop' )
1108
1109	END SUBROUTINE fftx_tri_fftx
1110
1111
1112	SUBROUTINE fftx_tr_xy( f_in, f_out )
1113
1114	!------------------------------------------------------------------------------!
1115	! Fourier-transformation along x with subsequent transposition x --> y for
1116	! a 1d-decomposition along y
1117	!
1118	! ATTENTION: The NEC-branch of this routine may significantly profit from
1119	! further optimizations. So far, performance is much worse than
1120	! for routine ffty_tr_yx (more than three times slower).
1121	!------------------------------------------------------------------------------!
1122
1123	USE control_parameters, &
1124	ONLY: host
1125
1126	USE cpulog, &
1127	ONLY: cpu_log, log_point_s
1128
1129	USE kinds
1130
1131	USE pegrid
1132
1133	IMPLICIT NONE
1134
1135	INTEGER(iwp) :: i !:
1136	INTEGER(iwp) :: j !:
1137	INTEGER(iwp) :: k !:
1138
1139	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !:
1140	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_in !:
1141	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_out !:
1142	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !:
1143
1144	!
1145	!-- Carry out the FFT along x, where all data are present due to the
1146	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1147	!-- the first index.
1148	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'start' )
1149
1150	IF ( host(1:3) == 'nec' ) THEN
1151	!
1152	!-- Code for vector processors
1153	!$OMP PARALLEL PRIVATE ( i, j, k )
1154	!$OMP DO
1155	DO i = 0, nx
1156
1157	DO j = nys, nyn
1158	DO k = 1, nz
1159	work_fftx(i,k,j) = f_in(k,j,i)
1160	ENDDO
1161	ENDDO
1162
1163	ENDDO
1164
1165	!$OMP DO
1166	DO j = nys, nyn
1167
1168	CALL fft_x_m( work_fftx(:,:,j), 'forward' )
1169
1170	DO k = 1, nz
1171	DO i = 0, nx
1172	work(j,k,i) = work_fftx(i,k,j)
1173	ENDDO
1174	ENDDO
1175
1176	ENDDO
1177	!$OMP END PARALLEL
1178
1179	ELSE
1180
1181	!
1182	!-- Cache optimized code (there might be still a potential for better
1183	!-- optimization).
1184	!$OMP PARALLEL PRIVATE (i,j,k)
1185	!$OMP DO
1186	DO i = 0, nx
1187
1188	DO j = nys, nyn
1189	DO k = 1, nz
1190	work_fftx(i,k,j) = f_in(k,j,i)
1191	ENDDO
1192	ENDDO
1193
1194	ENDDO
1195
1196	!$OMP DO
1197	DO j = nys, nyn
1198	DO k = 1, nz
1199
1200	CALL fft_x_1d( work_fftx(0:nx,k,j), 'forward' )
1201
1202	DO i = 0, nx
1203	work(j,k,i) = work_fftx(i,k,j)
1204	ENDDO
1205	ENDDO
1206
1207	ENDDO
1208	!$OMP END PARALLEL
1209
1210	ENDIF
1211	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'pause' )
1212
1213	!
1214	!-- Transpose array
1215	#if defined( __parallel )
1216	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1217	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1218	CALL MPI_ALLTOALL( work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1219	f_out(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1220	comm1dy, ierr )
1221	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1222	#endif
1223
1224	END SUBROUTINE fftx_tr_xy
1225
1226
1227	SUBROUTINE tr_yx_fftx( f_in, f_out )
1228
1229	!------------------------------------------------------------------------------!
1230	! Transposition y --> x with a subsequent backward Fourier transformation for
1231	! a 1d-decomposition along x
1232	!------------------------------------------------------------------------------!
1233
1234	USE control_parameters, &
1235	ONLY: host
1236
1237	USE cpulog, &
1238	ONLY: cpu_log, log_point_s
1239
1240	USE kinds
1241
1242	USE pegrid
1243
1244	IMPLICIT NONE
1245
1246	INTEGER(iwp) :: i !:
1247	INTEGER(iwp) :: j !:
1248	INTEGER(iwp) :: k !:
1249
1250	REAL(wp), DIMENSION(0:nx,1:nz,nys:nyn) :: work_fftx !:
1251	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: f_in !:
1252	REAL(wp), DIMENSION(1:nz,nys:nyn,0:nx) :: f_out !:
1253	REAL(wp), DIMENSION(nys:nyn,1:nz,0:nx) :: work !:
1254
1255	!
1256	!-- Transpose array
1257	#if defined( __parallel )
1258	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1259	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1260	CALL MPI_ALLTOALL( f_in(1,1,nxl_y,1), sendrecvcount_xy, MPI_REAL, &
1261	work(nys,1,0), sendrecvcount_xy, MPI_REAL, &
1262	comm1dy, ierr )
1263	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1264	#endif
1265
1266	!
1267	!-- Carry out the FFT along x, where all data are present due to the
1268	!-- 1d-decomposition along y. Resort the data in a way that y becomes
1269	!-- the first index.
1270	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'continue' )
1271
1272	IF ( host(1:3) == 'nec' ) THEN
1273	!
1274	!-- Code optimized for vector processors
1275	!$OMP PARALLEL PRIVATE ( i, j, k )
1276	!$OMP DO
1277	DO j = nys, nyn
1278
1279	DO k = 1, nz
1280	DO i = 0, nx
1281	work_fftx(i,k,j) = work(j,k,i)
1282	ENDDO
1283	ENDDO
1284
1285	CALL fft_x_m( work_fftx(:,:,j), 'backward' )
1286
1287	ENDDO
1288
1289	!$OMP DO
1290	DO i = 0, nx
1291	DO j = nys, nyn
1292	DO k = 1, nz
1293	f_out(k,j,i) = work_fftx(i,k,j)
1294	ENDDO
1295	ENDDO
1296	ENDDO
1297	!$OMP END PARALLEL
1298
1299	ELSE
1300
1301	!
1302	!-- Cache optimized code (there might be still a potential for better
1303	!-- optimization).
1304	!$OMP PARALLEL PRIVATE (i,j,k)
1305	!$OMP DO
1306	DO j = nys, nyn
1307	DO k = 1, nz
1308
1309	DO i = 0, nx
1310	work_fftx(i,k,j) = work(j,k,i)
1311	ENDDO
1312
1313	CALL fft_x_1d( work_fftx(0:nx,k,j), 'backward' )
1314
1315	ENDDO
1316	ENDDO
1317
1318	!$OMP DO
1319	DO i = 0, nx
1320	DO j = nys, nyn
1321	DO k = 1, nz
1322	f_out(k,j,i) = work_fftx(i,k,j)
1323	ENDDO
1324	ENDDO
1325	ENDDO
1326	!$OMP END PARALLEL
1327
1328	ENDIF
1329	CALL cpu_log( log_point_s(4), 'fft_x_1d', 'stop' )
1330
1331	END SUBROUTINE tr_yx_fftx
1332
1333
1334	SUBROUTINE ffty_tri_ffty( ar )
1335
1336	!------------------------------------------------------------------------------!
1337	! FFT along y, solution of the tridiagonal system and backward FFT for
1338	! a 1d-decomposition along y
1339	!
1340	! WARNING: this subroutine may still not work for hybrid parallelization
1341	! with OpenMP (for possible necessary changes see the original
1342	! routine poisfft_hybrid, developed by Klaus Ketelsen, May 2002)
1343	!------------------------------------------------------------------------------!
1344
1345	USE control_parameters, &
1346	ONLY: host
1347
1348	USE cpulog, &
1349	ONLY: cpu_log, log_point_s
1350
1351	USE grid_variables, &
1352	ONLY: ddx2, ddy2
1353
1354	USE kinds
1355
1356	USE pegrid
1357
1358	IMPLICIT NONE
1359
1360	INTEGER(iwp) :: i !:
1361	INTEGER(iwp) :: j !:
1362	INTEGER(iwp) :: k !:
1363	INTEGER(iwp) :: m !:
1364	INTEGER(iwp) :: n !:
1365	INTEGER(iwp) :: omp_get_thread_num !:
1366	INTEGER(iwp) :: tn !:
1367
1368	REAL(wp), DIMENSION(0:ny) :: work_ffty !:
1369	REAL(wp), DIMENSION(0:ny,1:nz) :: work_triy !:
1370	REAL(wp), DIMENSION(nny,1:nz,nxl_y:nxr_y,pdims(2)) :: ar !:
1371	REAL(wp), DIMENSION(:,:,:,:), ALLOCATABLE :: tri !:
1372
1373
1374	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'start' )
1375
1376	ALLOCATE( tri(5,0:ny,0:nz-1,0:threads_per_task-1) )
1377
1378	tn = 0 ! Default thread number in case of one thread
1379	!$OMP PARALLEL DO PRIVATE ( i, j, k, m, n, tn, work_ffty, work_triy )
1380	DO i = nxl_y, nxr_y
1381
1382	!$ tn = omp_get_thread_num()
1383
1384	IF ( host(1:3) == 'nec' ) THEN
1385	!
1386	!-- Code optimized for vector processors
1387	DO k = 1, nz
1388
1389	m = 0
1390	DO n = 1, pdims(2)
1391	DO j = 1, nny
1392	work_triy(m,k) = ar(j,k,i,n)
1393	m = m + 1
1394	ENDDO
1395	ENDDO
1396
1397	ENDDO
1398
1399	CALL fft_y_m( work_triy, ny, 'forward' )
1400
1401	ELSE
1402	!
1403	!-- Cache optimized code
1404	DO k = 1, nz
1405
1406	m = 0
1407	DO n = 1, pdims(2)
1408	DO j = 1, nny
1409	work_ffty(m) = ar(j,k,i,n)
1410	m = m + 1
1411	ENDDO
1412	ENDDO
1413
1414	CALL fft_y_1d( work_ffty, 'forward' )
1415
1416	DO j = 0, ny
1417	work_triy(j,k) = work_ffty(j)
1418	ENDDO
1419
1420	ENDDO
1421
1422	ENDIF
1423
1424	!
1425	!-- Solve the linear equation system
1426	CALL tridia_1dd( ddy2, ddx2, ny, nx, i, work_triy, tri(:,:,:,tn) )
1427
1428	IF ( host(1:3) == 'nec' ) THEN
1429	!
1430	!-- Code optimized for vector processors
1431	CALL fft_y_m( work_triy, ny, 'backward' )
1432
1433	DO k = 1, nz
1434
1435	m = 0
1436	DO n = 1, pdims(2)
1437	DO j = 1, nny
1438	ar(j,k,i,n) = work_triy(m,k)
1439	m = m + 1
1440	ENDDO
1441	ENDDO
1442
1443	ENDDO
1444
1445	ELSE
1446	!
1447	!-- Cache optimized code
1448	DO k = 1, nz
1449
1450	DO j = 0, ny
1451	work_ffty(j) = work_triy(j,k)
1452	ENDDO
1453
1454	CALL fft_y_1d( work_ffty, 'backward' )
1455
1456	m = 0
1457	DO n = 1, pdims(2)
1458	DO j = 1, nny
1459	ar(j,k,i,n) = work_ffty(m)
1460	m = m + 1
1461	ENDDO
1462	ENDDO
1463
1464	ENDDO
1465
1466	ENDIF
1467
1468	ENDDO
1469
1470	DEALLOCATE( tri )
1471
1472	CALL cpu_log( log_point_s(39), 'fft_y_1d + tridia', 'stop' )
1473
1474	END SUBROUTINE ffty_tri_ffty
1475
1476	#endif
1477
1478	END MODULE poisfft_mod

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |