Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 4427

Last change on this file since 4427 was 4415, checked in by raasch, 5 years ago
bugfix for misplaced/missing preprocessor directives
Property svn:keywords set to `Id`
File size: 34.2 KB

Line
1	!> @file transpose.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2020 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 4415 2020-02-20 10:30:33Z maronga $
27	! bugfix for misplaced preprocessor directive
28	!
29	! 4370 2020-01-10 14:00:44Z raasch
30	! vector array renamed
31	!
32	! 4366 2020-01-09 08:12:43Z raasch
33	! modifications for NEC vectorization
34	!
35	! 4360 2020-01-07 11:25:50Z suehring
36	! Added missing OpenMP directives
37	!
38	! 4182 2019-08-22 15:20:23Z scharf
39	! Corrected "Former revisions" section
40	!
41	! 4171 2019-08-19 17:44:09Z gronemeier
42	! loop reordering for performance optimization
43	!
44	! 3832 2019-03-28 13:16:58Z raasch
45	! loop reordering for performance optimization
46	!
47	! 3694 2019-01-23 17:01:49Z knoop
48	! OpenACC port for SPEC
49	!
50	! Revision 1.1 1997/07/24 11:25:18 raasch
51	! Initial revision
52	!
53	!
54	! Description:
55	! ------------
56	!> Resorting data for the transposition from x to y. The transposition itself
57	!> is carried out in transpose_xy
58	!------------------------------------------------------------------------------!
59
60	#define __acc_fft_device ( defined( _OPENACC ) && ( defined ( __cuda_fft ) ) )
61
62	SUBROUTINE resort_for_xy( f_in, f_inv )
63
64
65	USE indices, &
66	ONLY: nx
67
68	USE kinds
69
70	USE transpose_indices, &
71	ONLY: nyn_x, nys_x, nzb_x, nzt_x
72
73	IMPLICIT NONE
74
75	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
76	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
77
78
79	INTEGER(iwp) :: i !<
80	INTEGER(iwp) :: j !<
81	INTEGER(iwp) :: k !<
82	!
83	!-- Rearrange indices of input array in order to make data to be send
84	!-- by MPI contiguous
85	!$OMP PARALLEL PRIVATE ( i, j, k )
86	!$OMP DO
87	#if __acc_fft_device
88	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
89	!$ACC PRESENT(f_inv, f_in)
90	#endif
91	DO k = nzb_x, nzt_x
92	DO j = nys_x, nyn_x
93	DO i = 0, nx
94	f_inv(j,k,i) = f_in(i,j,k)
95	ENDDO
96	ENDDO
97	ENDDO
98	!$OMP END PARALLEL
99
100	END SUBROUTINE resort_for_xy
101
102
103	!------------------------------------------------------------------------------!
104	! Description:
105	! ------------
106	!> Transposition of input array (f_in) from x to y. For the input array, all
107	!> elements along x reside on the same PE, while after transposition, all
108	!> elements along y reside on the same PE.
109	!------------------------------------------------------------------------------!
110	SUBROUTINE transpose_xy( f_inv, f_out )
111
112
113	USE cpulog, &
114	ONLY: cpu_log, cpu_log_nowait, log_point_s
115
116	USE indices, &
117	ONLY: nx, ny
118
119	USE kinds
120
121	USE pegrid
122
123	USE transpose_indices, &
124	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
125
126	IMPLICIT NONE
127
128	INTEGER(iwp) :: i !<
129	INTEGER(iwp) :: j !<
130	INTEGER(iwp) :: k !<
131	INTEGER(iwp) :: l !<
132	INTEGER(iwp) :: ys !<
133
134	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
135	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
136
137	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
138	#if __acc_fft_device
139	!$ACC DECLARE CREATE(work)
140	#endif
141
142
143	IF ( numprocs /= 1 ) THEN
144
145	#if defined( __parallel )
146	!
147	!-- Transpose array
148	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
149
150	#if __acc_fft_device
151	#ifndef __cuda_aware_mpi
152	!$ACC UPDATE HOST(f_inv)
153	#else
154	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
155	#endif
156	#endif
157
158	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
159	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
160	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
161	comm1dy, ierr )
162
163	#if __acc_fft_device
164	#ifndef __cuda_aware_mpi
165	!$ACC UPDATE DEVICE(work)
166	#else
167	!$ACC END HOST_DATA
168	#endif
169	#endif
170
171	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
172
173	!
174	!-- Reorder transposed array
175	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
176	DO l = 0, pdims(2) - 1
177	ys = 0 + l * ( nyn_x - nys_x + 1 )
178	#if __acc_fft_device
179	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
180	!$ACC PRESENT(f_out, work)
181	#endif
182	!$OMP DO
183	DO i = nxl_y, nxr_y
184	DO k = nzb_y, nzt_y
185	DO j = ys, ys + nyn_x - nys_x
186	f_out(j,i,k) = work(j-ys+1,k,i,l)
187	ENDDO
188	ENDDO
189	ENDDO
190	!$OMP END DO NOWAIT
191	ENDDO
192	!$OMP END PARALLEL
193	#endif
194
195	ELSE
196
197	!
198	!-- Reorder transposed array
199	!$OMP PARALLEL PRIVATE ( i, j, k )
200	!$OMP DO
201	#if __acc_fft_device
202	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
203	!$ACC PRESENT(f_out, f_inv)
204	#endif
205	DO k = nzb_y, nzt_y
206	DO i = nxl_y, nxr_y
207	DO j = 0, ny
208	f_out(j,i,k) = f_inv(j,k,i)
209	ENDDO
210	ENDDO
211	ENDDO
212	!$OMP END PARALLEL
213
214	ENDIF
215
216	END SUBROUTINE transpose_xy
217
218
219	!------------------------------------------------------------------------------!
220	! Description:
221	! ------------
222	!> Resorting data after the transposition from x to z. The transposition itself
223	!> is carried out in transpose_xz
224	!------------------------------------------------------------------------------!
225	SUBROUTINE resort_for_xz( f_inv, f_out )
226
227
228	USE indices, &
229	ONLY: nxl, nxr, nyn, nys, nz
230
231	USE kinds
232
233	IMPLICIT NONE
234
235	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
236	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
237
238	INTEGER(iwp) :: i !<
239	INTEGER(iwp) :: j !<
240	INTEGER(iwp) :: k !<
241	!
242	!-- Rearrange indices of input array in order to make data to be send
243	!-- by MPI contiguous.
244	!-- In case of parallel fft/transposition, scattered store is faster in
245	!-- backward direction!!!
246	!$OMP PARALLEL PRIVATE ( i, j, k )
247	!$OMP DO
248	#if __acc_fft_device
249	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
250	!$ACC PRESENT(f_out, f_inv)
251	#endif
252	DO i = nxl, nxr
253	DO j = nys, nyn
254	DO k = 1, nz
255	f_out(k,j,i) = f_inv(j,i,k)
256	ENDDO
257	ENDDO
258	ENDDO
259	!$OMP END PARALLEL
260
261	END SUBROUTINE resort_for_xz
262
263
264	!------------------------------------------------------------------------------!
265	! Description:
266	! ------------
267	!> Transposition of input array (f_in) from x to z. For the input array, all
268	!> elements along x reside on the same PE, while after transposition, all
269	!> elements along z reside on the same PE.
270	!------------------------------------------------------------------------------!
271	SUBROUTINE transpose_xz( f_in, f_inv )
272
273
274	USE cpulog, &
275	ONLY: cpu_log, cpu_log_nowait, log_point_s
276
277	USE fft_xy, &
278	ONLY: f_vec_x, temperton_fft_vec
279
280	USE indices, &
281	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
282
283	USE kinds
284
285	USE pegrid
286
287	USE transpose_indices, &
288	ONLY: nyn_x, nys_x, nzb_x, nzt_x
289
290	IMPLICIT NONE
291
292	INTEGER(iwp) :: i !<
293	INTEGER(iwp) :: j !<
294	INTEGER(iwp) :: k !<
295	INTEGER(iwp) :: l !<
296	INTEGER(iwp) :: mm !<
297	INTEGER(iwp) :: xs !<
298
299	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
300	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
301
302	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
303	#if __acc_fft_device
304	!$ACC DECLARE CREATE(work)
305	#endif
306
307	!
308	!-- If the PE grid is one-dimensional along y, the array has only to be
309	!-- reordered locally and therefore no transposition has to be done.
310	IF ( pdims(1) /= 1 ) THEN
311
312	#if defined( __parallel )
313	!
314	!-- Reorder input array for transposition. Data from the vectorized Temperton-fft is stored in
315	!-- different array format (f_vec_x).
316	IF ( temperton_fft_vec ) THEN
317
318	DO l = 0, pdims(1) - 1
319	xs = 0 + l * nnx
320	DO k = nzb_x, nzt_x
321	DO i = xs, xs + nnx - 1
322	DO j = nys_x, nyn_x
323	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
324	work(j,i-xs+1,k,l) = f_vec_x(mm,i)
325	ENDDO
326	ENDDO
327	ENDDO
328	ENDDO
329
330	ELSE
331
332	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
333	DO l = 0, pdims(1) - 1
334	xs = 0 + l * nnx
335	#if __acc_fft_device
336	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
337	!$ACC PRESENT(work, f_in)
338	#endif
339	!$OMP DO
340	DO k = nzb_x, nzt_x
341	DO i = xs, xs + nnx - 1
342	DO j = nys_x, nyn_x
343	work(j,i-xs+1,k,l) = f_in(i,j,k)
344	ENDDO
345	ENDDO
346	ENDDO
347	!$OMP END DO NOWAIT
348	ENDDO
349	!$OMP END PARALLEL
350
351	ENDIF
352
353	!
354	!-- Transpose array
355	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
356
357	#if __acc_fft_device
358	#ifndef __cuda_aware_mpi
359	!$ACC UPDATE HOST(work)
360	#else
361	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
362	#endif
363	#endif
364
365	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
366	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
367	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
368	comm1dx, ierr )
369
370	#if __acc_fft_device
371	#ifndef __cuda_aware_mpi
372	!$ACC UPDATE DEVICE(f_inv)
373	#else
374	!$ACC END HOST_DATA
375	#endif
376	#endif
377
378	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
379	#endif
380
381	ELSE
382
383	!
384	!-- Reorder the array in a way that the z index is in first position
385	!$OMP PARALLEL PRIVATE ( i, j, k )
386	!$OMP DO
387	#if __acc_fft_device
388	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
389	!$ACC PRESENT(f_inv, f_in)
390	#endif
391	DO i = nxl, nxr
392	DO j = nys, nyn
393	DO k = 1, nz
394	f_inv(j,i,k) = f_in(i,j,k)
395	ENDDO
396	ENDDO
397	ENDDO
398	!$OMP END PARALLEL
399
400	ENDIF
401
402	END SUBROUTINE transpose_xz
403
404
405	!------------------------------------------------------------------------------!
406	! Description:
407	! ------------
408	!> Resorting data after the transposition from y to x. The transposition itself
409	!> is carried out in transpose_yx
410	!------------------------------------------------------------------------------!
411	SUBROUTINE resort_for_yx( f_inv, f_out )
412
413
414	USE indices, &
415	ONLY: nx
416
417	USE kinds
418
419	USE transpose_indices, &
420	ONLY: nyn_x, nys_x, nzb_x, nzt_x
421
422	IMPLICIT NONE
423
424	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
425	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
426
427
428	INTEGER(iwp) :: i !<
429	INTEGER(iwp) :: j !<
430	INTEGER(iwp) :: k !<
431	!
432	!-- Rearrange indices of input array in order to make data to be send
433	!-- by MPI contiguous
434	!$OMP PARALLEL PRIVATE ( i, j, k )
435	!$OMP DO
436	#if __acc_fft_device
437	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
438	!$ACC PRESENT(f_out, f_inv)
439	#endif
440	DO k = nzb_x, nzt_x
441	DO j = nys_x, nyn_x
442	DO i = 0, nx
443	f_out(i,j,k) = f_inv(j,k,i)
444	ENDDO
445	ENDDO
446	ENDDO
447	!$OMP END PARALLEL
448
449	END SUBROUTINE resort_for_yx
450
451
452	!------------------------------------------------------------------------------!
453	! Description:
454	! ------------
455	!> Transposition of input array (f_in) from y to x. For the input array, all
456	!> elements along y reside on the same PE, while after transposition, all
457	!> elements along x reside on the same PE.
458	!------------------------------------------------------------------------------!
459	SUBROUTINE transpose_yx( f_in, f_inv )
460
461
462	USE cpulog, &
463	ONLY: cpu_log, cpu_log_nowait, log_point_s
464
465	USE indices, &
466	ONLY: nx, ny
467
468	USE kinds
469
470	USE pegrid
471
472	USE transpose_indices, &
473	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
474
475	IMPLICIT NONE
476
477	INTEGER(iwp) :: i !<
478	INTEGER(iwp) :: j !<
479	INTEGER(iwp) :: k !<
480	INTEGER(iwp) :: l !<
481	INTEGER(iwp) :: ys !<
482
483	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
484	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
485
486	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
487	#if __acc_fft_device
488	!$ACC DECLARE CREATE(work)
489	#endif
490
491
492	IF ( numprocs /= 1 ) THEN
493
494	#if defined( __parallel )
495	!
496	!-- Reorder input array for transposition
497	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
498	DO l = 0, pdims(2) - 1
499	ys = 0 + l * ( nyn_x - nys_x + 1 )
500	#if __acc_fft_device
501	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
502	!$ACC PRESENT(work, f_in)
503	#endif
504	!$OMP DO
505	DO i = nxl_y, nxr_y
506	DO k = nzb_y, nzt_y
507	DO j = ys, ys + nyn_x - nys_x
508	work(j-ys+1,k,i,l) = f_in(j,i,k)
509	ENDDO
510	ENDDO
511	ENDDO
512	!$OMP END DO NOWAIT
513	ENDDO
514	!$OMP END PARALLEL
515
516	!
517	!-- Transpose array
518	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
519
520	#if __acc_fft_device
521	#ifndef __cuda_aware_mpi
522	!$ACC UPDATE HOST(work)
523	#else
524	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
525	#endif
526	#endif
527
528	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
529	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
530	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
531	comm1dy, ierr )
532
533	#if __acc_fft_device
534	#ifndef __cuda_aware_mpi
535	!$ACC UPDATE DEVICE(f_inv)
536	#else
537	!$ACC END HOST_DATA
538	#endif
539	#endif
540
541	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
542	#endif
543
544	ELSE
545
546	!
547	!-- Reorder array f_in the same way as ALLTOALL did it
548	!$OMP PARALLEL PRIVATE ( i, j, k )
549	!$OMP DO
550	#if __acc_fft_device
551	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
552	!$ACC PRESENT(f_inv, f_in)
553	#endif
554	DO i = nxl_y, nxr_y
555	DO k = nzb_y, nzt_y
556	DO j = 0, ny
557	f_inv(j,k,i) = f_in(j,i,k)
558	ENDDO
559	ENDDO
560	ENDDO
561	!$OMP END PARALLEL
562
563	ENDIF
564
565	END SUBROUTINE transpose_yx
566
567
568	!------------------------------------------------------------------------------!
569	! Description:
570	! ------------
571	!> Transposition of input array (f_in) from y to x. For the input array, all
572	!> elements along y reside on the same PE, while after transposition, all
573	!> elements along x reside on the same PE.
574	!> This is a direct transposition for arrays with indices in regular order
575	!> (k,j,i) (cf. transpose_yx).
576	!------------------------------------------------------------------------------!
577	SUBROUTINE transpose_yxd( f_in, f_out )
578
579
580	USE cpulog, &
581	ONLY: cpu_log, log_point_s
582
583	USE indices, &
584	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
585
586	USE kinds
587
588	USE pegrid
589
590	USE transpose_indices, &
591	ONLY: nyn_x, nys_x, nzb_x, nzt_x
592
593	IMPLICIT NONE
594
595	INTEGER(iwp) :: i !<
596	INTEGER(iwp) :: j !<
597	INTEGER(iwp) :: k !<
598	INTEGER(iwp) :: l !<
599	INTEGER(iwp) :: m !<
600	INTEGER(iwp) :: xs !<
601
602	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
603	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
604	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
605	REAL(wp) :: work(nnxnnynnz) !<
606	#if defined( __parallel )
607
608	!
609	!-- Rearrange indices of input array in order to make data to be send
610	!-- by MPI contiguous
611	DO k = 1, nz
612	DO j = nys, nyn
613	DO i = nxl, nxr
614	f_inv(i,k,j) = f_in(k,j,i)
615	ENDDO
616	ENDDO
617	ENDDO
618
619	!
620	!-- Transpose array
621	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
622	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
623	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
624	work(1), sendrecvcount_xy, MPI_REAL, &
625	comm1dx, ierr )
626	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
627
628	!
629	!-- Reorder transposed array
630	m = 0
631	DO l = 0, pdims(1) - 1
632	xs = 0 + l * nnx
633	DO j = nys_x, nyn_x
634	DO k = 1, nz
635	DO i = xs, xs + nnx - 1
636	m = m + 1
637	f_out(i,j,k) = work(m)
638	ENDDO
639	ENDDO
640	ENDDO
641	ENDDO
642
643	#endif
644
645	END SUBROUTINE transpose_yxd
646
647
648	!------------------------------------------------------------------------------!
649	! Description:
650	! ------------
651	!> Resorting data for the transposition from y to z. The transposition itself
652	!> is carried out in transpose_yz
653	!------------------------------------------------------------------------------!
654	SUBROUTINE resort_for_yz( f_in, f_inv )
655
656
657	USE indices, &
658	ONLY: ny
659
660	USE kinds
661
662	USE transpose_indices, &
663	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
664
665	IMPLICIT NONE
666
667	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
668	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
669
670	INTEGER(iwp) :: i !<
671	INTEGER(iwp) :: j !<
672	INTEGER(iwp) :: k !<
673
674	!
675	!-- Rearrange indices of input array in order to make data to be send
676	!-- by MPI contiguous
677	!$OMP PARALLEL PRIVATE ( i, j, k )
678	!$OMP DO
679	#if __acc_fft_device
680	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
681	!$ACC PRESENT(f_inv, f_in)
682	#endif
683	DO k = nzb_y, nzt_y
684	DO i = nxl_y, nxr_y
685	DO j = 0, ny
686	f_inv(i,k,j) = f_in(j,i,k)
687	ENDDO
688	ENDDO
689	ENDDO
690	!$OMP END PARALLEL
691
692	END SUBROUTINE resort_for_yz
693
694
695	!------------------------------------------------------------------------------!
696	! Description:
697	! ------------
698	!> Transposition of input array (f_in) from y to z. For the input array, all
699	!> elements along y reside on the same PE, while after transposition, all
700	!> elements along z reside on the same PE.
701	!------------------------------------------------------------------------------!
702	SUBROUTINE transpose_yz( f_inv, f_out )
703
704
705	USE cpulog, &
706	ONLY: cpu_log, cpu_log_nowait, log_point_s
707
708	USE indices, &
709	ONLY: ny, nz
710
711	USE kinds
712
713	USE pegrid
714
715	USE transpose_indices, &
716	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
717
718	IMPLICIT NONE
719
720	INTEGER(iwp) :: i !<
721	INTEGER(iwp) :: j !<
722	INTEGER(iwp) :: k !<
723	INTEGER(iwp) :: l !<
724	INTEGER(iwp) :: zs !<
725
726	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
727	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
728
729	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
730	#if __acc_fft_device
731	!$ACC DECLARE CREATE(work)
732	#endif
733
734
735	!
736	!-- If the PE grid is one-dimensional along y, only local reordering
737	!-- of the data is necessary and no transposition has to be done.
738	IF ( pdims(1) == 1 ) THEN
739
740	!$OMP PARALLEL PRIVATE ( i, j, k )
741	!$OMP DO
742	#if __acc_fft_device
743	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
744	!$ACC PRESENT(f_out, f_inv)
745	#endif
746	DO j = 0, ny
747	DO k = nzb_y, nzt_y
748	DO i = nxl_y, nxr_y
749	f_out(i,j,k) = f_inv(i,k,j)
750	ENDDO
751	ENDDO
752	ENDDO
753	!$OMP END PARALLEL
754
755	ELSE
756
757	#if defined( __parallel )
758	!
759	!-- Transpose array
760	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
761
762	#if __acc_fft_device
763	#ifndef __cuda_aware_mpi
764	!$ACC UPDATE HOST(f_inv)
765	#else
766	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
767	#endif
768	#endif
769
770	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
771	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
772	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
773	comm1dx, ierr )
774
775	#if __acc_fft_device
776	#ifndef __cuda_aware_mpi
777	!$ACC UPDATE DEVICE(work)
778	#else
779	!$ACC END HOST_DATA
780	#endif
781	#endif
782
783	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
784
785	!
786	!-- Reorder transposed array
787	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
788	DO l = 0, pdims(1) - 1
789	zs = 1 + l * ( nzt_y - nzb_y + 1 )
790	#if __acc_fft_device
791	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
792	!$ACC PRESENT(f_out, work)
793	#endif
794	!$OMP DO
795	DO j = nys_z, nyn_z
796	DO k = zs, zs + nzt_y - nzb_y
797	DO i = nxl_z, nxr_z
798	f_out(i,j,k) = work(i,k-zs+1,j,l)
799	ENDDO
800	ENDDO
801	ENDDO
802	!$OMP END DO NOWAIT
803	ENDDO
804	!$OMP END PARALLEL
805	#endif
806
807	ENDIF
808
809	END SUBROUTINE transpose_yz
810
811
812	!------------------------------------------------------------------------------!
813	! Description:
814	! ------------
815	!> Resorting data for the transposition from z to x. The transposition itself
816	!> is carried out in transpose_zx
817	!------------------------------------------------------------------------------!
818	SUBROUTINE resort_for_zx( f_in, f_inv )
819
820
821	USE indices, &
822	ONLY: nxl, nxr, nyn, nys, nz
823
824	USE kinds
825
826	IMPLICIT NONE
827
828	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
829	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
830
831	INTEGER(iwp) :: i !<
832	INTEGER(iwp) :: j !<
833	INTEGER(iwp) :: k !<
834
835	!
836	!-- Rearrange indices of input array in order to make data to be send
837	!-- by MPI contiguous
838	!$OMP PARALLEL PRIVATE ( i, j, k )
839	!$OMP DO
840	#if __acc_fft_device
841	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
842	!$ACC PRESENT(f_in, f_inv)
843	#endif
844	DO i = nxl, nxr
845	DO j = nys, nyn
846	DO k = 1,nz
847	f_inv(j,i,k) = f_in(k,j,i)
848	ENDDO
849	ENDDO
850	ENDDO
851	!$OMP END PARALLEL
852
853	END SUBROUTINE resort_for_zx
854
855
856	!------------------------------------------------------------------------------!
857	! Description:
858	! ------------
859	!> Transposition of input array (f_in) from z to x. For the input array, all
860	!> elements along z reside on the same PE, while after transposition, all
861	!> elements along x reside on the same PE.
862	!------------------------------------------------------------------------------!
863	SUBROUTINE transpose_zx( f_inv, f_out )
864
865
866	USE cpulog, &
867	ONLY: cpu_log, cpu_log_nowait, log_point_s
868
869	USE fft_xy, &
870	ONLY: f_vec_x, temperton_fft_vec
871
872	USE indices, &
873	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
874
875	USE kinds
876
877	USE pegrid
878
879	USE transpose_indices, &
880	ONLY: nyn_x, nys_x, nzb_x, nzt_x
881
882	IMPLICIT NONE
883
884	INTEGER(iwp) :: i !<
885	INTEGER(iwp) :: j !<
886	INTEGER(iwp) :: k !<
887	INTEGER(iwp) :: l !<
888	INTEGER(iwp) :: mm !<
889	INTEGER(iwp) :: xs !<
890
891	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
892	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
893
894	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
895	#if __acc_fft_device
896	!$ACC DECLARE CREATE(work)
897	#endif
898
899
900	!
901	!-- If the PE grid is one-dimensional along y, only local reordering
902	!-- of the data is necessary and no transposition has to be done.
903	IF ( pdims(1) == 1 ) THEN
904
905	!$OMP PARALLEL PRIVATE ( i, j, k )
906	!$OMP DO
907	#if __acc_fft_device
908	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
909	!$ACC PRESENT(f_out, f_inv)
910	#endif
911	DO k = 1, nz
912	DO i = nxl, nxr
913	DO j = nys, nyn
914	f_out(i,j,k) = f_inv(j,i,k)
915	ENDDO
916	ENDDO
917	ENDDO
918	!$OMP END PARALLEL
919
920	ELSE
921
922	#if defined( __parallel )
923	!
924	!-- Transpose array
925	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
926
927	#if __acc_fft_device
928	#ifndef __cuda_aware_mpi
929	!$ACC UPDATE HOST(f_inv)
930	#else
931	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
932	#endif
933	#endif
934
935	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
936	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
937	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
938	comm1dx, ierr )
939
940	#if __acc_fft_device
941	#ifndef __cuda_aware_mpi
942	!$ACC UPDATE DEVICE(work)
943	#else
944	!$ACC END HOST_DATA
945	#endif
946	#endif
947
948	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
949
950	!
951	!-- Reorder transposed array.
952	!-- Data for the vectorized Temperton-fft is stored in different array format (f_vec_x) which
953	!-- saves additional data copy in fft_x.
954	IF ( temperton_fft_vec ) THEN
955
956	DO l = 0, pdims(1) - 1
957	xs = 0 + l * nnx
958	DO k = nzb_x, nzt_x
959	DO i = xs, xs + nnx - 1
960	DO j = nys_x, nyn_x
961	mm = j-nys_x+1+(k-nzb_x)*(nyn_x-nys_x+1)
962	f_vec_x(mm,i) = work(j,i-xs+1,k,l)
963	ENDDO
964	ENDDO
965	ENDDO
966	ENDDO
967
968	ELSE
969
970	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
971	DO l = 0, pdims(1) - 1
972	xs = 0 + l * nnx
973	#if __acc_fft_device
974	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
975	!$ACC PRESENT(f_out, work)
976	#endif
977	!$OMP DO
978	DO k = nzb_x, nzt_x
979	DO i = xs, xs + nnx - 1
980	DO j = nys_x, nyn_x
981	f_out(i,j,k) = work(j,i-xs+1,k,l)
982	ENDDO
983	ENDDO
984	ENDDO
985	!$OMP END DO NOWAIT
986	ENDDO
987	!$OMP END PARALLEL
988
989	ENDIF
990
991	#endif
992
993	ENDIF
994
995	END SUBROUTINE transpose_zx
996
997
998	!------------------------------------------------------------------------------!
999	! Description:
1000	! ------------
1001	!> Resorting data after the transposition from z to y. The transposition itself
1002	!> is carried out in transpose_zy
1003	!------------------------------------------------------------------------------!
1004	SUBROUTINE resort_for_zy( f_inv, f_out )
1005
1006
1007	USE indices, &
1008	ONLY: ny
1009
1010	USE kinds
1011
1012	USE transpose_indices, &
1013	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
1014
1015	IMPLICIT NONE
1016
1017	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1018	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
1019
1020
1021	INTEGER(iwp) :: i !<
1022	INTEGER(iwp) :: j !<
1023	INTEGER(iwp) :: k !<
1024
1025	!
1026	!-- Rearrange indices of input array in order to make data to be send
1027	!-- by MPI contiguous
1028	!$OMP PARALLEL PRIVATE ( i, j, k )
1029	!$OMP DO
1030	#if __acc_fft_device
1031	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1032	!$ACC PRESENT(f_out, f_inv)
1033	#endif
1034	DO k = nzb_y, nzt_y
1035	DO i = nxl_y, nxr_y
1036	DO j = 0, ny
1037	f_out(j,i,k) = f_inv(i,k,j)
1038	ENDDO
1039	ENDDO
1040	ENDDO
1041	!$OMP END PARALLEL
1042
1043	END SUBROUTINE resort_for_zy
1044
1045
1046	!------------------------------------------------------------------------------!
1047	! Description:cpu_log_nowait
1048	! ------------
1049	!> Transposition of input array (f_in) from z to y. For the input array, all
1050	!> elements along z reside on the same PE, while after transposition, all
1051	!> elements along y reside on the same PE.
1052	!------------------------------------------------------------------------------!
1053	SUBROUTINE transpose_zy( f_in, f_inv )
1054
1055
1056	USE cpulog, &
1057	ONLY: cpu_log, cpu_log_nowait, log_point_s
1058
1059	USE indices, &
1060	ONLY: ny, nz
1061
1062	USE kinds
1063
1064	USE pegrid
1065
1066	USE transpose_indices, &
1067	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
1068
1069	IMPLICIT NONE
1070
1071	INTEGER(iwp) :: i !<
1072	INTEGER(iwp) :: j !<
1073	INTEGER(iwp) :: k !<
1074	INTEGER(iwp) :: l !<
1075	INTEGER(iwp) :: zs !<
1076
1077	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
1078	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
1079
1080	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
1081	#if __acc_fft_device
1082	!$ACC DECLARE CREATE(work)
1083	#endif
1084
1085	!
1086	!-- If the PE grid is one-dimensional along y, the array has only to be
1087	!-- reordered locally and therefore no transposition has to be done.
1088	IF ( pdims(1) /= 1 ) THEN
1089
1090	#if defined( __parallel )
1091	!
1092	!-- Reorder input array for transposition
1093	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
1094	DO l = 0, pdims(1) - 1
1095	zs = 1 + l * ( nzt_y - nzb_y + 1 )
1096	#if __acc_fft_device
1097	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1098	!$ACC PRESENT(work, f_in)
1099	#endif
1100	!$OMP DO
1101	DO j = nys_z, nyn_z
1102	DO k = zs, zs + nzt_y - nzb_y
1103	DO i = nxl_z, nxr_z
1104	work(i,k-zs+1,j,l) = f_in(i,j,k)
1105	ENDDO
1106	ENDDO
1107	ENDDO
1108	!$OMP END DO NOWAIT
1109	ENDDO
1110	!$OMP END PARALLEL
1111
1112	!
1113	!-- Transpose array
1114	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
1115
1116	#if __acc_fft_device
1117	#ifndef __cuda_aware_mpi
1118	!$ACC UPDATE HOST(work)
1119	#else
1120	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
1121	#endif
1122	#endif
1123
1124	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1125	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
1126	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
1127	comm1dx, ierr )
1128
1129	#if __acc_fft_device
1130	#ifndef __cuda_aware_mpi
1131	!$ACC UPDATE DEVICE(f_inv)
1132	#else
1133	!$ACC END HOST_DATA
1134	#endif
1135	#endif
1136
1137	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1138	#endif
1139
1140	ELSE
1141	!
1142	!-- Reorder the array in the same way like ALLTOALL did it
1143	!$OMP PARALLEL PRIVATE ( i, j, k )
1144	!$OMP DO
1145	#if __acc_fft_device
1146	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1147	!$ACC PRESENT(f_inv, f_in)
1148	#endif
1149	DO k = nzb_y, nzt_y
1150	DO j = 0, ny
1151	DO i = nxl_y, nxr_y
1152	f_inv(i,k,j) = f_in(i,j,k)
1153	ENDDO
1154	ENDDO
1155	ENDDO
1156	!$OMP END PARALLEL
1157
1158	ENDIF
1159
1160	END SUBROUTINE transpose_zy
1161
1162
1163	!------------------------------------------------------------------------------!
1164	! Description:
1165	! ------------
1166	!> Transposition of input array (f_in) from z to y. For the input array, all
1167	!> elements along z reside on the same PE, while after transposition, all
1168	!> elements along y reside on the same PE.
1169	!> This is a direct transposition for arrays with indices in regular order
1170	!> (k,j,i) (cf. transpose_zy).
1171	!------------------------------------------------------------------------------!
1172	SUBROUTINE transpose_zyd( f_in, f_out )
1173
1174
1175	USE cpulog, &
1176	ONLY: cpu_log, log_point_s
1177
1178	USE indices, &
1179	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
1180
1181	USE kinds
1182
1183	USE pegrid
1184
1185	USE transpose_indices, &
1186	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
1187
1188	IMPLICIT NONE
1189
1190	INTEGER(iwp) :: i !<
1191	INTEGER(iwp) :: j !<
1192	INTEGER(iwp) :: k !<
1193	INTEGER(iwp) :: l !<
1194	INTEGER(iwp) :: m !<
1195	INTEGER(iwp) :: ys !<
1196
1197	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
1198	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
1199	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
1200	REAL(wp) :: work(nnxnnynnz) !<
1201
1202	#if defined( __parallel )
1203
1204	!
1205	!-- Rearrange indices of input array in order to make data to be send
1206	!-- by MPI contiguous
1207	DO i = nxl, nxr
1208	DO j = nys, nyn
1209	DO k = 1, nz
1210	f_inv(j,i,k) = f_in(k,j,i)
1211	ENDDO
1212	ENDDO
1213	ENDDO
1214
1215	!
1216	!-- Move data to different array, because memory location of work1 is
1217	!-- needed further below (work1 = work2).
1218	!-- If the PE grid is one-dimensional along x, only local reordering
1219	!-- of the data is necessary and no transposition has to be done.
1220	IF ( pdims(2) == 1 ) THEN
1221	DO k = 1, nz
1222	DO i = nxl, nxr
1223	DO j = nys, nyn
1224	f_out(j,i,k) = f_inv(j,i,k)
1225	ENDDO
1226	ENDDO
1227	ENDDO
1228	RETURN
1229	ENDIF
1230
1231	!
1232	!-- Transpose array
1233	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1234	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1235	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
1236	work(1), sendrecvcount_zyd, MPI_REAL, &
1237	comm1dy, ierr )
1238	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1239
1240	!
1241	!-- Reorder transposed array
1242	m = 0
1243	DO l = 0, pdims(2) - 1
1244	ys = 0 + l * nny
1245	DO k = nzb_yd, nzt_yd
1246	DO i = nxl_yd, nxr_yd
1247	DO j = ys, ys + nny - 1
1248	m = m + 1
1249	f_out(j,i,k) = work(m)
1250	ENDDO
1251	ENDDO
1252	ENDDO
1253	ENDDO
1254
1255	#endif
1256
1257	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |