Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 3680

Last change on this file since 3680 was 3657, checked in by knoop, 5 years ago
OpenACC: cuda-aware-mpi in transpose and acc update async in exchange_horiz added
Property svn:keywords set to `Id`
File size: 32.8 KB

Line
1	!> @file transpose.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2019 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 3657 2019-01-07 20:14:18Z knoop $
27	! OpenACC port for SPEC
28	!
29	! 3241 2018-09-12 15:02:00Z raasch
30	! unused variables removed
31	!
32	! 2718 2018-01-02 08:49:38Z maronga
33	! Corrected "Former revisions" section
34	!
35	! 2696 2017-12-14 17:12:51Z kanani
36	! Change in file header (GPL part)
37	!
38	! 2119 2017-01-17 16:51:50Z raasch
39	!
40	! 2118 2017-01-17 16:38:49Z raasch
41	! OpenACC directives removed
42	!
43	! 2000 2016-08-20 18:09:15Z knoop
44	! Forced header and separation lines into 80 columns
45	!
46	! 1682 2015-10-07 23:56:08Z knoop
47	! Code annotations made doxygen readable
48	!
49	! 1324 2014-03-21 09:13:16Z suehring
50	! Bugfix: ONLY statement for module pegrid removed
51	!
52	! 1320 2014-03-20 08:40:49Z raasch
53	! ONLY-attribute added to USE-statements,
54	! kind-parameters added to all INTEGER and REAL declaration statements,
55	! kinds are defined in new module kinds,
56	! old module precision_kind is removed,
57	! revision history before 2012 removed,
58	! comment fields (!:) to be used for variable explanations added to
59	! all variable declaration statements
60	!
61	! 1318 2014-03-17 13:35:16Z raasch
62	! cpu_log_nowait parameter added to cpu measurements of the transpositions
63	! required for solving the Poisson equation (poisfft),
64	! module interfaces removed
65	!
66	! 1257 2013-11-08 15:18:40Z raasch
67	! openacc loop and loop vector clauses removed
68	!
69	! 1216 2013-08-26 09:31:42Z raasch
70	! re-sorting of the transposed / to be transposed arrays moved to separate
71	! routines resort_for_...
72	!
73	! 1111 2013-03-08 23:54:10Z raasch
74	! openACC directives added,
75	! resorting data from/to work changed, work got 4 dimensions instead of 1
76	!
77	! 1106 2013-03-04 05:31:38Z raasch
78	! preprocessor lines rearranged so that routines can also be used in serial
79	! (non-parallel) mode
80	!
81	! 1092 2013-02-02 11:24:22Z raasch
82	! unused variables removed
83	!
84	! 1036 2012-10-22 13:43:42Z raasch
85	! code put under GPL (PALM 3.9)
86	!
87	! 1003 2012-09-14 14:35:53Z raasch
88	! indices nxa, nya, etc. replaced by nx, ny, etc.
89	!
90	! Revision 1.1 1997/07/24 11:25:18 raasch
91	! Initial revision
92	!
93	!------------------------------------------------------------------------------!
94	! Description:
95	! ------------
96	!> Resorting data for the transposition from x to y. The transposition itself
97	!> is carried out in transpose_xy
98	!------------------------------------------------------------------------------!
99	SUBROUTINE resort_for_xy( f_in, f_inv )
100
101
102	USE indices, &
103	ONLY: nx
104
105	USE kinds
106
107	USE transpose_indices, &
108	ONLY: nyn_x, nys_x, nzb_x, nzt_x
109
110	IMPLICIT NONE
111
112	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
113	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
114
115
116	INTEGER(iwp) :: i !<
117	INTEGER(iwp) :: j !<
118	INTEGER(iwp) :: k !<
119	!
120	!-- Rearrange indices of input array in order to make data to be send
121	!-- by MPI contiguous
122	!$OMP PARALLEL PRIVATE ( i, j, k )
123	!$OMP DO
124	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
125	!$ACC PRESENT(f_inv, f_in)
126	DO i = 0, nx
127	DO k = nzb_x, nzt_x
128	DO j = nys_x, nyn_x
129	f_inv(j,k,i) = f_in(i,j,k)
130	ENDDO
131	ENDDO
132	ENDDO
133	!$OMP END PARALLEL
134
135	END SUBROUTINE resort_for_xy
136
137
138	!------------------------------------------------------------------------------!
139	! Description:
140	! ------------
141	!> Transposition of input array (f_in) from x to y. For the input array, all
142	!> elements along x reside on the same PE, while after transposition, all
143	!> elements along y reside on the same PE.
144	!------------------------------------------------------------------------------!
145	SUBROUTINE transpose_xy( f_inv, f_out )
146
147
148	USE cpulog, &
149	ONLY: cpu_log, cpu_log_nowait, log_point_s
150
151	USE indices, &
152	ONLY: nx, ny
153
154	USE kinds
155
156	USE pegrid
157
158	USE transpose_indices, &
159	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
160
161	IMPLICIT NONE
162
163	INTEGER(iwp) :: i !<
164	INTEGER(iwp) :: j !<
165	INTEGER(iwp) :: k !<
166	INTEGER(iwp) :: l !<
167	INTEGER(iwp) :: ys !<
168
169	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
170	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
171
172	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
173	!$ACC DECLARE CREATE(work)
174
175
176	IF ( numprocs /= 1 ) THEN
177
178	#if defined( __parallel )
179	!
180	!-- Transpose array
181	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
182	#ifndef __cuda_aware_mpi
183	!$ACC UPDATE HOST(f_inv)
184	#else
185	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
186	#endif
187	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
188	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
189	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
190	comm1dy, ierr )
191	#ifndef __cuda_aware_mpi
192	!$ACC UPDATE DEVICE(work)
193	#else
194	!$ACC END HOST_DATA
195	#endif
196	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
197
198	!
199	!-- Reorder transposed array
200	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
201	!$OMP DO
202	DO l = 0, pdims(2) - 1
203	ys = 0 + l * ( nyn_x - nys_x + 1 )
204	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
205	!$ACC PRESENT(f_out, work)
206	DO i = nxl_y, nxr_y
207	DO k = nzb_y, nzt_y
208	DO j = ys, ys + nyn_x - nys_x
209	f_out(j,i,k) = work(j-ys+1,k,i,l)
210	ENDDO
211	ENDDO
212	ENDDO
213	ENDDO
214	!$OMP END PARALLEL
215	#endif
216
217	ELSE
218
219	!
220	!-- Reorder transposed array
221	!$OMP PARALLEL PRIVATE ( i, j, k )
222	!$OMP DO
223	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
224	!$ACC PRESENT(f_out, f_inv)
225	DO k = nzb_y, nzt_y
226	DO i = nxl_y, nxr_y
227	DO j = 0, ny
228	f_out(j,i,k) = f_inv(j,k,i)
229	ENDDO
230	ENDDO
231	ENDDO
232	!$OMP END PARALLEL
233
234	ENDIF
235
236	END SUBROUTINE transpose_xy
237
238
239	!------------------------------------------------------------------------------!
240	! Description:
241	! ------------
242	!> Resorting data after the transposition from x to z. The transposition itself
243	!> is carried out in transpose_xz
244	!------------------------------------------------------------------------------!
245	SUBROUTINE resort_for_xz( f_inv, f_out )
246
247
248	USE indices, &
249	ONLY: nxl, nxr, nyn, nys, nz
250
251	USE kinds
252
253	IMPLICIT NONE
254
255	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
256	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
257
258	INTEGER(iwp) :: i !<
259	INTEGER(iwp) :: j !<
260	INTEGER(iwp) :: k !<
261	!
262	!-- Rearrange indices of input array in order to make data to be send
263	!-- by MPI contiguous.
264	!-- In case of parallel fft/transposition, scattered store is faster in
265	!-- backward direction!!!
266	!$OMP PARALLEL PRIVATE ( i, j, k )
267	!$OMP DO
268	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
269	!$ACC PRESENT(f_out, f_inv)
270	DO k = 1, nz
271	DO i = nxl, nxr
272	DO j = nys, nyn
273	f_out(k,j,i) = f_inv(j,i,k)
274	ENDDO
275	ENDDO
276	ENDDO
277	!$OMP END PARALLEL
278
279	END SUBROUTINE resort_for_xz
280
281
282	!------------------------------------------------------------------------------!
283	! Description:
284	! ------------
285	!> Transposition of input array (f_in) from x to z. For the input array, all
286	!> elements along x reside on the same PE, while after transposition, all
287	!> elements along z reside on the same PE.
288	!------------------------------------------------------------------------------!
289	SUBROUTINE transpose_xz( f_in, f_inv )
290
291
292	USE cpulog, &
293	ONLY: cpu_log, cpu_log_nowait, log_point_s
294
295	USE indices, &
296	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
297
298	USE kinds
299
300	USE pegrid
301
302	USE transpose_indices, &
303	ONLY: nyn_x, nys_x, nzb_x, nzt_x
304
305	IMPLICIT NONE
306
307	INTEGER(iwp) :: i !<
308	INTEGER(iwp) :: j !<
309	INTEGER(iwp) :: k !<
310	INTEGER(iwp) :: l !<
311	INTEGER(iwp) :: xs !<
312
313	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
314	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
315
316	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
317	!$ACC DECLARE CREATE(work)
318
319
320	!
321	!-- If the PE grid is one-dimensional along y, the array has only to be
322	!-- reordered locally and therefore no transposition has to be done.
323	IF ( pdims(1) /= 1 ) THEN
324
325	#if defined( __parallel )
326	!
327	!-- Reorder input array for transposition
328	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
329	!$OMP DO
330	DO l = 0, pdims(1) - 1
331	xs = 0 + l * nnx
332	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
333	!$ACC PRESENT(work, f_in)
334	DO k = nzb_x, nzt_x
335	DO i = xs, xs + nnx - 1
336	DO j = nys_x, nyn_x
337	work(j,i-xs+1,k,l) = f_in(i,j,k)
338	ENDDO
339	ENDDO
340	ENDDO
341	ENDDO
342	!$OMP END PARALLEL
343
344	!
345	!-- Transpose array
346	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
347	#ifndef __cuda_aware_mpi
348	!$ACC UPDATE HOST(work)
349	#else
350	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
351	#endif
352	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
353	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
354	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
355	comm1dx, ierr )
356	#ifndef __cuda_aware_mpi
357	!$ACC UPDATE DEVICE(f_inv)
358	#else
359	!$ACC END HOST_DATA
360	#endif
361	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
362	#endif
363
364	ELSE
365
366	!
367	!-- Reorder the array in a way that the z index is in first position
368	!$OMP PARALLEL PRIVATE ( i, j, k )
369	!$OMP DO
370	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
371	!$ACC PRESENT(f_inv, f_in)
372	DO i = nxl, nxr
373	DO j = nys, nyn
374	DO k = 1, nz
375	f_inv(j,i,k) = f_in(i,j,k)
376	ENDDO
377	ENDDO
378	ENDDO
379	!$OMP END PARALLEL
380
381	ENDIF
382
383	END SUBROUTINE transpose_xz
384
385
386	!------------------------------------------------------------------------------!
387	! Description:
388	! ------------
389	!> Resorting data after the transposition from y to x. The transposition itself
390	!> is carried out in transpose_yx
391	!------------------------------------------------------------------------------!
392	SUBROUTINE resort_for_yx( f_inv, f_out )
393
394
395	USE indices, &
396	ONLY: nx
397
398	USE kinds
399
400	USE transpose_indices, &
401	ONLY: nyn_x, nys_x, nzb_x, nzt_x
402
403	IMPLICIT NONE
404
405	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
406	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
407
408
409	INTEGER(iwp) :: i !<
410	INTEGER(iwp) :: j !<
411	INTEGER(iwp) :: k !<
412	!
413	!-- Rearrange indices of input array in order to make data to be send
414	!-- by MPI contiguous
415	!$OMP PARALLEL PRIVATE ( i, j, k )
416	!$OMP DO
417	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
418	!$ACC PRESENT(f_out, f_inv)
419	DO i = 0, nx
420	DO k = nzb_x, nzt_x
421	DO j = nys_x, nyn_x
422	f_out(i,j,k) = f_inv(j,k,i)
423	ENDDO
424	ENDDO
425	ENDDO
426	!$OMP END PARALLEL
427
428	END SUBROUTINE resort_for_yx
429
430
431	!------------------------------------------------------------------------------!
432	! Description:
433	! ------------
434	!> Transposition of input array (f_in) from y to x. For the input array, all
435	!> elements along y reside on the same PE, while after transposition, all
436	!> elements along x reside on the same PE.
437	!------------------------------------------------------------------------------!
438	SUBROUTINE transpose_yx( f_in, f_inv )
439
440
441	USE cpulog, &
442	ONLY: cpu_log, cpu_log_nowait, log_point_s
443
444	USE indices, &
445	ONLY: nx, ny
446
447	USE kinds
448
449	USE pegrid
450
451	USE transpose_indices, &
452	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
453
454	IMPLICIT NONE
455
456	INTEGER(iwp) :: i !<
457	INTEGER(iwp) :: j !<
458	INTEGER(iwp) :: k !<
459	INTEGER(iwp) :: l !<
460	INTEGER(iwp) :: ys !<
461
462	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
463	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
464
465	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
466	!$ACC DECLARE CREATE(work)
467
468
469	IF ( numprocs /= 1 ) THEN
470
471	#if defined( __parallel )
472	!
473	!-- Reorder input array for transposition
474	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
475	!$OMP DO
476	DO l = 0, pdims(2) - 1
477	ys = 0 + l * ( nyn_x - nys_x + 1 )
478	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
479	!$ACC PRESENT(work, f_in)
480	DO i = nxl_y, nxr_y
481	DO k = nzb_y, nzt_y
482	DO j = ys, ys + nyn_x - nys_x
483	work(j-ys+1,k,i,l) = f_in(j,i,k)
484	ENDDO
485	ENDDO
486	ENDDO
487	ENDDO
488	!$OMP END PARALLEL
489
490	!
491	!-- Transpose array
492	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
493	#ifndef __cuda_aware_mpi
494	!$ACC UPDATE HOST(work)
495	#else
496	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
497	#endif
498	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
499	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
500	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
501	comm1dy, ierr )
502	#ifndef __cuda_aware_mpi
503	!$ACC UPDATE DEVICE(f_inv)
504	#else
505	!$ACC END HOST_DATA
506	#endif
507	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
508	#endif
509
510	ELSE
511
512	!
513	!-- Reorder array f_in the same way as ALLTOALL did it
514	!$OMP PARALLEL PRIVATE ( i, j, k )
515	!$OMP DO
516	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
517	!$ACC PRESENT(f_inv, f_in)
518	DO i = nxl_y, nxr_y
519	DO k = nzb_y, nzt_y
520	DO j = 0, ny
521	f_inv(j,k,i) = f_in(j,i,k)
522	ENDDO
523	ENDDO
524	ENDDO
525	!$OMP END PARALLEL
526
527	ENDIF
528
529	END SUBROUTINE transpose_yx
530
531
532	!------------------------------------------------------------------------------!
533	! Description:
534	! ------------
535	!> Transposition of input array (f_in) from y to x. For the input array, all
536	!> elements along y reside on the same PE, while after transposition, all
537	!> elements along x reside on the same PE.
538	!> This is a direct transposition for arrays with indices in regular order
539	!> (k,j,i) (cf. transpose_yx).
540	!------------------------------------------------------------------------------!
541	SUBROUTINE transpose_yxd( f_in, f_out )
542
543
544	USE cpulog, &
545	ONLY: cpu_log, log_point_s
546
547	USE indices, &
548	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
549
550	USE kinds
551
552	USE pegrid
553
554	USE transpose_indices, &
555	ONLY: nyn_x, nys_x, nzb_x, nzt_x
556
557	IMPLICIT NONE
558
559	INTEGER(iwp) :: i !<
560	INTEGER(iwp) :: j !<
561	INTEGER(iwp) :: k !<
562	INTEGER(iwp) :: l !<
563	INTEGER(iwp) :: m !<
564	INTEGER(iwp) :: xs !<
565
566	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
567	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
568	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
569	REAL(wp) :: work(nnxnnynnz) !<
570	#if defined( __parallel )
571
572	!
573	!-- Rearrange indices of input array in order to make data to be send
574	!-- by MPI contiguous
575	DO k = 1, nz
576	DO j = nys, nyn
577	DO i = nxl, nxr
578	f_inv(i,k,j) = f_in(k,j,i)
579	ENDDO
580	ENDDO
581	ENDDO
582
583	!
584	!-- Transpose array
585	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
586	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
587	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
588	work(1), sendrecvcount_xy, MPI_REAL, &
589	comm1dx, ierr )
590	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
591
592	!
593	!-- Reorder transposed array
594	m = 0
595	DO l = 0, pdims(1) - 1
596	xs = 0 + l * nnx
597	DO j = nys_x, nyn_x
598	DO k = 1, nz
599	DO i = xs, xs + nnx - 1
600	m = m + 1
601	f_out(i,j,k) = work(m)
602	ENDDO
603	ENDDO
604	ENDDO
605	ENDDO
606
607	#endif
608
609	END SUBROUTINE transpose_yxd
610
611
612	!------------------------------------------------------------------------------!
613	! Description:
614	! ------------
615	!> Resorting data for the transposition from y to z. The transposition itself
616	!> is carried out in transpose_yz
617	!------------------------------------------------------------------------------!
618	SUBROUTINE resort_for_yz( f_in, f_inv )
619
620
621	USE indices, &
622	ONLY: ny
623
624	USE kinds
625
626	USE transpose_indices, &
627	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
628
629	IMPLICIT NONE
630
631	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
632	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
633
634	INTEGER(iwp) :: i !<
635	INTEGER(iwp) :: j !<
636	INTEGER(iwp) :: k !<
637
638	!
639	!-- Rearrange indices of input array in order to make data to be send
640	!-- by MPI contiguous
641	!$OMP PARALLEL PRIVATE ( i, j, k )
642	!$OMP DO
643	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
644	!$ACC PRESENT(f_inv, f_in)
645	DO j = 0, ny
646	DO k = nzb_y, nzt_y
647	DO i = nxl_y, nxr_y
648	f_inv(i,k,j) = f_in(j,i,k)
649	ENDDO
650	ENDDO
651	ENDDO
652	!$OMP END PARALLEL
653
654	END SUBROUTINE resort_for_yz
655
656
657	!------------------------------------------------------------------------------!
658	! Description:
659	! ------------
660	!> Transposition of input array (f_in) from y to z. For the input array, all
661	!> elements along y reside on the same PE, while after transposition, all
662	!> elements along z reside on the same PE.
663	!------------------------------------------------------------------------------!
664	SUBROUTINE transpose_yz( f_inv, f_out )
665
666
667	USE cpulog, &
668	ONLY: cpu_log, cpu_log_nowait, log_point_s
669
670	USE indices, &
671	ONLY: ny, nz
672
673	USE kinds
674
675	USE pegrid
676
677	USE transpose_indices, &
678	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
679
680	IMPLICIT NONE
681
682	INTEGER(iwp) :: i !<
683	INTEGER(iwp) :: j !<
684	INTEGER(iwp) :: k !<
685	INTEGER(iwp) :: l !<
686	INTEGER(iwp) :: zs !<
687
688	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
689	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
690
691	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
692	!$ACC DECLARE CREATE(work)
693
694
695	!
696	!-- If the PE grid is one-dimensional along y, only local reordering
697	!-- of the data is necessary and no transposition has to be done.
698	IF ( pdims(1) == 1 ) THEN
699
700	!$OMP PARALLEL PRIVATE ( i, j, k )
701	!$OMP DO
702	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
703	!$ACC PRESENT(f_out, f_inv)
704	DO j = 0, ny
705	DO k = nzb_y, nzt_y
706	DO i = nxl_y, nxr_y
707	f_out(i,j,k) = f_inv(i,k,j)
708	ENDDO
709	ENDDO
710	ENDDO
711	!$OMP END PARALLEL
712
713	ELSE
714
715	#if defined( __parallel )
716	!
717	!-- Transpose array
718	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
719	#ifndef __cuda_aware_mpi
720	!$ACC UPDATE HOST(f_inv)
721	#else
722	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
723	#endif
724	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
725	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
726	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
727	comm1dx, ierr )
728	#ifndef __cuda_aware_mpi
729	!$ACC UPDATE DEVICE(work)
730	#else
731	!$ACC END HOST_DATA
732	#endif
733	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
734
735	!
736	!-- Reorder transposed array
737	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
738	!$OMP DO
739	DO l = 0, pdims(1) - 1
740	zs = 1 + l * ( nzt_y - nzb_y + 1 )
741	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
742	!$ACC PRESENT(f_out, work)
743	DO j = nys_z, nyn_z
744	DO k = zs, zs + nzt_y - nzb_y
745	DO i = nxl_z, nxr_z
746	f_out(i,j,k) = work(i,k-zs+1,j,l)
747	ENDDO
748	ENDDO
749	ENDDO
750	ENDDO
751	!$OMP END PARALLEL
752	#endif
753
754	ENDIF
755
756	END SUBROUTINE transpose_yz
757
758
759	!------------------------------------------------------------------------------!
760	! Description:
761	! ------------
762	!> Resorting data for the transposition from z to x. The transposition itself
763	!> is carried out in transpose_zx
764	!------------------------------------------------------------------------------!
765	SUBROUTINE resort_for_zx( f_in, f_inv )
766
767
768	USE indices, &
769	ONLY: nxl, nxr, nyn, nys, nz
770
771	USE kinds
772
773	IMPLICIT NONE
774
775	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
776	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
777
778	INTEGER(iwp) :: i !<
779	INTEGER(iwp) :: j !<
780	INTEGER(iwp) :: k !<
781
782	!
783	!-- Rearrange indices of input array in order to make data to be send
784	!-- by MPI contiguous
785	!$OMP PARALLEL PRIVATE ( i, j, k )
786	!$OMP DO
787	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
788	!$ACC PRESENT(f_in, f_inv)
789	DO k = 1,nz
790	DO i = nxl, nxr
791	DO j = nys, nyn
792	f_inv(j,i,k) = f_in(k,j,i)
793	ENDDO
794	ENDDO
795	ENDDO
796	!$OMP END PARALLEL
797
798	END SUBROUTINE resort_for_zx
799
800
801	!------------------------------------------------------------------------------!
802	! Description:
803	! ------------
804	!> Transposition of input array (f_in) from z to x. For the input array, all
805	!> elements along z reside on the same PE, while after transposition, all
806	!> elements along x reside on the same PE.
807	!------------------------------------------------------------------------------!
808	SUBROUTINE transpose_zx( f_inv, f_out )
809
810
811	USE cpulog, &
812	ONLY: cpu_log, cpu_log_nowait, log_point_s
813
814	USE indices, &
815	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
816
817	USE kinds
818
819	USE pegrid
820
821	USE transpose_indices, &
822	ONLY: nyn_x, nys_x, nzb_x, nzt_x
823
824	IMPLICIT NONE
825
826	INTEGER(iwp) :: i !<
827	INTEGER(iwp) :: j !<
828	INTEGER(iwp) :: k !<
829	INTEGER(iwp) :: l !<
830	INTEGER(iwp) :: xs !<
831
832	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
833	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
834
835	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
836	!$ACC DECLARE CREATE(work)
837
838
839	!
840	!-- If the PE grid is one-dimensional along y, only local reordering
841	!-- of the data is necessary and no transposition has to be done.
842	IF ( pdims(1) == 1 ) THEN
843
844	!$OMP PARALLEL PRIVATE ( i, j, k )
845	!$OMP DO
846	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
847	!$ACC PRESENT(f_out, f_inv)
848	DO k = 1, nz
849	DO i = nxl, nxr
850	DO j = nys, nyn
851	f_out(i,j,k) = f_inv(j,i,k)
852	ENDDO
853	ENDDO
854	ENDDO
855	!$OMP END PARALLEL
856
857	ELSE
858
859	#if defined( __parallel )
860	!
861	!-- Transpose array
862	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
863	#ifndef __cuda_aware_mpi
864	!$ACC UPDATE HOST(f_inv)
865	#else
866	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
867	#endif
868	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
869	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
870	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
871	comm1dx, ierr )
872	#ifndef __cuda_aware_mpi
873	!$ACC UPDATE DEVICE(work)
874	#else
875	!$ACC END HOST_DATA
876	#endif
877	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
878
879	!
880	!-- Reorder transposed array
881	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
882	!$OMP DO
883	DO l = 0, pdims(1) - 1
884	xs = 0 + l * nnx
885	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
886	!$ACC PRESENT(f_out, work)
887	DO k = nzb_x, nzt_x
888	DO i = xs, xs + nnx - 1
889	DO j = nys_x, nyn_x
890	f_out(i,j,k) = work(j,i-xs+1,k,l)
891	ENDDO
892	ENDDO
893	ENDDO
894	ENDDO
895	!$OMP END PARALLEL
896	#endif
897
898	ENDIF
899
900	END SUBROUTINE transpose_zx
901
902
903	!------------------------------------------------------------------------------!
904	! Description:
905	! ------------
906	!> Resorting data after the transposition from z to y. The transposition itself
907	!> is carried out in transpose_zy
908	!------------------------------------------------------------------------------!
909	SUBROUTINE resort_for_zy( f_inv, f_out )
910
911
912	USE indices, &
913	ONLY: ny
914
915	USE kinds
916
917	USE transpose_indices, &
918	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
919
920	IMPLICIT NONE
921
922	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
923	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
924
925
926	INTEGER(iwp) :: i !<
927	INTEGER(iwp) :: j !<
928	INTEGER(iwp) :: k !<
929
930	!
931	!-- Rearrange indices of input array in order to make data to be send
932	!-- by MPI contiguous
933	!$OMP PARALLEL PRIVATE ( i, j, k )
934	!$OMP DO
935	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
936	!$ACC PRESENT(f_out, f_inv)
937	DO k = nzb_y, nzt_y
938	DO j = 0, ny
939	DO i = nxl_y, nxr_y
940	f_out(j,i,k) = f_inv(i,k,j)
941	ENDDO
942	ENDDO
943	ENDDO
944	!$OMP END PARALLEL
945
946	END SUBROUTINE resort_for_zy
947
948
949	!------------------------------------------------------------------------------!
950	! Description:cpu_log_nowait
951	! ------------
952	!> Transposition of input array (f_in) from z to y. For the input array, all
953	!> elements along z reside on the same PE, while after transposition, all
954	!> elements along y reside on the same PE.
955	!------------------------------------------------------------------------------!
956	SUBROUTINE transpose_zy( f_in, f_inv )
957
958
959	USE cpulog, &
960	ONLY: cpu_log, cpu_log_nowait, log_point_s
961
962	USE indices, &
963	ONLY: ny, nz
964
965	USE kinds
966
967	USE pegrid
968
969	USE transpose_indices, &
970	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
971
972	IMPLICIT NONE
973
974	INTEGER(iwp) :: i !<
975	INTEGER(iwp) :: j !<
976	INTEGER(iwp) :: k !<
977	INTEGER(iwp) :: l !<
978	INTEGER(iwp) :: zs !<
979
980	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
981	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
982
983	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
984	!$ACC DECLARE CREATE(work)
985
986	!
987	!-- If the PE grid is one-dimensional along y, the array has only to be
988	!-- reordered locally and therefore no transposition has to be done.
989	IF ( pdims(1) /= 1 ) THEN
990
991	#if defined( __parallel )
992	!
993	!-- Reorder input array for transposition
994	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
995	!$OMP DO
996	DO l = 0, pdims(1) - 1
997	zs = 1 + l * ( nzt_y - nzb_y + 1 )
998	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
999	!$ACC PRESENT(work, f_in)
1000	DO j = nys_z, nyn_z
1001	DO k = zs, zs + nzt_y - nzb_y
1002	DO i = nxl_z, nxr_z
1003	work(i,k-zs+1,j,l) = f_in(i,j,k)
1004	ENDDO
1005	ENDDO
1006	ENDDO
1007	ENDDO
1008	!$OMP END PARALLEL
1009
1010	!
1011	!-- Transpose array
1012	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
1013	#ifndef __cuda_aware_mpi
1014	!$ACC UPDATE HOST(work)
1015	#else
1016	!$ACC HOST_DATA USE_DEVICE(work, f_inv)
1017	#endif
1018	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1019	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
1020	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
1021	comm1dx, ierr )
1022	#ifndef __cuda_aware_mpi
1023	!$ACC UPDATE DEVICE(f_inv)
1024	#else
1025	!$ACC END HOST_DATA
1026	#endif
1027	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1028	#endif
1029
1030	ELSE
1031	!
1032	!-- Reorder the array in the same way like ALLTOALL did it
1033	!$OMP PARALLEL PRIVATE ( i, j, k )
1034	!$OMP DO
1035	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
1036	!$ACC PRESENT(f_inv, f_in)
1037	DO k = nzb_y, nzt_y
1038	DO j = 0, ny
1039	DO i = nxl_y, nxr_y
1040	f_inv(i,k,j) = f_in(i,j,k)
1041	ENDDO
1042	ENDDO
1043	ENDDO
1044	!$OMP END PARALLEL
1045
1046	ENDIF
1047
1048	END SUBROUTINE transpose_zy
1049
1050
1051	!------------------------------------------------------------------------------!
1052	! Description:
1053	! ------------
1054	!> Transposition of input array (f_in) from z to y. For the input array, all
1055	!> elements along z reside on the same PE, while after transposition, all
1056	!> elements along y reside on the same PE.
1057	!> This is a direct transposition for arrays with indices in regular order
1058	!> (k,j,i) (cf. transpose_zy).
1059	!------------------------------------------------------------------------------!
1060	SUBROUTINE transpose_zyd( f_in, f_out )
1061
1062
1063	USE cpulog, &
1064	ONLY: cpu_log, log_point_s
1065
1066	USE indices, &
1067	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
1068
1069	USE kinds
1070
1071	USE pegrid
1072
1073	USE transpose_indices, &
1074	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
1075
1076	IMPLICIT NONE
1077
1078	INTEGER(iwp) :: i !<
1079	INTEGER(iwp) :: j !<
1080	INTEGER(iwp) :: k !<
1081	INTEGER(iwp) :: l !<
1082	INTEGER(iwp) :: m !<
1083	INTEGER(iwp) :: ys !<
1084
1085	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
1086	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
1087	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
1088	REAL(wp) :: work(nnxnnynnz) !<
1089
1090	#if defined( __parallel )
1091
1092	!
1093	!-- Rearrange indices of input array in order to make data to be send
1094	!-- by MPI contiguous
1095	DO i = nxl, nxr
1096	DO j = nys, nyn
1097	DO k = 1, nz
1098	f_inv(j,i,k) = f_in(k,j,i)
1099	ENDDO
1100	ENDDO
1101	ENDDO
1102
1103	!
1104	!-- Move data to different array, because memory location of work1 is
1105	!-- needed further below (work1 = work2).
1106	!-- If the PE grid is one-dimensional along x, only local reordering
1107	!-- of the data is necessary and no transposition has to be done.
1108	IF ( pdims(2) == 1 ) THEN
1109	DO k = 1, nz
1110	DO i = nxl, nxr
1111	DO j = nys, nyn
1112	f_out(j,i,k) = f_inv(j,i,k)
1113	ENDDO
1114	ENDDO
1115	ENDDO
1116	RETURN
1117	ENDIF
1118
1119	!
1120	!-- Transpose array
1121	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1122	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1123	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
1124	work(1), sendrecvcount_zyd, MPI_REAL, &
1125	comm1dy, ierr )
1126	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1127
1128	!
1129	!-- Reorder transposed array
1130	m = 0
1131	DO l = 0, pdims(2) - 1
1132	ys = 0 + l * nny
1133	DO k = nzb_yd, nzt_yd
1134	DO i = nxl_yd, nxr_yd
1135	DO j = ys, ys + nny - 1
1136	m = m + 1
1137	f_out(j,i,k) = work(m)
1138	ENDDO
1139	ENDDO
1140	ENDDO
1141	ENDDO
1142
1143	#endif
1144
1145	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |