Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 3648

Last change on this file since 3648 was 3634, checked in by knoop, 5 years ago
OpenACC port for SPEC
Property svn:keywords set to `Id`
File size: 31.9 KB

Line
1	!> @file transpose.f90
2	!------------------------------------------------------------------------------!
3	! This file is part of the PALM model system.
4	!
5	! PALM is free software: you can redistribute it and/or modify it under the
6	! terms of the GNU General Public License as published by the Free Software
7	! Foundation, either version 3 of the License, or (at your option) any later
8	! version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2018 Leibniz Universitaet Hannover
18	!------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 3634 2018-12-18 12:31:28Z suehring $
27	! OpenACC port for SPEC
28	!
29	! 3241 2018-09-12 15:02:00Z raasch
30	! unused variables removed
31	!
32	! 2718 2018-01-02 08:49:38Z maronga
33	! Corrected "Former revisions" section
34	!
35	! 2696 2017-12-14 17:12:51Z kanani
36	! Change in file header (GPL part)
37	!
38	! 2119 2017-01-17 16:51:50Z raasch
39	!
40	! 2118 2017-01-17 16:38:49Z raasch
41	! OpenACC directives removed
42	!
43	! 2000 2016-08-20 18:09:15Z knoop
44	! Forced header and separation lines into 80 columns
45	!
46	! 1682 2015-10-07 23:56:08Z knoop
47	! Code annotations made doxygen readable
48	!
49	! 1324 2014-03-21 09:13:16Z suehring
50	! Bugfix: ONLY statement for module pegrid removed
51	!
52	! 1320 2014-03-20 08:40:49Z raasch
53	! ONLY-attribute added to USE-statements,
54	! kind-parameters added to all INTEGER and REAL declaration statements,
55	! kinds are defined in new module kinds,
56	! old module precision_kind is removed,
57	! revision history before 2012 removed,
58	! comment fields (!:) to be used for variable explanations added to
59	! all variable declaration statements
60	!
61	! 1318 2014-03-17 13:35:16Z raasch
62	! cpu_log_nowait parameter added to cpu measurements of the transpositions
63	! required for solving the Poisson equation (poisfft),
64	! module interfaces removed
65	!
66	! 1257 2013-11-08 15:18:40Z raasch
67	! openacc loop and loop vector clauses removed
68	!
69	! 1216 2013-08-26 09:31:42Z raasch
70	! re-sorting of the transposed / to be transposed arrays moved to separate
71	! routines resort_for_...
72	!
73	! 1111 2013-03-08 23:54:10Z raasch
74	! openACC directives added,
75	! resorting data from/to work changed, work got 4 dimensions instead of 1
76	!
77	! 1106 2013-03-04 05:31:38Z raasch
78	! preprocessor lines rearranged so that routines can also be used in serial
79	! (non-parallel) mode
80	!
81	! 1092 2013-02-02 11:24:22Z raasch
82	! unused variables removed
83	!
84	! 1036 2012-10-22 13:43:42Z raasch
85	! code put under GPL (PALM 3.9)
86	!
87	! 1003 2012-09-14 14:35:53Z raasch
88	! indices nxa, nya, etc. replaced by nx, ny, etc.
89	!
90	! Revision 1.1 1997/07/24 11:25:18 raasch
91	! Initial revision
92	!
93	!------------------------------------------------------------------------------!
94	! Description:
95	! ------------
96	!> Resorting data for the transposition from x to y. The transposition itself
97	!> is carried out in transpose_xy
98	!------------------------------------------------------------------------------!
99	SUBROUTINE resort_for_xy( f_in, f_inv )
100
101
102	USE indices, &
103	ONLY: nx
104
105	USE kinds
106
107	USE transpose_indices, &
108	ONLY: nyn_x, nys_x, nzb_x, nzt_x
109
110	IMPLICIT NONE
111
112	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
113	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
114
115
116	INTEGER(iwp) :: i !<
117	INTEGER(iwp) :: j !<
118	INTEGER(iwp) :: k !<
119	!
120	!-- Rearrange indices of input array in order to make data to be send
121	!-- by MPI contiguous
122	!$OMP PARALLEL PRIVATE ( i, j, k )
123	!$OMP DO
124	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
125	!$ACC PRESENT(f_inv, f_in)
126	DO i = 0, nx
127	DO k = nzb_x, nzt_x
128	DO j = nys_x, nyn_x
129	f_inv(j,k,i) = f_in(i,j,k)
130	ENDDO
131	ENDDO
132	ENDDO
133	!$OMP END PARALLEL
134
135	END SUBROUTINE resort_for_xy
136
137
138	!------------------------------------------------------------------------------!
139	! Description:
140	! ------------
141	!> Transposition of input array (f_in) from x to y. For the input array, all
142	!> elements along x reside on the same PE, while after transposition, all
143	!> elements along y reside on the same PE.
144	!------------------------------------------------------------------------------!
145	SUBROUTINE transpose_xy( f_inv, f_out )
146
147
148	USE cpulog, &
149	ONLY: cpu_log, cpu_log_nowait, log_point_s
150
151	USE indices, &
152	ONLY: nx, ny
153
154	USE kinds
155
156	USE pegrid
157
158	USE transpose_indices, &
159	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
160
161	IMPLICIT NONE
162
163	INTEGER(iwp) :: i !<
164	INTEGER(iwp) :: j !<
165	INTEGER(iwp) :: k !<
166	INTEGER(iwp) :: l !<
167	INTEGER(iwp) :: ys !<
168
169	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
170	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
171
172	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
173	!$ACC DECLARE CREATE(work)
174
175
176	IF ( numprocs /= 1 ) THEN
177
178	#if defined( __parallel )
179	!
180	!-- Transpose array
181	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
182	!$ACC UPDATE HOST(f_inv)
183	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
184	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
185	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
186	comm1dy, ierr )
187	!$ACC UPDATE DEVICE(work)
188	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
189
190	!
191	!-- Reorder transposed array
192	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
193	!$OMP DO
194	DO l = 0, pdims(2) - 1
195	ys = 0 + l * ( nyn_x - nys_x + 1 )
196	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
197	!$ACC PRESENT(f_out, work)
198	DO i = nxl_y, nxr_y
199	DO k = nzb_y, nzt_y
200	DO j = ys, ys + nyn_x - nys_x
201	f_out(j,i,k) = work(j-ys+1,k,i,l)
202	ENDDO
203	ENDDO
204	ENDDO
205	ENDDO
206	!$OMP END PARALLEL
207	#endif
208
209	ELSE
210
211	!
212	!-- Reorder transposed array
213	!$OMP PARALLEL PRIVATE ( i, j, k )
214	!$OMP DO
215	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
216	!$ACC PRESENT(f_out, f_inv)
217	DO k = nzb_y, nzt_y
218	DO i = nxl_y, nxr_y
219	DO j = 0, ny
220	f_out(j,i,k) = f_inv(j,k,i)
221	ENDDO
222	ENDDO
223	ENDDO
224	!$OMP END PARALLEL
225
226	ENDIF
227
228	END SUBROUTINE transpose_xy
229
230
231	!------------------------------------------------------------------------------!
232	! Description:
233	! ------------
234	!> Resorting data after the transposition from x to z. The transposition itself
235	!> is carried out in transpose_xz
236	!------------------------------------------------------------------------------!
237	SUBROUTINE resort_for_xz( f_inv, f_out )
238
239
240	USE indices, &
241	ONLY: nxl, nxr, nyn, nys, nz
242
243	USE kinds
244
245	IMPLICIT NONE
246
247	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
248	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !<
249
250	INTEGER(iwp) :: i !<
251	INTEGER(iwp) :: j !<
252	INTEGER(iwp) :: k !<
253	!
254	!-- Rearrange indices of input array in order to make data to be send
255	!-- by MPI contiguous.
256	!-- In case of parallel fft/transposition, scattered store is faster in
257	!-- backward direction!!!
258	!$OMP PARALLEL PRIVATE ( i, j, k )
259	!$OMP DO
260	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
261	!$ACC PRESENT(f_out, f_inv)
262	DO k = 1, nz
263	DO i = nxl, nxr
264	DO j = nys, nyn
265	f_out(k,j,i) = f_inv(j,i,k)
266	ENDDO
267	ENDDO
268	ENDDO
269	!$OMP END PARALLEL
270
271	END SUBROUTINE resort_for_xz
272
273
274	!------------------------------------------------------------------------------!
275	! Description:
276	! ------------
277	!> Transposition of input array (f_in) from x to z. For the input array, all
278	!> elements along x reside on the same PE, while after transposition, all
279	!> elements along z reside on the same PE.
280	!------------------------------------------------------------------------------!
281	SUBROUTINE transpose_xz( f_in, f_inv )
282
283
284	USE cpulog, &
285	ONLY: cpu_log, cpu_log_nowait, log_point_s
286
287	USE indices, &
288	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
289
290	USE kinds
291
292	USE pegrid
293
294	USE transpose_indices, &
295	ONLY: nyn_x, nys_x, nzb_x, nzt_x
296
297	IMPLICIT NONE
298
299	INTEGER(iwp) :: i !<
300	INTEGER(iwp) :: j !<
301	INTEGER(iwp) :: k !<
302	INTEGER(iwp) :: l !<
303	INTEGER(iwp) :: xs !<
304
305	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
306	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
307
308	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
309	!$ACC DECLARE CREATE(work)
310
311
312	!
313	!-- If the PE grid is one-dimensional along y, the array has only to be
314	!-- reordered locally and therefore no transposition has to be done.
315	IF ( pdims(1) /= 1 ) THEN
316
317	#if defined( __parallel )
318	!
319	!-- Reorder input array for transposition
320	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
321	!$OMP DO
322	DO l = 0, pdims(1) - 1
323	xs = 0 + l * nnx
324	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
325	!$ACC PRESENT(work, f_in)
326	DO k = nzb_x, nzt_x
327	DO i = xs, xs + nnx - 1
328	DO j = nys_x, nyn_x
329	work(j,i-xs+1,k,l) = f_in(i,j,k)
330	ENDDO
331	ENDDO
332	ENDDO
333	ENDDO
334	!$OMP END PARALLEL
335
336	!
337	!-- Transpose array
338	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
339	!$ACC UPDATE HOST(work)
340	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
341	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
342	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
343	comm1dx, ierr )
344	!$ACC UPDATE DEVICE(f_inv)
345	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
346	#endif
347
348	ELSE
349
350	!
351	!-- Reorder the array in a way that the z index is in first position
352	!$OMP PARALLEL PRIVATE ( i, j, k )
353	!$OMP DO
354	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
355	!$ACC PRESENT(f_inv, f_in)
356	DO i = nxl, nxr
357	DO j = nys, nyn
358	DO k = 1, nz
359	f_inv(j,i,k) = f_in(i,j,k)
360	ENDDO
361	ENDDO
362	ENDDO
363	!$OMP END PARALLEL
364
365	ENDIF
366
367	END SUBROUTINE transpose_xz
368
369
370	!------------------------------------------------------------------------------!
371	! Description:
372	! ------------
373	!> Resorting data after the transposition from y to x. The transposition itself
374	!> is carried out in transpose_yx
375	!------------------------------------------------------------------------------!
376	SUBROUTINE resort_for_yx( f_inv, f_out )
377
378
379	USE indices, &
380	ONLY: nx
381
382	USE kinds
383
384	USE transpose_indices, &
385	ONLY: nyn_x, nys_x, nzb_x, nzt_x
386
387	IMPLICIT NONE
388
389	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
390	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
391
392
393	INTEGER(iwp) :: i !<
394	INTEGER(iwp) :: j !<
395	INTEGER(iwp) :: k !<
396	!
397	!-- Rearrange indices of input array in order to make data to be send
398	!-- by MPI contiguous
399	!$OMP PARALLEL PRIVATE ( i, j, k )
400	!$OMP DO
401	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
402	!$ACC PRESENT(f_out, f_inv)
403	DO i = 0, nx
404	DO k = nzb_x, nzt_x
405	DO j = nys_x, nyn_x
406	f_out(i,j,k) = f_inv(j,k,i)
407	ENDDO
408	ENDDO
409	ENDDO
410	!$OMP END PARALLEL
411
412	END SUBROUTINE resort_for_yx
413
414
415	!------------------------------------------------------------------------------!
416	! Description:
417	! ------------
418	!> Transposition of input array (f_in) from y to x. For the input array, all
419	!> elements along y reside on the same PE, while after transposition, all
420	!> elements along x reside on the same PE.
421	!------------------------------------------------------------------------------!
422	SUBROUTINE transpose_yx( f_in, f_inv )
423
424
425	USE cpulog, &
426	ONLY: cpu_log, cpu_log_nowait, log_point_s
427
428	USE indices, &
429	ONLY: nx, ny
430
431	USE kinds
432
433	USE pegrid
434
435	USE transpose_indices, &
436	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
437
438	IMPLICIT NONE
439
440	INTEGER(iwp) :: i !<
441	INTEGER(iwp) :: j !<
442	INTEGER(iwp) :: k !<
443	INTEGER(iwp) :: l !<
444	INTEGER(iwp) :: ys !<
445
446	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
447	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !<
448
449	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !<
450	!$ACC DECLARE CREATE(work)
451
452
453	IF ( numprocs /= 1 ) THEN
454
455	#if defined( __parallel )
456	!
457	!-- Reorder input array for transposition
458	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
459	!$OMP DO
460	DO l = 0, pdims(2) - 1
461	ys = 0 + l * ( nyn_x - nys_x + 1 )
462	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
463	!$ACC PRESENT(work, f_in)
464	DO i = nxl_y, nxr_y
465	DO k = nzb_y, nzt_y
466	DO j = ys, ys + nyn_x - nys_x
467	work(j-ys+1,k,i,l) = f_in(j,i,k)
468	ENDDO
469	ENDDO
470	ENDDO
471	ENDDO
472	!$OMP END PARALLEL
473
474	!
475	!-- Transpose array
476	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
477	!$ACC UPDATE HOST(work)
478	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
479	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
480	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
481	comm1dy, ierr )
482	!$ACC UPDATE DEVICE(f_inv)
483	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
484	#endif
485
486	ELSE
487
488	!
489	!-- Reorder array f_in the same way as ALLTOALL did it
490	!$OMP PARALLEL PRIVATE ( i, j, k )
491	!$OMP DO
492	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
493	!$ACC PRESENT(f_inv, f_in)
494	DO i = nxl_y, nxr_y
495	DO k = nzb_y, nzt_y
496	DO j = 0, ny
497	f_inv(j,k,i) = f_in(j,i,k)
498	ENDDO
499	ENDDO
500	ENDDO
501	!$OMP END PARALLEL
502
503	ENDIF
504
505	END SUBROUTINE transpose_yx
506
507
508	!------------------------------------------------------------------------------!
509	! Description:
510	! ------------
511	!> Transposition of input array (f_in) from y to x. For the input array, all
512	!> elements along y reside on the same PE, while after transposition, all
513	!> elements along x reside on the same PE.
514	!> This is a direct transposition for arrays with indices in regular order
515	!> (k,j,i) (cf. transpose_yx).
516	!------------------------------------------------------------------------------!
517	SUBROUTINE transpose_yxd( f_in, f_out )
518
519
520	USE cpulog, &
521	ONLY: cpu_log, log_point_s
522
523	USE indices, &
524	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
525
526	USE kinds
527
528	USE pegrid
529
530	USE transpose_indices, &
531	ONLY: nyn_x, nys_x, nzb_x, nzt_x
532
533	IMPLICIT NONE
534
535	INTEGER(iwp) :: i !<
536	INTEGER(iwp) :: j !<
537	INTEGER(iwp) :: k !<
538	INTEGER(iwp) :: l !<
539	INTEGER(iwp) :: m !<
540	INTEGER(iwp) :: xs !<
541
542	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
543	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !<
544	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
545	REAL(wp) :: work(nnxnnynnz) !<
546	#if defined( __parallel )
547
548	!
549	!-- Rearrange indices of input array in order to make data to be send
550	!-- by MPI contiguous
551	DO k = 1, nz
552	DO j = nys, nyn
553	DO i = nxl, nxr
554	f_inv(i,k,j) = f_in(k,j,i)
555	ENDDO
556	ENDDO
557	ENDDO
558
559	!
560	!-- Transpose array
561	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
562	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
563	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
564	work(1), sendrecvcount_xy, MPI_REAL, &
565	comm1dx, ierr )
566	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
567
568	!
569	!-- Reorder transposed array
570	m = 0
571	DO l = 0, pdims(1) - 1
572	xs = 0 + l * nnx
573	DO j = nys_x, nyn_x
574	DO k = 1, nz
575	DO i = xs, xs + nnx - 1
576	m = m + 1
577	f_out(i,j,k) = work(m)
578	ENDDO
579	ENDDO
580	ENDDO
581	ENDDO
582
583	#endif
584
585	END SUBROUTINE transpose_yxd
586
587
588	!------------------------------------------------------------------------------!
589	! Description:
590	! ------------
591	!> Resorting data for the transposition from y to z. The transposition itself
592	!> is carried out in transpose_yz
593	!------------------------------------------------------------------------------!
594	SUBROUTINE resort_for_yz( f_in, f_inv )
595
596
597	USE indices, &
598	ONLY: ny
599
600	USE kinds
601
602	USE transpose_indices, &
603	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
604
605	IMPLICIT NONE
606
607	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
608	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
609
610	INTEGER(iwp) :: i !<
611	INTEGER(iwp) :: j !<
612	INTEGER(iwp) :: k !<
613
614	!
615	!-- Rearrange indices of input array in order to make data to be send
616	!-- by MPI contiguous
617	!$OMP PARALLEL PRIVATE ( i, j, k )
618	!$OMP DO
619	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
620	!$ACC PRESENT(f_inv, f_in)
621	DO j = 0, ny
622	DO k = nzb_y, nzt_y
623	DO i = nxl_y, nxr_y
624	f_inv(i,k,j) = f_in(j,i,k)
625	ENDDO
626	ENDDO
627	ENDDO
628	!$OMP END PARALLEL
629
630	END SUBROUTINE resort_for_yz
631
632
633	!------------------------------------------------------------------------------!
634	! Description:
635	! ------------
636	!> Transposition of input array (f_in) from y to z. For the input array, all
637	!> elements along y reside on the same PE, while after transposition, all
638	!> elements along z reside on the same PE.
639	!------------------------------------------------------------------------------!
640	SUBROUTINE transpose_yz( f_inv, f_out )
641
642
643	USE cpulog, &
644	ONLY: cpu_log, cpu_log_nowait, log_point_s
645
646	USE indices, &
647	ONLY: ny, nz
648
649	USE kinds
650
651	USE pegrid
652
653	USE transpose_indices, &
654	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
655
656	IMPLICIT NONE
657
658	INTEGER(iwp) :: i !<
659	INTEGER(iwp) :: j !<
660	INTEGER(iwp) :: k !<
661	INTEGER(iwp) :: l !<
662	INTEGER(iwp) :: zs !<
663
664	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
665	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
666
667	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
668	!$ACC DECLARE CREATE(work)
669
670
671	!
672	!-- If the PE grid is one-dimensional along y, only local reordering
673	!-- of the data is necessary and no transposition has to be done.
674	IF ( pdims(1) == 1 ) THEN
675
676	!$OMP PARALLEL PRIVATE ( i, j, k )
677	!$OMP DO
678	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
679	!$ACC PRESENT(f_out, f_inv)
680	DO j = 0, ny
681	DO k = nzb_y, nzt_y
682	DO i = nxl_y, nxr_y
683	f_out(i,j,k) = f_inv(i,k,j)
684	ENDDO
685	ENDDO
686	ENDDO
687	!$OMP END PARALLEL
688
689	ELSE
690
691	#if defined( __parallel )
692	!
693	!-- Transpose array
694	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
695	!$ACC UPDATE HOST(f_inv)
696	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
697	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
698	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
699	comm1dx, ierr )
700	!$ACC UPDATE DEVICE(work)
701	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
702
703	!
704	!-- Reorder transposed array
705	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
706	!$OMP DO
707	DO l = 0, pdims(1) - 1
708	zs = 1 + l * ( nzt_y - nzb_y + 1 )
709	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
710	!$ACC PRESENT(f_out, work)
711	DO j = nys_z, nyn_z
712	DO k = zs, zs + nzt_y - nzb_y
713	DO i = nxl_z, nxr_z
714	f_out(i,j,k) = work(i,k-zs+1,j,l)
715	ENDDO
716	ENDDO
717	ENDDO
718	ENDDO
719	!$OMP END PARALLEL
720	#endif
721
722	ENDIF
723
724	END SUBROUTINE transpose_yz
725
726
727	!------------------------------------------------------------------------------!
728	! Description:
729	! ------------
730	!> Resorting data for the transposition from z to x. The transposition itself
731	!> is carried out in transpose_zx
732	!------------------------------------------------------------------------------!
733	SUBROUTINE resort_for_zx( f_in, f_inv )
734
735
736	USE indices, &
737	ONLY: nxl, nxr, nyn, nys, nz
738
739	USE kinds
740
741	IMPLICIT NONE
742
743	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
744	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
745
746	INTEGER(iwp) :: i !<
747	INTEGER(iwp) :: j !<
748	INTEGER(iwp) :: k !<
749
750	!
751	!-- Rearrange indices of input array in order to make data to be send
752	!-- by MPI contiguous
753	!$OMP PARALLEL PRIVATE ( i, j, k )
754	!$OMP DO
755	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
756	!$ACC PRESENT(f_in, f_inv)
757	DO k = 1,nz
758	DO i = nxl, nxr
759	DO j = nys, nyn
760	f_inv(j,i,k) = f_in(k,j,i)
761	ENDDO
762	ENDDO
763	ENDDO
764	!$OMP END PARALLEL
765
766	END SUBROUTINE resort_for_zx
767
768
769	!------------------------------------------------------------------------------!
770	! Description:
771	! ------------
772	!> Transposition of input array (f_in) from z to x. For the input array, all
773	!> elements along z reside on the same PE, while after transposition, all
774	!> elements along x reside on the same PE.
775	!------------------------------------------------------------------------------!
776	SUBROUTINE transpose_zx( f_inv, f_out )
777
778
779	USE cpulog, &
780	ONLY: cpu_log, cpu_log_nowait, log_point_s
781
782	USE indices, &
783	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
784
785	USE kinds
786
787	USE pegrid
788
789	USE transpose_indices, &
790	ONLY: nyn_x, nys_x, nzb_x, nzt_x
791
792	IMPLICIT NONE
793
794	INTEGER(iwp) :: i !<
795	INTEGER(iwp) :: j !<
796	INTEGER(iwp) :: k !<
797	INTEGER(iwp) :: l !<
798	INTEGER(iwp) :: xs !<
799
800	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
801	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !<
802
803	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !<
804	!$ACC DECLARE CREATE(work)
805
806
807	!
808	!-- If the PE grid is one-dimensional along y, only local reordering
809	!-- of the data is necessary and no transposition has to be done.
810	IF ( pdims(1) == 1 ) THEN
811
812	!$OMP PARALLEL PRIVATE ( i, j, k )
813	!$OMP DO
814	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
815	!$ACC PRESENT(f_out, f_inv)
816	DO k = 1, nz
817	DO i = nxl, nxr
818	DO j = nys, nyn
819	f_out(i,j,k) = f_inv(j,i,k)
820	ENDDO
821	ENDDO
822	ENDDO
823	!$OMP END PARALLEL
824
825	ELSE
826
827	#if defined( __parallel )
828	!
829	!-- Transpose array
830	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
831	!$ACC UPDATE HOST(f_inv)
832	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
833	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
834	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
835	comm1dx, ierr )
836	!$ACC UPDATE DEVICE(work)
837	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
838
839	!
840	!-- Reorder transposed array
841	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
842	!$OMP DO
843	DO l = 0, pdims(1) - 1
844	xs = 0 + l * nnx
845	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
846	!$ACC PRESENT(f_out, work)
847	DO k = nzb_x, nzt_x
848	DO i = xs, xs + nnx - 1
849	DO j = nys_x, nyn_x
850	f_out(i,j,k) = work(j,i-xs+1,k,l)
851	ENDDO
852	ENDDO
853	ENDDO
854	ENDDO
855	!$OMP END PARALLEL
856	#endif
857
858	ENDIF
859
860	END SUBROUTINE transpose_zx
861
862
863	!------------------------------------------------------------------------------!
864	! Description:
865	! ------------
866	!> Resorting data after the transposition from z to y. The transposition itself
867	!> is carried out in transpose_zy
868	!------------------------------------------------------------------------------!
869	SUBROUTINE resort_for_zy( f_inv, f_out )
870
871
872	USE indices, &
873	ONLY: ny
874
875	USE kinds
876
877	USE transpose_indices, &
878	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
879
880	IMPLICIT NONE
881
882	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
883	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !<
884
885
886	INTEGER(iwp) :: i !<
887	INTEGER(iwp) :: j !<
888	INTEGER(iwp) :: k !<
889
890	!
891	!-- Rearrange indices of input array in order to make data to be send
892	!-- by MPI contiguous
893	!$OMP PARALLEL PRIVATE ( i, j, k )
894	!$OMP DO
895	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
896	!$ACC PRESENT(f_out, f_inv)
897	DO k = nzb_y, nzt_y
898	DO j = 0, ny
899	DO i = nxl_y, nxr_y
900	f_out(j,i,k) = f_inv(i,k,j)
901	ENDDO
902	ENDDO
903	ENDDO
904	!$OMP END PARALLEL
905
906	END SUBROUTINE resort_for_zy
907
908
909	!------------------------------------------------------------------------------!
910	! Description:cpu_log_nowait
911	! ------------
912	!> Transposition of input array (f_in) from z to y. For the input array, all
913	!> elements along z reside on the same PE, while after transposition, all
914	!> elements along y reside on the same PE.
915	!------------------------------------------------------------------------------!
916	SUBROUTINE transpose_zy( f_in, f_inv )
917
918
919	USE cpulog, &
920	ONLY: cpu_log, cpu_log_nowait, log_point_s
921
922	USE indices, &
923	ONLY: ny, nz
924
925	USE kinds
926
927	USE pegrid
928
929	USE transpose_indices, &
930	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
931
932	IMPLICIT NONE
933
934	INTEGER(iwp) :: i !<
935	INTEGER(iwp) :: j !<
936	INTEGER(iwp) :: k !<
937	INTEGER(iwp) :: l !<
938	INTEGER(iwp) :: zs !<
939
940	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !<
941	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !<
942
943	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !<
944	!$ACC DECLARE CREATE(work)
945
946	!
947	!-- If the PE grid is one-dimensional along y, the array has only to be
948	!-- reordered locally and therefore no transposition has to be done.
949	IF ( pdims(1) /= 1 ) THEN
950
951	#if defined( __parallel )
952	!
953	!-- Reorder input array for transposition
954	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
955	!$OMP DO
956	DO l = 0, pdims(1) - 1
957	zs = 1 + l * ( nzt_y - nzb_y + 1 )
958	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
959	!$ACC PRESENT(work, f_in)
960	DO j = nys_z, nyn_z
961	DO k = zs, zs + nzt_y - nzb_y
962	DO i = nxl_z, nxr_z
963	work(i,k-zs+1,j,l) = f_in(i,j,k)
964	ENDDO
965	ENDDO
966	ENDDO
967	ENDDO
968	!$OMP END PARALLEL
969
970	!
971	!-- Transpose array
972	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
973	!$ACC UPDATE HOST(work)
974	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
975	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
976	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
977	comm1dx, ierr )
978	!$ACC UPDATE DEVICE(f_inv)
979	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
980	#endif
981
982	ELSE
983	!
984	!-- Reorder the array in the same way like ALLTOALL did it
985	!$OMP PARALLEL PRIVATE ( i, j, k )
986	!$OMP DO
987	!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(i,j,k) &
988	!$ACC PRESENT(f_inv, f_in)
989	DO k = nzb_y, nzt_y
990	DO j = 0, ny
991	DO i = nxl_y, nxr_y
992	f_inv(i,k,j) = f_in(i,j,k)
993	ENDDO
994	ENDDO
995	ENDDO
996	!$OMP END PARALLEL
997
998	ENDIF
999
1000	END SUBROUTINE transpose_zy
1001
1002
1003	!------------------------------------------------------------------------------!
1004	! Description:
1005	! ------------
1006	!> Transposition of input array (f_in) from z to y. For the input array, all
1007	!> elements along z reside on the same PE, while after transposition, all
1008	!> elements along y reside on the same PE.
1009	!> This is a direct transposition for arrays with indices in regular order
1010	!> (k,j,i) (cf. transpose_zy).
1011	!------------------------------------------------------------------------------!
1012	SUBROUTINE transpose_zyd( f_in, f_out )
1013
1014
1015	USE cpulog, &
1016	ONLY: cpu_log, log_point_s
1017
1018	USE indices, &
1019	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
1020
1021	USE kinds
1022
1023	USE pegrid
1024
1025	USE transpose_indices, &
1026	ONLY: nxl_yd, nxr_yd, nzb_yd, nzt_yd
1027
1028	IMPLICIT NONE
1029
1030	INTEGER(iwp) :: i !<
1031	INTEGER(iwp) :: j !<
1032	INTEGER(iwp) :: k !<
1033	INTEGER(iwp) :: l !<
1034	INTEGER(iwp) :: m !<
1035	INTEGER(iwp) :: ys !<
1036
1037	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !<
1038	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !<
1039	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !<
1040	REAL(wp) :: work(nnxnnynnz) !<
1041
1042	#if defined( __parallel )
1043
1044	!
1045	!-- Rearrange indices of input array in order to make data to be send
1046	!-- by MPI contiguous
1047	DO i = nxl, nxr
1048	DO j = nys, nyn
1049	DO k = 1, nz
1050	f_inv(j,i,k) = f_in(k,j,i)
1051	ENDDO
1052	ENDDO
1053	ENDDO
1054
1055	!
1056	!-- Move data to different array, because memory location of work1 is
1057	!-- needed further below (work1 = work2).
1058	!-- If the PE grid is one-dimensional along x, only local reordering
1059	!-- of the data is necessary and no transposition has to be done.
1060	IF ( pdims(2) == 1 ) THEN
1061	DO k = 1, nz
1062	DO i = nxl, nxr
1063	DO j = nys, nyn
1064	f_out(j,i,k) = f_inv(j,i,k)
1065	ENDDO
1066	ENDDO
1067	ENDDO
1068	RETURN
1069	ENDIF
1070
1071	!
1072	!-- Transpose array
1073	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1074	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1075	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
1076	work(1), sendrecvcount_zyd, MPI_REAL, &
1077	comm1dy, ierr )
1078	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1079
1080	!
1081	!-- Reorder transposed array
1082	m = 0
1083	DO l = 0, pdims(2) - 1
1084	ys = 0 + l * nny
1085	DO k = nzb_yd, nzt_yd
1086	DO i = nxl_yd, nxr_yd
1087	DO j = ys, ys + nny - 1
1088	m = m + 1
1089	f_out(j,i,k) = work(m)
1090	ENDDO
1091	ENDDO
1092	ENDDO
1093	ENDDO
1094
1095	#endif
1096
1097	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |