Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 1634

Last change on this file since 1634 was 1325, checked in by suehring, 11 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 31.0 KB

Line
1	SUBROUTINE resort_for_xy( f_in, f_inv )
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2014 Leibniz Universitaet Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	! Former revisions:
24	! -----------------
25	! $Id: transpose.f90 1325 2014-03-21 09:21:15Z hoffmann $
26	!
27	! 1324 2014-03-21 09:13:16Z suehring
28	! Bugfix: ONLY statement for module pegrid removed
29	!
30	! 1320 2014-03-20 08:40:49Z raasch
31	! ONLY-attribute added to USE-statements,
32	! kind-parameters added to all INTEGER and REAL declaration statements,
33	! kinds are defined in new module kinds,
34	! old module precision_kind is removed,
35	! revision history before 2012 removed,
36	! comment fields (!:) to be used for variable explanations added to
37	! all variable declaration statements
38	!
39	! 1318 2014-03-17 13:35:16Z raasch
40	! cpu_log_nowait parameter added to cpu measurements of the transpositions
41	! required for solving the Poisson equation (poisfft),
42	! module interfaces removed
43	!
44	! 1257 2013-11-08 15:18:40Z raasch
45	! openacc loop and loop vector clauses removed
46	!
47	! 1216 2013-08-26 09:31:42Z raasch
48	! re-sorting of the transposed / to be transposed arrays moved to separate
49	! routines resort_for_...
50	!
51	! 1111 2013-03-08 23:54:10Z raasch
52	! openACC directives added,
53	! resorting data from/to work changed, work got 4 dimensions instead of 1
54	!
55	! 1106 2013-03-04 05:31:38Z raasch
56	! preprocessor lines rearranged so that routines can also be used in serial
57	! (non-parallel) mode
58	!
59	! 1092 2013-02-02 11:24:22Z raasch
60	! unused variables removed
61	!
62	! 1036 2012-10-22 13:43:42Z raasch
63	! code put under GPL (PALM 3.9)
64	!
65	! 1003 2012-09-14 14:35:53Z raasch
66	! indices nxa, nya, etc. replaced by nx, ny, etc.
67	!
68	! Revision 1.1 1997/07/24 11:25:18 raasch
69	! Initial revision
70	!
71	!------------------------------------------------------------------------------!
72	! Description:
73	! ------------
74	! Resorting data for the transposition from x to y. The transposition itself
75	! is carried out in transpose_xy
76	!------------------------------------------------------------------------------!
77
78	USE indices, &
79	ONLY: nx
80
81	USE kinds
82
83	USE transpose_indices, &
84	ONLY: nxl_z, nxr_z, nyn_x, nyn_z, nys_x, nys_z, nzb_x, nzt_x
85
86	IMPLICIT NONE
87
88	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !:
89	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !:
90
91
92	INTEGER(iwp) :: i !:
93	INTEGER(iwp) :: j !:
94	INTEGER(iwp) :: k !:
95	!
96	!-- Rearrange indices of input array in order to make data to be send
97	!-- by MPI contiguous
98	!$OMP PARALLEL PRIVATE ( i, j, k )
99	!$OMP DO
100	!$acc kernels present( f_in, f_inv )
101	DO i = 0, nx
102	DO k = nzb_x, nzt_x
103	DO j = nys_x, nyn_x
104	f_inv(j,k,i) = f_in(i,j,k)
105	ENDDO
106	ENDDO
107	ENDDO
108	!$acc end kernels
109	!$OMP END PARALLEL
110
111	END SUBROUTINE resort_for_xy
112
113
114	SUBROUTINE transpose_xy( f_inv, f_out )
115
116	!------------------------------------------------------------------------------!
117	! Description:
118	! ------------
119	! Transposition of input array (f_in) from x to y. For the input array, all
120	! elements along x reside on the same PE, while after transposition, all
121	! elements along y reside on the same PE.
122	!------------------------------------------------------------------------------!
123
124	USE cpulog, &
125	ONLY: cpu_log, cpu_log_nowait, log_point_s
126
127	USE indices, &
128	ONLY: nx, ny
129
130	USE kinds
131
132	USE pegrid
133
134	USE transpose_indices, &
135	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
136
137	IMPLICIT NONE
138
139	INTEGER(iwp) :: i !:
140	INTEGER(iwp) :: j !:
141	INTEGER(iwp) :: k !:
142	INTEGER(iwp) :: l !:
143	INTEGER(iwp) :: ys !:
144
145	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !:
146	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !:
147
148	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !:
149
150
151	IF ( numprocs /= 1 ) THEN
152
153	#if defined( __parallel )
154	!
155	!-- Transpose array
156	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
157	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
158	!$acc update host( f_inv )
159	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
160	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
161	comm1dy, ierr )
162	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
163
164	!
165	!-- Reorder transposed array
166	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
167	!$OMP DO
168	!$acc data copyin( work )
169	DO l = 0, pdims(2) - 1
170	ys = 0 + l * ( nyn_x - nys_x + 1 )
171	!$acc kernels present( f_out, work )
172	DO i = nxl_y, nxr_y
173	DO k = nzb_y, nzt_y
174	DO j = ys, ys + nyn_x - nys_x
175	f_out(j,i,k) = work(j-ys+1,k,i,l)
176	ENDDO
177	ENDDO
178	ENDDO
179	!$acc end kernels
180	ENDDO
181	!$acc end data
182	!$OMP END PARALLEL
183	#endif
184
185	ELSE
186
187	!
188	!-- Reorder transposed array
189	!$OMP PARALLEL PRIVATE ( i, j, k )
190	!$OMP DO
191	!$acc kernels present( f_inv, f_out )
192	DO k = nzb_y, nzt_y
193	DO i = nxl_y, nxr_y
194	DO j = 0, ny
195	f_out(j,i,k) = f_inv(j,k,i)
196	ENDDO
197	ENDDO
198	ENDDO
199	!$acc end kernels
200	!$OMP END PARALLEL
201
202	ENDIF
203
204	END SUBROUTINE transpose_xy
205
206
207	SUBROUTINE resort_for_xz( f_inv, f_out )
208
209	!------------------------------------------------------------------------------!
210	! Description:
211	! ------------
212	! Resorting data after the transposition from x to z. The transposition itself
213	! is carried out in transpose_xz
214	!------------------------------------------------------------------------------!
215
216	USE indices, &
217	ONLY: nxl, nxr, nyn, nys, nz
218
219	USE kinds
220
221	IMPLICIT NONE
222
223	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !:
224	REAL(wp) :: f_out(1:nz,nys:nyn,nxl:nxr) !:
225
226	INTEGER(iwp) :: i !:
227	INTEGER(iwp) :: j !:
228	INTEGER(iwp) :: k !:
229	!
230	!-- Rearrange indices of input array in order to make data to be send
231	!-- by MPI contiguous.
232	!-- In case of parallel fft/transposition, scattered store is faster in
233	!-- backward direction!!!
234	!$OMP PARALLEL PRIVATE ( i, j, k )
235	!$OMP DO
236	!$acc kernels present( f_inv, f_out )
237	DO k = 1, nz
238	DO i = nxl, nxr
239	DO j = nys, nyn
240	f_out(k,j,i) = f_inv(j,i,k)
241	ENDDO
242	ENDDO
243	ENDDO
244	!$acc end kernels
245	!$OMP END PARALLEL
246
247	END SUBROUTINE resort_for_xz
248
249
250	SUBROUTINE transpose_xz( f_in, f_inv )
251
252	!------------------------------------------------------------------------------!
253	! Description:
254	! ------------
255	! Transposition of input array (f_in) from x to z. For the input array, all
256	! elements along x reside on the same PE, while after transposition, all
257	! elements along z reside on the same PE.
258	!------------------------------------------------------------------------------!
259
260	USE cpulog, &
261	ONLY: cpu_log, cpu_log_nowait, log_point_s
262
263	USE indices, &
264	ONLY: nnx, nx, nxl, nxr, ny, nyn, nys, nz
265
266	USE kinds
267
268	USE pegrid
269
270	USE transpose_indices, &
271	ONLY: nyn_x, nys_x, nzb_x, nzt_x
272
273	IMPLICIT NONE
274
275	INTEGER(iwp) :: i !:
276	INTEGER(iwp) :: j !:
277	INTEGER(iwp) :: k !:
278	INTEGER(iwp) :: l !:
279	INTEGER(iwp) :: xs !:
280
281	REAL(wp) :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !:
282	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !:
283
284	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !:
285
286
287	!
288	!-- If the PE grid is one-dimensional along y, the array has only to be
289	!-- reordered locally and therefore no transposition has to be done.
290	IF ( pdims(1) /= 1 ) THEN
291
292	#if defined( __parallel )
293	!
294	!-- Reorder input array for transposition
295	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
296	!$OMP DO
297	!$acc data copyout( work )
298	DO l = 0, pdims(1) - 1
299	xs = 0 + l * nnx
300	!$acc kernels present( f_in, work )
301	DO k = nzb_x, nzt_x
302	DO i = xs, xs + nnx - 1
303	DO j = nys_x, nyn_x
304	work(j,i-xs+1,k,l) = f_in(i,j,k)
305	ENDDO
306	ENDDO
307	ENDDO
308	!$acc end kernels
309	ENDDO
310	!$acc end data
311	!$OMP END PARALLEL
312
313	!
314	!-- Transpose array
315	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
316	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
317	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
318	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
319	comm1dx, ierr )
320	!$acc update device( f_inv )
321	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
322	#endif
323
324	ELSE
325
326	!
327	!-- Reorder the array in a way that the z index is in first position
328	!$OMP PARALLEL PRIVATE ( i, j, k )
329	!$OMP DO
330	!$acc kernels present( f_in, f_inv )
331	DO i = nxl, nxr
332	DO j = nys, nyn
333	DO k = 1, nz
334	f_inv(j,i,k) = f_in(i,j,k)
335	ENDDO
336	ENDDO
337	ENDDO
338	!$acc end kernels
339	!$OMP END PARALLEL
340
341	ENDIF
342
343	END SUBROUTINE transpose_xz
344
345
346	SUBROUTINE resort_for_yx( f_inv, f_out )
347
348	!------------------------------------------------------------------------------!
349	! Description:
350	! ------------
351	! Resorting data after the transposition from y to x. The transposition itself
352	! is carried out in transpose_yx
353	!------------------------------------------------------------------------------!
354
355	USE indices, &
356	ONLY: nx
357
358	USE kinds
359
360	USE transpose_indices, &
361	ONLY: nyn_x, nys_x, nzb_x, nzt_x
362
363	IMPLICIT NONE
364
365	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !:
366	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !:
367
368
369	INTEGER(iwp) :: i !:
370	INTEGER(iwp) :: j !:
371	INTEGER(iwp) :: k !:
372	!
373	!-- Rearrange indices of input array in order to make data to be send
374	!-- by MPI contiguous
375	!$OMP PARALLEL PRIVATE ( i, j, k )
376	!$OMP DO
377	!$acc kernels present( f_inv, f_out )
378	DO i = 0, nx
379	DO k = nzb_x, nzt_x
380	DO j = nys_x, nyn_x
381	f_out(i,j,k) = f_inv(j,k,i)
382	ENDDO
383	ENDDO
384	ENDDO
385	!$acc end kernels
386	!$OMP END PARALLEL
387
388	END SUBROUTINE resort_for_yx
389
390
391	SUBROUTINE transpose_yx( f_in, f_inv )
392
393	!------------------------------------------------------------------------------!
394	! Description:
395	! ------------
396	! Transposition of input array (f_in) from y to x. For the input array, all
397	! elements along y reside on the same PE, while after transposition, all
398	! elements along x reside on the same PE.
399	!------------------------------------------------------------------------------!
400
401	USE cpulog, &
402	ONLY: cpu_log, cpu_log_nowait, log_point_s
403
404	USE indices, &
405	ONLY: nx, ny
406
407	USE kinds
408
409	USE pegrid
410
411	USE transpose_indices, &
412	ONLY: nxl_y, nxr_y, nyn_x, nys_x, nzb_x, nzb_y, nzt_x, nzt_y
413
414	IMPLICIT NONE
415
416	INTEGER(iwp) :: i !:
417	INTEGER(iwp) :: j !:
418	INTEGER(iwp) :: k !:
419	INTEGER(iwp) :: l !:
420	INTEGER(iwp) :: ys !:
421
422	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !:
423	REAL(wp) :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx) !:
424
425	REAL(wp), DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work !:
426
427
428	IF ( numprocs /= 1 ) THEN
429
430	#if defined( __parallel )
431	!
432	!-- Reorder input array for transposition
433	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
434	!$OMP DO
435	!$acc data copyout( work )
436	DO l = 0, pdims(2) - 1
437	ys = 0 + l * ( nyn_x - nys_x + 1 )
438	!$acc kernels present( f_in, work )
439	DO i = nxl_y, nxr_y
440	DO k = nzb_y, nzt_y
441	DO j = ys, ys + nyn_x - nys_x
442	work(j-ys+1,k,i,l) = f_in(j,i,k)
443	ENDDO
444	ENDDO
445	ENDDO
446	!$acc end kernels
447	ENDDO
448	!$acc end data
449	!$OMP END PARALLEL
450
451	!
452	!-- Transpose array
453	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
454	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
455	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
456	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
457	comm1dy, ierr )
458	!$acc update device( f_inv )
459	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
460	#endif
461
462	ELSE
463
464	!
465	!-- Reorder array f_in the same way as ALLTOALL did it
466	!$OMP PARALLEL PRIVATE ( i, j, k )
467	!$OMP DO
468	!$acc kernels present( f_in, f_inv )
469	DO i = nxl_y, nxr_y
470	DO k = nzb_y, nzt_y
471	DO j = 0, ny
472	f_inv(j,k,i) = f_in(j,i,k)
473	ENDDO
474	ENDDO
475	ENDDO
476	!$acc end kernels
477	!$OMP END PARALLEL
478
479	ENDIF
480
481	END SUBROUTINE transpose_yx
482
483
484	SUBROUTINE transpose_yxd( f_in, f_out )
485
486	!------------------------------------------------------------------------------!
487	! Description:
488	! ------------
489	! Transposition of input array (f_in) from y to x. For the input array, all
490	! elements along y reside on the same PE, while after transposition, all
491	! elements along x reside on the same PE.
492	! This is a direct transposition for arrays with indices in regular order
493	! (k,j,i) (cf. transpose_yx).
494	!------------------------------------------------------------------------------!
495
496	USE cpulog, &
497	ONLY: cpu_log, cpu_log_nowait, log_point_s
498
499	USE indices, &
500	ONLY: nnx, nny, nnz, nx, nxl, nxr, nyn, nys, nz
501
502	USE kinds
503
504	USE pegrid
505
506	USE transpose_indices, &
507	ONLY: nyn_x, nys_x, nzb_x, nzt_x
508
509	IMPLICIT NONE
510
511	INTEGER(iwp) :: i !:
512	INTEGER(iwp) :: j !:
513	INTEGER(iwp) :: k !:
514	INTEGER(iwp) :: l !:
515	INTEGER(iwp) :: m !:
516	INTEGER(iwp) :: xs !:
517
518	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !:
519	REAL(wp) :: f_inv(nxl:nxr,1:nz,nys:nyn) !:
520	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !:
521	REAL(wp) :: work(nnxnnynnz) !:
522	#if defined( __parallel )
523
524	!
525	!-- Rearrange indices of input array in order to make data to be send
526	!-- by MPI contiguous
527	DO k = 1, nz
528	DO j = nys, nyn
529	DO i = nxl, nxr
530	f_inv(i,k,j) = f_in(k,j,i)
531	ENDDO
532	ENDDO
533	ENDDO
534
535	!
536	!-- Transpose array
537	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
538	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
539	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
540	work(1), sendrecvcount_xy, MPI_REAL, &
541	comm1dx, ierr )
542	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
543
544	!
545	!-- Reorder transposed array
546	m = 0
547	DO l = 0, pdims(1) - 1
548	xs = 0 + l * nnx
549	DO j = nys_x, nyn_x
550	DO k = 1, nz
551	DO i = xs, xs + nnx - 1
552	m = m + 1
553	f_out(i,j,k) = work(m)
554	ENDDO
555	ENDDO
556	ENDDO
557	ENDDO
558
559	#endif
560
561	END SUBROUTINE transpose_yxd
562
563
564	SUBROUTINE resort_for_yz( f_in, f_inv )
565
566	!------------------------------------------------------------------------------!
567	! Description:
568	! ------------
569	! Resorting data for the transposition from y to z. The transposition itself
570	! is carried out in transpose_yz
571	!------------------------------------------------------------------------------!
572
573	USE indices, &
574	ONLY: ny
575
576	USE kinds
577
578	USE transpose_indices, &
579	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
580
581	IMPLICIT NONE
582
583	REAL(wp) :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !:
584	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !:
585
586	INTEGER(iwp) :: i !:
587	INTEGER(iwp) :: j !:
588	INTEGER(iwp) :: k !:
589
590	!
591	!-- Rearrange indices of input array in order to make data to be send
592	!-- by MPI contiguous
593	!$OMP PARALLEL PRIVATE ( i, j, k )
594	!$OMP DO
595	!$acc kernels present( f_in, f_inv )
596	DO j = 0, ny
597	DO k = nzb_y, nzt_y
598	DO i = nxl_y, nxr_y
599	f_inv(i,k,j) = f_in(j,i,k)
600	ENDDO
601	ENDDO
602	ENDDO
603	!$acc end kernels
604	!$OMP END PARALLEL
605
606	END SUBROUTINE resort_for_yz
607
608
609	SUBROUTINE transpose_yz( f_inv, f_out )
610
611	!------------------------------------------------------------------------------!
612	! Description:
613	! ------------
614	! Transposition of input array (f_in) from y to z. For the input array, all
615	! elements along y reside on the same PE, while after transposition, all
616	! elements along z reside on the same PE.
617	!------------------------------------------------------------------------------!
618
619	USE cpulog, &
620	ONLY: cpu_log, cpu_log_nowait, log_point_s
621
622	USE indices, &
623	ONLY: ny, nz
624
625	USE kinds
626
627	USE pegrid
628
629	USE transpose_indices, &
630	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
631
632	IMPLICIT NONE
633
634	INTEGER(iwp) :: i !:
635	INTEGER(iwp) :: j !:
636	INTEGER(iwp) :: k !:
637	INTEGER(iwp) :: l !:
638	INTEGER(iwp) :: zs !:
639
640	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !:
641	REAL(wp) :: f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !:
642
643	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !:
644
645
646	!
647	!-- If the PE grid is one-dimensional along y, only local reordering
648	!-- of the data is necessary and no transposition has to be done.
649	IF ( pdims(1) == 1 ) THEN
650
651	!$OMP PARALLEL PRIVATE ( i, j, k )
652	!$OMP DO
653	!$acc kernels present( f_inv, f_out )
654	DO j = 0, ny
655	DO k = nzb_y, nzt_y
656	DO i = nxl_y, nxr_y
657	f_out(i,j,k) = f_inv(i,k,j)
658	ENDDO
659	ENDDO
660	ENDDO
661	!$acc end kernels
662	!$OMP END PARALLEL
663
664	ELSE
665
666	#if defined( __parallel )
667	!
668	!-- Transpose array
669	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
670	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
671	!$acc update host( f_inv )
672	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
673	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
674	comm1dx, ierr )
675	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
676
677	!
678	!-- Reorder transposed array
679	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
680	!$OMP DO
681	!$acc data copyin( work )
682	DO l = 0, pdims(1) - 1
683	zs = 1 + l * ( nzt_y - nzb_y + 1 )
684	!$acc kernels present( f_out )
685	DO j = nys_z, nyn_z
686	DO k = zs, zs + nzt_y - nzb_y
687	DO i = nxl_z, nxr_z
688	f_out(i,j,k) = work(i,k-zs+1,j,l)
689	ENDDO
690	ENDDO
691	ENDDO
692	!$acc end kernels
693	ENDDO
694	!$acc end data
695	!$OMP END PARALLEL
696	#endif
697
698	ENDIF
699
700	END SUBROUTINE transpose_yz
701
702
703	SUBROUTINE resort_for_zx( f_in, f_inv )
704
705	!------------------------------------------------------------------------------!
706	! Description:
707	! ------------
708	! Resorting data for the transposition from z to x. The transposition itself
709	! is carried out in transpose_zx
710	!------------------------------------------------------------------------------!
711
712	USE indices, &
713	ONLY: nxl, nxr, nyn, nys, nz
714
715	USE kinds
716
717	IMPLICIT NONE
718
719	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !:
720	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !:
721
722	INTEGER(iwp) :: i !:
723	INTEGER(iwp) :: j !:
724	INTEGER(iwp) :: k !:
725
726	!
727	!-- Rearrange indices of input array in order to make data to be send
728	!-- by MPI contiguous
729	!$OMP PARALLEL PRIVATE ( i, j, k )
730	!$OMP DO
731	!$acc kernels present( f_in, f_inv )
732	DO k = 1,nz
733	DO i = nxl, nxr
734	DO j = nys, nyn
735	f_inv(j,i,k) = f_in(k,j,i)
736	ENDDO
737	ENDDO
738	ENDDO
739	!$acc end kernels
740	!$OMP END PARALLEL
741
742	END SUBROUTINE resort_for_zx
743
744
745	SUBROUTINE transpose_zx( f_inv, f_out )
746
747	!------------------------------------------------------------------------------!
748	! Description:
749	! ------------
750	! Transposition of input array (f_in) from z to x. For the input array, all
751	! elements along z reside on the same PE, while after transposition, all
752	! elements along x reside on the same PE.
753	!------------------------------------------------------------------------------!
754
755	USE cpulog, &
756	ONLY: cpu_log, cpu_log_nowait, log_point_s
757
758	USE indices, &
759	ONLY: nnx, nx, nxl, nxr, nyn, nys, nz
760
761	USE kinds
762
763	USE pegrid
764
765	USE transpose_indices, &
766	ONLY: nyn_x, nys_x, nzb_x, nzt_x
767
768	IMPLICIT NONE
769
770	INTEGER(iwp) :: i !:
771	INTEGER(iwp) :: j !:
772	INTEGER(iwp) :: k !:
773	INTEGER(iwp) :: l !:
774	INTEGER(iwp) :: xs !:
775
776	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !:
777	REAL(wp) :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x) !:
778
779	REAL(wp), DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work !:
780
781
782	!
783	!-- If the PE grid is one-dimensional along y, only local reordering
784	!-- of the data is necessary and no transposition has to be done.
785	IF ( pdims(1) == 1 ) THEN
786
787	!$OMP PARALLEL PRIVATE ( i, j, k )
788	!$OMP DO
789	!$acc kernels present( f_inv, f_out )
790	DO k = 1, nz
791	DO i = nxl, nxr
792	DO j = nys, nyn
793	f_out(i,j,k) = f_inv(j,i,k)
794	ENDDO
795	ENDDO
796	ENDDO
797	!$acc end kernels
798	!$OMP END PARALLEL
799
800	ELSE
801
802	#if defined( __parallel )
803	!
804	!-- Transpose array
805	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
806	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
807	!$acc update host( f_inv )
808	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
809	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
810	comm1dx, ierr )
811	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
812
813	!
814	!-- Reorder transposed array
815	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
816	!$OMP DO
817	!$acc data copyin( work )
818	DO l = 0, pdims(1) - 1
819	xs = 0 + l * nnx
820	!$acc kernels present( f_out )
821	DO k = nzb_x, nzt_x
822	DO i = xs, xs + nnx - 1
823	DO j = nys_x, nyn_x
824	f_out(i,j,k) = work(j,i-xs+1,k,l)
825	ENDDO
826	ENDDO
827	ENDDO
828	!$acc end kernels
829	ENDDO
830	!$acc end data
831	!$OMP END PARALLEL
832	#endif
833
834	ENDIF
835
836	END SUBROUTINE transpose_zx
837
838
839	SUBROUTINE resort_for_zy( f_inv, f_out )
840
841	!------------------------------------------------------------------------------!
842	! Description:
843	! ------------
844	! Resorting data after the transposition from z to y. The transposition itself
845	! is carried out in transpose_zy
846	!------------------------------------------------------------------------------!
847
848	USE indices, &
849	ONLY: ny
850
851	USE kinds
852
853	USE transpose_indices, &
854	ONLY: nxl_y, nxr_y, nzb_y, nzt_y
855
856	IMPLICIT NONE
857
858	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !:
859	REAL(wp) :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y) !:
860
861
862	INTEGER(iwp) :: i !:
863	INTEGER(iwp) :: j !:
864	INTEGER(iwp) :: k !:
865
866	!
867	!-- Rearrange indices of input array in order to make data to be send
868	!-- by MPI contiguous
869	!$OMP PARALLEL PRIVATE ( i, j, k )
870	!$OMP DO
871	!$acc kernels present( f_inv, f_out )
872	DO k = nzb_y, nzt_y
873	DO j = 0, ny
874	DO i = nxl_y, nxr_y
875	f_out(j,i,k) = f_inv(i,k,j)
876	ENDDO
877	ENDDO
878	ENDDO
879	!$acc end kernels
880	!$OMP END PARALLEL
881
882	END SUBROUTINE resort_for_zy
883
884
885	SUBROUTINE transpose_zy( f_in, f_inv )
886
887	!------------------------------------------------------------------------------!
888	! Description:
889	! ------------
890	! Transposition of input array (f_in) from z to y. For the input array, all
891	! elements along z reside on the same PE, while after transposition, all
892	! elements along y reside on the same PE.
893	!------------------------------------------------------------------------------!
894
895	USE cpulog, &
896	ONLY: cpu_log, cpu_log_nowait, log_point_s
897
898	USE indices, &
899	ONLY: ny, nz
900
901	USE kinds
902
903	USE pegrid
904
905	USE transpose_indices, &
906	ONLY: nxl_y, nxl_z, nxr_y, nxr_z, nyn_z, nys_z, nzb_y, nzt_y
907
908	IMPLICIT NONE
909
910	INTEGER(iwp) :: i !:
911	INTEGER(iwp) :: j !:
912	INTEGER(iwp) :: k !:
913	INTEGER(iwp) :: l !:
914	INTEGER(iwp) :: zs !:
915
916	REAL(wp) :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz) !:
917	REAL(wp) :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny) !:
918
919	REAL(wp), DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work !:
920
921	!
922	!-- If the PE grid is one-dimensional along y, the array has only to be
923	!-- reordered locally and therefore no transposition has to be done.
924	IF ( pdims(1) /= 1 ) THEN
925
926	#if defined( __parallel )
927	!
928	!-- Reorder input array for transposition
929	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
930	!$OMP DO
931	!$acc data copyout( work )
932	DO l = 0, pdims(1) - 1
933	zs = 1 + l * ( nzt_y - nzb_y + 1 )
934	!$acc kernels present( f_in, work )
935	DO j = nys_z, nyn_z
936	DO k = zs, zs + nzt_y - nzb_y
937	DO i = nxl_z, nxr_z
938	work(i,k-zs+1,j,l) = f_in(i,j,k)
939	ENDDO
940	ENDDO
941	ENDDO
942	!$acc end kernels
943	ENDDO
944	!$acc end data
945	!$OMP END PARALLEL
946
947	!
948	!-- Transpose array
949	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
950	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
951	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
952	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
953	comm1dx, ierr )
954	!$acc update device( f_inv )
955	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
956	#endif
957
958	ELSE
959	!
960	!-- Reorder the array in the same way like ALLTOALL did it
961	!$OMP PARALLEL PRIVATE ( i, j, k )
962	!$OMP DO
963	!$acc kernels present( f_in, f_inv )
964	DO k = nzb_y, nzt_y
965	DO j = 0, ny
966	DO i = nxl_y, nxr_y
967	f_inv(i,k,j) = f_in(i,j,k)
968	ENDDO
969	ENDDO
970	ENDDO
971	!$acc end kernels
972	!$OMP END PARALLEL
973
974	ENDIF
975
976	END SUBROUTINE transpose_zy
977
978
979	SUBROUTINE transpose_zyd( f_in, f_out )
980
981	!------------------------------------------------------------------------------!
982	! Description:
983	! ------------
984	! Transposition of input array (f_in) from z to y. For the input array, all
985	! elements along z reside on the same PE, while after transposition, all
986	! elements along y reside on the same PE.
987	! This is a direct transposition for arrays with indices in regular order
988	! (k,j,i) (cf. transpose_zy).
989	!------------------------------------------------------------------------------!
990
991	USE cpulog, &
992	ONLY: cpu_log, cpu_log_nowait, log_point_s
993
994	USE indices, &
995	ONLY: nnx, nny, nnz, nxl, nxr, nyn, nys, ny, nz
996
997	USE kinds
998
999	USE pegrid
1000
1001	USE transpose_indices, &
1002	ONLY: nxl_y, nxl_yd, nxr_y, nxr_yd, nzb_y, nzb_yd, nzt_y, nzt_yd
1003
1004	IMPLICIT NONE
1005
1006	INTEGER(iwp) :: i !:
1007	INTEGER(iwp) :: j !:
1008	INTEGER(iwp) :: k !:
1009	INTEGER(iwp) :: l !:
1010	INTEGER(iwp) :: m !:
1011	INTEGER(iwp) :: ys !:
1012
1013	REAL(wp) :: f_in(1:nz,nys:nyn,nxl:nxr) !:
1014	REAL(wp) :: f_inv(nys:nyn,nxl:nxr,1:nz) !:
1015	REAL(wp) :: f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd) !:
1016	REAL(wp) :: work(nnxnnynnz) !:
1017
1018	#if defined( __parallel )
1019
1020	!
1021	!-- Rearrange indices of input array in order to make data to be send
1022	!-- by MPI contiguous
1023	DO i = nxl, nxr
1024	DO j = nys, nyn
1025	DO k = 1, nz
1026	f_inv(j,i,k) = f_in(k,j,i)
1027	ENDDO
1028	ENDDO
1029	ENDDO
1030
1031	!
1032	!-- Move data to different array, because memory location of work1 is
1033	!-- needed further below (work1 = work2).
1034	!-- If the PE grid is one-dimensional along x, only local reordering
1035	!-- of the data is necessary and no transposition has to be done.
1036	IF ( pdims(2) == 1 ) THEN
1037	DO k = 1, nz
1038	DO i = nxl, nxr
1039	DO j = nys, nyn
1040	f_out(j,i,k) = f_inv(j,i,k)
1041	ENDDO
1042	ENDDO
1043	ENDDO
1044	RETURN
1045	ENDIF
1046
1047	!
1048	!-- Transpose array
1049	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
1050	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
1051	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
1052	work(1), sendrecvcount_zyd, MPI_REAL, &
1053	comm1dy, ierr )
1054	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
1055
1056	!
1057	!-- Reorder transposed array
1058	m = 0
1059	DO l = 0, pdims(2) - 1
1060	ys = 0 + l * nny
1061	DO k = nzb_yd, nzt_yd
1062	DO i = nxl_yd, nxr_yd
1063	DO j = ys, ys + nny - 1
1064	m = m + 1
1065	f_out(j,i,k) = work(m)
1066	ENDDO
1067	ENDDO
1068	ENDDO
1069	ENDDO
1070
1071	#endif
1072
1073	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |