Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 1280

Last change on this file since 1280 was 1258, checked in by raasch, 11 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 25.9 KB

Line
1	SUBROUTINE resort_for_xy( f_in, f_inv )
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2012 Leibniz University Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 1258 2013-11-08 16:09:09Z raasch $
27	!
28	! 1257 2013-11-08 15:18:40Z raasch
29	! openacc loop and loop vector clauses removed
30	!
31	! 1216 2013-08-26 09:31:42Z raasch
32	! re-sorting of the transposed / to be transposed arrays moved to separate
33	! routines resort_for_...
34	!
35	! 1111 2013-03-08 23:54:10Z raasch
36	! openACC directives added,
37	! resorting data from/to work changed, work got 4 dimensions instead of 1
38	!
39	! 1106 2013-03-04 05:31:38Z raasch
40	! preprocessor lines rearranged so that routines can also be used in serial
41	! (non-parallel) mode
42	!
43	! 1092 2013-02-02 11:24:22Z raasch
44	! unused variables removed
45	!
46	! 1036 2012-10-22 13:43:42Z raasch
47	! code put under GPL (PALM 3.9)
48	!
49	! 1003 2012-09-14 14:35:53Z raasch
50	! indices nxa, nya, etc. replaced by nx, ny, etc.
51	!
52	! 683 2011-02-09 14:25:15Z raasch
53	! openMP parallelization of transpositions for 2d-domain-decomposition
54	!
55	! 622 2010-12-10 08:08:13Z raasch
56	! optional barriers included in order to speed up collective operations
57	!
58	! 164 2008-05-15 08:46:15Z raasch
59	! f_inv changed from subroutine argument to automatic array in order to do
60	! re-ordering from f_in to f_inv in one step, one array work is needed instead
61	! of work1 and work2
62	!
63	! February 2007
64	! RCS Log replace by Id keyword, revision history cleaned up
65	!
66	! Revision 1.2 2004/04/30 13:12:17 raasch
67	! Switched from mpi_alltoallv to the simpler mpi_alltoall,
68	! all former transpose-routine files collected in this file, enlarged
69	! transposition arrays introduced
70	!
71	! Revision 1.1 2004/04/30 13:08:16 raasch
72	! Initial revision (collection of former routines transpose_xy, transpose_xz,
73	! transpose_yx, transpose_yz, transpose_zx, transpose_zy)
74	!
75	! Revision 1.1 1997/07/24 11:25:18 raasch
76	! Initial revision
77	!
78	!------------------------------------------------------------------------------!
79	! Description:
80	! ------------
81	! Resorting data for the transposition from x to y. The transposition itself
82	! is carried out in transpose_xy
83	!------------------------------------------------------------------------------!
84
85	USE indices
86	USE transpose_indices
87
88	IMPLICIT NONE
89
90	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
91	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
92
93
94	INTEGER :: i, j, k
95
96	!
97	!-- Rearrange indices of input array in order to make data to be send
98	!-- by MPI contiguous
99	!$OMP PARALLEL PRIVATE ( i, j, k )
100	!$OMP DO
101	!$acc kernels present( f_in, f_inv )
102	DO i = 0, nx
103	DO k = nzb_x, nzt_x
104	DO j = nys_x, nyn_x
105	f_inv(j,k,i) = f_in(i,j,k)
106	ENDDO
107	ENDDO
108	ENDDO
109	!$acc end kernels
110	!$OMP END PARALLEL
111
112	END SUBROUTINE resort_for_xy
113
114
115	SUBROUTINE transpose_xy( f_inv, f_out )
116
117	!------------------------------------------------------------------------------!
118	! Description:
119	! ------------
120	! Transposition of input array (f_in) from x to y. For the input array, all
121	! elements along x reside on the same PE, while after transposition, all
122	! elements along y reside on the same PE.
123	!------------------------------------------------------------------------------!
124
125	USE cpulog
126	USE indices
127	USE interfaces
128	USE pegrid
129	USE transpose_indices
130
131	IMPLICIT NONE
132
133	INTEGER :: i, j, k, l, ys
134
135	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
136
137	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
138
139
140	IF ( numprocs /= 1 ) THEN
141
142	#if defined( __parallel )
143	!
144	!-- Transpose array
145	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
146	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
147	!$acc update host( f_inv )
148	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
149	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
150	comm1dy, ierr )
151	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
152
153	!
154	!-- Reorder transposed array
155	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
156	!$OMP DO
157	!$acc data copyin( work )
158	DO l = 0, pdims(2) - 1
159	ys = 0 + l * ( nyn_x - nys_x + 1 )
160	!$acc kernels present( f_out, work )
161	DO i = nxl_y, nxr_y
162	DO k = nzb_y, nzt_y
163	DO j = ys, ys + nyn_x - nys_x
164	f_out(j,i,k) = work(j-ys+1,k,i,l)
165	ENDDO
166	ENDDO
167	ENDDO
168	!$acc end kernels
169	ENDDO
170	!$acc end data
171	!$OMP END PARALLEL
172	#endif
173
174	ELSE
175
176	!
177	!-- Reorder transposed array
178	!$OMP PARALLEL PRIVATE ( i, j, k )
179	!$OMP DO
180	!$acc kernels present( f_inv, f_out )
181	DO k = nzb_y, nzt_y
182	DO i = nxl_y, nxr_y
183	DO j = 0, ny
184	f_out(j,i,k) = f_inv(j,k,i)
185	ENDDO
186	ENDDO
187	ENDDO
188	!$acc end kernels
189	!$OMP END PARALLEL
190
191	ENDIF
192
193	END SUBROUTINE transpose_xy
194
195
196	SUBROUTINE resort_for_xz( f_inv, f_out )
197
198	!------------------------------------------------------------------------------!
199	! Description:
200	! ------------
201	! Resorting data after the transposition from x to z. The transposition itself
202	! is carried out in transpose_xz
203	!------------------------------------------------------------------------------!
204
205	USE indices
206	USE transpose_indices
207
208	IMPLICIT NONE
209
210	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
211	REAL :: f_out(1:nz,nys:nyn,nxl:nxr)
212
213
214	INTEGER :: i, j, k
215
216	!
217	!-- Rearrange indices of input array in order to make data to be send
218	!-- by MPI contiguous.
219	!-- In case of parallel fft/transposition, scattered store is faster in
220	!-- backward direction!!!
221	!$OMP PARALLEL PRIVATE ( i, j, k )
222	!$OMP DO
223	!$acc kernels present( f_inv, f_out )
224	DO k = 1, nz
225	DO i = nxl, nxr
226	DO j = nys, nyn
227	f_out(k,j,i) = f_inv(j,i,k)
228	ENDDO
229	ENDDO
230	ENDDO
231	!$acc end kernels
232	!$OMP END PARALLEL
233
234	END SUBROUTINE resort_for_xz
235
236
237	SUBROUTINE transpose_xz( f_in, f_inv )
238
239	!------------------------------------------------------------------------------!
240	! Description:
241	! ------------
242	! Transposition of input array (f_in) from x to z. For the input array, all
243	! elements along x reside on the same PE, while after transposition, all
244	! elements along z reside on the same PE.
245	!------------------------------------------------------------------------------!
246
247	USE cpulog
248	USE indices
249	USE interfaces
250	USE pegrid
251	USE transpose_indices
252
253	IMPLICIT NONE
254
255	INTEGER :: i, j, k, l, xs
256
257	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz)
258
259	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
260
261
262	!
263	!-- If the PE grid is one-dimensional along y, the array has only to be
264	!-- reordered locally and therefore no transposition has to be done.
265	IF ( pdims(1) /= 1 ) THEN
266
267	#if defined( __parallel )
268	!
269	!-- Reorder input array for transposition
270	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
271	!$OMP DO
272	!$acc data copyout( work )
273	DO l = 0, pdims(1) - 1
274	xs = 0 + l * nnx
275	!$acc kernels present( f_in, work )
276	DO k = nzb_x, nzt_x
277	DO i = xs, xs + nnx - 1
278	DO j = nys_x, nyn_x
279	work(j,i-xs+1,k,l) = f_in(i,j,k)
280	ENDDO
281	ENDDO
282	ENDDO
283	!$acc end kernels
284	ENDDO
285	!$acc end data
286	!$OMP END PARALLEL
287
288	!
289	!-- Transpose array
290	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
291	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
292	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
293	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
294	comm1dx, ierr )
295	!$acc update device( f_inv )
296	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
297	#endif
298
299	ELSE
300
301	!
302	!-- Reorder the array in a way that the z index is in first position
303	!$OMP PARALLEL PRIVATE ( i, j, k )
304	!$OMP DO
305	!$acc kernels present( f_in, f_inv )
306	DO i = nxl, nxr
307	DO j = nys, nyn
308	DO k = 1, nz
309	f_inv(j,i,k) = f_in(i,j,k)
310	ENDDO
311	ENDDO
312	ENDDO
313	!$acc end kernels
314	!$OMP END PARALLEL
315
316	ENDIF
317
318	END SUBROUTINE transpose_xz
319
320
321	SUBROUTINE resort_for_yx( f_inv, f_out )
322
323	!------------------------------------------------------------------------------!
324	! Description:
325	! ------------
326	! Resorting data after the transposition from y to x. The transposition itself
327	! is carried out in transpose_yx
328	!------------------------------------------------------------------------------!
329
330	USE indices
331	USE transpose_indices
332
333	IMPLICIT NONE
334
335	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
336	REAL :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
337
338
339	INTEGER :: i, j, k
340
341	!
342	!-- Rearrange indices of input array in order to make data to be send
343	!-- by MPI contiguous
344	!$OMP PARALLEL PRIVATE ( i, j, k )
345	!$OMP DO
346	!$acc kernels present( f_inv, f_out )
347	DO i = 0, nx
348	DO k = nzb_x, nzt_x
349	DO j = nys_x, nyn_x
350	f_out(i,j,k) = f_inv(j,k,i)
351	ENDDO
352	ENDDO
353	ENDDO
354	!$acc end kernels
355	!$OMP END PARALLEL
356
357	END SUBROUTINE resort_for_yx
358
359
360	SUBROUTINE transpose_yx( f_in, f_inv )
361
362	!------------------------------------------------------------------------------!
363	! Description:
364	! ------------
365	! Transposition of input array (f_in) from y to x. For the input array, all
366	! elements along y reside on the same PE, while after transposition, all
367	! elements along x reside on the same PE.
368	!------------------------------------------------------------------------------!
369
370	USE cpulog
371	USE indices
372	USE interfaces
373	USE pegrid
374	USE transpose_indices
375
376	IMPLICIT NONE
377
378	INTEGER :: i, j, k, l, ys
379
380	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
381
382	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
383
384
385	IF ( numprocs /= 1 ) THEN
386
387	#if defined( __parallel )
388	!
389	!-- Reorder input array for transposition
390	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
391	!$OMP DO
392	!$acc data copyout( work )
393	DO l = 0, pdims(2) - 1
394	ys = 0 + l * ( nyn_x - nys_x + 1 )
395	!$acc kernels present( f_in, work )
396	DO i = nxl_y, nxr_y
397	DO k = nzb_y, nzt_y
398	DO j = ys, ys + nyn_x - nys_x
399	work(j-ys+1,k,i,l) = f_in(j,i,k)
400	ENDDO
401	ENDDO
402	ENDDO
403	!$acc end kernels
404	ENDDO
405	!$acc end data
406	!$OMP END PARALLEL
407
408	!
409	!-- Transpose array
410	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
411	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
412	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
413	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
414	comm1dy, ierr )
415	!$acc update device( f_inv )
416	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
417	#endif
418
419	ELSE
420
421	!
422	!-- Reorder array f_in the same way as ALLTOALL did it
423	!$OMP PARALLEL PRIVATE ( i, j, k )
424	!$OMP DO
425	!$acc kernels present( f_in, f_inv )
426	DO i = nxl_y, nxr_y
427	DO k = nzb_y, nzt_y
428	DO j = 0, ny
429	f_inv(j,k,i) = f_in(j,i,k)
430	ENDDO
431	ENDDO
432	ENDDO
433	!$acc end kernels
434	!$OMP END PARALLEL
435
436	ENDIF
437
438	END SUBROUTINE transpose_yx
439
440
441	SUBROUTINE transpose_yxd( f_in, f_out )
442
443	!------------------------------------------------------------------------------!
444	! Description:
445	! ------------
446	! Transposition of input array (f_in) from y to x. For the input array, all
447	! elements along y reside on the same PE, while after transposition, all
448	! elements along x reside on the same PE.
449	! This is a direct transposition for arrays with indices in regular order
450	! (k,j,i) (cf. transpose_yx).
451	!------------------------------------------------------------------------------!
452
453	USE cpulog
454	USE indices
455	USE interfaces
456	USE pegrid
457	USE transpose_indices
458
459	IMPLICIT NONE
460
461	INTEGER :: i, j, k, l, m, xs
462
463	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
464	f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x), &
465	work(nnxnnynnz)
466
467	#if defined( __parallel )
468
469	!
470	!-- Rearrange indices of input array in order to make data to be send
471	!-- by MPI contiguous
472	DO k = 1, nz
473	DO j = nys, nyn
474	DO i = nxl, nxr
475	f_inv(i,k,j) = f_in(k,j,i)
476	ENDDO
477	ENDDO
478	ENDDO
479
480	!
481	!-- Transpose array
482	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
483	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
484	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
485	work(1), sendrecvcount_xy, MPI_REAL, &
486	comm1dx, ierr )
487	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
488
489	!
490	!-- Reorder transposed array
491	m = 0
492	DO l = 0, pdims(1) - 1
493	xs = 0 + l * nnx
494	DO j = nys_x, nyn_x
495	DO k = 1, nz
496	DO i = xs, xs + nnx - 1
497	m = m + 1
498	f_out(i,j,k) = work(m)
499	ENDDO
500	ENDDO
501	ENDDO
502	ENDDO
503
504	#endif
505
506	END SUBROUTINE transpose_yxd
507
508
509	SUBROUTINE resort_for_yz( f_in, f_inv )
510
511	!------------------------------------------------------------------------------!
512	! Description:
513	! ------------
514	! Resorting data for the transposition from y to z. The transposition itself
515	! is carried out in transpose_yz
516	!------------------------------------------------------------------------------!
517
518	USE indices
519	USE transpose_indices
520
521	IMPLICIT NONE
522
523	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
524	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
525
526
527	INTEGER :: i, j, k
528
529	!
530	!-- Rearrange indices of input array in order to make data to be send
531	!-- by MPI contiguous
532	!$OMP PARALLEL PRIVATE ( i, j, k )
533	!$OMP DO
534	!$acc kernels present( f_in, f_inv )
535	DO j = 0, ny
536	DO k = nzb_y, nzt_y
537	DO i = nxl_y, nxr_y
538	f_inv(i,k,j) = f_in(j,i,k)
539	ENDDO
540	ENDDO
541	ENDDO
542	!$acc end kernels
543	!$OMP END PARALLEL
544
545	END SUBROUTINE resort_for_yz
546
547
548	SUBROUTINE transpose_yz( f_inv, f_out )
549
550	!------------------------------------------------------------------------------!
551	! Description:
552	! ------------
553	! Transposition of input array (f_in) from y to z. For the input array, all
554	! elements along y reside on the same PE, while after transposition, all
555	! elements along z reside on the same PE.
556	!------------------------------------------------------------------------------!
557
558	USE cpulog
559	USE indices
560	USE interfaces
561	USE pegrid
562	USE transpose_indices
563
564	IMPLICIT NONE
565
566	INTEGER :: i, j, k, l, zs
567
568	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
569
570	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
571
572
573	!
574	!-- If the PE grid is one-dimensional along y, only local reordering
575	!-- of the data is necessary and no transposition has to be done.
576	IF ( pdims(1) == 1 ) THEN
577
578	!$OMP PARALLEL PRIVATE ( i, j, k )
579	!$OMP DO
580	!$acc kernels present( f_inv, f_out )
581	DO j = 0, ny
582	DO k = nzb_y, nzt_y
583	DO i = nxl_y, nxr_y
584	f_out(i,j,k) = f_inv(i,k,j)
585	ENDDO
586	ENDDO
587	ENDDO
588	!$acc end kernels
589	!$OMP END PARALLEL
590
591	ELSE
592
593	#if defined( __parallel )
594	!
595	!-- Transpose array
596	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
597	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
598	!$acc update host( f_inv )
599	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
600	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
601	comm1dx, ierr )
602	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
603
604	!
605	!-- Reorder transposed array
606	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
607	!$OMP DO
608	!$acc data copyin( work )
609	DO l = 0, pdims(1) - 1
610	zs = 1 + l * ( nzt_y - nzb_y + 1 )
611	!$acc kernels present( f_out )
612	DO j = nys_z, nyn_z
613	DO k = zs, zs + nzt_y - nzb_y
614	DO i = nxl_z, nxr_z
615	f_out(i,j,k) = work(i,k-zs+1,j,l)
616	ENDDO
617	ENDDO
618	ENDDO
619	!$acc end kernels
620	ENDDO
621	!$acc end data
622	!$OMP END PARALLEL
623	#endif
624
625	ENDIF
626
627	END SUBROUTINE transpose_yz
628
629
630	SUBROUTINE resort_for_zx( f_in, f_inv )
631
632	!------------------------------------------------------------------------------!
633	! Description:
634	! ------------
635	! Resorting data for the transposition from z to x. The transposition itself
636	! is carried out in transpose_zx
637	!------------------------------------------------------------------------------!
638
639	USE indices
640	USE transpose_indices
641
642	IMPLICIT NONE
643
644	REAL :: f_in(1:nz,nys:nyn,nxl:nxr)
645	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
646
647
648	INTEGER :: i, j, k
649
650	!
651	!-- Rearrange indices of input array in order to make data to be send
652	!-- by MPI contiguous
653	!$OMP PARALLEL PRIVATE ( i, j, k )
654	!$OMP DO
655	!$acc kernels present( f_in, f_inv )
656	DO k = 1,nz
657	DO i = nxl, nxr
658	DO j = nys, nyn
659	f_inv(j,i,k) = f_in(k,j,i)
660	ENDDO
661	ENDDO
662	ENDDO
663	!$acc end kernels
664	!$OMP END PARALLEL
665
666	END SUBROUTINE resort_for_zx
667
668
669	SUBROUTINE transpose_zx( f_inv, f_out )
670
671	!------------------------------------------------------------------------------!
672	! Description:
673	! ------------
674	! Transposition of input array (f_in) from z to x. For the input array, all
675	! elements along z reside on the same PE, while after transposition, all
676	! elements along x reside on the same PE.
677	!------------------------------------------------------------------------------!
678
679	USE cpulog
680	USE indices
681	USE interfaces
682	USE pegrid
683	USE transpose_indices
684
685	IMPLICIT NONE
686
687	INTEGER :: i, j, k, l, xs
688
689	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
690
691	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
692
693
694	!
695	!-- If the PE grid is one-dimensional along y, only local reordering
696	!-- of the data is necessary and no transposition has to be done.
697	IF ( pdims(1) == 1 ) THEN
698
699	!$OMP PARALLEL PRIVATE ( i, j, k )
700	!$OMP DO
701	!$acc kernels present( f_inv, f_out )
702	DO k = 1, nz
703	DO i = nxl, nxr
704	DO j = nys, nyn
705	f_out(i,j,k) = f_inv(j,i,k)
706	ENDDO
707	ENDDO
708	ENDDO
709	!$acc end kernels
710	!$OMP END PARALLEL
711
712	ELSE
713
714	#if defined( __parallel )
715	!
716	!-- Transpose array
717	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
718	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
719	!$acc update host( f_inv )
720	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
721	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
722	comm1dx, ierr )
723	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
724
725	!
726	!-- Reorder transposed array
727	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
728	!$OMP DO
729	!$acc data copyin( work )
730	DO l = 0, pdims(1) - 1
731	xs = 0 + l * nnx
732	!$acc kernels present( f_out )
733	DO k = nzb_x, nzt_x
734	DO i = xs, xs + nnx - 1
735	DO j = nys_x, nyn_x
736	f_out(i,j,k) = work(j,i-xs+1,k,l)
737	ENDDO
738	ENDDO
739	ENDDO
740	!$acc end kernels
741	ENDDO
742	!$acc end data
743	!$OMP END PARALLEL
744	#endif
745
746	ENDIF
747
748	END SUBROUTINE transpose_zx
749
750
751	SUBROUTINE resort_for_zy( f_inv, f_out )
752
753	!------------------------------------------------------------------------------!
754	! Description:
755	! ------------
756	! Resorting data after the transposition from z to y. The transposition itself
757	! is carried out in transpose_zy
758	!------------------------------------------------------------------------------!
759
760	USE indices
761	USE transpose_indices
762
763	IMPLICIT NONE
764
765	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
766	REAL :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
767
768
769	INTEGER :: i, j, k
770
771	!
772	!-- Rearrange indices of input array in order to make data to be send
773	!-- by MPI contiguous
774	!$OMP PARALLEL PRIVATE ( i, j, k )
775	!$OMP DO
776	!$acc kernels present( f_inv, f_out )
777	DO k = nzb_y, nzt_y
778	DO j = 0, ny
779	DO i = nxl_y, nxr_y
780	f_out(j,i,k) = f_inv(i,k,j)
781	ENDDO
782	ENDDO
783	ENDDO
784	!$acc end kernels
785	!$OMP END PARALLEL
786
787	END SUBROUTINE resort_for_zy
788
789
790	SUBROUTINE transpose_zy( f_in, f_inv )
791
792	!------------------------------------------------------------------------------!
793	! Description:
794	! ------------
795	! Transposition of input array (f_in) from z to y. For the input array, all
796	! elements along z reside on the same PE, while after transposition, all
797	! elements along y reside on the same PE.
798	!------------------------------------------------------------------------------!
799
800	USE cpulog
801	USE indices
802	USE interfaces
803	USE pegrid
804	USE transpose_indices
805
806	IMPLICIT NONE
807
808	INTEGER :: i, j, k, l, zs
809
810	REAL :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
811
812	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
813
814
815	!
816	!-- If the PE grid is one-dimensional along y, the array has only to be
817	!-- reordered locally and therefore no transposition has to be done.
818	IF ( pdims(1) /= 1 ) THEN
819
820	#if defined( __parallel )
821	!
822	!-- Reorder input array for transposition
823	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
824	!$OMP DO
825	!$acc data copyout( work )
826	DO l = 0, pdims(1) - 1
827	zs = 1 + l * ( nzt_y - nzb_y + 1 )
828	!$acc kernels present( f_in, work )
829	DO j = nys_z, nyn_z
830	DO k = zs, zs + nzt_y - nzb_y
831	DO i = nxl_z, nxr_z
832	work(i,k-zs+1,j,l) = f_in(i,j,k)
833	ENDDO
834	ENDDO
835	ENDDO
836	!$acc end kernels
837	ENDDO
838	!$acc end data
839	!$OMP END PARALLEL
840
841	!
842	!-- Transpose array
843	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
844	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
845	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
846	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
847	comm1dx, ierr )
848	!$acc update device( f_inv )
849	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
850	#endif
851
852	ELSE
853	!
854	!-- Reorder the array in the same way like ALLTOALL did it
855	!$OMP PARALLEL PRIVATE ( i, j, k )
856	!$OMP DO
857	!$acc kernels present( f_in, f_inv )
858	DO k = nzb_y, nzt_y
859	DO j = 0, ny
860	DO i = nxl_y, nxr_y
861	f_inv(i,k,j) = f_in(i,j,k)
862	ENDDO
863	ENDDO
864	ENDDO
865	!$acc end kernels
866	!$OMP END PARALLEL
867
868	ENDIF
869
870	END SUBROUTINE transpose_zy
871
872
873	SUBROUTINE transpose_zyd( f_in, f_out )
874
875	!------------------------------------------------------------------------------!
876	! Description:
877	! ------------
878	! Transposition of input array (f_in) from z to y. For the input array, all
879	! elements along z reside on the same PE, while after transposition, all
880	! elements along y reside on the same PE.
881	! This is a direct transposition for arrays with indices in regular order
882	! (k,j,i) (cf. transpose_zy).
883	!------------------------------------------------------------------------------!
884
885	USE cpulog
886	USE indices
887	USE interfaces
888	USE pegrid
889	USE transpose_indices
890
891	IMPLICIT NONE
892
893	INTEGER :: i, j, k, l, m, ys
894
895	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
896	f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd), &
897	work(nnxnnynnz)
898
899	#if defined( __parallel )
900
901	!
902	!-- Rearrange indices of input array in order to make data to be send
903	!-- by MPI contiguous
904	DO i = nxl, nxr
905	DO j = nys, nyn
906	DO k = 1, nz
907	f_inv(j,i,k) = f_in(k,j,i)
908	ENDDO
909	ENDDO
910	ENDDO
911
912	!
913	!-- Move data to different array, because memory location of work1 is
914	!-- needed further below (work1 = work2).
915	!-- If the PE grid is one-dimensional along x, only local reordering
916	!-- of the data is necessary and no transposition has to be done.
917	IF ( pdims(2) == 1 ) THEN
918	DO k = 1, nz
919	DO i = nxl, nxr
920	DO j = nys, nyn
921	f_out(j,i,k) = f_inv(j,i,k)
922	ENDDO
923	ENDDO
924	ENDDO
925	RETURN
926	ENDIF
927
928	!
929	!-- Transpose array
930	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
931	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
932	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
933	work(1), sendrecvcount_zyd, MPI_REAL, &
934	comm1dy, ierr )
935	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
936
937	!
938	!-- Reorder transposed array
939	m = 0
940	DO l = 0, pdims(2) - 1
941	ys = 0 + l * nny
942	DO k = nzb_yd, nzt_yd
943	DO i = nxl_yd, nxr_yd
944	DO j = ys, ys + nny - 1
945	m = m + 1
946	f_out(j,i,k) = work(m)
947	ENDDO
948	ENDDO
949	ENDDO
950	ENDDO
951
952	#endif
953
954	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |