Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 1248

Last change on this file since 1248 was 1217, checked in by raasch, 11 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 26.8 KB

Line
1	SUBROUTINE resort_for_xy( f_in, f_inv )
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2012 Leibniz University Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 1217 2013-08-26 11:30:05Z heinze $
27	!
28	! 1216 2013-08-26 09:31:42Z raasch
29	! re-sorting of the transposed / to be transposed arrays moved to separate
30	! routines resort_for_...
31	!
32	! 1111 2013-03-08 23:54:10Z raasch
33	! openACC directives added,
34	! resorting data from/to work changed, work got 4 dimensions instead of 1
35	!
36	! 1106 2013-03-04 05:31:38Z raasch
37	! preprocessor lines rearranged so that routines can also be used in serial
38	! (non-parallel) mode
39	!
40	! 1092 2013-02-02 11:24:22Z raasch
41	! unused variables removed
42	!
43	! 1036 2012-10-22 13:43:42Z raasch
44	! code put under GPL (PALM 3.9)
45	!
46	! 1003 2012-09-14 14:35:53Z raasch
47	! indices nxa, nya, etc. replaced by nx, ny, etc.
48	!
49	! 683 2011-02-09 14:25:15Z raasch
50	! openMP parallelization of transpositions for 2d-domain-decomposition
51	!
52	! 622 2010-12-10 08:08:13Z raasch
53	! optional barriers included in order to speed up collective operations
54	!
55	! 164 2008-05-15 08:46:15Z raasch
56	! f_inv changed from subroutine argument to automatic array in order to do
57	! re-ordering from f_in to f_inv in one step, one array work is needed instead
58	! of work1 and work2
59	!
60	! February 2007
61	! RCS Log replace by Id keyword, revision history cleaned up
62	!
63	! Revision 1.2 2004/04/30 13:12:17 raasch
64	! Switched from mpi_alltoallv to the simpler mpi_alltoall,
65	! all former transpose-routine files collected in this file, enlarged
66	! transposition arrays introduced
67	!
68	! Revision 1.1 2004/04/30 13:08:16 raasch
69	! Initial revision (collection of former routines transpose_xy, transpose_xz,
70	! transpose_yx, transpose_yz, transpose_zx, transpose_zy)
71	!
72	! Revision 1.1 1997/07/24 11:25:18 raasch
73	! Initial revision
74	!
75	!------------------------------------------------------------------------------!
76	! Description:
77	! ------------
78	! Resorting data for the transposition from x to y. The transposition itself
79	! is carried out in transpose_xy
80	!------------------------------------------------------------------------------!
81
82	USE indices
83	USE transpose_indices
84
85	IMPLICIT NONE
86
87	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
88	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
89
90
91	INTEGER :: i, j, k
92
93	!
94	!-- Rearrange indices of input array in order to make data to be send
95	!-- by MPI contiguous
96	!$OMP PARALLEL PRIVATE ( i, j, k )
97	!$OMP DO
98	!$acc kernels present( f_in, f_inv )
99	!$acc loop
100	DO i = 0, nx
101	DO k = nzb_x, nzt_x
102	!$acc loop vector( 32 )
103	DO j = nys_x, nyn_x
104	f_inv(j,k,i) = f_in(i,j,k)
105	ENDDO
106	ENDDO
107	ENDDO
108	!$acc end kernels
109	!$OMP END PARALLEL
110
111	END SUBROUTINE resort_for_xy
112
113
114	SUBROUTINE transpose_xy( f_inv, f_out )
115
116	!------------------------------------------------------------------------------!
117	! Description:
118	! ------------
119	! Transposition of input array (f_in) from x to y. For the input array, all
120	! elements along x reside on the same PE, while after transposition, all
121	! elements along y reside on the same PE.
122	!------------------------------------------------------------------------------!
123
124	USE cpulog
125	USE indices
126	USE interfaces
127	USE pegrid
128	USE transpose_indices
129
130	IMPLICIT NONE
131
132	INTEGER :: i, j, k, l, ys
133
134	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
135
136	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
137
138
139	IF ( numprocs /= 1 ) THEN
140
141	#if defined( __parallel )
142	!
143	!-- Transpose array
144	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
145	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
146	!$acc update host( f_inv )
147	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
148	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
149	comm1dy, ierr )
150	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
151
152	!
153	!-- Reorder transposed array
154	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
155	!$OMP DO
156	!$acc data copyin( work )
157	DO l = 0, pdims(2) - 1
158	ys = 0 + l * ( nyn_x - nys_x + 1 )
159	!$acc kernels present( f_out, work )
160	!$acc loop
161	DO i = nxl_y, nxr_y
162	DO k = nzb_y, nzt_y
163	!$acc loop vector( 32 )
164	DO j = ys, ys + nyn_x - nys_x
165	f_out(j,i,k) = work(j-ys+1,k,i,l)
166	ENDDO
167	ENDDO
168	ENDDO
169	!$acc end kernels
170	ENDDO
171	!$acc end data
172	!$OMP END PARALLEL
173	#endif
174
175	ELSE
176
177	!
178	!-- Reorder transposed array
179	!$OMP PARALLEL PRIVATE ( i, j, k )
180	!$OMP DO
181	!$acc kernels present( f_inv, f_out )
182	!$acc loop
183	DO k = nzb_y, nzt_y
184	DO i = nxl_y, nxr_y
185	!$acc loop vector( 32 )
186	DO j = 0, ny
187	f_out(j,i,k) = f_inv(j,k,i)
188	ENDDO
189	ENDDO
190	ENDDO
191	!$acc end kernels
192	!$OMP END PARALLEL
193
194	ENDIF
195
196	END SUBROUTINE transpose_xy
197
198
199	SUBROUTINE resort_for_xz( f_inv, f_out )
200
201	!------------------------------------------------------------------------------!
202	! Description:
203	! ------------
204	! Resorting data after the transposition from x to z. The transposition itself
205	! is carried out in transpose_xz
206	!------------------------------------------------------------------------------!
207
208	USE indices
209	USE transpose_indices
210
211	IMPLICIT NONE
212
213	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
214	REAL :: f_out(1:nz,nys:nyn,nxl:nxr)
215
216
217	INTEGER :: i, j, k
218
219	!
220	!-- Rearrange indices of input array in order to make data to be send
221	!-- by MPI contiguous.
222	!-- In case of parallel fft/transposition, scattered store is faster in
223	!-- backward direction!!!
224	!$OMP PARALLEL PRIVATE ( i, j, k )
225	!$OMP DO
226	!$acc kernels present( f_inv, f_out )
227	!$acc loop
228	DO k = 1, nz
229	DO i = nxl, nxr
230	!$acc loop vector( 32 )
231	DO j = nys, nyn
232	f_out(k,j,i) = f_inv(j,i,k)
233	ENDDO
234	ENDDO
235	ENDDO
236	!$acc end kernels
237	!$OMP END PARALLEL
238
239	END SUBROUTINE resort_for_xz
240
241
242	SUBROUTINE transpose_xz( f_in, f_inv )
243
244	!------------------------------------------------------------------------------!
245	! Description:
246	! ------------
247	! Transposition of input array (f_in) from x to z. For the input array, all
248	! elements along x reside on the same PE, while after transposition, all
249	! elements along z reside on the same PE.
250	!------------------------------------------------------------------------------!
251
252	USE cpulog
253	USE indices
254	USE interfaces
255	USE pegrid
256	USE transpose_indices
257
258	IMPLICIT NONE
259
260	INTEGER :: i, j, k, l, xs
261
262	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz)
263
264	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
265
266
267	!
268	!-- If the PE grid is one-dimensional along y, the array has only to be
269	!-- reordered locally and therefore no transposition has to be done.
270	IF ( pdims(1) /= 1 ) THEN
271
272	#if defined( __parallel )
273	!
274	!-- Reorder input array for transposition
275	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
276	!$OMP DO
277	!$acc data copyout( work )
278	DO l = 0, pdims(1) - 1
279	xs = 0 + l * nnx
280	!$acc kernels present( f_in, work )
281	!$acc loop
282	DO k = nzb_x, nzt_x
283	DO i = xs, xs + nnx - 1
284	!$acc loop vector( 32 )
285	DO j = nys_x, nyn_x
286	work(j,i-xs+1,k,l) = f_in(i,j,k)
287	ENDDO
288	ENDDO
289	ENDDO
290	!$acc end kernels
291	ENDDO
292	!$acc end data
293	!$OMP END PARALLEL
294
295	!
296	!-- Transpose array
297	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
298	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
299	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
300	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
301	comm1dx, ierr )
302	!$acc update device( f_inv )
303	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
304	#endif
305
306	ELSE
307
308	!
309	!-- Reorder the array in a way that the z index is in first position
310	!$OMP PARALLEL PRIVATE ( i, j, k )
311	!$OMP DO
312	!$acc kernels present( f_in, f_inv )
313	!$acc loop
314	DO i = nxl, nxr
315	DO j = nys, nyn
316	!$acc loop vector( 32 )
317	DO k = 1, nz
318	f_inv(j,i,k) = f_in(i,j,k)
319	ENDDO
320	ENDDO
321	ENDDO
322	!$acc end kernels
323	!$OMP END PARALLEL
324
325	ENDIF
326
327	END SUBROUTINE transpose_xz
328
329
330	SUBROUTINE resort_for_yx( f_inv, f_out )
331
332	!------------------------------------------------------------------------------!
333	! Description:
334	! ------------
335	! Resorting data after the transposition from y to x. The transposition itself
336	! is carried out in transpose_yx
337	!------------------------------------------------------------------------------!
338
339	USE indices
340	USE transpose_indices
341
342	IMPLICIT NONE
343
344	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
345	REAL :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
346
347
348	INTEGER :: i, j, k
349
350	!
351	!-- Rearrange indices of input array in order to make data to be send
352	!-- by MPI contiguous
353	!$OMP PARALLEL PRIVATE ( i, j, k )
354	!$OMP DO
355	!$acc kernels present( f_inv, f_out )
356	!$acc loop
357	DO i = 0, nx
358	DO k = nzb_x, nzt_x
359	!$acc loop vector( 32 )
360	DO j = nys_x, nyn_x
361	f_out(i,j,k) = f_inv(j,k,i)
362	ENDDO
363	ENDDO
364	ENDDO
365	!$acc end kernels
366	!$OMP END PARALLEL
367
368	END SUBROUTINE resort_for_yx
369
370
371	SUBROUTINE transpose_yx( f_in, f_inv )
372
373	!------------------------------------------------------------------------------!
374	! Description:
375	! ------------
376	! Transposition of input array (f_in) from y to x. For the input array, all
377	! elements along y reside on the same PE, while after transposition, all
378	! elements along x reside on the same PE.
379	!------------------------------------------------------------------------------!
380
381	USE cpulog
382	USE indices
383	USE interfaces
384	USE pegrid
385	USE transpose_indices
386
387	IMPLICIT NONE
388
389	INTEGER :: i, j, k, l, ys
390
391	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
392
393	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
394
395
396	IF ( numprocs /= 1 ) THEN
397
398	#if defined( __parallel )
399	!
400	!-- Reorder input array for transposition
401	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
402	!$OMP DO
403	!$acc data copyout( work )
404	DO l = 0, pdims(2) - 1
405	ys = 0 + l * ( nyn_x - nys_x + 1 )
406	!$acc kernels present( f_in, work )
407	!$acc loop
408	DO i = nxl_y, nxr_y
409	DO k = nzb_y, nzt_y
410	!$acc loop vector( 32 )
411	DO j = ys, ys + nyn_x - nys_x
412	work(j-ys+1,k,i,l) = f_in(j,i,k)
413	ENDDO
414	ENDDO
415	ENDDO
416	!$acc end kernels
417	ENDDO
418	!$acc end data
419	!$OMP END PARALLEL
420
421	!
422	!-- Transpose array
423	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
424	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
425	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
426	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
427	comm1dy, ierr )
428	!$acc update device( f_inv )
429	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
430	#endif
431
432	ELSE
433
434	!
435	!-- Reorder array f_in the same way as ALLTOALL did it
436	!$OMP PARALLEL PRIVATE ( i, j, k )
437	!$OMP DO
438	!$acc kernels present( f_in, f_inv )
439	!$acc loop
440	DO i = nxl_y, nxr_y
441	DO k = nzb_y, nzt_y
442	!$acc loop vector( 32 )
443	DO j = 0, ny
444	f_inv(j,k,i) = f_in(j,i,k)
445	ENDDO
446	ENDDO
447	ENDDO
448	!$acc end kernels
449	!$OMP END PARALLEL
450
451	ENDIF
452
453	END SUBROUTINE transpose_yx
454
455
456	SUBROUTINE transpose_yxd( f_in, f_out )
457
458	!------------------------------------------------------------------------------!
459	! Description:
460	! ------------
461	! Transposition of input array (f_in) from y to x. For the input array, all
462	! elements along y reside on the same PE, while after transposition, all
463	! elements along x reside on the same PE.
464	! This is a direct transposition for arrays with indices in regular order
465	! (k,j,i) (cf. transpose_yx).
466	!------------------------------------------------------------------------------!
467
468	USE cpulog
469	USE indices
470	USE interfaces
471	USE pegrid
472	USE transpose_indices
473
474	IMPLICIT NONE
475
476	INTEGER :: i, j, k, l, m, xs
477
478	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
479	f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x), &
480	work(nnxnnynnz)
481
482	#if defined( __parallel )
483
484	!
485	!-- Rearrange indices of input array in order to make data to be send
486	!-- by MPI contiguous
487	DO k = 1, nz
488	DO j = nys, nyn
489	DO i = nxl, nxr
490	f_inv(i,k,j) = f_in(k,j,i)
491	ENDDO
492	ENDDO
493	ENDDO
494
495	!
496	!-- Transpose array
497	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
498	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
499	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
500	work(1), sendrecvcount_xy, MPI_REAL, &
501	comm1dx, ierr )
502	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
503
504	!
505	!-- Reorder transposed array
506	m = 0
507	DO l = 0, pdims(1) - 1
508	xs = 0 + l * nnx
509	DO j = nys_x, nyn_x
510	DO k = 1, nz
511	DO i = xs, xs + nnx - 1
512	m = m + 1
513	f_out(i,j,k) = work(m)
514	ENDDO
515	ENDDO
516	ENDDO
517	ENDDO
518
519	#endif
520
521	END SUBROUTINE transpose_yxd
522
523
524	SUBROUTINE resort_for_yz( f_in, f_inv )
525
526	!------------------------------------------------------------------------------!
527	! Description:
528	! ------------
529	! Resorting data for the transposition from y to z. The transposition itself
530	! is carried out in transpose_yz
531	!------------------------------------------------------------------------------!
532
533	USE indices
534	USE transpose_indices
535
536	IMPLICIT NONE
537
538	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
539	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
540
541
542	INTEGER :: i, j, k
543
544	!
545	!-- Rearrange indices of input array in order to make data to be send
546	!-- by MPI contiguous
547	!$OMP PARALLEL PRIVATE ( i, j, k )
548	!$OMP DO
549	!$acc kernels present( f_in, f_inv )
550	!$acc loop
551	DO j = 0, ny
552	DO k = nzb_y, nzt_y
553	!$acc loop vector( 32 )
554	DO i = nxl_y, nxr_y
555	f_inv(i,k,j) = f_in(j,i,k)
556	ENDDO
557	ENDDO
558	ENDDO
559	!$acc end kernels
560	!$OMP END PARALLEL
561
562	END SUBROUTINE resort_for_yz
563
564
565	SUBROUTINE transpose_yz( f_inv, f_out )
566
567	!------------------------------------------------------------------------------!
568	! Description:
569	! ------------
570	! Transposition of input array (f_in) from y to z. For the input array, all
571	! elements along y reside on the same PE, while after transposition, all
572	! elements along z reside on the same PE.
573	!------------------------------------------------------------------------------!
574
575	USE cpulog
576	USE indices
577	USE interfaces
578	USE pegrid
579	USE transpose_indices
580
581	IMPLICIT NONE
582
583	INTEGER :: i, j, k, l, zs
584
585	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
586
587	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
588
589
590	!
591	!-- If the PE grid is one-dimensional along y, only local reordering
592	!-- of the data is necessary and no transposition has to be done.
593	IF ( pdims(1) == 1 ) THEN
594
595	!$OMP PARALLEL PRIVATE ( i, j, k )
596	!$OMP DO
597	!$acc kernels present( f_inv, f_out )
598	!$acc loop
599	DO j = 0, ny
600	DO k = nzb_y, nzt_y
601	!$acc loop vector( 32 )
602	DO i = nxl_y, nxr_y
603	f_out(i,j,k) = f_inv(i,k,j)
604	ENDDO
605	ENDDO
606	ENDDO
607	!$acc end kernels
608	!$OMP END PARALLEL
609
610	ELSE
611
612	#if defined( __parallel )
613	!
614	!-- Transpose array
615	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
616	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
617	!$acc update host( f_inv )
618	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
619	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
620	comm1dx, ierr )
621	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
622
623	!
624	!-- Reorder transposed array
625	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
626	!$OMP DO
627	!$acc data copyin( work )
628	DO l = 0, pdims(1) - 1
629	zs = 1 + l * ( nzt_y - nzb_y + 1 )
630	!$acc kernels present( f_out )
631	!$acc loop
632	DO j = nys_z, nyn_z
633	DO k = zs, zs + nzt_y - nzb_y
634	!$acc loop vector( 32 )
635	DO i = nxl_z, nxr_z
636	f_out(i,j,k) = work(i,k-zs+1,j,l)
637	ENDDO
638	ENDDO
639	ENDDO
640	!$acc end kernels
641	ENDDO
642	!$acc end data
643	!$OMP END PARALLEL
644	#endif
645
646	ENDIF
647
648	END SUBROUTINE transpose_yz
649
650
651	SUBROUTINE resort_for_zx( f_in, f_inv )
652
653	!------------------------------------------------------------------------------!
654	! Description:
655	! ------------
656	! Resorting data for the transposition from z to x. The transposition itself
657	! is carried out in transpose_zx
658	!------------------------------------------------------------------------------!
659
660	USE indices
661	USE transpose_indices
662
663	IMPLICIT NONE
664
665	REAL :: f_in(1:nz,nys:nyn,nxl:nxr)
666	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
667
668
669	INTEGER :: i, j, k
670
671	!
672	!-- Rearrange indices of input array in order to make data to be send
673	!-- by MPI contiguous
674	!$OMP PARALLEL PRIVATE ( i, j, k )
675	!$OMP DO
676	!$acc kernels present( f_in, f_inv )
677	!$acc loop
678	DO k = 1,nz
679	DO i = nxl, nxr
680	!$acc loop vector( 32 )
681	DO j = nys, nyn
682	f_inv(j,i,k) = f_in(k,j,i)
683	ENDDO
684	ENDDO
685	ENDDO
686	!$acc end kernels
687	!$OMP END PARALLEL
688
689	END SUBROUTINE resort_for_zx
690
691
692	SUBROUTINE transpose_zx( f_inv, f_out )
693
694	!------------------------------------------------------------------------------!
695	! Description:
696	! ------------
697	! Transposition of input array (f_in) from z to x. For the input array, all
698	! elements along z reside on the same PE, while after transposition, all
699	! elements along x reside on the same PE.
700	!------------------------------------------------------------------------------!
701
702	USE cpulog
703	USE indices
704	USE interfaces
705	USE pegrid
706	USE transpose_indices
707
708	IMPLICIT NONE
709
710	INTEGER :: i, j, k, l, xs
711
712	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
713
714	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
715
716
717	!
718	!-- If the PE grid is one-dimensional along y, only local reordering
719	!-- of the data is necessary and no transposition has to be done.
720	IF ( pdims(1) == 1 ) THEN
721
722	!$OMP PARALLEL PRIVATE ( i, j, k )
723	!$OMP DO
724	!$acc kernels present( f_inv, f_out )
725	!$acc loop
726	DO k = 1, nz
727	DO i = nxl, nxr
728	!$acc loop vector( 32 )
729	DO j = nys, nyn
730	f_out(i,j,k) = f_inv(j,i,k)
731	ENDDO
732	ENDDO
733	ENDDO
734	!$acc end kernels
735	!$OMP END PARALLEL
736
737	ELSE
738
739	#if defined( __parallel )
740	!
741	!-- Transpose array
742	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
743	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
744	!$acc update host( f_inv )
745	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
746	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
747	comm1dx, ierr )
748	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
749
750	!
751	!-- Reorder transposed array
752	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
753	!$OMP DO
754	!$acc data copyin( work )
755	DO l = 0, pdims(1) - 1
756	xs = 0 + l * nnx
757	!$acc kernels present( f_out )
758	!$acc loop
759	DO k = nzb_x, nzt_x
760	DO i = xs, xs + nnx - 1
761	!$acc loop vector( 32 )
762	DO j = nys_x, nyn_x
763	f_out(i,j,k) = work(j,i-xs+1,k,l)
764	ENDDO
765	ENDDO
766	ENDDO
767	!$acc end kernels
768	ENDDO
769	!$acc end data
770	!$OMP END PARALLEL
771	#endif
772
773	ENDIF
774
775	END SUBROUTINE transpose_zx
776
777
778	SUBROUTINE resort_for_zy( f_inv, f_out )
779
780	!------------------------------------------------------------------------------!
781	! Description:
782	! ------------
783	! Resorting data after the transposition from z to y. The transposition itself
784	! is carried out in transpose_zy
785	!------------------------------------------------------------------------------!
786
787	USE indices
788	USE transpose_indices
789
790	IMPLICIT NONE
791
792	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
793	REAL :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
794
795
796	INTEGER :: i, j, k
797
798	!
799	!-- Rearrange indices of input array in order to make data to be send
800	!-- by MPI contiguous
801	!$OMP PARALLEL PRIVATE ( i, j, k )
802	!$OMP DO
803	!$acc kernels present( f_inv, f_out )
804	!$acc loop
805	DO k = nzb_y, nzt_y
806	DO j = 0, ny
807	!$acc loop vector( 32 )
808	DO i = nxl_y, nxr_y
809	f_out(j,i,k) = f_inv(i,k,j)
810	ENDDO
811	ENDDO
812	ENDDO
813	!$acc end kernels
814	!$OMP END PARALLEL
815
816	END SUBROUTINE resort_for_zy
817
818
819	SUBROUTINE transpose_zy( f_in, f_inv )
820
821	!------------------------------------------------------------------------------!
822	! Description:
823	! ------------
824	! Transposition of input array (f_in) from z to y. For the input array, all
825	! elements along z reside on the same PE, while after transposition, all
826	! elements along y reside on the same PE.
827	!------------------------------------------------------------------------------!
828
829	USE cpulog
830	USE indices
831	USE interfaces
832	USE pegrid
833	USE transpose_indices
834
835	IMPLICIT NONE
836
837	INTEGER :: i, j, k, l, zs
838
839	REAL :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
840
841	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
842
843
844	!
845	!-- If the PE grid is one-dimensional along y, the array has only to be
846	!-- reordered locally and therefore no transposition has to be done.
847	IF ( pdims(1) /= 1 ) THEN
848
849	#if defined( __parallel )
850	!
851	!-- Reorder input array for transposition
852	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
853	!$OMP DO
854	!$acc data copyout( work )
855	DO l = 0, pdims(1) - 1
856	zs = 1 + l * ( nzt_y - nzb_y + 1 )
857	!$acc kernels present( f_in, work )
858	!$acc loop
859	DO j = nys_z, nyn_z
860	DO k = zs, zs + nzt_y - nzb_y
861	!$acc loop vector( 32 )
862	DO i = nxl_z, nxr_z
863	work(i,k-zs+1,j,l) = f_in(i,j,k)
864	ENDDO
865	ENDDO
866	ENDDO
867	!$acc end kernels
868	ENDDO
869	!$acc end data
870	!$OMP END PARALLEL
871
872	!
873	!-- Transpose array
874	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
875	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
876	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
877	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
878	comm1dx, ierr )
879	!$acc update device( f_inv )
880	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
881	#endif
882
883	ELSE
884	!
885	!-- Reorder the array in the same way like ALLTOALL did it
886	!$OMP PARALLEL PRIVATE ( i, j, k )
887	!$OMP DO
888	!$acc kernels present( f_in, f_inv )
889	!$acc loop
890	DO k = nzb_y, nzt_y
891	DO j = 0, ny
892	!$acc loop vector( 32 )
893	DO i = nxl_y, nxr_y
894	f_inv(i,k,j) = f_in(i,j,k)
895	ENDDO
896	ENDDO
897	ENDDO
898	!$acc end kernels
899	!$OMP END PARALLEL
900
901	ENDIF
902
903	END SUBROUTINE transpose_zy
904
905
906	SUBROUTINE transpose_zyd( f_in, f_out )
907
908	!------------------------------------------------------------------------------!
909	! Description:
910	! ------------
911	! Transposition of input array (f_in) from z to y. For the input array, all
912	! elements along z reside on the same PE, while after transposition, all
913	! elements along y reside on the same PE.
914	! This is a direct transposition for arrays with indices in regular order
915	! (k,j,i) (cf. transpose_zy).
916	!------------------------------------------------------------------------------!
917
918	USE cpulog
919	USE indices
920	USE interfaces
921	USE pegrid
922	USE transpose_indices
923
924	IMPLICIT NONE
925
926	INTEGER :: i, j, k, l, m, ys
927
928	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
929	f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd), &
930	work(nnxnnynnz)
931
932	#if defined( __parallel )
933
934	!
935	!-- Rearrange indices of input array in order to make data to be send
936	!-- by MPI contiguous
937	DO i = nxl, nxr
938	DO j = nys, nyn
939	DO k = 1, nz
940	f_inv(j,i,k) = f_in(k,j,i)
941	ENDDO
942	ENDDO
943	ENDDO
944
945	!
946	!-- Move data to different array, because memory location of work1 is
947	!-- needed further below (work1 = work2).
948	!-- If the PE grid is one-dimensional along x, only local reordering
949	!-- of the data is necessary and no transposition has to be done.
950	IF ( pdims(2) == 1 ) THEN
951	DO k = 1, nz
952	DO i = nxl, nxr
953	DO j = nys, nyn
954	f_out(j,i,k) = f_inv(j,i,k)
955	ENDDO
956	ENDDO
957	ENDDO
958	RETURN
959	ENDIF
960
961	!
962	!-- Transpose array
963	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
964	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
965	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
966	work(1), sendrecvcount_zyd, MPI_REAL, &
967	comm1dy, ierr )
968	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
969
970	!
971	!-- Reorder transposed array
972	m = 0
973	DO l = 0, pdims(2) - 1
974	ys = 0 + l * nny
975	DO k = nzb_yd, nzt_yd
976	DO i = nxl_yd, nxr_yd
977	DO j = ys, ys + nny - 1
978	m = m + 1
979	f_out(j,i,k) = work(m)
980	ENDDO
981	ENDDO
982	ENDDO
983	ENDDO
984
985	#endif
986
987	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |