Home

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 1319

Last change on this file since 1319 was 1319, checked in by raasch, 10 years ago
last commit documented
Property svn:keywords set to `Id`
File size: 26.1 KB

Line
1	SUBROUTINE resort_for_xy( f_in, f_inv )
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2014 Leibniz Universitaet Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	!
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 1319 2014-03-17 15:08:44Z raasch $
27	!
28	! 1318 2014-03-17 13:35:16Z raasch
29	! cpu_log_nowait parameter added to cpu measurements of the transpositions
30	! required for solving the Poisson equation (poisfft),
31	! module interfaces removed
32	!
33	! 1257 2013-11-08 15:18:40Z raasch
34	! openacc loop and loop vector clauses removed
35	!
36	! 1216 2013-08-26 09:31:42Z raasch
37	! re-sorting of the transposed / to be transposed arrays moved to separate
38	! routines resort_for_...
39	!
40	! 1111 2013-03-08 23:54:10Z raasch
41	! openACC directives added,
42	! resorting data from/to work changed, work got 4 dimensions instead of 1
43	!
44	! 1106 2013-03-04 05:31:38Z raasch
45	! preprocessor lines rearranged so that routines can also be used in serial
46	! (non-parallel) mode
47	!
48	! 1092 2013-02-02 11:24:22Z raasch
49	! unused variables removed
50	!
51	! 1036 2012-10-22 13:43:42Z raasch
52	! code put under GPL (PALM 3.9)
53	!
54	! 1003 2012-09-14 14:35:53Z raasch
55	! indices nxa, nya, etc. replaced by nx, ny, etc.
56	!
57	! 683 2011-02-09 14:25:15Z raasch
58	! openMP parallelization of transpositions for 2d-domain-decomposition
59	!
60	! 622 2010-12-10 08:08:13Z raasch
61	! optional barriers included in order to speed up collective operations
62	!
63	! 164 2008-05-15 08:46:15Z raasch
64	! f_inv changed from subroutine argument to automatic array in order to do
65	! re-ordering from f_in to f_inv in one step, one array work is needed instead
66	! of work1 and work2
67	!
68	! February 2007
69	! RCS Log replace by Id keyword, revision history cleaned up
70	!
71	! Revision 1.2 2004/04/30 13:12:17 raasch
72	! Switched from mpi_alltoallv to the simpler mpi_alltoall,
73	! all former transpose-routine files collected in this file, enlarged
74	! transposition arrays introduced
75	!
76	! Revision 1.1 2004/04/30 13:08:16 raasch
77	! Initial revision (collection of former routines transpose_xy, transpose_xz,
78	! transpose_yx, transpose_yz, transpose_zx, transpose_zy)
79	!
80	! Revision 1.1 1997/07/24 11:25:18 raasch
81	! Initial revision
82	!
83	!------------------------------------------------------------------------------!
84	! Description:
85	! ------------
86	! Resorting data for the transposition from x to y. The transposition itself
87	! is carried out in transpose_xy
88	!------------------------------------------------------------------------------!
89
90	USE indices
91	USE transpose_indices
92
93	IMPLICIT NONE
94
95	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
96	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
97
98
99	INTEGER :: i, j, k
100
101	!
102	!-- Rearrange indices of input array in order to make data to be send
103	!-- by MPI contiguous
104	!$OMP PARALLEL PRIVATE ( i, j, k )
105	!$OMP DO
106	!$acc kernels present( f_in, f_inv )
107	DO i = 0, nx
108	DO k = nzb_x, nzt_x
109	DO j = nys_x, nyn_x
110	f_inv(j,k,i) = f_in(i,j,k)
111	ENDDO
112	ENDDO
113	ENDDO
114	!$acc end kernels
115	!$OMP END PARALLEL
116
117	END SUBROUTINE resort_for_xy
118
119
120	SUBROUTINE transpose_xy( f_inv, f_out )
121
122	!------------------------------------------------------------------------------!
123	! Description:
124	! ------------
125	! Transposition of input array (f_in) from x to y. For the input array, all
126	! elements along x reside on the same PE, while after transposition, all
127	! elements along y reside on the same PE.
128	!------------------------------------------------------------------------------!
129
130	USE cpulog
131	USE indices
132	USE pegrid
133	USE transpose_indices
134
135	IMPLICIT NONE
136
137	INTEGER :: i, j, k, l, ys
138
139	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
140
141	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
142
143
144	IF ( numprocs /= 1 ) THEN
145
146	#if defined( __parallel )
147	!
148	!-- Transpose array
149	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
150	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
151	!$acc update host( f_inv )
152	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
153	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
154	comm1dy, ierr )
155	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
156
157	!
158	!-- Reorder transposed array
159	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
160	!$OMP DO
161	!$acc data copyin( work )
162	DO l = 0, pdims(2) - 1
163	ys = 0 + l * ( nyn_x - nys_x + 1 )
164	!$acc kernels present( f_out, work )
165	DO i = nxl_y, nxr_y
166	DO k = nzb_y, nzt_y
167	DO j = ys, ys + nyn_x - nys_x
168	f_out(j,i,k) = work(j-ys+1,k,i,l)
169	ENDDO
170	ENDDO
171	ENDDO
172	!$acc end kernels
173	ENDDO
174	!$acc end data
175	!$OMP END PARALLEL
176	#endif
177
178	ELSE
179
180	!
181	!-- Reorder transposed array
182	!$OMP PARALLEL PRIVATE ( i, j, k )
183	!$OMP DO
184	!$acc kernels present( f_inv, f_out )
185	DO k = nzb_y, nzt_y
186	DO i = nxl_y, nxr_y
187	DO j = 0, ny
188	f_out(j,i,k) = f_inv(j,k,i)
189	ENDDO
190	ENDDO
191	ENDDO
192	!$acc end kernels
193	!$OMP END PARALLEL
194
195	ENDIF
196
197	END SUBROUTINE transpose_xy
198
199
200	SUBROUTINE resort_for_xz( f_inv, f_out )
201
202	!------------------------------------------------------------------------------!
203	! Description:
204	! ------------
205	! Resorting data after the transposition from x to z. The transposition itself
206	! is carried out in transpose_xz
207	!------------------------------------------------------------------------------!
208
209	USE indices
210	USE transpose_indices
211
212	IMPLICIT NONE
213
214	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
215	REAL :: f_out(1:nz,nys:nyn,nxl:nxr)
216
217
218	INTEGER :: i, j, k
219
220	!
221	!-- Rearrange indices of input array in order to make data to be send
222	!-- by MPI contiguous.
223	!-- In case of parallel fft/transposition, scattered store is faster in
224	!-- backward direction!!!
225	!$OMP PARALLEL PRIVATE ( i, j, k )
226	!$OMP DO
227	!$acc kernels present( f_inv, f_out )
228	DO k = 1, nz
229	DO i = nxl, nxr
230	DO j = nys, nyn
231	f_out(k,j,i) = f_inv(j,i,k)
232	ENDDO
233	ENDDO
234	ENDDO
235	!$acc end kernels
236	!$OMP END PARALLEL
237
238	END SUBROUTINE resort_for_xz
239
240
241	SUBROUTINE transpose_xz( f_in, f_inv )
242
243	!------------------------------------------------------------------------------!
244	! Description:
245	! ------------
246	! Transposition of input array (f_in) from x to z. For the input array, all
247	! elements along x reside on the same PE, while after transposition, all
248	! elements along z reside on the same PE.
249	!------------------------------------------------------------------------------!
250
251	USE cpulog
252	USE indices
253	USE pegrid
254	USE transpose_indices
255
256	IMPLICIT NONE
257
258	INTEGER :: i, j, k, l, xs
259
260	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz)
261
262	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
263
264
265	!
266	!-- If the PE grid is one-dimensional along y, the array has only to be
267	!-- reordered locally and therefore no transposition has to be done.
268	IF ( pdims(1) /= 1 ) THEN
269
270	#if defined( __parallel )
271	!
272	!-- Reorder input array for transposition
273	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
274	!$OMP DO
275	!$acc data copyout( work )
276	DO l = 0, pdims(1) - 1
277	xs = 0 + l * nnx
278	!$acc kernels present( f_in, work )
279	DO k = nzb_x, nzt_x
280	DO i = xs, xs + nnx - 1
281	DO j = nys_x, nyn_x
282	work(j,i-xs+1,k,l) = f_in(i,j,k)
283	ENDDO
284	ENDDO
285	ENDDO
286	!$acc end kernels
287	ENDDO
288	!$acc end data
289	!$OMP END PARALLEL
290
291	!
292	!-- Transpose array
293	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
294	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
295	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
296	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
297	comm1dx, ierr )
298	!$acc update device( f_inv )
299	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
300	#endif
301
302	ELSE
303
304	!
305	!-- Reorder the array in a way that the z index is in first position
306	!$OMP PARALLEL PRIVATE ( i, j, k )
307	!$OMP DO
308	!$acc kernels present( f_in, f_inv )
309	DO i = nxl, nxr
310	DO j = nys, nyn
311	DO k = 1, nz
312	f_inv(j,i,k) = f_in(i,j,k)
313	ENDDO
314	ENDDO
315	ENDDO
316	!$acc end kernels
317	!$OMP END PARALLEL
318
319	ENDIF
320
321	END SUBROUTINE transpose_xz
322
323
324	SUBROUTINE resort_for_yx( f_inv, f_out )
325
326	!------------------------------------------------------------------------------!
327	! Description:
328	! ------------
329	! Resorting data after the transposition from y to x. The transposition itself
330	! is carried out in transpose_yx
331	!------------------------------------------------------------------------------!
332
333	USE indices
334	USE transpose_indices
335
336	IMPLICIT NONE
337
338	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
339	REAL :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
340
341
342	INTEGER :: i, j, k
343
344	!
345	!-- Rearrange indices of input array in order to make data to be send
346	!-- by MPI contiguous
347	!$OMP PARALLEL PRIVATE ( i, j, k )
348	!$OMP DO
349	!$acc kernels present( f_inv, f_out )
350	DO i = 0, nx
351	DO k = nzb_x, nzt_x
352	DO j = nys_x, nyn_x
353	f_out(i,j,k) = f_inv(j,k,i)
354	ENDDO
355	ENDDO
356	ENDDO
357	!$acc end kernels
358	!$OMP END PARALLEL
359
360	END SUBROUTINE resort_for_yx
361
362
363	SUBROUTINE transpose_yx( f_in, f_inv )
364
365	!------------------------------------------------------------------------------!
366	! Description:
367	! ------------
368	! Transposition of input array (f_in) from y to x. For the input array, all
369	! elements along y reside on the same PE, while after transposition, all
370	! elements along x reside on the same PE.
371	!------------------------------------------------------------------------------!
372
373	USE cpulog
374	USE indices
375	USE pegrid
376	USE transpose_indices
377
378	IMPLICIT NONE
379
380	INTEGER :: i, j, k, l, ys
381
382	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
383
384	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
385
386
387	IF ( numprocs /= 1 ) THEN
388
389	#if defined( __parallel )
390	!
391	!-- Reorder input array for transposition
392	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
393	!$OMP DO
394	!$acc data copyout( work )
395	DO l = 0, pdims(2) - 1
396	ys = 0 + l * ( nyn_x - nys_x + 1 )
397	!$acc kernels present( f_in, work )
398	DO i = nxl_y, nxr_y
399	DO k = nzb_y, nzt_y
400	DO j = ys, ys + nyn_x - nys_x
401	work(j-ys+1,k,i,l) = f_in(j,i,k)
402	ENDDO
403	ENDDO
404	ENDDO
405	!$acc end kernels
406	ENDDO
407	!$acc end data
408	!$OMP END PARALLEL
409
410	!
411	!-- Transpose array
412	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
413	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
414	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
415	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
416	comm1dy, ierr )
417	!$acc update device( f_inv )
418	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
419	#endif
420
421	ELSE
422
423	!
424	!-- Reorder array f_in the same way as ALLTOALL did it
425	!$OMP PARALLEL PRIVATE ( i, j, k )
426	!$OMP DO
427	!$acc kernels present( f_in, f_inv )
428	DO i = nxl_y, nxr_y
429	DO k = nzb_y, nzt_y
430	DO j = 0, ny
431	f_inv(j,k,i) = f_in(j,i,k)
432	ENDDO
433	ENDDO
434	ENDDO
435	!$acc end kernels
436	!$OMP END PARALLEL
437
438	ENDIF
439
440	END SUBROUTINE transpose_yx
441
442
443	SUBROUTINE transpose_yxd( f_in, f_out )
444
445	!------------------------------------------------------------------------------!
446	! Description:
447	! ------------
448	! Transposition of input array (f_in) from y to x. For the input array, all
449	! elements along y reside on the same PE, while after transposition, all
450	! elements along x reside on the same PE.
451	! This is a direct transposition for arrays with indices in regular order
452	! (k,j,i) (cf. transpose_yx).
453	!------------------------------------------------------------------------------!
454
455	USE cpulog
456	USE indices
457	USE pegrid
458	USE transpose_indices
459
460	IMPLICIT NONE
461
462	INTEGER :: i, j, k, l, m, xs
463
464	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
465	f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x), &
466	work(nnxnnynnz)
467
468	#if defined( __parallel )
469
470	!
471	!-- Rearrange indices of input array in order to make data to be send
472	!-- by MPI contiguous
473	DO k = 1, nz
474	DO j = nys, nyn
475	DO i = nxl, nxr
476	f_inv(i,k,j) = f_in(k,j,i)
477	ENDDO
478	ENDDO
479	ENDDO
480
481	!
482	!-- Transpose array
483	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
484	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
485	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
486	work(1), sendrecvcount_xy, MPI_REAL, &
487	comm1dx, ierr )
488	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
489
490	!
491	!-- Reorder transposed array
492	m = 0
493	DO l = 0, pdims(1) - 1
494	xs = 0 + l * nnx
495	DO j = nys_x, nyn_x
496	DO k = 1, nz
497	DO i = xs, xs + nnx - 1
498	m = m + 1
499	f_out(i,j,k) = work(m)
500	ENDDO
501	ENDDO
502	ENDDO
503	ENDDO
504
505	#endif
506
507	END SUBROUTINE transpose_yxd
508
509
510	SUBROUTINE resort_for_yz( f_in, f_inv )
511
512	!------------------------------------------------------------------------------!
513	! Description:
514	! ------------
515	! Resorting data for the transposition from y to z. The transposition itself
516	! is carried out in transpose_yz
517	!------------------------------------------------------------------------------!
518
519	USE indices
520	USE transpose_indices
521
522	IMPLICIT NONE
523
524	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
525	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
526
527
528	INTEGER :: i, j, k
529
530	!
531	!-- Rearrange indices of input array in order to make data to be send
532	!-- by MPI contiguous
533	!$OMP PARALLEL PRIVATE ( i, j, k )
534	!$OMP DO
535	!$acc kernels present( f_in, f_inv )
536	DO j = 0, ny
537	DO k = nzb_y, nzt_y
538	DO i = nxl_y, nxr_y
539	f_inv(i,k,j) = f_in(j,i,k)
540	ENDDO
541	ENDDO
542	ENDDO
543	!$acc end kernels
544	!$OMP END PARALLEL
545
546	END SUBROUTINE resort_for_yz
547
548
549	SUBROUTINE transpose_yz( f_inv, f_out )
550
551	!------------------------------------------------------------------------------!
552	! Description:
553	! ------------
554	! Transposition of input array (f_in) from y to z. For the input array, all
555	! elements along y reside on the same PE, while after transposition, all
556	! elements along z reside on the same PE.
557	!------------------------------------------------------------------------------!
558
559	USE cpulog
560	USE indices
561	USE pegrid
562	USE transpose_indices
563
564	IMPLICIT NONE
565
566	INTEGER :: i, j, k, l, zs
567
568	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
569
570	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
571
572
573	!
574	!-- If the PE grid is one-dimensional along y, only local reordering
575	!-- of the data is necessary and no transposition has to be done.
576	IF ( pdims(1) == 1 ) THEN
577
578	!$OMP PARALLEL PRIVATE ( i, j, k )
579	!$OMP DO
580	!$acc kernels present( f_inv, f_out )
581	DO j = 0, ny
582	DO k = nzb_y, nzt_y
583	DO i = nxl_y, nxr_y
584	f_out(i,j,k) = f_inv(i,k,j)
585	ENDDO
586	ENDDO
587	ENDDO
588	!$acc end kernels
589	!$OMP END PARALLEL
590
591	ELSE
592
593	#if defined( __parallel )
594	!
595	!-- Transpose array
596	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
597	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
598	!$acc update host( f_inv )
599	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
600	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
601	comm1dx, ierr )
602	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
603
604	!
605	!-- Reorder transposed array
606	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
607	!$OMP DO
608	!$acc data copyin( work )
609	DO l = 0, pdims(1) - 1
610	zs = 1 + l * ( nzt_y - nzb_y + 1 )
611	!$acc kernels present( f_out )
612	DO j = nys_z, nyn_z
613	DO k = zs, zs + nzt_y - nzb_y
614	DO i = nxl_z, nxr_z
615	f_out(i,j,k) = work(i,k-zs+1,j,l)
616	ENDDO
617	ENDDO
618	ENDDO
619	!$acc end kernels
620	ENDDO
621	!$acc end data
622	!$OMP END PARALLEL
623	#endif
624
625	ENDIF
626
627	END SUBROUTINE transpose_yz
628
629
630	SUBROUTINE resort_for_zx( f_in, f_inv )
631
632	!------------------------------------------------------------------------------!
633	! Description:
634	! ------------
635	! Resorting data for the transposition from z to x. The transposition itself
636	! is carried out in transpose_zx
637	!------------------------------------------------------------------------------!
638
639	USE indices
640	USE transpose_indices
641
642	IMPLICIT NONE
643
644	REAL :: f_in(1:nz,nys:nyn,nxl:nxr)
645	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
646
647
648	INTEGER :: i, j, k
649
650	!
651	!-- Rearrange indices of input array in order to make data to be send
652	!-- by MPI contiguous
653	!$OMP PARALLEL PRIVATE ( i, j, k )
654	!$OMP DO
655	!$acc kernels present( f_in, f_inv )
656	DO k = 1,nz
657	DO i = nxl, nxr
658	DO j = nys, nyn
659	f_inv(j,i,k) = f_in(k,j,i)
660	ENDDO
661	ENDDO
662	ENDDO
663	!$acc end kernels
664	!$OMP END PARALLEL
665
666	END SUBROUTINE resort_for_zx
667
668
669	SUBROUTINE transpose_zx( f_inv, f_out )
670
671	!------------------------------------------------------------------------------!
672	! Description:
673	! ------------
674	! Transposition of input array (f_in) from z to x. For the input array, all
675	! elements along z reside on the same PE, while after transposition, all
676	! elements along x reside on the same PE.
677	!------------------------------------------------------------------------------!
678
679	USE cpulog
680	USE indices
681	USE pegrid
682	USE transpose_indices
683
684	IMPLICIT NONE
685
686	INTEGER :: i, j, k, l, xs
687
688	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
689
690	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
691
692
693	!
694	!-- If the PE grid is one-dimensional along y, only local reordering
695	!-- of the data is necessary and no transposition has to be done.
696	IF ( pdims(1) == 1 ) THEN
697
698	!$OMP PARALLEL PRIVATE ( i, j, k )
699	!$OMP DO
700	!$acc kernels present( f_inv, f_out )
701	DO k = 1, nz
702	DO i = nxl, nxr
703	DO j = nys, nyn
704	f_out(i,j,k) = f_inv(j,i,k)
705	ENDDO
706	ENDDO
707	ENDDO
708	!$acc end kernels
709	!$OMP END PARALLEL
710
711	ELSE
712
713	#if defined( __parallel )
714	!
715	!-- Transpose array
716	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
717	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
718	!$acc update host( f_inv )
719	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
720	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
721	comm1dx, ierr )
722	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
723
724	!
725	!-- Reorder transposed array
726	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
727	!$OMP DO
728	!$acc data copyin( work )
729	DO l = 0, pdims(1) - 1
730	xs = 0 + l * nnx
731	!$acc kernels present( f_out )
732	DO k = nzb_x, nzt_x
733	DO i = xs, xs + nnx - 1
734	DO j = nys_x, nyn_x
735	f_out(i,j,k) = work(j,i-xs+1,k,l)
736	ENDDO
737	ENDDO
738	ENDDO
739	!$acc end kernels
740	ENDDO
741	!$acc end data
742	!$OMP END PARALLEL
743	#endif
744
745	ENDIF
746
747	END SUBROUTINE transpose_zx
748
749
750	SUBROUTINE resort_for_zy( f_inv, f_out )
751
752	!------------------------------------------------------------------------------!
753	! Description:
754	! ------------
755	! Resorting data after the transposition from z to y. The transposition itself
756	! is carried out in transpose_zy
757	!------------------------------------------------------------------------------!
758
759	USE indices
760	USE transpose_indices
761
762	IMPLICIT NONE
763
764	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
765	REAL :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
766
767
768	INTEGER :: i, j, k
769
770	!
771	!-- Rearrange indices of input array in order to make data to be send
772	!-- by MPI contiguous
773	!$OMP PARALLEL PRIVATE ( i, j, k )
774	!$OMP DO
775	!$acc kernels present( f_inv, f_out )
776	DO k = nzb_y, nzt_y
777	DO j = 0, ny
778	DO i = nxl_y, nxr_y
779	f_out(j,i,k) = f_inv(i,k,j)
780	ENDDO
781	ENDDO
782	ENDDO
783	!$acc end kernels
784	!$OMP END PARALLEL
785
786	END SUBROUTINE resort_for_zy
787
788
789	SUBROUTINE transpose_zy( f_in, f_inv )
790
791	!------------------------------------------------------------------------------!
792	! Description:
793	! ------------
794	! Transposition of input array (f_in) from z to y. For the input array, all
795	! elements along z reside on the same PE, while after transposition, all
796	! elements along y reside on the same PE.
797	!------------------------------------------------------------------------------!
798
799	USE cpulog
800	USE indices
801	USE pegrid
802	USE transpose_indices
803
804	IMPLICIT NONE
805
806	INTEGER :: i, j, k, l, zs
807
808	REAL :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
809
810	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
811
812
813	!
814	!-- If the PE grid is one-dimensional along y, the array has only to be
815	!-- reordered locally and therefore no transposition has to be done.
816	IF ( pdims(1) /= 1 ) THEN
817
818	#if defined( __parallel )
819	!
820	!-- Reorder input array for transposition
821	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
822	!$OMP DO
823	!$acc data copyout( work )
824	DO l = 0, pdims(1) - 1
825	zs = 1 + l * ( nzt_y - nzb_y + 1 )
826	!$acc kernels present( f_in, work )
827	DO j = nys_z, nyn_z
828	DO k = zs, zs + nzt_y - nzb_y
829	DO i = nxl_z, nxr_z
830	work(i,k-zs+1,j,l) = f_in(i,j,k)
831	ENDDO
832	ENDDO
833	ENDDO
834	!$acc end kernels
835	ENDDO
836	!$acc end data
837	!$OMP END PARALLEL
838
839	!
840	!-- Transpose array
841	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start', cpu_log_nowait )
842	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
843	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
844	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
845	comm1dx, ierr )
846	!$acc update device( f_inv )
847	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
848	#endif
849
850	ELSE
851	!
852	!-- Reorder the array in the same way like ALLTOALL did it
853	!$OMP PARALLEL PRIVATE ( i, j, k )
854	!$OMP DO
855	!$acc kernels present( f_in, f_inv )
856	DO k = nzb_y, nzt_y
857	DO j = 0, ny
858	DO i = nxl_y, nxr_y
859	f_inv(i,k,j) = f_in(i,j,k)
860	ENDDO
861	ENDDO
862	ENDDO
863	!$acc end kernels
864	!$OMP END PARALLEL
865
866	ENDIF
867
868	END SUBROUTINE transpose_zy
869
870
871	SUBROUTINE transpose_zyd( f_in, f_out )
872
873	!------------------------------------------------------------------------------!
874	! Description:
875	! ------------
876	! Transposition of input array (f_in) from z to y. For the input array, all
877	! elements along z reside on the same PE, while after transposition, all
878	! elements along y reside on the same PE.
879	! This is a direct transposition for arrays with indices in regular order
880	! (k,j,i) (cf. transpose_zy).
881	!------------------------------------------------------------------------------!
882
883	USE cpulog
884	USE indices
885	USE pegrid
886	USE transpose_indices
887
888	IMPLICIT NONE
889
890	INTEGER :: i, j, k, l, m, ys
891
892	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
893	f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd), &
894	work(nnxnnynnz)
895
896	#if defined( __parallel )
897
898	!
899	!-- Rearrange indices of input array in order to make data to be send
900	!-- by MPI contiguous
901	DO i = nxl, nxr
902	DO j = nys, nyn
903	DO k = 1, nz
904	f_inv(j,i,k) = f_in(k,j,i)
905	ENDDO
906	ENDDO
907	ENDDO
908
909	!
910	!-- Move data to different array, because memory location of work1 is
911	!-- needed further below (work1 = work2).
912	!-- If the PE grid is one-dimensional along x, only local reordering
913	!-- of the data is necessary and no transposition has to be done.
914	IF ( pdims(2) == 1 ) THEN
915	DO k = 1, nz
916	DO i = nxl, nxr
917	DO j = nys, nyn
918	f_out(j,i,k) = f_inv(j,i,k)
919	ENDDO
920	ENDDO
921	ENDDO
922	RETURN
923	ENDIF
924
925	!
926	!-- Transpose array
927	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
928	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
929	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
930	work(1), sendrecvcount_zyd, MPI_REAL, &
931	comm1dy, ierr )
932	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
933
934	!
935	!-- Reorder transposed array
936	m = 0
937	DO l = 0, pdims(2) - 1
938	ys = 0 + l * nny
939	DO k = nzb_yd, nzt_yd
940	DO i = nxl_yd, nxr_yd
941	DO j = ys, ys + nny - 1
942	m = m + 1
943	f_out(j,i,k) = work(m)
944	ENDDO
945	ENDDO
946	ENDDO
947	ENDDO
948
949	#endif
950
951	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Download in other formats:

| Impressum | ©Leibniz Universität Hannover |