Home

Context Navigation

transpose.f90 @ 1257

Last change on this file since 1257 was 1257, checked in by raasch, 10 years ago

New:
---

openACC porting of timestep calculation
(modules, timestep, time_integration)

Changed:

openACC loop directives and vector clauses removed (because they do not give any performance improvement with PGI
compiler versions > 13.6)
(advec_ws, buoyancy, coriolis, diffusion_e, diffusion_s, diffusion_u, diffusion_v, diffusion_w, diffusivities, exchange_horiz, fft_xy, pres, production_e, transpose, tridia_solver, wall_fluxes)

openACC loop independent clauses added
(boundary_conds, prandtl_fluxes, pres)

openACC declare create statements moved after FORTRAN declaration statement
(diffusion_u, diffusion_v, diffusion_w, fft_xy, poisfft, production_e, tridia_solver)

openACC end parallel replaced by end parallel loop
(flow_statistics, pres)

openACC "kernels do" replaced by "kernels loop"
(prandtl_fluxes)

output format for theta* changed to avoid output of *
(run_control)

Errors:

bugfix for calculation of advective timestep (old version may cause wrong timesteps in case of
vertixcally stretched grids)
Attention: standard run-control output has changed!
(timestep)

Property svn:keywords set to Id

File size: 25.9 KB

Line
1	SUBROUTINE resort_for_xy( f_in, f_inv )
2
3	!--------------------------------------------------------------------------------!
4	! This file is part of PALM.
5	!
6	! PALM is free software: you can redistribute it and/or modify it under the terms
7	! of the GNU General Public License as published by the Free Software Foundation,
8	! either version 3 of the License, or (at your option) any later version.
9	!
10	! PALM is distributed in the hope that it will be useful, but WITHOUT ANY
11	! WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
12	! A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13	!
14	! You should have received a copy of the GNU General Public License along with
15	! PALM. If not, see <http://www.gnu.org/licenses/>.
16	!
17	! Copyright 1997-2012 Leibniz University Hannover
18	!--------------------------------------------------------------------------------!
19	!
20	! Current revisions:
21	! -----------------
22	! openacc loop and loop vector clauses removed
23	!
24	! Former revisions:
25	! -----------------
26	! $Id: transpose.f90 1257 2013-11-08 15:18:40Z raasch $
27	!
28	! 1216 2013-08-26 09:31:42Z raasch
29	! re-sorting of the transposed / to be transposed arrays moved to separate
30	! routines resort_for_...
31	!
32	! 1111 2013-03-08 23:54:10Z raasch
33	! openACC directives added,
34	! resorting data from/to work changed, work got 4 dimensions instead of 1
35	!
36	! 1106 2013-03-04 05:31:38Z raasch
37	! preprocessor lines rearranged so that routines can also be used in serial
38	! (non-parallel) mode
39	!
40	! 1092 2013-02-02 11:24:22Z raasch
41	! unused variables removed
42	!
43	! 1036 2012-10-22 13:43:42Z raasch
44	! code put under GPL (PALM 3.9)
45	!
46	! 1003 2012-09-14 14:35:53Z raasch
47	! indices nxa, nya, etc. replaced by nx, ny, etc.
48	!
49	! 683 2011-02-09 14:25:15Z raasch
50	! openMP parallelization of transpositions for 2d-domain-decomposition
51	!
52	! 622 2010-12-10 08:08:13Z raasch
53	! optional barriers included in order to speed up collective operations
54	!
55	! 164 2008-05-15 08:46:15Z raasch
56	! f_inv changed from subroutine argument to automatic array in order to do
57	! re-ordering from f_in to f_inv in one step, one array work is needed instead
58	! of work1 and work2
59	!
60	! February 2007
61	! RCS Log replace by Id keyword, revision history cleaned up
62	!
63	! Revision 1.2 2004/04/30 13:12:17 raasch
64	! Switched from mpi_alltoallv to the simpler mpi_alltoall,
65	! all former transpose-routine files collected in this file, enlarged
66	! transposition arrays introduced
67	!
68	! Revision 1.1 2004/04/30 13:08:16 raasch
69	! Initial revision (collection of former routines transpose_xy, transpose_xz,
70	! transpose_yx, transpose_yz, transpose_zx, transpose_zy)
71	!
72	! Revision 1.1 1997/07/24 11:25:18 raasch
73	! Initial revision
74	!
75	!------------------------------------------------------------------------------!
76	! Description:
77	! ------------
78	! Resorting data for the transposition from x to y. The transposition itself
79	! is carried out in transpose_xy
80	!------------------------------------------------------------------------------!
81
82	USE indices
83	USE transpose_indices
84
85	IMPLICIT NONE
86
87	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
88	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
89
90
91	INTEGER :: i, j, k
92
93	!
94	!-- Rearrange indices of input array in order to make data to be send
95	!-- by MPI contiguous
96	!$OMP PARALLEL PRIVATE ( i, j, k )
97	!$OMP DO
98	!$acc kernels present( f_in, f_inv )
99	DO i = 0, nx
100	DO k = nzb_x, nzt_x
101	DO j = nys_x, nyn_x
102	f_inv(j,k,i) = f_in(i,j,k)
103	ENDDO
104	ENDDO
105	ENDDO
106	!$acc end kernels
107	!$OMP END PARALLEL
108
109	END SUBROUTINE resort_for_xy
110
111
112	SUBROUTINE transpose_xy( f_inv, f_out )
113
114	!------------------------------------------------------------------------------!
115	! Description:
116	! ------------
117	! Transposition of input array (f_in) from x to y. For the input array, all
118	! elements along x reside on the same PE, while after transposition, all
119	! elements along y reside on the same PE.
120	!------------------------------------------------------------------------------!
121
122	USE cpulog
123	USE indices
124	USE interfaces
125	USE pegrid
126	USE transpose_indices
127
128	IMPLICIT NONE
129
130	INTEGER :: i, j, k, l, ys
131
132	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx), f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
133
134	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
135
136
137	IF ( numprocs /= 1 ) THEN
138
139	#if defined( __parallel )
140	!
141	!-- Transpose array
142	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
143	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
144	!$acc update host( f_inv )
145	CALL MPI_ALLTOALL( f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
146	work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
147	comm1dy, ierr )
148	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
149
150	!
151	!-- Reorder transposed array
152	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
153	!$OMP DO
154	!$acc data copyin( work )
155	DO l = 0, pdims(2) - 1
156	ys = 0 + l * ( nyn_x - nys_x + 1 )
157	!$acc kernels present( f_out, work )
158	DO i = nxl_y, nxr_y
159	DO k = nzb_y, nzt_y
160	DO j = ys, ys + nyn_x - nys_x
161	f_out(j,i,k) = work(j-ys+1,k,i,l)
162	ENDDO
163	ENDDO
164	ENDDO
165	!$acc end kernels
166	ENDDO
167	!$acc end data
168	!$OMP END PARALLEL
169	#endif
170
171	ELSE
172
173	!
174	!-- Reorder transposed array
175	!$OMP PARALLEL PRIVATE ( i, j, k )
176	!$OMP DO
177	!$acc kernels present( f_inv, f_out )
178	DO k = nzb_y, nzt_y
179	DO i = nxl_y, nxr_y
180	DO j = 0, ny
181	f_out(j,i,k) = f_inv(j,k,i)
182	ENDDO
183	ENDDO
184	ENDDO
185	!$acc end kernels
186	!$OMP END PARALLEL
187
188	ENDIF
189
190	END SUBROUTINE transpose_xy
191
192
193	SUBROUTINE resort_for_xz( f_inv, f_out )
194
195	!------------------------------------------------------------------------------!
196	! Description:
197	! ------------
198	! Resorting data after the transposition from x to z. The transposition itself
199	! is carried out in transpose_xz
200	!------------------------------------------------------------------------------!
201
202	USE indices
203	USE transpose_indices
204
205	IMPLICIT NONE
206
207	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
208	REAL :: f_out(1:nz,nys:nyn,nxl:nxr)
209
210
211	INTEGER :: i, j, k
212
213	!
214	!-- Rearrange indices of input array in order to make data to be send
215	!-- by MPI contiguous.
216	!-- In case of parallel fft/transposition, scattered store is faster in
217	!-- backward direction!!!
218	!$OMP PARALLEL PRIVATE ( i, j, k )
219	!$OMP DO
220	!$acc kernels present( f_inv, f_out )
221	DO k = 1, nz
222	DO i = nxl, nxr
223	DO j = nys, nyn
224	f_out(k,j,i) = f_inv(j,i,k)
225	ENDDO
226	ENDDO
227	ENDDO
228	!$acc end kernels
229	!$OMP END PARALLEL
230
231	END SUBROUTINE resort_for_xz
232
233
234	SUBROUTINE transpose_xz( f_in, f_inv )
235
236	!------------------------------------------------------------------------------!
237	! Description:
238	! ------------
239	! Transposition of input array (f_in) from x to z. For the input array, all
240	! elements along x reside on the same PE, while after transposition, all
241	! elements along z reside on the same PE.
242	!------------------------------------------------------------------------------!
243
244	USE cpulog
245	USE indices
246	USE interfaces
247	USE pegrid
248	USE transpose_indices
249
250	IMPLICIT NONE
251
252	INTEGER :: i, j, k, l, xs
253
254	REAL :: f_in(0:nx,nys_x:nyn_x,nzb_x:nzt_x), f_inv(nys:nyn,nxl:nxr,1:nz)
255
256	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
257
258
259	!
260	!-- If the PE grid is one-dimensional along y, the array has only to be
261	!-- reordered locally and therefore no transposition has to be done.
262	IF ( pdims(1) /= 1 ) THEN
263
264	#if defined( __parallel )
265	!
266	!-- Reorder input array for transposition
267	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
268	!$OMP DO
269	!$acc data copyout( work )
270	DO l = 0, pdims(1) - 1
271	xs = 0 + l * nnx
272	!$acc kernels present( f_in, work )
273	DO k = nzb_x, nzt_x
274	DO i = xs, xs + nnx - 1
275	DO j = nys_x, nyn_x
276	work(j,i-xs+1,k,l) = f_in(i,j,k)
277	ENDDO
278	ENDDO
279	ENDDO
280	!$acc end kernels
281	ENDDO
282	!$acc end data
283	!$OMP END PARALLEL
284
285	!
286	!-- Transpose array
287	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
288	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
289	CALL MPI_ALLTOALL( work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
290	f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
291	comm1dx, ierr )
292	!$acc update device( f_inv )
293	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
294	#endif
295
296	ELSE
297
298	!
299	!-- Reorder the array in a way that the z index is in first position
300	!$OMP PARALLEL PRIVATE ( i, j, k )
301	!$OMP DO
302	!$acc kernels present( f_in, f_inv )
303	DO i = nxl, nxr
304	DO j = nys, nyn
305	DO k = 1, nz
306	f_inv(j,i,k) = f_in(i,j,k)
307	ENDDO
308	ENDDO
309	ENDDO
310	!$acc end kernels
311	!$OMP END PARALLEL
312
313	ENDIF
314
315	END SUBROUTINE transpose_xz
316
317
318	SUBROUTINE resort_for_yx( f_inv, f_out )
319
320	!------------------------------------------------------------------------------!
321	! Description:
322	! ------------
323	! Resorting data after the transposition from y to x. The transposition itself
324	! is carried out in transpose_yx
325	!------------------------------------------------------------------------------!
326
327	USE indices
328	USE transpose_indices
329
330	IMPLICIT NONE
331
332	REAL :: f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
333	REAL :: f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
334
335
336	INTEGER :: i, j, k
337
338	!
339	!-- Rearrange indices of input array in order to make data to be send
340	!-- by MPI contiguous
341	!$OMP PARALLEL PRIVATE ( i, j, k )
342	!$OMP DO
343	!$acc kernels present( f_inv, f_out )
344	DO i = 0, nx
345	DO k = nzb_x, nzt_x
346	DO j = nys_x, nyn_x
347	f_out(i,j,k) = f_inv(j,k,i)
348	ENDDO
349	ENDDO
350	ENDDO
351	!$acc end kernels
352	!$OMP END PARALLEL
353
354	END SUBROUTINE resort_for_yx
355
356
357	SUBROUTINE transpose_yx( f_in, f_inv )
358
359	!------------------------------------------------------------------------------!
360	! Description:
361	! ------------
362	! Transposition of input array (f_in) from y to x. For the input array, all
363	! elements along y reside on the same PE, while after transposition, all
364	! elements along x reside on the same PE.
365	!------------------------------------------------------------------------------!
366
367	USE cpulog
368	USE indices
369	USE interfaces
370	USE pegrid
371	USE transpose_indices
372
373	IMPLICIT NONE
374
375	INTEGER :: i, j, k, l, ys
376
377	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y), f_inv(nys_x:nyn_x,nzb_x:nzt_x,0:nx)
378
379	REAL, DIMENSION(nyn_x-nys_x+1,nzb_y:nzt_y,nxl_y:nxr_y,0:pdims(2)-1) :: work
380
381
382	IF ( numprocs /= 1 ) THEN
383
384	#if defined( __parallel )
385	!
386	!-- Reorder input array for transposition
387	!$OMP PARALLEL PRIVATE ( i, j, k, l, ys )
388	!$OMP DO
389	!$acc data copyout( work )
390	DO l = 0, pdims(2) - 1
391	ys = 0 + l * ( nyn_x - nys_x + 1 )
392	!$acc kernels present( f_in, work )
393	DO i = nxl_y, nxr_y
394	DO k = nzb_y, nzt_y
395	DO j = ys, ys + nyn_x - nys_x
396	work(j-ys+1,k,i,l) = f_in(j,i,k)
397	ENDDO
398	ENDDO
399	ENDDO
400	!$acc end kernels
401	ENDDO
402	!$acc end data
403	!$OMP END PARALLEL
404
405	!
406	!-- Transpose array
407	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
408	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
409	CALL MPI_ALLTOALL( work(1,nzb_y,nxl_y,0), sendrecvcount_xy, MPI_REAL, &
410	f_inv(nys_x,nzb_x,0), sendrecvcount_xy, MPI_REAL, &
411	comm1dy, ierr )
412	!$acc update device( f_inv )
413	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
414	#endif
415
416	ELSE
417
418	!
419	!-- Reorder array f_in the same way as ALLTOALL did it
420	!$OMP PARALLEL PRIVATE ( i, j, k )
421	!$OMP DO
422	!$acc kernels present( f_in, f_inv )
423	DO i = nxl_y, nxr_y
424	DO k = nzb_y, nzt_y
425	DO j = 0, ny
426	f_inv(j,k,i) = f_in(j,i,k)
427	ENDDO
428	ENDDO
429	ENDDO
430	!$acc end kernels
431	!$OMP END PARALLEL
432
433	ENDIF
434
435	END SUBROUTINE transpose_yx
436
437
438	SUBROUTINE transpose_yxd( f_in, f_out )
439
440	!------------------------------------------------------------------------------!
441	! Description:
442	! ------------
443	! Transposition of input array (f_in) from y to x. For the input array, all
444	! elements along y reside on the same PE, while after transposition, all
445	! elements along x reside on the same PE.
446	! This is a direct transposition for arrays with indices in regular order
447	! (k,j,i) (cf. transpose_yx).
448	!------------------------------------------------------------------------------!
449
450	USE cpulog
451	USE indices
452	USE interfaces
453	USE pegrid
454	USE transpose_indices
455
456	IMPLICIT NONE
457
458	INTEGER :: i, j, k, l, m, xs
459
460	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nxl:nxr,1:nz,nys:nyn), &
461	f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x), &
462	work(nnxnnynnz)
463
464	#if defined( __parallel )
465
466	!
467	!-- Rearrange indices of input array in order to make data to be send
468	!-- by MPI contiguous
469	DO k = 1, nz
470	DO j = nys, nyn
471	DO i = nxl, nxr
472	f_inv(i,k,j) = f_in(k,j,i)
473	ENDDO
474	ENDDO
475	ENDDO
476
477	!
478	!-- Transpose array
479	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
480	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
481	CALL MPI_ALLTOALL( f_inv(nxl,1,nys), sendrecvcount_xy, MPI_REAL, &
482	work(1), sendrecvcount_xy, MPI_REAL, &
483	comm1dx, ierr )
484	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
485
486	!
487	!-- Reorder transposed array
488	m = 0
489	DO l = 0, pdims(1) - 1
490	xs = 0 + l * nnx
491	DO j = nys_x, nyn_x
492	DO k = 1, nz
493	DO i = xs, xs + nnx - 1
494	m = m + 1
495	f_out(i,j,k) = work(m)
496	ENDDO
497	ENDDO
498	ENDDO
499	ENDDO
500
501	#endif
502
503	END SUBROUTINE transpose_yxd
504
505
506	SUBROUTINE resort_for_yz( f_in, f_inv )
507
508	!------------------------------------------------------------------------------!
509	! Description:
510	! ------------
511	! Resorting data for the transposition from y to z. The transposition itself
512	! is carried out in transpose_yz
513	!------------------------------------------------------------------------------!
514
515	USE indices
516	USE transpose_indices
517
518	IMPLICIT NONE
519
520	REAL :: f_in(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
521	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
522
523
524	INTEGER :: i, j, k
525
526	!
527	!-- Rearrange indices of input array in order to make data to be send
528	!-- by MPI contiguous
529	!$OMP PARALLEL PRIVATE ( i, j, k )
530	!$OMP DO
531	!$acc kernels present( f_in, f_inv )
532	DO j = 0, ny
533	DO k = nzb_y, nzt_y
534	DO i = nxl_y, nxr_y
535	f_inv(i,k,j) = f_in(j,i,k)
536	ENDDO
537	ENDDO
538	ENDDO
539	!$acc end kernels
540	!$OMP END PARALLEL
541
542	END SUBROUTINE resort_for_yz
543
544
545	SUBROUTINE transpose_yz( f_inv, f_out )
546
547	!------------------------------------------------------------------------------!
548	! Description:
549	! ------------
550	! Transposition of input array (f_in) from y to z. For the input array, all
551	! elements along y reside on the same PE, while after transposition, all
552	! elements along z reside on the same PE.
553	!------------------------------------------------------------------------------!
554
555	USE cpulog
556	USE indices
557	USE interfaces
558	USE pegrid
559	USE transpose_indices
560
561	IMPLICIT NONE
562
563	INTEGER :: i, j, k, l, zs
564
565	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny), f_out(nxl_z:nxr_z,nys_z:nyn_z,1:nz)
566
567	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
568
569
570	!
571	!-- If the PE grid is one-dimensional along y, only local reordering
572	!-- of the data is necessary and no transposition has to be done.
573	IF ( pdims(1) == 1 ) THEN
574
575	!$OMP PARALLEL PRIVATE ( i, j, k )
576	!$OMP DO
577	!$acc kernels present( f_inv, f_out )
578	DO j = 0, ny
579	DO k = nzb_y, nzt_y
580	DO i = nxl_y, nxr_y
581	f_out(i,j,k) = f_inv(i,k,j)
582	ENDDO
583	ENDDO
584	ENDDO
585	!$acc end kernels
586	!$OMP END PARALLEL
587
588	ELSE
589
590	#if defined( __parallel )
591	!
592	!-- Transpose array
593	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
594	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
595	!$acc update host( f_inv )
596	CALL MPI_ALLTOALL( f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
597	work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
598	comm1dx, ierr )
599	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
600
601	!
602	!-- Reorder transposed array
603	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
604	!$OMP DO
605	!$acc data copyin( work )
606	DO l = 0, pdims(1) - 1
607	zs = 1 + l * ( nzt_y - nzb_y + 1 )
608	!$acc kernels present( f_out )
609	DO j = nys_z, nyn_z
610	DO k = zs, zs + nzt_y - nzb_y
611	DO i = nxl_z, nxr_z
612	f_out(i,j,k) = work(i,k-zs+1,j,l)
613	ENDDO
614	ENDDO
615	ENDDO
616	!$acc end kernels
617	ENDDO
618	!$acc end data
619	!$OMP END PARALLEL
620	#endif
621
622	ENDIF
623
624	END SUBROUTINE transpose_yz
625
626
627	SUBROUTINE resort_for_zx( f_in, f_inv )
628
629	!------------------------------------------------------------------------------!
630	! Description:
631	! ------------
632	! Resorting data for the transposition from z to x. The transposition itself
633	! is carried out in transpose_zx
634	!------------------------------------------------------------------------------!
635
636	USE indices
637	USE transpose_indices
638
639	IMPLICIT NONE
640
641	REAL :: f_in(1:nz,nys:nyn,nxl:nxr)
642	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz)
643
644
645	INTEGER :: i, j, k
646
647	!
648	!-- Rearrange indices of input array in order to make data to be send
649	!-- by MPI contiguous
650	!$OMP PARALLEL PRIVATE ( i, j, k )
651	!$OMP DO
652	!$acc kernels present( f_in, f_inv )
653	DO k = 1,nz
654	DO i = nxl, nxr
655	DO j = nys, nyn
656	f_inv(j,i,k) = f_in(k,j,i)
657	ENDDO
658	ENDDO
659	ENDDO
660	!$acc end kernels
661	!$OMP END PARALLEL
662
663	END SUBROUTINE resort_for_zx
664
665
666	SUBROUTINE transpose_zx( f_inv, f_out )
667
668	!------------------------------------------------------------------------------!
669	! Description:
670	! ------------
671	! Transposition of input array (f_in) from z to x. For the input array, all
672	! elements along z reside on the same PE, while after transposition, all
673	! elements along x reside on the same PE.
674	!------------------------------------------------------------------------------!
675
676	USE cpulog
677	USE indices
678	USE interfaces
679	USE pegrid
680	USE transpose_indices
681
682	IMPLICIT NONE
683
684	INTEGER :: i, j, k, l, xs
685
686	REAL :: f_inv(nys:nyn,nxl:nxr,1:nz), f_out(0:nx,nys_x:nyn_x,nzb_x:nzt_x)
687
688	REAL, DIMENSION(nys_x:nyn_x,nnx,nzb_x:nzt_x,0:pdims(1)-1) :: work
689
690
691	!
692	!-- If the PE grid is one-dimensional along y, only local reordering
693	!-- of the data is necessary and no transposition has to be done.
694	IF ( pdims(1) == 1 ) THEN
695
696	!$OMP PARALLEL PRIVATE ( i, j, k )
697	!$OMP DO
698	!$acc kernels present( f_inv, f_out )
699	DO k = 1, nz
700	DO i = nxl, nxr
701	DO j = nys, nyn
702	f_out(i,j,k) = f_inv(j,i,k)
703	ENDDO
704	ENDDO
705	ENDDO
706	!$acc end kernels
707	!$OMP END PARALLEL
708
709	ELSE
710
711	#if defined( __parallel )
712	!
713	!-- Transpose array
714	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
715	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
716	!$acc update host( f_inv )
717	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zx, MPI_REAL, &
718	work(nys_x,1,nzb_x,0), sendrecvcount_zx, MPI_REAL, &
719	comm1dx, ierr )
720	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
721
722	!
723	!-- Reorder transposed array
724	!$OMP PARALLEL PRIVATE ( i, j, k, l, xs )
725	!$OMP DO
726	!$acc data copyin( work )
727	DO l = 0, pdims(1) - 1
728	xs = 0 + l * nnx
729	!$acc kernels present( f_out )
730	DO k = nzb_x, nzt_x
731	DO i = xs, xs + nnx - 1
732	DO j = nys_x, nyn_x
733	f_out(i,j,k) = work(j,i-xs+1,k,l)
734	ENDDO
735	ENDDO
736	ENDDO
737	!$acc end kernels
738	ENDDO
739	!$acc end data
740	!$OMP END PARALLEL
741	#endif
742
743	ENDIF
744
745	END SUBROUTINE transpose_zx
746
747
748	SUBROUTINE resort_for_zy( f_inv, f_out )
749
750	!------------------------------------------------------------------------------!
751	! Description:
752	! ------------
753	! Resorting data after the transposition from z to y. The transposition itself
754	! is carried out in transpose_zy
755	!------------------------------------------------------------------------------!
756
757	USE indices
758	USE transpose_indices
759
760	IMPLICIT NONE
761
762	REAL :: f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
763	REAL :: f_out(0:ny,nxl_y:nxr_y,nzb_y:nzt_y)
764
765
766	INTEGER :: i, j, k
767
768	!
769	!-- Rearrange indices of input array in order to make data to be send
770	!-- by MPI contiguous
771	!$OMP PARALLEL PRIVATE ( i, j, k )
772	!$OMP DO
773	!$acc kernels present( f_inv, f_out )
774	DO k = nzb_y, nzt_y
775	DO j = 0, ny
776	DO i = nxl_y, nxr_y
777	f_out(j,i,k) = f_inv(i,k,j)
778	ENDDO
779	ENDDO
780	ENDDO
781	!$acc end kernels
782	!$OMP END PARALLEL
783
784	END SUBROUTINE resort_for_zy
785
786
787	SUBROUTINE transpose_zy( f_in, f_inv )
788
789	!------------------------------------------------------------------------------!
790	! Description:
791	! ------------
792	! Transposition of input array (f_in) from z to y. For the input array, all
793	! elements along z reside on the same PE, while after transposition, all
794	! elements along y reside on the same PE.
795	!------------------------------------------------------------------------------!
796
797	USE cpulog
798	USE indices
799	USE interfaces
800	USE pegrid
801	USE transpose_indices
802
803	IMPLICIT NONE
804
805	INTEGER :: i, j, k, l, zs
806
807	REAL :: f_in(nxl_z:nxr_z,nys_z:nyn_z,1:nz), f_inv(nxl_y:nxr_y,nzb_y:nzt_y,0:ny)
808
809	REAL, DIMENSION(nxl_z:nxr_z,nzt_y-nzb_y+1,nys_z:nyn_z,0:pdims(1)-1) :: work
810
811
812	!
813	!-- If the PE grid is one-dimensional along y, the array has only to be
814	!-- reordered locally and therefore no transposition has to be done.
815	IF ( pdims(1) /= 1 ) THEN
816
817	#if defined( __parallel )
818	!
819	!-- Reorder input array for transposition
820	!$OMP PARALLEL PRIVATE ( i, j, k, l, zs )
821	!$OMP DO
822	!$acc data copyout( work )
823	DO l = 0, pdims(1) - 1
824	zs = 1 + l * ( nzt_y - nzb_y + 1 )
825	!$acc kernels present( f_in, work )
826	DO j = nys_z, nyn_z
827	DO k = zs, zs + nzt_y - nzb_y
828	DO i = nxl_z, nxr_z
829	work(i,k-zs+1,j,l) = f_in(i,j,k)
830	ENDDO
831	ENDDO
832	ENDDO
833	!$acc end kernels
834	ENDDO
835	!$acc end data
836	!$OMP END PARALLEL
837
838	!
839	!-- Transpose array
840	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
841	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
842	CALL MPI_ALLTOALL( work(nxl_z,1,nys_z,0), sendrecvcount_yz, MPI_REAL, &
843	f_inv(nxl_y,nzb_y,0), sendrecvcount_yz, MPI_REAL, &
844	comm1dx, ierr )
845	!$acc update device( f_inv )
846	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
847	#endif
848
849	ELSE
850	!
851	!-- Reorder the array in the same way like ALLTOALL did it
852	!$OMP PARALLEL PRIVATE ( i, j, k )
853	!$OMP DO
854	!$acc kernels present( f_in, f_inv )
855	DO k = nzb_y, nzt_y
856	DO j = 0, ny
857	DO i = nxl_y, nxr_y
858	f_inv(i,k,j) = f_in(i,j,k)
859	ENDDO
860	ENDDO
861	ENDDO
862	!$acc end kernels
863	!$OMP END PARALLEL
864
865	ENDIF
866
867	END SUBROUTINE transpose_zy
868
869
870	SUBROUTINE transpose_zyd( f_in, f_out )
871
872	!------------------------------------------------------------------------------!
873	! Description:
874	! ------------
875	! Transposition of input array (f_in) from z to y. For the input array, all
876	! elements along z reside on the same PE, while after transposition, all
877	! elements along y reside on the same PE.
878	! This is a direct transposition for arrays with indices in regular order
879	! (k,j,i) (cf. transpose_zy).
880	!------------------------------------------------------------------------------!
881
882	USE cpulog
883	USE indices
884	USE interfaces
885	USE pegrid
886	USE transpose_indices
887
888	IMPLICIT NONE
889
890	INTEGER :: i, j, k, l, m, ys
891
892	REAL :: f_in(1:nz,nys:nyn,nxl:nxr), f_inv(nys:nyn,nxl:nxr,1:nz), &
893	f_out(0:ny,nxl_yd:nxr_yd,nzb_yd:nzt_yd), &
894	work(nnxnnynnz)
895
896	#if defined( __parallel )
897
898	!
899	!-- Rearrange indices of input array in order to make data to be send
900	!-- by MPI contiguous
901	DO i = nxl, nxr
902	DO j = nys, nyn
903	DO k = 1, nz
904	f_inv(j,i,k) = f_in(k,j,i)
905	ENDDO
906	ENDDO
907	ENDDO
908
909	!
910	!-- Move data to different array, because memory location of work1 is
911	!-- needed further below (work1 = work2).
912	!-- If the PE grid is one-dimensional along x, only local reordering
913	!-- of the data is necessary and no transposition has to be done.
914	IF ( pdims(2) == 1 ) THEN
915	DO k = 1, nz
916	DO i = nxl, nxr
917	DO j = nys, nyn
918	f_out(j,i,k) = f_inv(j,i,k)
919	ENDDO
920	ENDDO
921	ENDDO
922	RETURN
923	ENDIF
924
925	!
926	!-- Transpose array
927	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'start' )
928	IF ( collective_wait ) CALL MPI_BARRIER( comm2d, ierr )
929	CALL MPI_ALLTOALL( f_inv(nys,nxl,1), sendrecvcount_zyd, MPI_REAL, &
930	work(1), sendrecvcount_zyd, MPI_REAL, &
931	comm1dy, ierr )
932	CALL cpu_log( log_point_s(32), 'mpi_alltoall', 'stop' )
933
934	!
935	!-- Reorder transposed array
936	m = 0
937	DO l = 0, pdims(2) - 1
938	ys = 0 + l * nny
939	DO k = nzb_yd, nzt_yd
940	DO i = nxl_yd, nxr_yd
941	DO j = ys, ys + nny - 1
942	m = m + 1
943	f_out(j,i,k) = work(m)
944	ENDDO
945	ENDDO
946	ENDDO
947	ENDDO
948
949	#endif
950
951	END SUBROUTINE transpose_zyd

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: palm/trunk/SOURCE/transpose.f90 @ 1257

Download in other formats: