source: palm/trunk/SOURCE/cpu_statistics.f90 @ 1016

Last change on this file since 1016 was 1015, checked in by raasch, 12 years ago

Starting with changes required for GPU optimization. OpenACC statements for using NVIDIA GPUs added.
Adjustment of mixing length to the Prandtl mixing length at first grid point above ground removed.
mask array is set zero for ghost boundaries

  • Property svn:keywords set to Id
File size: 9.7 KB
Line 
1 SUBROUTINE cpu_statistics
2
3!------------------------------------------------------------------------------!
4! Current revisions:
5! -----------------
6! output of accelerator board information
7!
8! Former revisions:
9! -----------------
10! $Id: cpu_statistics.f90 1015 2012-09-27 09:23:24Z raasch $
11!
12! 683 2011-02-09 14:25:15Z raasch
13! output of handling of ghostpoint exchange
14!
15! 622 2010-12-10 08:08:13Z raasch
16! output of handling of collective operations
17!
18! 222 2009-01-12 16:04:16Z letzel
19! Bugfix for nonparallel execution
20!
21! 197 2008-09-16 15:29:03Z raasch
22! Format adjustments in order to allow CPU# > 999,
23! data are collected from PE0 in an ordered sequence which seems to avoid
24! hanging of processes on SGI-ICE
25!
26! 82 2007-04-16 15:40:52Z raasch
27! Preprocessor directives for old systems removed
28!
29! RCS Log replace by Id keyword, revision history cleaned up
30!
31! Revision 1.13  2006/04/26 12:10:51  raasch
32! Output of number of threads per task, max = min in case of 1 PE
33!
34! Revision 1.1  1997/07/24 11:11:11  raasch
35! Initial revision
36!
37!
38! Description:
39! ------------
40! Analysis and output of the cpu-times measured. All PE results are collected
41! on PE0 in order to calculate the mean cpu-time over all PEs and other
42! statistics. The output is sorted according to the amount of cpu-time consumed
43! and output on PE0.
44!------------------------------------------------------------------------------!
45
46    USE cpulog
47    USE pegrid
48    USE control_parameters
49
50    IMPLICIT NONE
51
52    INTEGER    ::  i, ii(1), iii, lp, sender
53    REAL, SAVE ::  norm = 1.0
54    REAL, DIMENSION(:),   ALLOCATABLE ::  pe_max, pe_min, pe_rms, sum
55    REAL, DIMENSION(:,:), ALLOCATABLE ::  pe_log_points
56
57
58!
59!-- Compute cpu-times in seconds
60    log_point%mtime  = log_point%mtime  / norm
61    log_point%sum    = log_point%sum    / norm
62    log_point%vector = log_point%vector / norm
63    WHERE ( log_point%counts /= 0 )
64       log_point%mean = log_point%sum / log_point%counts
65    END WHERE
66
67
68!
69!-- Collect cpu-times from all PEs and calculate statistics
70    IF ( myid == 0 )  THEN
71!
72!--    Allocate and initialize temporary arrays needed for statistics
73       ALLOCATE( pe_max( SIZE( log_point ) ), pe_min( SIZE( log_point ) ), &
74                 pe_rms( SIZE( log_point ) ),                              &
75                 pe_log_points( SIZE( log_point ), 0:numprocs-1 ) )
76       pe_min = log_point%sum
77       pe_max = log_point%sum    ! need to be set in case of 1 PE
78       pe_rms = 0.0
79
80#if defined( __parallel )
81!
82!--    Receive data from all PEs
83       DO  i = 1, numprocs-1
84          CALL MPI_RECV( pe_max(1), SIZE( log_point ), MPI_REAL, &
85                         i, i, comm2d, status, ierr )
86          sender = status(MPI_SOURCE)
87          pe_log_points(:,sender) = pe_max
88       ENDDO
89       pe_log_points(:,0) = log_point%sum   ! Results from PE0
90!
91!--    Calculate mean of all PEs, store it on log_point%sum
92!--    and find minimum and maximum
93       DO  iii = 1, SIZE( log_point )
94          DO  i = 1, numprocs-1
95             log_point(iii)%sum = log_point(iii)%sum + pe_log_points(iii,i)
96             pe_min(iii) = MIN( pe_min(iii), pe_log_points(iii,i) )
97             pe_max(iii) = MAX( pe_max(iii), pe_log_points(iii,i) )
98          ENDDO
99          log_point(iii)%sum = log_point(iii)%sum / numprocs
100!
101!--       Calculate rms
102          DO  i = 0, numprocs-1
103             pe_rms(iii) = pe_rms(iii) + ( &
104                                 pe_log_points(iii,i) - log_point(iii)%sum &
105                                         )**2
106          ENDDO
107          pe_rms(iii) = SQRT( pe_rms(iii) / numprocs )
108       ENDDO
109    ELSE
110!
111!--    Send data to PE0 (pe_max is used as temporary storage to send
112!--    the data in order to avoid sending the data type log)
113       ALLOCATE( pe_max( SIZE( log_point ) ) )
114       pe_max = log_point%sum
115       CALL MPI_SEND( pe_max(1), SIZE( log_point ), MPI_REAL, 0, myid, comm2d, &
116                      ierr )
117#endif
118
119    ENDIF
120
121!
122!-- Write cpu-times
123    IF ( myid == 0 )  THEN
124!
125!--    Re-store sums
126       ALLOCATE( sum( SIZE( log_point ) ) )
127       WHERE ( log_point%counts /= 0 )
128          sum = log_point%sum
129       ELSEWHERE
130          sum = -1.0
131       ENDWHERE
132
133!
134!--    Write cpu-times sorted by size
135       CALL check_open( 18 )
136#if defined( __parallel )
137       WRITE ( 18, 100 )  TRIM( run_description_header ),        &
138                          numprocs * threads_per_task, pdims(1), pdims(2), &
139                          threads_per_task
140       IF ( num_acc_per_node /= 0 )  WRITE ( 18, 108 )  num_acc_per_node
141       WRITE ( 18, 110 )
142#else
143       WRITE ( 18, 100 )  TRIM( run_description_header ),        &
144                          numprocs * threads_per_task, 1, 1, &
145                          threads_per_task
146       IF ( num_acc_per_node /= 0 )  WRITE ( 18, 109 )  num_acc_per_node
147       WRITE ( 18, 110 )
148#endif
149       DO
150          ii = MAXLOC( sum )
151          i = ii(1)
152          IF ( sum(i) /= -1.0 )  THEN
153             WRITE ( 18, 102 ) &
154                log_point(i)%place, log_point(i)%sum,                &
155                log_point(i)%sum / log_point(1)%sum * 100.0,         &
156                log_point(i)%counts, pe_min(i), pe_max(i), pe_rms(i)
157             sum(i) = -1.0
158          ELSE
159             EXIT
160          ENDIF
161       ENDDO
162    ENDIF
163
164
165!
166!-- The same procedure again for the individual measurements.
167!
168!-- Compute cpu-times in seconds
169    log_point_s%mtime  = log_point_s%mtime  / norm
170    log_point_s%sum    = log_point_s%sum    / norm
171    log_point_s%vector = log_point_s%vector / norm
172    WHERE ( log_point_s%counts /= 0 )
173       log_point_s%mean = log_point_s%sum / log_point_s%counts
174    END WHERE
175
176!
177!-- Collect cpu-times from all PEs and calculate statistics
178#if defined( __parallel )
179!
180!-- Set barrier in order to avoid that PE0 receives log_point_s-data
181!-- while still busy with receiving log_point-data (see above)
182    CALL MPI_BARRIER( comm2d, ierr )   
183#endif
184    IF ( myid == 0 )  THEN
185!
186!--    Initialize temporary arrays needed for statistics
187       pe_min = log_point_s%sum
188       pe_max = log_point_s%sum    ! need to be set in case of 1 PE
189       pe_rms = 0.0
190
191#if defined( __parallel )
192!
193!--    Receive data from all PEs
194       DO  i = 1, numprocs-1
195          CALL MPI_RECV( pe_max(1), SIZE( log_point ), MPI_REAL, &
196                         MPI_ANY_SOURCE, MPI_ANY_TAG, comm2d, status, ierr )
197          sender = status(MPI_SOURCE)
198          pe_log_points(:,sender) = pe_max
199       ENDDO
200       pe_log_points(:,0) = log_point_s%sum   ! Results from PE0
201!
202!--    Calculate mean of all PEs, store it on log_point_s%sum
203!--    and find minimum and maximum
204       DO  iii = 1, SIZE( log_point )
205          DO  i = 1, numprocs-1
206             log_point_s(iii)%sum = log_point_s(iii)%sum + pe_log_points(iii,i)
207             pe_min(iii) = MIN( pe_min(iii), pe_log_points(iii,i) )
208             pe_max(iii) = MAX( pe_max(iii), pe_log_points(iii,i) )
209          ENDDO
210          log_point_s(iii)%sum = log_point_s(iii)%sum / numprocs
211!
212!--       Calculate rms
213          DO  i = 0, numprocs-1
214             pe_rms(iii) = pe_rms(iii) + ( &
215                                 pe_log_points(iii,i) - log_point_s(iii)%sum &
216                                         )**2
217          ENDDO
218          pe_rms(iii) = SQRT( pe_rms(iii) / numprocs )
219       ENDDO
220    ELSE
221!
222!--    Send data to PE0 (pe_max is used as temporary storage to send
223!--    the data in order to avoid sending the data type log)
224       pe_max = log_point_s%sum
225       CALL MPI_SEND( pe_max(1), SIZE( log_point ), MPI_REAL, 0, 0, comm2d, &
226                      ierr )
227#endif
228
229    ENDIF
230
231!
232!-- Write cpu-times
233    IF ( myid == 0 )  THEN
234!
235!--    Re-store sums
236       WHERE ( log_point_s%counts /= 0 )
237          sum = log_point_s%sum
238       ELSEWHERE
239          sum = -1.0
240       ENDWHERE
241
242!
243!--    Write cpu-times sorted by size
244       WRITE ( 18, 101 )
245       DO
246          ii = MAXLOC( sum )
247          i = ii(1)
248          IF ( sum(i) /= -1.0 )  THEN
249             WRITE ( 18, 102 ) &
250                log_point_s(i)%place, log_point_s(i)%sum, &
251                log_point_s(i)%sum / log_point(1)%sum * 100.0, &
252                log_point_s(i)%counts, pe_min(i), pe_max(i), pe_rms(i)
253             sum(i) = -1.0
254          ELSE
255             EXIT
256          ENDIF
257       ENDDO
258
259!
260!--    Output of handling of MPI operations
261       IF ( collective_wait )  THEN
262          WRITE ( 18, 103 )
263       ELSE
264          WRITE ( 18, 104 )
265       ENDIF
266       IF ( synchronous_exchange )  THEN
267          WRITE ( 18, 105 )
268       ELSE
269          WRITE ( 18, 106 )
270       ENDIF
271
272!
273!--    Empty lines in order to create a gap to the results of the model
274!--    continuation runs
275       WRITE ( 18, 107 )
276
277!
278!--    Unit 18 is not needed anymore
279       CALL close_file( 18 )
280
281    ENDIF
282
283
284100 FORMAT (A/11('-')//'CPU measures for ',I5,' PEs (',I5,'(x) * ',I5,'(y', &
285            &') tasks *',I5,' threads):')
286
287101 FORMAT (/'special measures:'/ &
288            &'-----------------------------------------------------------', &
289            &'--------------------')
290
291102 FORMAT (A20,2X,F9.3,2X,F7.2,1X,I7,3(1X,F9.3))
292103 FORMAT (/'Barriers are set in front of collective operations')
293104 FORMAT (/'No barriers are set in front of collective operations')
294105 FORMAT (/'Exchange of ghostpoints via MPI_SENDRCV')
295106 FORMAT (/'Exchange of ghostpoints via MPI_ISEND/MPI_IRECV')
296107 FORMAT (//)
297108 FORMAT ('Accelerator boards per node: ',I2)
298109 FORMAT ('Accelerator boards: ',I2)
299110 FORMAT ('----------------------------------------------------------',   &
300            &'------------'//&
301            &'place:                        mean        counts      min  ', &
302            &'     max       rms'/ &
303            &'                           sec.      %                sec. ', &
304            &'     sec.      sec.'/  &
305            &'-----------------------------------------------------------', &
306            &'-------------------')
307
308 END SUBROUTINE cpu_statistics
309
Note: See TracBrowser for help on using the repository browser.