1 // Copyright 2022 The ChromiumOS Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 use std::fmt;
6 use std::fmt::Write as _;
7 use std::mem;
8 use std::sync::Arc;
9 use std::sync::Mutex;
10 use std::sync::Weak;
11 use std::thread;
12 use std::thread::JoinHandle;
13 use std::time::Duration;
14
15 use base::error;
16 use base::AsRawDescriptor;
17 use base::Error as SysError;
18 use base::Event;
19 use base::EventToken;
20 use base::FromRawDescriptor;
21 use base::SafeDescriptor;
22 use base::WaitContext;
23 use chrono::DateTime;
24 use chrono::Local;
25 use winapi::shared::minwindef::DWORD;
26 use winapi::shared::minwindef::FILETIME;
27 use winapi::um::processthreadsapi::GetProcessTimes;
28 use winapi::um::processthreadsapi::GetSystemTimes;
29 use winapi::um::processthreadsapi::OpenProcess;
30 use winapi::um::psapi::GetProcessMemoryInfo;
31 use winapi::um::psapi::PROCESS_MEMORY_COUNTERS;
32 use winapi::um::psapi::PROCESS_MEMORY_COUNTERS_EX;
33 use winapi::um::winbase::GetProcessIoCounters;
34 use winapi::um::winnt::IO_COUNTERS;
35 use winapi::um::winnt::LARGE_INTEGER;
36 use winapi::um::winnt::LONGLONG;
37 use winapi::um::winnt::PROCESS_QUERY_LIMITED_INFORMATION;
38 use winapi::um::winnt::PROCESS_VM_READ;
39 use winapi::um::winnt::SYNCHRONIZE;
40
41 use crate::log_metric;
42 use crate::windows::Error;
43 use crate::windows::Result;
44 use crate::windows::METRIC_UPLOAD_INTERVAL_SECONDS;
45 use crate::MetricEventType;
46
47 const BYTES_PER_MB: usize = 1024 * 1024;
48 const WORKER_REPORT_INTERVAL: Duration = Duration::from_secs(1);
49
50 type SysResult<T> = std::result::Result<T, SysError>;
51
52 /// A worker job which periodically logs system metrics.
53 struct Worker {
54 exit_evt: Event,
55 io: Arc<Mutex<Option<ProcessIoRecord>>>,
56 measurements: Arc<Mutex<Option<Measurements>>>,
57 memory: Arc<Mutex<ProcessMemory>>,
58 memory_acc: Arc<Mutex<Option<ProcessMemoryAccumulated>>>,
59 metrics_string: Arc<Mutex<String>>,
60 }
61
62 impl Worker {
run(&mut self)63 fn run(&mut self) {
64 #[derive(EventToken)]
65 enum Token {
66 Exit,
67 }
68 let event_ctx: WaitContext<Token> =
69 match WaitContext::build_with(&[(&self.exit_evt, Token::Exit)]) {
70 Ok(event_ctx) => event_ctx,
71 Err(e) => {
72 error!("failed creating WaitContext: {}", e);
73 return;
74 }
75 };
76 let mut last_metric_upload_time = Local::now();
77 'poll: loop {
78 let events = match event_ctx.wait_timeout(WORKER_REPORT_INTERVAL) {
79 Ok(events) => events,
80 Err(e) => {
81 error!("failed polling for events: {}", e);
82 return;
83 }
84 };
85 if events.is_empty() {
86 self.collect_metrics();
87 // Time budget for UI thread is very limited.
88 // We make the metric string for displaying in UI in the
89 // worker thread for best performance.
90 self.make_metrics_string();
91
92 self.upload_metrics(&mut last_metric_upload_time);
93 }
94
95 if events.into_iter().any(|e| e.is_readable) {
96 break 'poll;
97 }
98 }
99 }
100
make_metrics_string(&mut self)101 fn make_metrics_string(&mut self) {
102 let mut metrics_string = self.metrics_string.lock().unwrap();
103 *metrics_string = format!(
104 "{}\n{}",
105 self.cpu_metrics_string(),
106 self.mem_metrics_string()
107 );
108 }
109
upload_metrics(&self, last_metric_upload_time: &mut DateTime<Local>)110 fn upload_metrics(&self, last_metric_upload_time: &mut DateTime<Local>) {
111 let time_elapsed = (Local::now() - *last_metric_upload_time).num_seconds();
112 if time_elapsed >= METRIC_UPLOAD_INTERVAL_SECONDS {
113 let mut memory_acc = self.memory_acc.lock().unwrap();
114 if let Some(acc) = &*memory_acc {
115 let mem = acc.accumulated.physical / acc.accumulated_count / BYTES_PER_MB;
116 // The i64 cast will not cause overflow because the mem is at most 10TB for
117 // Windows 10.
118 log_metric(MetricEventType::MemoryUsage, mem as i64);
119 }
120 *memory_acc = None;
121
122 let mut cpu_measurements = self.measurements.lock().unwrap();
123 if let Some(measurements) = &*cpu_measurements {
124 let sys_time = measurements.current.sys_time;
125 let process_time = measurements.current.process_time;
126 let prev_sys_time = measurements.last_upload.sys_time;
127 let prev_process_time = measurements.last_upload.process_time;
128
129 let diff_systime_kernel =
130 compute_filetime_subtraction(sys_time.kernel, prev_sys_time.kernel);
131 let diff_systime_user =
132 compute_filetime_subtraction(sys_time.user, prev_sys_time.user);
133
134 let diff_processtime_kernel =
135 compute_filetime_subtraction(process_time.kernel, prev_process_time.kernel);
136 let diff_processtime_user =
137 compute_filetime_subtraction(process_time.user, prev_process_time.user);
138
139 let total_systime = diff_systime_kernel + diff_systime_user;
140 let total_processtime = diff_processtime_kernel + diff_processtime_user;
141
142 if total_systime > 0 {
143 let cpu_usage = 100 * total_processtime / total_systime;
144 // The i64 cast will not cause overflow because the usage is at most 100.
145 log_metric(MetricEventType::CpuUsage, cpu_usage as i64);
146 }
147 }
148 *cpu_measurements = None;
149
150 let mut io = self.io.lock().unwrap();
151 if let Some(io_record) = &*io {
152 let new_io_read_bytes =
153 io_record.current.read_bytes - io_record.last_upload.read_bytes;
154 let new_io_write_bytes =
155 io_record.current.write_bytes - io_record.last_upload.write_bytes;
156
157 let ms_elapsed =
158 (io_record.current_time - io_record.last_upload_time).num_milliseconds();
159 if ms_elapsed > 0 {
160 let io_read_bytes_per_sec =
161 (new_io_read_bytes as f32) / (ms_elapsed as f32) * 1000.0;
162 let io_write_bytes_per_sec =
163 (new_io_write_bytes as f32) / (ms_elapsed as f32) * 1000.0;
164 log_metric(MetricEventType::ReadIo, io_read_bytes_per_sec as i64);
165 log_metric(MetricEventType::WriteIo, io_write_bytes_per_sec as i64);
166 }
167 }
168 *io = None;
169 *last_metric_upload_time = Local::now();
170 }
171 }
172
collect_metrics(&mut self)173 fn collect_metrics(&mut self) {
174 match self.get_cpu_metrics() {
175 Ok(new_measurement) => {
176 let mut measurements = self.measurements.lock().unwrap();
177 let next_measurements = match *measurements {
178 Some(Measurements {
179 current,
180 last_upload,
181 ..
182 }) => Measurements {
183 current: new_measurement,
184 previous: current,
185 last_upload,
186 },
187 None => Measurements {
188 current: new_measurement,
189 previous: new_measurement,
190 last_upload: new_measurement,
191 },
192 };
193 *measurements = Some(next_measurements);
194 }
195 Err(e) => {
196 // Do not panic because of cpu query related failures.
197 error!("Get cpu measurement failed: {}", e);
198 }
199 }
200
201 match self.get_mem_metrics() {
202 Ok(mem) => {
203 // Keep running sum and count to calculate averages.
204 let mut memory_acc = self.memory_acc.lock().unwrap();
205 let updated_memory_acc = match *memory_acc {
206 Some(acc) => accumulate_process_memory(acc, mem),
207 None => ProcessMemoryAccumulated {
208 accumulated: mem,
209 accumulated_count: 1,
210 },
211 };
212 *memory_acc = Some(updated_memory_acc);
213 *self.memory.lock().unwrap() = mem
214 }
215 Err(e) => {
216 // Do not panic because of memory query failures.
217 error!("Get cpu measurement failed: {}", e);
218 }
219 }
220
221 match self.get_io_metrics() {
222 Ok(new_io) => {
223 let mut io_record = self.io.lock().unwrap();
224 let updated_io = match *io_record {
225 Some(io) => ProcessIoRecord {
226 current: new_io,
227 current_time: Local::now(),
228 last_upload: io.last_upload,
229 last_upload_time: io.last_upload_time,
230 },
231 None => ProcessIoRecord {
232 current: new_io,
233 current_time: Local::now(),
234 last_upload: new_io,
235 last_upload_time: Local::now(),
236 },
237 };
238 *io_record = Some(updated_io);
239 }
240 Err(e) => {
241 // Do not panic because of io query failures.
242 error!("Get io measurement failed: {}", e);
243 }
244 }
245 }
246
get_mem_metrics(&self) -> SysResult<ProcessMemory>247 fn get_mem_metrics(&self) -> SysResult<ProcessMemory> {
248 let process_handle = CoreWinMetrics::get_process_handle()?;
249
250 let mut counters = PROCESS_MEMORY_COUNTERS_EX::default();
251
252 // Safe because we own the process handle and all memory was allocated.
253 let result = unsafe {
254 GetProcessMemoryInfo(
255 process_handle.as_raw_descriptor(),
256 // Converting is necessary because the `winapi`' GetProcessMemoryInfo
257 // does NOT support `PROCESS_MEMORY_COUNTERS_EX`, but only
258 // 'PROCESS_MEMORY_COUNTERS'. The casting is safe because the underlining
259 // Windows api does `PROCESS_MEMORY_COUNTERS_EX`.
260 &mut counters as *mut PROCESS_MEMORY_COUNTERS_EX as *mut PROCESS_MEMORY_COUNTERS,
261 mem::size_of::<PROCESS_MEMORY_COUNTERS_EX>() as DWORD,
262 )
263 };
264 if result == 0 {
265 return Err(SysError::last());
266 }
267
268 Ok(ProcessMemory {
269 page_fault_count: counters.PageFaultCount,
270 working_set_size: counters.WorkingSetSize,
271 working_set_peak: counters.PeakWorkingSetSize,
272 page_file_usage: counters.PagefileUsage,
273 page_file_peak: counters.PeakPagefileUsage,
274 physical: counters.PrivateUsage,
275 })
276 }
277
get_cpu_metrics(&self) -> SysResult<Measurement>278 fn get_cpu_metrics(&self) -> SysResult<Measurement> {
279 let mut sys_time: SystemCpuTime = Default::default();
280 let mut process_time: ProcessCpuTime = Default::default();
281 let sys_time_success: i32;
282
283 // Safe because memory is allocated for sys_time before the windows call.
284 // And the value were initilized to 0s.
285 unsafe {
286 // First get kernel cpu time.
287 sys_time_success =
288 GetSystemTimes(&mut sys_time.idle, &mut sys_time.kernel, &mut sys_time.user);
289 }
290 if sys_time_success == 0 {
291 error!("Systime collection failed.\n");
292 return Err(SysError::last());
293 } else {
294 // Query current process cpu time.
295 let process_handle = CoreWinMetrics::get_process_handle()?;
296 let process_time_success: i32;
297 // Safe because memory is allocated for process_time before the windows call.
298 // And the value were initilized to 0s.
299 unsafe {
300 process_time_success = GetProcessTimes(
301 process_handle.as_raw_descriptor(),
302 &mut process_time.create,
303 &mut process_time.exit,
304 &mut process_time.kernel,
305 &mut process_time.user,
306 );
307 }
308 if process_time_success == 0 {
309 error!("Systime collection failed.\n");
310 return Err(SysError::last());
311 }
312 }
313 Ok(Measurement {
314 sys_time: SystemCpuTime {
315 idle: sys_time.idle,
316 kernel: sys_time.kernel,
317 user: sys_time.user,
318 },
319 process_time: ProcessCpuTime {
320 create: process_time.create,
321 exit: process_time.exit,
322 kernel: process_time.kernel,
323 user: process_time.user,
324 },
325 })
326 }
327
get_io_metrics(&self) -> SysResult<ProcessIo>328 fn get_io_metrics(&self) -> SysResult<ProcessIo> {
329 let process_handle = CoreWinMetrics::get_process_handle()?;
330 let mut io_counters = IO_COUNTERS::default();
331 // Safe because we own the process handle and all memory was allocated.
332 let result = unsafe {
333 GetProcessIoCounters(
334 process_handle.as_raw_descriptor(),
335 &mut io_counters as *mut IO_COUNTERS,
336 )
337 };
338 if result == 0 {
339 return Err(SysError::last());
340 }
341 Ok(ProcessIo {
342 read_bytes: io_counters.ReadTransferCount,
343 write_bytes: io_counters.WriteTransferCount,
344 })
345 }
346
mem_metrics_string(&self) -> String347 fn mem_metrics_string(&self) -> String {
348 let guard = self.memory.lock().unwrap();
349 let memory: ProcessMemory = *guard;
350 let mut buf = format!(
351 "Physical memory used: {} mb.\n",
352 memory.physical / BYTES_PER_MB
353 );
354 let _ = writeln!(
355 buf,
356 "Total working memory: {} mb.",
357 memory.working_set_size / BYTES_PER_MB
358 );
359 let _ = writeln!(
360 buf,
361 "Peak working memory: {} mb.",
362 memory.working_set_peak / BYTES_PER_MB
363 );
364 let _ = writeln!(buf, "Page fault count: {}.", memory.page_fault_count);
365 let _ = writeln!(
366 buf,
367 "Page file used: {} mb.",
368 memory.page_file_usage / BYTES_PER_MB
369 );
370 let _ = writeln!(
371 buf,
372 "Peak page file used: {} mb.",
373 memory.page_file_peak / BYTES_PER_MB
374 );
375 buf
376 }
377
cpu_metrics_string(&self) -> String378 fn cpu_metrics_string(&self) -> String {
379 let guard = self.measurements.lock().unwrap();
380 let mut buf = String::new();
381
382 // Now we use current and last cpu measurment data to calculate cpu usage
383 // as a percentage.
384 if let Some(measurements) = &*guard {
385 let sys_time = measurements.current.sys_time;
386 let process_time = measurements.current.process_time;
387 let prev_sys_time = measurements.previous.sys_time;
388 let prev_process_time = measurements.previous.process_time;
389
390 let diff_systime_kernel =
391 compute_filetime_subtraction(sys_time.kernel, prev_sys_time.kernel);
392 let diff_systime_user = compute_filetime_subtraction(sys_time.user, prev_sys_time.user);
393
394 let diff_processtime_kernel =
395 compute_filetime_subtraction(process_time.kernel, prev_process_time.kernel);
396 let diff_processtime_user =
397 compute_filetime_subtraction(process_time.user, prev_process_time.user);
398
399 let total_systime = diff_systime_kernel + diff_systime_user;
400 let total_processtime = diff_processtime_kernel + diff_processtime_user;
401
402 let mut process_cpu = String::from("still calculating...");
403 if total_systime > 0 {
404 process_cpu = format!("{}%", (100 * total_processtime / total_systime));
405 }
406 let _ = writeln!(buf, "Process cpu usage is: {}", process_cpu);
407
408 #[cfg(debug_assertions)]
409 {
410 // Show data supporting our cpu usage calculation.
411 // Output system cpu time.
412 let _ = writeln!(
413 buf,
414 "Systime Idle: low {} / high {}",
415 sys_time.idle.dwLowDateTime, sys_time.idle.dwHighDateTime
416 );
417 let _ = writeln!(
418 buf,
419 "Systime User: low {} / high {}",
420 sys_time.user.dwLowDateTime, sys_time.user.dwHighDateTime
421 );
422 let _ = writeln!(
423 buf,
424 "Systime kernel: low {} / high {}",
425 sys_time.kernel.dwLowDateTime, sys_time.kernel.dwHighDateTime
426 );
427 // Output process cpu time.
428 let _ = writeln!(
429 buf,
430 "Process Create: low {} / high {}",
431 process_time.create.dwLowDateTime, process_time.create.dwHighDateTime
432 );
433 let _ = writeln!(
434 buf,
435 "Process Exit: low {} / high {}",
436 process_time.exit.dwLowDateTime, process_time.exit.dwHighDateTime
437 );
438 let _ = writeln!(
439 buf,
440 "Process kernel: low {} / high {}",
441 process_time.kernel.dwLowDateTime, process_time.kernel.dwHighDateTime
442 );
443 let _ = writeln!(
444 buf,
445 "Process user: low {} / high {}",
446 process_time.user.dwLowDateTime, process_time.user.dwHighDateTime
447 );
448 }
449 } else {
450 let _ = write!(buf, "Calculating cpu usage...");
451 }
452 buf
453 }
454 }
455
compute_filetime_subtraction(fta: FILETIME, ftb: FILETIME) -> LONGLONG456 fn compute_filetime_subtraction(fta: FILETIME, ftb: FILETIME) -> LONGLONG {
457 // safe because we are initializing the struct to 0s.
458 unsafe {
459 let mut a: LARGE_INTEGER = mem::zeroed::<LARGE_INTEGER>();
460 a.u_mut().LowPart = fta.dwLowDateTime;
461 a.u_mut().HighPart = fta.dwHighDateTime as i32;
462 let mut b: LARGE_INTEGER = mem::zeroed::<LARGE_INTEGER>();
463 b.u_mut().LowPart = ftb.dwLowDateTime;
464 b.u_mut().HighPart = ftb.dwHighDateTime as i32;
465 a.QuadPart() - b.QuadPart()
466 }
467 }
468
469 // Adds to a running total of memory metrics over the course of a collection period.
470 // Can divide these sums to calculate averages.
accumulate_process_memory( acc: ProcessMemoryAccumulated, mem: ProcessMemory, ) -> ProcessMemoryAccumulated471 fn accumulate_process_memory(
472 acc: ProcessMemoryAccumulated,
473 mem: ProcessMemory,
474 ) -> ProcessMemoryAccumulated {
475 ProcessMemoryAccumulated {
476 accumulated: ProcessMemory {
477 page_fault_count: mem.page_fault_count,
478 working_set_size: acc.accumulated.working_set_size + mem.working_set_size,
479 working_set_peak: mem.working_set_peak,
480 page_file_usage: acc.accumulated.page_file_usage + mem.page_file_usage,
481 page_file_peak: mem.page_file_peak,
482 physical: acc.accumulated.physical + mem.physical,
483 },
484 accumulated_count: acc.accumulated_count + 1,
485 }
486 }
487
488 #[derive(Copy, Clone, Default)]
489 struct SystemCpuTime {
490 idle: FILETIME,
491 kernel: FILETIME,
492 user: FILETIME,
493 }
494
495 #[derive(Copy, Clone, Default)]
496 struct ProcessCpuTime {
497 create: FILETIME,
498 exit: FILETIME,
499 kernel: FILETIME,
500 user: FILETIME,
501 }
502
503 #[derive(Copy, Clone, Default)]
504 struct ProcessMemory {
505 page_fault_count: u32,
506 working_set_size: usize,
507 working_set_peak: usize,
508 page_file_usage: usize,
509 page_file_peak: usize,
510 physical: usize,
511 }
512
513 #[derive(Copy, Clone)]
514 struct ProcessMemoryAccumulated {
515 accumulated: ProcessMemory,
516 accumulated_count: usize,
517 }
518
519 #[derive(Copy, Clone, Default)]
520 struct ProcessIo {
521 read_bytes: u64,
522 write_bytes: u64,
523 }
524
525 #[derive(Copy, Clone)]
526 struct ProcessIoRecord {
527 current: ProcessIo,
528 current_time: DateTime<Local>,
529 last_upload: ProcessIo,
530 last_upload_time: DateTime<Local>,
531 }
532
533 #[derive(Copy, Clone)]
534 struct Measurement {
535 sys_time: SystemCpuTime,
536 process_time: ProcessCpuTime,
537 }
538
539 struct Measurements {
540 current: Measurement,
541 previous: Measurement,
542 last_upload: Measurement,
543 }
544
545 /// A managing struct for a job which defines regular logging of core Windows system metrics.
546 pub(crate) struct CoreWinMetrics {
547 metrics_string: Weak<Mutex<String>>,
548 exit_evt: Event,
549 worker_thread: Option<JoinHandle<()>>,
550 }
551
552 impl CoreWinMetrics {
new() -> Result<Self>553 pub fn new() -> Result<Self> {
554 let exit_evt = match Event::new() {
555 Ok(evt) => evt,
556 Err(_e) => return Err(Error::CannotInstantiateEvent),
557 };
558
559 let metrics_string = String::new();
560 let arc_metrics_memory = Arc::new(Mutex::new(metrics_string));
561 let weak_metrics_memory = Arc::downgrade(&arc_metrics_memory);
562
563 let mut me = Self {
564 metrics_string: weak_metrics_memory,
565 exit_evt,
566 worker_thread: None,
567 };
568 let exit_evt_clone = match me.exit_evt.try_clone() {
569 Ok(evt) => evt,
570 Err(_) => return Err(Error::CannotCloneEvent),
571 };
572 me.worker_thread.replace(thread::spawn(|| {
573 Worker {
574 exit_evt: exit_evt_clone,
575 io: Arc::new(Mutex::new(None)),
576 measurements: Arc::new(Mutex::new(None)),
577 memory: Arc::new(Mutex::new(Default::default())),
578 memory_acc: Arc::new(Mutex::new(None)),
579 metrics_string: arc_metrics_memory,
580 }
581 .run();
582 }));
583 Ok(me)
584 }
585
get_process_handle() -> SysResult<SafeDescriptor>586 fn get_process_handle() -> SysResult<SafeDescriptor> {
587 // Safe because we own the current process.
588 let process_handle = unsafe {
589 OpenProcess(
590 PROCESS_QUERY_LIMITED_INFORMATION | PROCESS_VM_READ | SYNCHRONIZE,
591 0,
592 std::process::id(),
593 )
594 };
595 if process_handle.is_null() {
596 return Err(SysError::last());
597 }
598 // Safe as the SafeDescriptor is the only thing with access to the handle after this.
599 Ok(unsafe { SafeDescriptor::from_raw_descriptor(process_handle) })
600 }
601 }
602
603 impl Drop for CoreWinMetrics {
drop(&mut self)604 fn drop(&mut self) {
605 if let Some(join_handle) = self.worker_thread.take() {
606 let _ = self.exit_evt.signal();
607 join_handle
608 .join()
609 .expect("fail to join the worker thread of a win core metrics collector.");
610 }
611 }
612 }
613
614 impl fmt::Display for CoreWinMetrics {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result615 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
616 match self.metrics_string.upgrade() {
617 Some(metrics_string) => write!(f, "{}", *metrics_string.lock().unwrap()),
618 None => write!(f, ""),
619 }
620 }
621 }
622