Skip to main content
The tracker module provides real-time memory tracking with alerts, OOM detection, and event logging.

Classes

TrackingEvent

Represents a memory tracking event.
from gpumemprof import TrackingEvent

Attributes

timestamp
float
Unix timestamp of the event
event_type
str
Event type: ‘allocation’, ‘deallocation’, ‘peak’, ‘warning’, ‘error’, ‘critical’
memory_allocated
int
Allocated memory in bytes at event time
memory_reserved
int
Reserved memory in bytes at event time
memory_change
int
Memory change in bytes (positive for allocation, negative for deallocation)
device_id
int
GPU device ID
context
Optional[str]
default:"None"
Contextual information about the event
metadata
Optional[Dict[str, Any]]
default:"None"
Additional metadata
active_memory
Optional[int]
default:"None"
Active memory in allocator
inactive_memory
Optional[int]
default:"None"
Inactive memory in allocator
device_used
Optional[int]
default:"None"
Device memory used
device_free
Optional[int]
default:"None"
Device memory free
device_total
Optional[int]
default:"None"
Total device memory
backend
str
default:"'cuda'"
Backend type (cuda, rocm, mps)

MemoryTracker

Real-time memory tracker with alerts and monitoring.
from gpumemprof import MemoryTracker

tracker = MemoryTracker(
    device="cuda:0",
    sampling_interval=0.1,
    enable_alerts=True,
    enable_oom_flight_recorder=True
)

Constructor

device
Optional[Union[str, int, torch.device]]
default:"None"
GPU device to track. Auto-detects if None
sampling_interval
float
default:"0.1"
Sampling interval in seconds for continuous tracking
max_events
int
default:"10000"
Maximum number of events to keep in memory (ring buffer)
enable_alerts
bool
default:"True"
Whether to enable memory threshold alerts
enable_oom_flight_recorder
bool
default:"False"
Enable automatic OOM dump artifacts
oom_dump_dir
str
default:"'oom_dumps'"
Directory for OOM dump bundles
oom_buffer_size
Optional[int]
default:"None"
Event ring-buffer size for OOM dumps (defaults to max_events)
oom_max_dumps
int
default:"5"
Maximum number of retained OOM dump bundles
oom_max_total_mb
int
default:"256"
Maximum total storage for OOM dumps in MB

Methods

start_tracking()
Start real-time memory tracking.
tracker.start_tracking()
# ... run your code ...
tracker.stop_tracking()
stop_tracking()
Stop real-time memory tracking.
tracker.stop_tracking()
handle_exception()
Capture OOM diagnostics for recognized OOM exceptions.
try:
    model = LargeModel().cuda()
except Exception as e:
    dump_path = tracker.handle_exception(e, context="model_loading")
    if dump_path:
        print(f"OOM dump saved to: {dump_path}")
    raise
exc
BaseException
Exception to analyze
context
Optional[str]
default:"None"
Context description
metadata
Optional[Dict[str, Any]]
default:"None"
Additional metadata to include in dump
return
Optional[str]
Path to the OOM dump directory if created, None otherwise
capture_oom()
Context manager to capture OOM diagnostics.
with tracker.capture_oom(context="training", metadata={"batch_size": 32}):
    output = model(large_batch)
context
str
default:"'runtime'"
Context description
metadata
Optional[Dict[str, Any]]
default:"None"
Additional metadata
add_alert_callback()
Add a callback function to be called on alerts.
def my_alert_handler(event: TrackingEvent):
    print(f"Alert: {event.context}")

tracker.add_alert_callback(my_alert_handler)
callback
Callable[[TrackingEvent], None]
Function to call when alerts are triggered
remove_alert_callback()
Remove an alert callback.
tracker.remove_alert_callback(my_alert_handler)
get_events()
Get tracking events with optional filtering.
# Get all events
events = tracker.get_events()

# Get only warnings
warnings = tracker.get_events(event_type="warning")

# Get last 100 events
recent = tracker.get_events(last_n=100)

# Get events since timestamp
import time
events = tracker.get_events(since=time.time() - 3600)
event_type
Optional[str]
default:"None"
Filter by event type
last_n
Optional[int]
default:"None"
Get last N events
since
Optional[float]
default:"None"
Get events since timestamp
return
List[TrackingEvent]
Filtered list of tracking events
get_memory_timeline()
Get memory usage timeline with specified interval.
timeline = tracker.get_memory_timeline(interval=1.0)
print(f"Timestamps: {timeline['timestamps']}")
print(f"Allocated: {timeline['allocated']}")
interval
float
default:"1.0"
Time interval in seconds for aggregation
return
Dict[str, List]
Dictionary with ‘timestamps’, ‘allocated’, and ‘reserved’ lists
get_statistics()
Get comprehensive tracking statistics.
stats = tracker.get_statistics()
print(f"Peak memory: {stats['peak_memory']}")
print(f"Total events: {stats['total_events']}")
print(f"Backend: {stats['backend']}")
return
Dict[str, Any]
Statistics including peak memory, event counts, backend info, and OOM recorder status
export_events()
Export tracking events to file.
# Export as CSV
tracker.export_events("memory_events.csv", format="csv")

# Export as JSON
tracker.export_events("memory_events.json", format="json")
filename
str
Output filename
format
str
default:"'csv'"
Export format: ‘csv’ or ‘json’
clear_events()
Clear all tracking events.
tracker.clear_events()
set_threshold()
Set alert threshold.
tracker.set_threshold("memory_warning_percent", 85.0)
tracker.set_threshold("memory_critical_percent", 97.0)
threshold_name
str
Name of the threshold to set
value
Union[int, float]
Threshold value
Available thresholds:
  • memory_warning_percent: Warn at this % of total memory (default: 80.0)
  • memory_critical_percent: Critical at this % of total memory (default: 95.0)
  • memory_leak_threshold: Memory growth in bytes to flag as leak (default: 100MB)
  • fragmentation_threshold: Fragmentation ratio to warn (default: 0.3)
get_alerts()
Get all alert events.
# Get all alerts
alerts = tracker.get_alerts()

# Get last 10 alerts
recent_alerts = tracker.get_alerts(last_n=10)
last_n
Optional[int]
default:"None"
Get last N alerts only
return
List[TrackingEvent]
List of alert events (warnings, critical, errors)

Context Manager Support

with MemoryTracker(device="cuda:0") as tracker:
    # Tracking starts automatically
    model.train()
    # Tracking stops automatically on exit

MemoryWatchdog

Memory watchdog for automated memory management.
from gpumemprof import MemoryWatchdog

tracker = MemoryTracker()
watchdog = MemoryWatchdog(
    tracker=tracker,
    auto_cleanup=True,
    cleanup_threshold=0.9,
    aggressive_cleanup_threshold=0.95
)

Constructor

tracker
MemoryTracker
MemoryTracker instance to monitor
auto_cleanup
bool
default:"True"
Whether to automatically clean up memory on alerts
cleanup_threshold
float
default:"0.9"
Memory usage threshold (0-1) to trigger cleanup
aggressive_cleanup_threshold
float
default:"0.95"
Threshold for aggressive cleanup including garbage collection

Methods

force_cleanup()
Force immediate memory cleanup.
watchdog.force_cleanup(aggressive=True)
aggressive
bool
default:"False"
Whether to perform aggressive cleanup (includes gc.collect())
get_cleanup_stats()
Get cleanup statistics.
stats = watchdog.get_cleanup_stats()
print(f"Cleanups performed: {stats['cleanup_count']}")
return
Dict[str, Any]
Statistics about cleanup operations

Example Usage

import torch
from gpumemprof import MemoryTracker, MemoryWatchdog

# Create tracker with OOM recording
tracker = MemoryTracker(
    device="cuda:0",
    sampling_interval=0.5,
    enable_oom_flight_recorder=True,
    oom_dump_dir="./oom_diagnostics"
)

# Add watchdog for automatic cleanup
watchdog = MemoryWatchdog(tracker, auto_cleanup=True)

# Start tracking
tracker.start_tracking()

try:
    with tracker.capture_oom(context="training"):
        model = MyModel().cuda()
        for batch in dataloader:
            output = model(batch)
except torch.cuda.OutOfMemoryError as e:
    print(f"OOM dump location: {tracker.last_oom_dump_path}")
    raise
finally:
    tracker.stop_tracking()

# Analyze results
stats = tracker.get_statistics()
print(f"Peak memory: {stats['peak_memory'] / 1024**3:.2f} GB")

# Export events for analysis
tracker.export_events("memory_timeline.csv")

Build docs developers (and LLMs) love