SAM 3 provides powerful video segmentation and dense tracking capabilities, allowing you to segment objects on one frame and automatically track them throughout the entire video.
Setup
Configure GPU usage
import os
import sam3
import torch
sam3_root = os.path.join(os.path.dirname(sam3. __file__ ), ".." )
# Use all available GPUs on the machine
gpus_to_use = range (torch.cuda.device_count())
# Or use only a single GPU:
# gpus_to_use = [torch.cuda.current_device()]
Build the video predictor
from sam3.model_builder import build_sam3_video_predictor
predictor = build_sam3_video_predictor( gpus_to_use = gpus_to_use)
Load video frames
import glob
import cv2
video_path = f " { sam3_root } /assets/videos/0001"
# For MP4 videos:
if isinstance (video_path, str ) and video_path.endswith( ".mp4" ):
cap = cv2.VideoCapture(video_path)
video_frames_for_vis = []
while True :
ret, frame = cap.read()
if not ret:
break
video_frames_for_vis.append(cv2.cvtColor(frame, cv2. COLOR_BGR2RGB ))
cap.release()
# For JPEG frames:
else :
video_frames_for_vis = glob.glob(os.path.join(video_path, "*.jpg" ))
video_frames_for_vis.sort(
key = lambda p : int (os.path.splitext(os.path.basename(p))[ 0 ])
)
You can extract JPEG frames from MP4 videos using ffmpeg: ffmpeg -i < your_vide o > .mp4 -q:v 2 -start_number 0 < output_di r > /'%05d.jpg'
Starting an Inference Session
SAM 3 requires stateful inference for interactive video segmentation:
response = predictor.handle_request(
request = dict (
type = "start_session" ,
resource_path = video_path,
)
)
session_id = response[ "session_id" ]
Each session is tied to a single video. Always close sessions after inference to free GPU resources.
Text Prompts for Video
Segment all instances of an object throughout the video using natural language:
prompt_text_str = "person"
frame_idx = 0 # Add prompt on frame 0
response = predictor.handle_request(
request = dict (
type = "add_prompt" ,
session_id = session_id,
frame_index = frame_idx,
text = prompt_text_str,
)
)
out = response[ "outputs" ]
Propagating Through Video
After adding prompts, propagate the segmentation across all frames:
def propagate_in_video ( predictor , session_id ):
outputs_per_frame = {}
for response in predictor.handle_stream_request(
request = dict (
type = "propagate_in_video" ,
session_id = session_id,
)
):
outputs_per_frame[response[ "frame_index" ]] = response[ "outputs" ]
return outputs_per_frame
outputs_per_frame = propagate_in_video(predictor, session_id)
Point Prompts for Refinement
Refine segmentation masks using positive and negative clicks:
from sam3.visualization_utils import load_frame
from PIL import Image
# Get image dimensions
sample_img = Image.fromarray(load_frame(video_frames_for_vis[ 0 ]))
IMG_WIDTH , IMG_HEIGHT = sample_img.size
# Define points (in absolute pixel coordinates)
frame_idx = 0
obj_id = 2
points_abs = np.array([
[ 760 , 550 ], # positive click
])
labels = np.array([ 1 ]) # 1 = positive, 0 = negative
# Convert to relative coordinates
def abs_to_rel_coords ( coords , IMG_WIDTH , IMG_HEIGHT , coord_type = "point" ):
if coord_type == "point" :
return [[x / IMG_WIDTH , y / IMG_HEIGHT ] for x, y in coords]
points_tensor = torch.tensor(
abs_to_rel_coords(points_abs, IMG_WIDTH , IMG_HEIGHT , coord_type = "point" ),
dtype = torch.float32,
)
points_labels_tensor = torch.tensor(labels, dtype = torch.int32)
response = predictor.handle_request(
request = dict (
type = "add_prompt" ,
session_id = session_id,
frame_index = frame_idx,
points = points_tensor,
point_labels = points_labels_tensor,
obj_id = obj_id,
)
)
Removing Objects
Remove individual objects by their ID:
obj_id = 2 # Object to remove
response = predictor.handle_request(
request = dict (
type = "remove_object" ,
session_id = session_id,
obj_id = obj_id,
)
)
Visualizing Results
from sam3.visualization_utils import (
prepare_masks_for_visualization,
visualize_formatted_frame_output,
)
# Reformat outputs for visualization
outputs_per_frame = prepare_masks_for_visualization(outputs_per_frame)
# Plot every 60th frame
vis_frame_stride = 60
for frame_idx in range ( 0 , len (outputs_per_frame), vis_frame_stride):
visualize_formatted_frame_output(
frame_idx,
video_frames_for_vis,
outputs_list = [outputs_per_frame],
titles = [ "SAM 3 Dense Tracking outputs" ],
figsize = ( 6 , 4 ),
)
Closing Sessions
Always close sessions when finished:
_ = predictor.handle_request(
request = dict (
type = "close_session" ,
session_id = session_id,
)
)
Complete Workflow Example
Start session and add text prompt
# Start session
response = predictor.handle_request(
request = dict ( type = "start_session" , resource_path = video_path)
)
session_id = response[ "session_id" ]
# Add text prompt
response = predictor.handle_request(
request = dict (
type = "add_prompt" ,
session_id = session_id,
frame_index = 0 ,
text = "person" ,
)
)
Propagate and visualize
# Propagate through video
outputs_per_frame = propagate_in_video(predictor, session_id)
# Visualize results
outputs_per_frame = prepare_masks_for_visualization(outputs_per_frame)
for frame_idx in range ( 0 , len (outputs_per_frame), 60 ):
visualize_formatted_frame_output(
frame_idx, video_frames_for_vis, [outputs_per_frame]
)
Close session
predictor.handle_request(
request = dict ( type = "close_session" , session_id = session_id)
)
Next Steps
Image Inference Learn the basics with single image segmentation
Interactive Refinement Refine video segmentations interactively