Skip to main content
SAM 3 provides powerful video segmentation and dense tracking capabilities, allowing you to segment objects on one frame and automatically track them throughout the entire video.

Setup

1

Configure GPU usage

import os
import sam3
import torch

sam3_root = os.path.join(os.path.dirname(sam3.__file__), "..")

# Use all available GPUs on the machine
gpus_to_use = range(torch.cuda.device_count())
# Or use only a single GPU:
# gpus_to_use = [torch.cuda.current_device()]
2

Build the video predictor

from sam3.model_builder import build_sam3_video_predictor

predictor = build_sam3_video_predictor(gpus_to_use=gpus_to_use)
3

Load video frames

import glob
import cv2

video_path = f"{sam3_root}/assets/videos/0001"

# For MP4 videos:
if isinstance(video_path, str) and video_path.endswith(".mp4"):
    cap = cv2.VideoCapture(video_path)
    video_frames_for_vis = []
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        video_frames_for_vis.append(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
    cap.release()
# For JPEG frames:
else:
    video_frames_for_vis = glob.glob(os.path.join(video_path, "*.jpg"))
    video_frames_for_vis.sort(
        key=lambda p: int(os.path.splitext(os.path.basename(p))[0])
    )
You can extract JPEG frames from MP4 videos using ffmpeg:
ffmpeg -i <your_video>.mp4 -q:v 2 -start_number 0 <output_dir>/'%05d.jpg'

Starting an Inference Session

SAM 3 requires stateful inference for interactive video segmentation:
response = predictor.handle_request(
    request=dict(
        type="start_session",
        resource_path=video_path,
    )
)
session_id = response["session_id"]
Each session is tied to a single video. Always close sessions after inference to free GPU resources.

Text Prompts for Video

Segment all instances of an object throughout the video using natural language:
prompt_text_str = "person"
frame_idx = 0  # Add prompt on frame 0

response = predictor.handle_request(
    request=dict(
        type="add_prompt",
        session_id=session_id,
        frame_index=frame_idx,
        text=prompt_text_str,
    )
)
out = response["outputs"]

Propagating Through Video

After adding prompts, propagate the segmentation across all frames:
def propagate_in_video(predictor, session_id):
    outputs_per_frame = {}
    for response in predictor.handle_stream_request(
        request=dict(
            type="propagate_in_video",
            session_id=session_id,
        )
    ):
        outputs_per_frame[response["frame_index"]] = response["outputs"]
    return outputs_per_frame

outputs_per_frame = propagate_in_video(predictor, session_id)

Point Prompts for Refinement

Refine segmentation masks using positive and negative clicks:
from sam3.visualization_utils import load_frame
from PIL import Image

# Get image dimensions
sample_img = Image.fromarray(load_frame(video_frames_for_vis[0]))
IMG_WIDTH, IMG_HEIGHT = sample_img.size

# Define points (in absolute pixel coordinates)
frame_idx = 0
obj_id = 2
points_abs = np.array([
    [760, 550],  # positive click
])
labels = np.array([1])  # 1 = positive, 0 = negative

# Convert to relative coordinates
def abs_to_rel_coords(coords, IMG_WIDTH, IMG_HEIGHT, coord_type="point"):
    if coord_type == "point":
        return [[x / IMG_WIDTH, y / IMG_HEIGHT] for x, y in coords]

points_tensor = torch.tensor(
    abs_to_rel_coords(points_abs, IMG_WIDTH, IMG_HEIGHT, coord_type="point"),
    dtype=torch.float32,
)
points_labels_tensor = torch.tensor(labels, dtype=torch.int32)

response = predictor.handle_request(
    request=dict(
        type="add_prompt",
        session_id=session_id,
        frame_index=frame_idx,
        points=points_tensor,
        point_labels=points_labels_tensor,
        obj_id=obj_id,
    )
)

Removing Objects

Remove individual objects by their ID:
obj_id = 2  # Object to remove

response = predictor.handle_request(
    request=dict(
        type="remove_object",
        session_id=session_id,
        obj_id=obj_id,
    )
)

Visualizing Results

from sam3.visualization_utils import (
    prepare_masks_for_visualization,
    visualize_formatted_frame_output,
)

# Reformat outputs for visualization
outputs_per_frame = prepare_masks_for_visualization(outputs_per_frame)

# Plot every 60th frame
vis_frame_stride = 60
for frame_idx in range(0, len(outputs_per_frame), vis_frame_stride):
    visualize_formatted_frame_output(
        frame_idx,
        video_frames_for_vis,
        outputs_list=[outputs_per_frame],
        titles=["SAM 3 Dense Tracking outputs"],
        figsize=(6, 4),
    )

Closing Sessions

Always close sessions when finished:
_ = predictor.handle_request(
    request=dict(
        type="close_session",
        session_id=session_id,
    )
)

Complete Workflow Example

1

Start session and add text prompt

# Start session
response = predictor.handle_request(
    request=dict(type="start_session", resource_path=video_path)
)
session_id = response["session_id"]

# Add text prompt
response = predictor.handle_request(
    request=dict(
        type="add_prompt",
        session_id=session_id,
        frame_index=0,
        text="person",
    )
)
2

Propagate and visualize

# Propagate through video
outputs_per_frame = propagate_in_video(predictor, session_id)

# Visualize results
outputs_per_frame = prepare_masks_for_visualization(outputs_per_frame)
for frame_idx in range(0, len(outputs_per_frame), 60):
    visualize_formatted_frame_output(
        frame_idx, video_frames_for_vis, [outputs_per_frame]
    )
3

Close session

predictor.handle_request(
    request=dict(type="close_session", session_id=session_id)
)

Next Steps

Image Inference

Learn the basics with single image segmentation

Interactive Refinement

Refine video segmentations interactively

Build docs developers (and LLMs) love