Instance Segmentation

RF-DETR supports instance segmentation with the same consistent API as its detection models. The segmentation models are trained on Microsoft COCO and produce pixel-level instance masks alongside bounding boxes and class labels.

Model sizes

RF-DETR-Seg offers model sizes from Nano to 2XLarge. All latency numbers were measured on an NVIDIA T4 using TensorRT, FP16, and batch size 1.

Size	Python class	Inference alias	COCO AP₅₀	COCO AP_50:95	Latency (ms)	Params (M)	Resolution	License
N	`RFDETRSegNano`	`rfdetr-seg-nano`	63.0	40.3	3.4	33.6	312x312	Apache 2.0
S	`RFDETRSegSmall`	`rfdetr-seg-small`	66.2	43.1	4.4	33.7	384x384	Apache 2.0
M	`RFDETRSegMedium`	`rfdetr-seg-medium`	68.4	45.3	5.9	35.7	432x432	Apache 2.0
L	`RFDETRSegLarge`	`rfdetr-seg-large`	70.5	47.1	8.8	36.2	504x504	Apache 2.0
XL	`RFDETRSegXLarge`	`rfdetr-seg-xlarge`	72.2	48.8	13.5	38.1	624x624	Apache 2.0
2XL	`RFDETRSeg2XLarge`	`rfdetr-seg-2xlarge`	73.1	49.9	21.8	38.6	768x768	Apache 2.0

All segmentation model sizes are licensed under Apache 2.0.

Run on an image

Single image
Video file
Webcam stream
RTSP stream

import supervision as sv
from rfdetr import RFDETRSegMedium
from rfdetr.assets.coco_classes import COCO_CLASSES

model = RFDETRSegMedium()

detections = model.predict("https://media.roboflow.com/dog.jpg", threshold=0.5)

labels = [f"{COCO_CLASSES[class_id]}" for class_id in detections.class_id]

annotated_image = sv.MaskAnnotator().annotate(detections.data["source_image"], detections)
annotated_image = sv.LabelAnnotator().annotate(annotated_image, detections, labels)

Use sv.MaskAnnotator() to render instance masks. For detections without masks (e.g., when comparing with detection models), use sv.BoxAnnotator() instead.

import cv2
import supervision as sv
from rfdetr import RFDETRSegMedium
from rfdetr.assets.coco_classes import COCO_CLASSES

model = RFDETRSegMedium()

video_capture = cv2.VideoCapture("<SOURCE_VIDEO_PATH>")
if not video_capture.isOpened():
    raise RuntimeError("Failed to open video source: <SOURCE_VIDEO_PATH>")

while True:
    success, frame_bgr = video_capture.read()
    if not success:
        break

    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
    detections = model.predict(frame_rgb, threshold=0.5)

    labels = [COCO_CLASSES[class_id] for class_id in detections.class_id]

    annotated_frame = sv.MaskAnnotator().annotate(frame_bgr, detections)
    annotated_frame = sv.LabelAnnotator().annotate(annotated_frame, detections, labels)

    cv2.imshow("RF-DETR-Seg Video", annotated_frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

video_capture.release()
cv2.destroyAllWindows()

Replace <SOURCE_VIDEO_PATH> with your video file path.

import cv2
import supervision as sv
from rfdetr import RFDETRSegMedium
from rfdetr.assets.coco_classes import COCO_CLASSES

model = RFDETRSegMedium()

WEBCAM_INDEX = 0  # Change this to the desired webcam index (e.g., 1, 2, ...)
video_capture = cv2.VideoCapture(WEBCAM_INDEX)
if not video_capture.isOpened():
    raise RuntimeError(f"Failed to open webcam: {WEBCAM_INDEX}")

while True:
    success, frame_bgr = video_capture.read()
    if not success:
        break

    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
    detections = model.predict(frame_rgb, threshold=0.5)

    labels = [COCO_CLASSES[class_id] for class_id in detections.class_id]

    annotated_frame = sv.MaskAnnotator().annotate(frame_bgr, detections)
    annotated_frame = sv.LabelAnnotator().annotate(annotated_frame, detections, labels)

    cv2.imshow("RF-DETR-Seg Webcam", annotated_frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

video_capture.release()
cv2.destroyAllWindows()

WEBCAM_INDEX is usually 0 for the default camera. Press q to quit.

import cv2
import supervision as sv
from rfdetr import RFDETRSegMedium
from rfdetr.assets.coco_classes import COCO_CLASSES

model = RFDETRSegMedium()

video_capture = cv2.VideoCapture("<RTSP_STREAM_URL>")
if not video_capture.isOpened():
    raise RuntimeError("Failed to open RTSP stream: <RTSP_STREAM_URL>")

while True:
    success, frame_bgr = video_capture.read()
    if not success:
        break

    frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
    detections = model.predict(frame_rgb, threshold=0.5)

    labels = [COCO_CLASSES[class_id] for class_id in detections.class_id]

    annotated_frame = sv.MaskAnnotator().annotate(frame_bgr, detections)
    annotated_frame = sv.LabelAnnotator().annotate(annotated_frame, detections, labels)

    cv2.imshow("RF-DETR-Seg RTSP", annotated_frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

video_capture.release()
cv2.destroyAllWindows()

Replace <RTSP_STREAM_URL> with your stream URL (e.g., rtsp://user:[email protected]/stream).

Batch inference

Pass a list of images to predict() to process multiple images in a single forward pass. The method returns a list of supervision.Detections objects, each containing bounding boxes, class IDs, confidence scores, and instance masks.

import io
import requests
import supervision as sv
from PIL import Image
from rfdetr import RFDETRSegMedium
from rfdetr.assets.coco_classes import COCO_CLASSES

model = RFDETRSegMedium()

urls = [
    "https://media.roboflow.com/notebooks/examples/dog-2.jpeg",
    "https://media.roboflow.com/notebooks/examples/dog-3.jpeg",
]

images = [Image.open(io.BytesIO(requests.get(url).content)) for url in urls]

detections_list = model.predict(images, threshold=0.5)

for image, detections in zip(images, detections_list):
    labels = [
        f"{COCO_CLASSES[class_id]} {confidence:.2f}"
        for class_id, confidence in zip(detections.class_id, detections.confidence)
    ]

    annotated_image = image.copy()
    annotated_image = sv.MaskAnnotator().annotate(annotated_image, detections)
    annotated_image = sv.LabelAnnotator().annotate(annotated_image, detections, labels)

    sv.plot_image(annotated_image)

Run with Roboflow Inference

You can also run RF-DETR-Seg using the Inference library. To switch model size, use the corresponding inference alias from the table above.

import requests
import supervision as sv
from PIL import Image
from inference import get_model

model = get_model("rfdetr-seg-medium")

image = Image.open(requests.get("https://media.roboflow.com/dog.jpg", stream=True).raw)
predictions = model.infer(image, confidence=0.5)[0]
detections = sv.Detections.from_inference(predictions)

annotated_image = sv.MaskAnnotator().annotate(image, detections)
annotated_image = sv.LabelAnnotator().annotate(annotated_image, detections)

Pretrained models

Full model comparison table with accuracy, latency, and parameter counts.

Object detection

Run RF-DETR for bounding box object detection.

Train a model

Fine-tune RF-DETR-Seg on your own dataset.

Deploy to Roboflow

Deploy your segmentation model to the Roboflow platform.

Get Started

Run Models

Train Models

Deploy & Export

Instance Segmentation

Model sizes

Run on an image

Batch inference

Run with Roboflow Inference

Pretrained models

Object detection

Train a model

Deploy to Roboflow

Build docs developers (and LLMs) love

Get Started

Run Models

Train Models

Deploy & Export

​Model sizes

​Run on an image

​Batch inference

​Run with Roboflow Inference

Pretrained models

Object detection

Train a model

Deploy to Roboflow

Build docs developers (and LLMs) love

Model sizes

Run on an image

Batch inference

Run with Roboflow Inference