The CVAT SDK provides an auto-annotation framework that allows you to apply machine learning models to automatically annotate tasks.
Installation
For auto-annotation with built-in functions, install the pytorch extra:
pip install "cvat-sdk[pytorch]"
Overview
The auto-annotation system supports two types of functions:
- Detection Functions: Apply a model independently to each frame
- Tracking Functions: Track shapes across multiple frames
Detection Functions
Detection functions process each image independently and return annotations.
Using annotate_task
The main entry point for auto-annotation:
from cvat_sdk import Client
from cvat_sdk.auto_annotation import annotate_task
client = Client(url="cvat.example.com")
client.login(("username", "password"))
# Apply detection function to task
annotate_task(
client=client,
task_id=123,
function=my_detection_function,
clear_existing=False, # Keep existing annotations
conf_threshold=0.5, # Confidence threshold
conv_mask_to_poly=False # Keep masks as-is
)
Function Parameters
ID of the task to annotate
function
DetectionFunction
required
Detection function to apply
pbar
ProgressReporter
default:"None"
Progress reporter for tracking progress
If True, remove existing annotations before adding new ones
If True, ignore function labels not in the task. If False, raise error.
conf_threshold
float | None
default:"None"
Confidence threshold (0-1) passed to the function. Function may apply its own default.
If True, function must convert mask shapes to polygons
Creating Detection Functions
Function Interface
A detection function must implement the DetectionFunction protocol:
import PIL.Image
from cvat_sdk import models
from cvat_sdk.auto_annotation import (
DetectionFunction,
DetectionFunctionContext,
DetectionFunctionSpec,
label_spec,
rectangle,
tag
)
class MyDetectionFunction:
def __init__(self, model):
self._model = model
self._spec = DetectionFunctionSpec(
labels=[
label_spec(name="car", id=0),
label_spec(name="person", id=1)
]
)
@property
def spec(self) -> DetectionFunctionSpec:
"""Return function specification."""
return self._spec
def detect(
self,
context: DetectionFunctionContext,
image: PIL.Image.Image
) -> list[models.LabeledShapeRequest | models.LabeledImageRequest]:
"""Detect objects in image and return annotations."""
# Run model inference
detections = self._model.predict(image)
# Filter by confidence threshold
conf_threshold = context.conf_threshold or 0.5
detections = [d for d in detections if d.confidence >= conf_threshold]
# Convert to CVAT annotations
annotations = []
for det in detections:
# Use helper functions to create annotations
annotations.append(
rectangle(
label_id=det.class_id, # Must match spec label IDs
points=[det.x1, det.y1, det.x2, det.y2]
)
)
return annotations
# Use the function
my_function = MyDetectionFunction(my_model)
annotate_task(client, task_id=123, function=my_function)
Detection Function Spec
The spec defines what labels your function supports:
from cvat_sdk.auto_annotation import (
DetectionFunctionSpec,
label_spec,
skeleton_label_spec,
keypoint_spec,
attribute_spec
)
spec = DetectionFunctionSpec(
labels=[
# Simple label
label_spec(name="car", id=0, type="rectangle", color="#ff0000"),
# Label with attributes
label_spec(
name="person",
id=1,
type="rectangle",
attributes=[
attribute_spec(
name="gender",
id=0,
input_type="select",
values=["male", "female"]
)
]
),
# Skeleton label
skeleton_label_spec(
name="person_skeleton",
id=2,
sublabels=[
keypoint_spec(name="head", id=0),
keypoint_spec(name="left_hand", id=1),
keypoint_spec(name="right_hand", id=2)
]
)
]
)
Creating Annotations
Use helper functions to create annotations:
from cvat_sdk.auto_annotation import (
tag,
rectangle,
polygon,
mask,
skeleton,
keypoint
)
from cvat_sdk import models
def detect(self, context, image):
annotations = []
# Create a tag (image-level label)
annotations.append(
tag(label_id=0) # frame=0 set automatically
)
# Create a rectangle
annotations.append(
rectangle(
label_id=1,
points=[100, 100, 200, 200] # x1, y1, x2, y2
)
)
# Create a polygon
annotations.append(
polygon(
label_id=1,
points=[10, 10, 20, 10, 20, 20, 10, 20] # x1,y1, x2,y2, ...
)
)
# Create annotation with attributes
annotations.append(
rectangle(
label_id=1,
points=[50, 50, 150, 150],
attributes=[
models.AttributeValRequest(spec_id=0, value="male")
]
)
)
# Create skeleton
annotations.append(
skeleton(
label_id=2,
elements=[
keypoint(label_id=0, points=[100, 50]), # head
keypoint(label_id=1, points=[90, 100]), # left hand
keypoint(label_id=2, points=[110, 100]) # right hand
]
)
)
return annotations
Detection Function Context
The context provides frame-specific information:
def detect(self, context: DetectionFunctionContext, image: PIL.Image.Image):
# Get frame filename
print(f"Processing: {context.frame_name}")
# Get confidence threshold
threshold = context.conf_threshold or 0.5
# Check if masks should be converted
if context.conv_mask_to_poly:
# Must return polygons instead of masks
pass
# Process image...
return annotations
Built-in Detection Functions
The SDK includes Torchvision-based detection functions:
Object Detection
from cvat_sdk.auto_annotation.functions import torchvision_detection
import torchvision.models as models
# Create detection function
model = models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
detection_function = torchvision_detection.create(
model=model,
transforms=None, # Optional preprocessing
label_map={ # Map model classes to label names
1: "person",
2: "bicycle",
3: "car"
}
)
# Apply to task
annotate_task(
client=client,
task_id=123,
function=detection_function,
conf_threshold=0.7
)
Instance Segmentation
from cvat_sdk.auto_annotation.functions import torchvision_instance_segmentation
model = models.detection.maskrcnn_resnet50_fpn(pretrained=True)
segmentation_function = torchvision_instance_segmentation.create(
model=model,
label_map={1: "person", 2: "car"}
)
annotate_task(
client=client,
task_id=123,
function=segmentation_function,
conv_mask_to_poly=True # Convert masks to polygons
)
Classification
from cvat_sdk.auto_annotation.functions import torchvision_classification
model = models.resnet50(pretrained=True)
classification_function = torchvision_classification.create(
model=model,
label_map={0: "cat", 1: "dog", 2: "bird"}
)
annotate_task(
client=client,
task_id=123,
function=classification_function
)
Keypoint Detection
from cvat_sdk.auto_annotation.functions import torchvision_keypoint_detection
model = models.detection.keypointrcnn_resnet50_fpn(pretrained=True)
keypoint_function = torchvision_keypoint_detection.create(
model=model,
keypoint_names=["nose", "left_eye", "right_eye", "left_ear", "right_ear"]
)
annotate_task(
client=client,
task_id=123,
function=keypoint_function
)
Custom Model Example
Here’s a complete example with a custom YOLOv5 model:
import torch
import PIL.Image
from cvat_sdk import Client, models
from cvat_sdk.auto_annotation import (
annotate_task,
DetectionFunctionSpec,
DetectionFunctionContext,
label_spec,
rectangle
)
class YOLOv5DetectionFunction:
def __init__(self, model_path: str):
# Load YOLOv5 model
self._model = torch.hub.load('ultralytics/yolov5', 'custom', path=model_path)
self._model.eval()
# Define spec with COCO classes
self._spec = DetectionFunctionSpec(
labels=[
label_spec(name="person", id=0),
label_spec(name="car", id=1),
label_spec(name="truck", id=2),
# Add more classes...
]
)
@property
def spec(self) -> DetectionFunctionSpec:
return self._spec
def detect(
self,
context: DetectionFunctionContext,
image: PIL.Image.Image
) -> list[models.LabeledShapeRequest]:
# Run inference
results = self._model(image)
# Get predictions
predictions = results.pandas().xyxy[0]
# Filter by confidence
conf_threshold = context.conf_threshold or 0.25
predictions = predictions[predictions['confidence'] >= conf_threshold]
# Convert to CVAT annotations
annotations = []
for _, row in predictions.iterrows():
annotations.append(
rectangle(
label_id=int(row['class']),
points=[
float(row['xmin']),
float(row['ymin']),
float(row['xmax']),
float(row['ymax'])
]
)
)
return annotations
def main():
# Connect to CVAT
client = Client(url="cvat.example.com")
client.login(("username", "password"))
# Create function
function = YOLOv5DetectionFunction("yolov5s.pt")
# Annotate task
annotate_task(
client=client,
task_id=123,
function=function,
conf_threshold=0.5,
clear_existing=False
)
print("Auto-annotation complete!")
if __name__ == "__main__":
main()
Label Mapping
When your function labels don’t exactly match task labels:
from cvat_sdk import models
# If allow_unmatched_labels=False (default)
# Function must have ALL task labels
# Error raised if function has labels not in task
# If allow_unmatched_labels=True
# Function labels not in task are ignored
# Annotations with unmatched labels are dropped
annotate_task(
client=client,
task_id=123,
function=my_function,
allow_unmatched_labels=True # Ignore extra labels
)
Progress Reporting
Track progress with tqdm:
from tqdm import tqdm
with tqdm(desc="Auto-annotating", unit="frames") as pbar:
annotate_task(
client=client,
task_id=123,
function=my_function,
pbar=pbar
)
Error Handling
from cvat_sdk.auto_annotation.exceptions import BadFunctionError
try:
annotate_task(client, task_id=123, function=my_function)
except BadFunctionError as e:
print(f"Function validation error: {e}")
# Common errors:
# - Label IDs don't match spec
# - Invalid annotation format
# - Incompatible label types
Limitations
Current auto-annotation limitations:
- Only 2D image tasks are supported (not video)
- Only detection functions are fully implemented
- Tracking functions are defined but not yet integrated
Best Practices
-
Test on Small Tasks: Start with a small task to verify your function works correctly
-
Use Confidence Thresholds: Set appropriate confidence thresholds to filter low-quality predictions
-
Clear Existing Annotations Carefully: Use
clear_existing=True only when you’re sure
-
Handle Label Mapping: Ensure your function labels match task labels, or use
allow_unmatched_labels=True
-
Monitor Progress: Use progress reporters for long-running operations
-
Validate Spec: Make sure your
DetectionFunctionSpec has unique IDs and valid attributes
Next Steps