Skip to main content

Overview

The Perception Module provides vision capabilities for the robotic arm system. It integrates camera hardware, image processing, and YOLO-based object detection to identify and classify objects in the environment.

Module Structure

perception/
├── vision/
│   ├── camera/
│   │   └── main.py           # CameraManager
│   ├── detection/
│   │   ├── main.py            # DetectionModel interface
│   │   └── model_loader.py    # YOLO model loading
│   └── image_processing.py    # ImageProcessor pipeline

CameraManager

Initialization (perception/vision/camera/main.py:5-9)

class CameraManager:
    def __init__(self, camera_index: int = 0, 
                 width: int = 1280, 
                 height: int = 720):
        self.cap = cv2.VideoCapture(camera_index)
        self.cap.set(cv2.CAP_PROP_FRAME_WIDTH, width)
        self.cap.set(cv2.CAP_PROP_FRAME_HEIGHT, height)

Image Capture (perception/vision/camera/main.py:11-26)

def capture_image(self):
    # Flush camera buffer with 5 grabs
    for _ in range(5):
        self.cap.grab()
    
    ret, frame = self.cap.read()
    if not ret:
        return None
    
    current_dir = os.path.dirname(os.path.abspath(__file__))
    timestamp = time.strftime("%Y%m%d-%H%M%S")
    filename = f"{current_dir}/objects_images/{timestamp}.png"
    
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    
    cv2.imwrite(filename, frame)
    return filename
The camera buffer is flushed with 5 frame grabs to ensure the captured image is current, not stale data from the buffer.

Resource Management (perception/vision/camera/main.py:28-29)

def __del__(self):
    self.cap.release()

ImageProcessor Pipeline

Initialization (perception/vision/image_processing.py:8-11)

class ImageProcessor:
    def __init__(self, confidence_threshold: float = 0.45):
        self.detection: DetectionModelInterface = DetectionModel()
        self.conf_threshold = confidence_threshold

Processing Pipeline

Main Processing Method (perception/vision/image_processing.py:13-22)

def read_image_path(self, path: str, 
                    draw_results: bool = True, 
                    save_drawn_img: bool = True):
    object_image = cv2.imread(path)
    processed_img, best_detection = self.process_image(
        object_image, 
        self.conf_threshold
    )
    
    if draw_results and best_detection is not None and best_detection.get('confidence', 0) > 0:
        self._draw_detection(processed_img, best_detection)
        if save_drawn_img:
            self._save_drawn_image(processed_img, path)

    return processed_img, best_detection

Image Processing Logic (perception/vision/image_processing.py:24-72)

def process_image(self, image: np.ndarray, 
                  confidence_threshold: float = 0.45):
    try:
        # 1. Run inference
        copy_image = image.copy()
        object_results, object_classes = self.detection.inference(copy_image)
        
        # 2. Initialize best detection tracker
        best_detection = {
            'class': '', 
            'confidence': 0.0, 
            'box': [], 
            'class_id': -1
        }
        
        # 3. Process all detections
        for res in object_results:
            boxes = res.boxes
            
            if boxes.shape[0] == 0:
                continue
                
            confidence = boxes.conf.cpu().numpy()[0]
            class_id = int(boxes.cls[0])
            box_data = boxes.xyxy.cpu().numpy()[0]
                
            if confidence < confidence_threshold:
                continue
                
            detected_class = object_classes[class_id]
            clss_object = 'default'
                
            # Filter for target classes
            if detected_class in ['apple', 'orange', 'bottle']:
                clss_object = detected_class
                
            log.info(f'class: {clss_object}')
                    
            # Keep highest confidence detection
            if confidence > best_detection['confidence']:
                best_detection.update({
                    'class': str(clss_object),
                    'confidence': float(confidence),
                    'box': box_data,
                    'class_id': class_id
                })
                    
        # 4. Return final result
        if best_detection['confidence'] >= confidence_threshold:
            log.info(f"Best detection: {best_detection}")
            return image, best_detection
        else:
            log.info("No detections found")
            return image, best_detection
    except Exception as e:
        log.info(f'Error in image processing: {e}')
        return image, None

Detection Data Structure

{
    'class': 'apple',           # Object class name
    'confidence': 0.87,         # Detection confidence [0-1]
    'box': [x1, y1, x2, y2],   # Bounding box coordinates
    'class_id': 47              # COCO dataset class ID
}
The system only considers three target classes: apple, orange, and bottle. All other detections are labeled as default.

Visualization

Drawing Detections (perception/vision/image_processing.py:74-87)

def _draw_detection(self, image: np.ndarray, detection: dict):
    """Draw bounding box and label on image"""
    box = detection['box']
    class_name = detection['class']
    confidence = detection['confidence']

    x1, y1, x2, y2 = map(int, box)
    color = (0, 255, 0)  # BGR - green
    cv2.rectangle(image, (x1, y1), (x2, y2), color, 2)

    label = f"{class_name} {confidence:.2f}"
    cv2.putText(
        image, label, (x1, y1 - 10),
        cv2.FONT_HERSHEY_SIMPLEX, 0.7, color, 2
    )

Saving Annotated Images (perception/vision/image_processing.py:89-95)

def _save_drawn_image(self, image: np.ndarray, original_path: str):
    """Save image with drawn detections"""
    out_path = original_path.replace('.jpg', '_detected.jpg')
    cv2.imwrite(out_path, image)
    log.info(f"Saved image with detections: {out_path}")

DetectionModel Interface

Abstract Interface (perception/vision/detection/main.py:9-12)

class DetectionModelInterface(ABC):
    @abstractmethod
    def inference(self, image: np.ndarray) -> Tuple[Results, Dict[int, str]]:
        pass

YOLO Implementation (perception/vision/detection/main.py:15-22)

class DetectionModel(DetectionModelInterface):
    def __init__(self):
        self.object_model = ModelLoader().get_model()

    def inference(self, image: np.ndarray) -> tuple[list[Results], Dict[int, str]]:
        results = self.object_model.predict(
            image, 
            conf=0.55,      # Confidence threshold
            verbose=False,  # Suppress output
            imgsz=640,      # Input size
            stream=True,    # Stream results
            task='detect',  # Detection task
            half=True       # FP16 inference
        )
        return results, self.object_model.names
The model uses FP16 (half precision) inference for improved performance on compatible hardware.

Integration with Communication Module

The perception module is tightly integrated with the communication system:

Initialization in CommunicationManager (serial_manager.py:49-50)

self.camera = CameraManager(camera_index=camera_index)
self.object_detect_model = ImageProcessor(
    confidence_threshold=0.45
)

Detection Flow (serial_manager.py:174-202)

def _handle_object_detection(self, data: dict):
    """Object detection in real time"""
    try:
        # 1. Capture image
        img_path = self.camera.capture_image()
        if not img_path:
            log.error("Camera could not capture image")
            return
        
        # 2. YOLO detection
        image, yolo_result = self.object_detect_model.read_image_path(
            img_path, 
            draw_results=True, 
            save_drawn_img=True
        )
        if yolo_result is None:
            log.info("No detections.")
            return
        
        # 3. Update data
        data.update({
            'class': yolo_result['class'],
            'confidence': yolo_result['confidence'],
            'timestamp': time.time(),
            'image_path': img_path
        })
        
        # 4. Notify the central system
        if self.callbacks.get('scan_service'):
            self.callbacks['scan_service'](data)
            
    except Exception as e:
        log.error(f"Error in object detection: {str(e)}")

Complete Detection Sequence

Performance Considerations

Camera Buffer Management

Without flushing the buffer, captured images may be stale. The 5-frame grab ensures fresh data.
for _ in range(5):
    self.cap.grab()  # Flush old frames

Threading for Non-Blocking Detection

Detection runs in a separate thread (serial_manager.py:153):
Thread(
    target=self._handle_object_detection, 
    args=(data,)
).start()
This prevents blocking the serial communication thread during inference.

Confidence Thresholding

Two-stage filtering:
  1. Model-level: conf=0.55 in YOLO inference
  2. Application-level: confidence_threshold=0.45 in ImageProcessor
if confidence < confidence_threshold:
    continue

Error Handling

Camera Errors

ret, frame = self.cap.read()
if not ret:
    return None

Detection Errors

try:
    # Processing logic
    ...
except Exception as e:
    log.info(f'Error in image processing: {e}')
    return image, None

Dependencies

PackagePurpose
opencv-python (cv2)Image capture and processing
numpyArray operations
ultralyticsYOLO11 model inference
loggingError and info logging

Configuration

Camera Settings

camera_index = 0        # Default camera
width = 1280           # Frame width
height = 720           # Frame height

Detection Settings

confidence_threshold = 0.45  # Minimum confidence
model_conf = 0.55           # YOLO confidence
imgsz = 640                 # Input image size
half = True                 # FP16 inference

Target Classes

target_classes = ['apple', 'orange', 'bottle']

Usage Example

Standalone usage:
from perception.vision.camera.main import CameraManager
from perception.vision.image_processing import ImageProcessor

# Initialize
camera = CameraManager(camera_index=0)
processor = ImageProcessor(confidence_threshold=0.45)

# Capture and process
img_path = camera.capture_image()
image, detection = processor.read_image_path(
    img_path, 
    draw_results=True, 
    save_drawn_img=True
)

if detection and detection['confidence'] > 0:
    print(f"Detected: {detection['class']}")
    print(f"Confidence: {detection['confidence']:.2f}")

Next Steps

Communication

See how perception integrates with serial communication

Control

Learn how detection results guide robot movements

Build docs developers (and LLMs) love