Overview
The Perception Module provides vision capabilities for the robotic arm system. It integrates camera hardware, image processing, and YOLO-based object detection to identify and classify objects in the environment.
Module Structure
perception/
├── vision/
│ ├── camera/
│ │ └── main.py # CameraManager
│ ├── detection/
│ │ ├── main.py # DetectionModel interface
│ │ └── model_loader.py # YOLO model loading
│ └── image_processing.py # ImageProcessor pipeline
CameraManager
Initialization (perception/vision/camera/main.py:5-9)
class CameraManager :
def __init__ ( self , camera_index : int = 0 ,
width : int = 1280 ,
height : int = 720 ):
self .cap = cv2.VideoCapture(camera_index)
self .cap.set(cv2. CAP_PROP_FRAME_WIDTH , width)
self .cap.set(cv2. CAP_PROP_FRAME_HEIGHT , height)
Image Capture (perception/vision/camera/main.py:11-26)
def capture_image ( self ):
# Flush camera buffer with 5 grabs
for _ in range ( 5 ):
self .cap.grab()
ret, frame = self .cap.read()
if not ret:
return None
current_dir = os.path.dirname(os.path.abspath( __file__ ))
timestamp = time.strftime( "%Y%m %d -%H%M%S" )
filename = f " { current_dir } /objects_images/ { timestamp } .png"
os.makedirs(os.path.dirname(filename), exist_ok = True )
cv2.imwrite(filename, frame)
return filename
The camera buffer is flushed with 5 frame grabs to ensure the captured image is current, not stale data from the buffer.
Resource Management (perception/vision/camera/main.py:28-29)
def __del__ ( self ):
self .cap.release()
ImageProcessor Pipeline
Initialization (perception/vision/image_processing.py:8-11)
class ImageProcessor :
def __init__ ( self , confidence_threshold : float = 0.45 ):
self .detection: DetectionModelInterface = DetectionModel()
self .conf_threshold = confidence_threshold
Processing Pipeline
Main Processing Method (perception/vision/image_processing.py:13-22)
def read_image_path ( self , path : str ,
draw_results : bool = True ,
save_drawn_img : bool = True ):
object_image = cv2.imread(path)
processed_img, best_detection = self .process_image(
object_image,
self .conf_threshold
)
if draw_results and best_detection is not None and best_detection.get( 'confidence' , 0 ) > 0 :
self ._draw_detection(processed_img, best_detection)
if save_drawn_img:
self ._save_drawn_image(processed_img, path)
return processed_img, best_detection
Image Processing Logic (perception/vision/image_processing.py:24-72)
def process_image ( self , image : np.ndarray,
confidence_threshold : float = 0.45 ):
try :
# 1. Run inference
copy_image = image.copy()
object_results, object_classes = self .detection.inference(copy_image)
# 2. Initialize best detection tracker
best_detection = {
'class' : '' ,
'confidence' : 0.0 ,
'box' : [],
'class_id' : - 1
}
# 3. Process all detections
for res in object_results:
boxes = res.boxes
if boxes.shape[ 0 ] == 0 :
continue
confidence = boxes.conf.cpu().numpy()[ 0 ]
class_id = int (boxes.cls[ 0 ])
box_data = boxes.xyxy.cpu().numpy()[ 0 ]
if confidence < confidence_threshold:
continue
detected_class = object_classes[class_id]
clss_object = 'default'
# Filter for target classes
if detected_class in [ 'apple' , 'orange' , 'bottle' ]:
clss_object = detected_class
log.info( f 'class: { clss_object } ' )
# Keep highest confidence detection
if confidence > best_detection[ 'confidence' ]:
best_detection.update({
'class' : str (clss_object),
'confidence' : float (confidence),
'box' : box_data,
'class_id' : class_id
})
# 4. Return final result
if best_detection[ 'confidence' ] >= confidence_threshold:
log.info( f "Best detection: { best_detection } " )
return image, best_detection
else :
log.info( "No detections found" )
return image, best_detection
except Exception as e:
log.info( f 'Error in image processing: { e } ' )
return image, None
Detection Data Structure
{
'class' : 'apple' , # Object class name
'confidence' : 0.87 , # Detection confidence [0-1]
'box' : [x1, y1, x2, y2], # Bounding box coordinates
'class_id' : 47 # COCO dataset class ID
}
The system only considers three target classes: apple, orange, and bottle. All other detections are labeled as default.
Visualization
Drawing Detections (perception/vision/image_processing.py:74-87)
def _draw_detection ( self , image : np.ndarray, detection : dict ):
"""Draw bounding box and label on image"""
box = detection[ 'box' ]
class_name = detection[ 'class' ]
confidence = detection[ 'confidence' ]
x1, y1, x2, y2 = map ( int , box)
color = ( 0 , 255 , 0 ) # BGR - green
cv2.rectangle(image, (x1, y1), (x2, y2), color, 2 )
label = f " { class_name } { confidence :.2f} "
cv2.putText(
image, label, (x1, y1 - 10 ),
cv2. FONT_HERSHEY_SIMPLEX , 0.7 , color, 2
)
Saving Annotated Images (perception/vision/image_processing.py:89-95)
def _save_drawn_image ( self , image : np.ndarray, original_path : str ):
"""Save image with drawn detections"""
out_path = original_path.replace( '.jpg' , '_detected.jpg' )
cv2.imwrite(out_path, image)
log.info( f "Saved image with detections: { out_path } " )
DetectionModel Interface
Abstract Interface (perception/vision/detection/main.py:9-12)
class DetectionModelInterface ( ABC ):
@abstractmethod
def inference ( self , image : np.ndarray) -> Tuple[Results, Dict[ int , str ]]:
pass
YOLO Implementation (perception/vision/detection/main.py:15-22)
class DetectionModel ( DetectionModelInterface ):
def __init__ ( self ):
self .object_model = ModelLoader().get_model()
def inference ( self , image : np.ndarray) -> tuple[list[Results], Dict[ int , str ]]:
results = self .object_model.predict(
image,
conf = 0.55 , # Confidence threshold
verbose = False , # Suppress output
imgsz = 640 , # Input size
stream = True , # Stream results
task = 'detect' , # Detection task
half = True # FP16 inference
)
return results, self .object_model.names
The model uses FP16 (half precision) inference for improved performance on compatible hardware.
Integration with Communication Module
The perception module is tightly integrated with the communication system:
Initialization in CommunicationManager (serial_manager.py:49-50)
self .camera = CameraManager( camera_index = camera_index)
self .object_detect_model = ImageProcessor(
confidence_threshold = 0.45
)
Detection Flow (serial_manager.py:174-202)
def _handle_object_detection ( self , data : dict ):
"""Object detection in real time"""
try :
# 1. Capture image
img_path = self .camera.capture_image()
if not img_path:
log.error( "Camera could not capture image" )
return
# 2. YOLO detection
image, yolo_result = self .object_detect_model.read_image_path(
img_path,
draw_results = True ,
save_drawn_img = True
)
if yolo_result is None :
log.info( "No detections." )
return
# 3. Update data
data.update({
'class' : yolo_result[ 'class' ],
'confidence' : yolo_result[ 'confidence' ],
'timestamp' : time.time(),
'image_path' : img_path
})
# 4. Notify the central system
if self .callbacks.get( 'scan_service' ):
self .callbacks[ 'scan_service' ](data)
except Exception as e:
log.error( f "Error in object detection: { str (e) } " )
Complete Detection Sequence
Camera Buffer Management
Without flushing the buffer, captured images may be stale. The 5-frame grab ensures fresh data.
for _ in range ( 5 ):
self .cap.grab() # Flush old frames
Threading for Non-Blocking Detection
Detection runs in a separate thread (serial_manager.py:153):
Thread(
target = self ._handle_object_detection,
args = (data,)
).start()
This prevents blocking the serial communication thread during inference.
Confidence Thresholding
Two-stage filtering:
Model-level : conf=0.55 in YOLO inference
Application-level : confidence_threshold=0.45 in ImageProcessor
if confidence < confidence_threshold:
continue
Error Handling
Camera Errors
ret, frame = self .cap.read()
if not ret:
return None
Detection Errors
try :
# Processing logic
...
except Exception as e:
log.info( f 'Error in image processing: { e } ' )
return image, None
Dependencies
Package Purpose opencv-python (cv2)Image capture and processing numpyArray operations ultralyticsYOLO11 model inference loggingError and info logging
Configuration
Camera Settings
camera_index = 0 # Default camera
width = 1280 # Frame width
height = 720 # Frame height
Detection Settings
confidence_threshold = 0.45 # Minimum confidence
model_conf = 0.55 # YOLO confidence
imgsz = 640 # Input image size
half = True # FP16 inference
Target Classes
target_classes = [ 'apple' , 'orange' , 'bottle' ]
Usage Example
Standalone usage:
from perception.vision.camera.main import CameraManager
from perception.vision.image_processing import ImageProcessor
# Initialize
camera = CameraManager( camera_index = 0 )
processor = ImageProcessor( confidence_threshold = 0.45 )
# Capture and process
img_path = camera.capture_image()
image, detection = processor.read_image_path(
img_path,
draw_results = True ,
save_drawn_img = True
)
if detection and detection[ 'confidence' ] > 0 :
print ( f "Detected: { detection[ 'class' ] } " )
print ( f "Confidence: { detection[ 'confidence' ] :.2f} " )
Next Steps
Communication See how perception integrates with serial communication
Control Learn how detection results guide robot movements