Skip to main content

Combined Audio/Video Streaming

This advanced example demonstrates how to simultaneously stream video from a USB camera (UVC) while capturing and playing audio (UAC) - essentially creating a complete USB multimedia system on ESP32.

What This Example Demonstrates

  • Simultaneous UVC camera and UAC audio streaming
  • Concurrent microphone capture and speaker output
  • Efficient resource management for multiple streams
  • Coordinating callbacks for different streams
  • Performance optimization techniques
  • Real-world application patterns

Hardware Setup

Required Components:
  • ESP32-S3 development board (recommended for better performance)
  • USB camera with UVC support
  • USB microphone (UAC-compatible)
  • USB speaker or audio output (UAC-compatible)
  • Powered USB hub (recommended for multiple devices)
  • Adequate power supply (5V 2A minimum)
Connection Options: Option 1: USB Hub (Recommended)
ESP32-S3 USB Port → Powered USB Hub
                     ├─ USB Camera
                     ├─ USB Microphone
                     └─ USB Speaker
Option 2: USB Headset + Camera
ESP32-S3 USB Port → USB Hub
                     ├─ USB Camera
                     └─ USB Headset (mic + speaker combined)
Power Considerations:
  • Each device draws power from USB
  • Camera: 200-500mA
  • Microphone: 50-100mA
  • Speaker: 100-500mA
  • Total: Use powered USB hub for stability

Complete Code

#include <Arduino.h>
#include "USB_STREAM.h"

// Global statistics
struct StreamStats {
    uint32_t cameraFrames;
    uint32_t micFrames;
    uint32_t spkFrames;
    unsigned long lastPrintTime;
} stats = {0, 0, 0, 0};

/* Camera frame callback */
static void onCameraFrameCallback(uvc_frame *frame, void *user_ptr)
{
    stats.cameraFrames++;
    
    // Process camera frame (example: just log)
    if (stats.cameraFrames % 30 == 0) {  // Every 30 frames (~2 seconds at 15fps)
        Serial.printf("[CAM] Frame #%lu: %ux%u, format=%d, %u bytes\n",
                     stats.cameraFrames, frame->width, frame->height, 
                     frame->frame_format, frame->data_bytes);
    }
    
    // Your video processing here:
    // - Save frame to SD card
    // - Detect motion
    // - Send over WiFi
    // - Process with AI/ML
}

/* Microphone callback */
static void onMicFrameCallback(mic_frame_t *frame, void *ptr)
{
    stats.micFrames++;
    
    // Process microphone input (example: just log)
    if (stats.micFrames % 50 == 0) {  // Every 50 frames (~1 second)
        Serial.printf("[MIC] Frame #%lu: %u Hz, %u-bit, %lu bytes\n",
                     stats.micFrames, frame->samples_frequence, 
                     frame->bit_resolution, frame->data_bytes);
    }
    
    // Your audio processing here:
    // - Voice activity detection
    // - Audio level metering
    // - Save to SD card
    // - Speech recognition
    // - Echo to speaker (see combined example below)
}

/* Speaker callback */
static void onSpeakerFrameCallback(spk_frame_t *frame, void *ptr)
{
    stats.spkFrames++;
    
    // Generate audio output (example: 440 Hz tone)
    static float phase = 0.0;
    int16_t *samples = (int16_t *)frame->data;
    int numSamples = frame->data_bytes / 2;
    
    for (int i = 0; i < numSamples; i++) {
        samples[i] = (int16_t)(sin(phase) * 5000);  // Quiet tone
        phase += 2.0 * PI * 440.0 / frame->samples_frequence;
        if (phase >= 2.0 * PI) phase -= 2.0 * PI;
    }
    
    // Alternative: Play audio from buffer/SD/network
    // Alternative: Echo microphone input (see example below)
}

void printStatistics()
{
    unsigned long now = millis();
    if (now - stats.lastPrintTime >= 5000) {  // Every 5 seconds
        stats.lastPrintTime = now;
        Serial.println("\n=== Streaming Statistics ===");
        Serial.printf("Camera frames: %lu\n", stats.cameraFrames);
        Serial.printf("Mic frames:    %lu\n", stats.micFrames);
        Serial.printf("Speaker frames: %lu\n", stats.spkFrames);
        Serial.printf("Free heap:     %lu bytes\n", ESP.getFreeHeap());
        Serial.println("============================\n");
    }
}

void setup()
{
    Serial.begin(115200);
    Serial.println("\n\nESP32 Combined USB Streaming Example");
    Serial.println("Starting UVC camera + UAC audio...");
    
    // Instantiate USB_STREAM object
    USB_STREAM *usb = new USB_STREAM();

    // ===== Allocate UVC Camera Buffers =====
    uint8_t *_xferBufferA = (uint8_t *)malloc(55 * 1024);
    assert(_xferBufferA != NULL);
    uint8_t *_xferBufferB = (uint8_t *)malloc(55 * 1024);
    assert(_xferBufferB != NULL);
    uint8_t *_frameBuffer = (uint8_t *)malloc(55 * 1024);
    assert(_frameBuffer != NULL);
    Serial.println("[INIT] Camera buffers allocated");

    // ===== Configure UVC Camera =====
    // Resolution: Any, FPS: 15, Buffer size: 55KB
    usb->uvcConfiguration(
        FRAME_RESOLUTION_ANY,    // Width: any
        FRAME_RESOLUTION_ANY,    // Height: any
        FRAME_INTERVAL_FPS_15,   // 15 FPS
        55 * 1024,               // Transfer buffer size
        _xferBufferA,            // Transfer buffer A
        _xferBufferB,            // Transfer buffer B
        55 * 1024,               // Frame buffer size
        _frameBuffer             // Frame buffer
    );
    Serial.println("[INIT] Camera configured");

    // ===== Configure UAC Audio =====
    // Microphone: any format, 6400 bytes buffer
    // Speaker: any format, 6400 bytes buffer
    usb->uacConfiguration(
        UAC_CH_ANY,              // Mic: any channels
        UAC_BITS_ANY,            // Mic: any bit depth
        UAC_FREQUENCY_ANY,       // Mic: any sample rate
        6400,                    // Mic: buffer size
        UAC_CH_ANY,              // Speaker: any channels
        UAC_BITS_ANY,            // Speaker: any bit depth
        UAC_FREQUENCY_ANY,       // Speaker: any sample rate
        6400                     // Speaker: buffer size
    );
    Serial.println("[INIT] Audio configured");

    // ===== Register All Callbacks =====
    usb->uvcCamRegisterCb(&onCameraFrameCallback, NULL);
    usb->uacMicRegisterCb(&onMicFrameCallback, NULL);
    usb->uacSpkRegisterCb(&onSpeakerFrameCallback, NULL);
    Serial.println("[INIT] Callbacks registered");

    // ===== Start Streaming =====
    usb->start();
    Serial.println("[INIT] USB streaming started");

    // Wait for devices to connect
    Serial.println("[INIT] Waiting for devices...");
    usb->connectWait(2000);  // Wait up to 2 seconds
    
    Serial.println("\n*** Streaming active! ***\n");
    stats.lastPrintTime = millis();
    
    // Optional: Demonstrate control functions
    delay(10000);
    
    Serial.println("[DEMO] Setting volumes...");
    usb->uacMicVolume((void *)70);   // Mic: 70%
    usb->uacSpkVolume((void *)60);   // Speaker: 60%
    
    /*
    // Optional: Memory cleanup (usually not needed as device runs continuously)
    // free(_xferBufferA);
    // free(_xferBufferB);
    // free(_frameBuffer);
    */
}

void loop()
{
    printStatistics();
    vTaskDelay(100);
}

Code Explanation

1. Global Statistics Tracking

struct StreamStats {
    uint32_t cameraFrames;
    uint32_t micFrames;
    uint32_t spkFrames;
    unsigned long lastPrintTime;
} stats = {0, 0, 0, 0};
Why track statistics?
  • Monitor stream health (are all callbacks firing?)
  • Detect performance issues (frame rate drops)
  • Debug resource usage
  • Verify synchronization

2. Coordinated Callbacks

Each stream has its own callback running independently:
onCameraFrameCallback()   // ~15 times/second (15 FPS)
onMicFrameCallback()      // ~50 times/second (20ms audio chunks)
onSpeakerFrameCallback()  // ~50 times/second (20ms audio chunks)
Critical considerations:
  • Callbacks run on different threads
  • Keep each callback fast (< 10ms)
  • Avoid blocking operations
  • Use thread-safe data structures for inter-callback communication

3. Memory Allocation Strategy

// UVC: 3 buffers × 55KB = 165KB
uint8_t *_xferBufferA = (uint8_t *)malloc(55 * 1024);
uint8_t *_xferBufferB = (uint8_t *)malloc(55 * 1024);
uint8_t *_frameBuffer = (uint8_t *)malloc(55 * 1024);

// UAC: Managed internally, 2 × 6400 bytes = 12.5KB
// Total: ~178KB minimum
ESP32-S3 Memory:
  • Total RAM: ~400KB
  • After buffers: ~220KB free
  • Monitor with ESP.getFreeHeap()
  • Use PSRAM for larger buffers if available

4. Configuration for Multiple Streams

// Camera configuration
usb->uvcConfiguration(...);

// Audio configuration (both mic and speaker)
usb->uacConfiguration(
    /* Microphone params */, /* Speaker params */
);
Order matters:
  1. Configure UVC first
  2. Configure UAC second
  3. Register all callbacks
  4. Call start() once (starts everything)

5. Resource Monitoring

void printStatistics()
{
    Serial.printf("Free heap: %lu bytes\n", ESP.getFreeHeap());
}
Watch for:
  • Decreasing free heap (memory leak)
  • Callbacks not incrementing (stream stopped)
  • Uneven frame rates (performance issue)

Advanced Usage Patterns

Pattern 1: Audio Echo (Microphone → Speaker)

Create a real-time audio passthrough:
#include "freertos/queue.h"

// Ring buffer for audio data
QueueHandle_t audioQueue;
const int QUEUE_SIZE = 10;

void setup() {
    // Create queue for audio buffers
    audioQueue = xQueueCreate(QUEUE_SIZE, 6400);  // 10 buffers of 6400 bytes
    // ... rest of setup
}

static void onMicFrameCallback(mic_frame_t *frame, void *ptr)
{
    // Copy microphone data to queue
    uint8_t buffer[6400];
    memcpy(buffer, frame->data, frame->data_bytes);
    xQueueSend(audioQueue, buffer, 0);  // Non-blocking
}

static void onSpeakerFrameCallback(spk_frame_t *frame, void *ptr)
{
    // Read from queue and play to speaker
    uint8_t buffer[6400];
    if (xQueueReceive(audioQueue, buffer, 0) == pdTRUE) {
        memcpy(frame->data, buffer, frame->data_bytes);
    } else {
        // No data available - output silence
        memset(frame->data, 0, frame->data_bytes);
    }
}
Result: Real-time audio monitoring with ~20-200ms latency.

Pattern 2: Video Recording with Audio

Synchronize audio and video recording:
#include <SD.h>

File videoFile;
File audioFile;
bool recording = false;
unsigned long recordingStartTime = 0;

void startRecording() {
    recordingStartTime = millis();
    videoFile = SD.open("/video.mjpeg", FILE_WRITE);
    audioFile = SD.open("/audio.raw", FILE_WRITE);
    recording = true;
    Serial.println("Recording started");
}

void stopRecording() {
    recording = false;
    videoFile.close();
    audioFile.close();
    Serial.println("Recording stopped");
}

static void onCameraFrameCallback(uvc_frame *frame, void *user_ptr)
{
    if (recording && videoFile) {
        // Write timestamp + frame
        uint32_t timestamp = millis() - recordingStartTime;
        videoFile.write((uint8_t*)&timestamp, 4);
        videoFile.write((uint8_t*)&frame->data_bytes, 4);
        videoFile.write(frame->data, frame->data_bytes);
    }
}

static void onMicFrameCallback(mic_frame_t *frame, void *ptr)
{
    if (recording && audioFile) {
        // Write timestamp + audio
        uint32_t timestamp = millis() - recordingStartTime;
        audioFile.write((uint8_t*)&timestamp, 4);
        audioFile.write(frame->data, frame->data_bytes);
    }
}

Pattern 3: WiFi Streaming (Video + Audio)

Stream to network client:
#include <WiFi.h>

WiFiClient videoClient;
WiFiClient audioClient;

void setup() {
    WiFi.begin("SSID", "password");
    while (WiFi.status() != WL_CONNECTED) delay(500);
    
    videoClient.connect("192.168.1.100", 8080);  // Video stream
    audioClient.connect("192.168.1.100", 8081);  // Audio stream
    
    // ... USB setup
}

static void onCameraFrameCallback(uvc_frame *frame, void *user_ptr)
{
    if (videoClient.connected()) {
        // Send frame size, then frame data
        videoClient.write((uint8_t*)&frame->data_bytes, 4);
        videoClient.write(frame->data, frame->data_bytes);
    }
}

static void onMicFrameCallback(mic_frame_t *frame, void *ptr)
{
    if (audioClient.connected()) {
        audioClient.write(frame->data, frame->data_bytes);
    }
}

Pattern 4: Motion Detection with Audio Alert

Detect motion in video, trigger audio alert:
bool motionDetected = false;
unsigned long motionTime = 0;

// Simple motion detection (compare frames)
uint8_t previousFrame[55 * 1024];
bool hasPixelFrame = false;

static void onCameraFrameCallback(uvc_frame *frame, void *user_ptr)
{
    if (hasPixelFrame) {
        // Compare with previous frame
        int differences = 0;
        for (int i = 0; i < frame->data_bytes; i += 100) {  // Sample pixels
            if (abs(frame->data[i] - previousFrame[i]) > 20) {
                differences++;
            }
        }
        
        if (differences > 50) {  // Threshold
            motionDetected = true;
            motionTime = millis();
            Serial.println("[ALERT] Motion detected!");
        }
    }
    
    // Save current frame for next comparison
    memcpy(previousFrame, frame->data, frame->data_bytes);
    hasPixelFrame = true;
}

static void onSpeakerFrameCallback(spk_frame_t *frame, void *ptr)
{
    if (motionDetected && (millis() - motionTime < 2000)) {
        // Play alert tone for 2 seconds after motion
        static float phase = 0.0;
        int16_t *samples = (int16_t *)frame->data;
        int numSamples = frame->data_bytes / 2;
        
        for (int i = 0; i < numSamples; i++) {
            // Alternating 800Hz and 1000Hz (alarm sound)
            float freq = ((millis() / 500) % 2) ? 800.0 : 1000.0;
            samples[i] = (int16_t)(sin(phase) * 15000);
            phase += 2.0 * PI * freq / frame->samples_frequence;
            if (phase >= 2.0 * PI) phase -= 2.0 * PI;
        }
    } else {
        // Silence
        memset(frame->data, 0, frame->data_bytes);
        motionDetected = false;
    }
}

Performance Optimization

1. Reduce Frame Rate

Lower FPS reduces CPU load:
// Instead of 15 FPS:
usb->uvcConfiguration(..., FRAME_INTERVAL_FPS_15, ...);

// Try 10 FPS for better stability:
usb->uvcConfiguration(..., FRAME_INTERVAL_FPS_10, ...);

2. Lower Camera Resolution

// Request specific resolution
usb->uvcConfiguration(
    320,                     // Width: 320px
    240,                     // Height: 240px
    FRAME_INTERVAL_FPS_15,
    // Smaller buffers needed:
    20 * 1024, _xferBufferA, _xferBufferB,
    20 * 1024, _frameBuffer
);

3. Minimize Callback Processing

Bad (blocks callback):
static void onCameraFrameCallback(uvc_frame *frame, void *user_ptr) {
    // Heavy processing in callback - BAD!
    processImageWithAI(frame->data);
    sendOverWiFi(frame->data, frame->data_bytes);
    delay(50);
}
Good (quick handoff):
QueueHandle_t frameQueue;

static void onCameraFrameCallback(uvc_frame *frame, void *user_ptr) {
    // Quick copy to queue
    xQueueSend(frameQueue, &frame->data, 0);
}

void loop() {
    uint8_t *frameData;
    if (xQueueReceive(frameQueue, &frameData, 10) == pdTRUE) {
        // Process in main loop, not callback
        processImageWithAI(frameData);
    }
}

4. Use PSRAM (ESP32-S3)

If your ESP32-S3 has PSRAM:
void setup() {
    // Allocate large buffers in PSRAM
    uint8_t *_xferBufferA = (uint8_t *)heap_caps_malloc(100 * 1024, MALLOC_CAP_SPIRAM);
    uint8_t *_xferBufferB = (uint8_t *)heap_caps_malloc(100 * 1024, MALLOC_CAP_SPIRAM);
    uint8_t *_frameBuffer = (uint8_t *)heap_caps_malloc(100 * 1024, MALLOC_CAP_SPIRAM);
    
    // Supports larger frames (720p)
}

Troubleshooting

Problem: Only camera or audio works, not both

Cause: Configuration order or missing callback registration. Solution:
// Correct order:
usb->uvcConfiguration(...);         // 1. Camera first
usb->uacConfiguration(...);         // 2. Audio second
usb->uvcCamRegisterCb(...);         // 3. Register callbacks
usb->uacMicRegisterCb(...);
usb->uacSpkRegisterCb(...);
usb->start();                       // 4. Start once

Problem: System crashes or resets

Causes:
  • Memory allocation failed
  • Buffer overflow
  • Stack overflow in callback
Solutions:
// 1. Check all allocations
assert(_xferBufferA != NULL);

// 2. Monitor heap
if (ESP.getFreeHeap() < 50000) {
    Serial.println("LOW MEMORY WARNING!");
}

// 3. Increase stack size for tasks
// In sdkconfig or menuconfig

Problem: Audio/video out of sync

Cause: Different callback rates and buffering delays. Solution: Add timestamps to synchronize:
struct TimestampedFrame {
    unsigned long timestamp;
    uint8_t data[6400];
};

static void onMicFrameCallback(mic_frame_t *frame, void *ptr) {
    TimestampedFrame tf;
    tf.timestamp = micros();  // High precision timestamp
    memcpy(tf.data, frame->data, frame->data_bytes);
    // Store with timestamp
}

Problem: Dropped frames

Symptoms: Sequence numbers skip in camera callback. Solutions:
  • Reduce frame rate
  • Increase buffer sizes
  • Minimize callback processing
  • Use lower resolution
  • Check USB hub power

Real-World Applications

1. Video Conferencing Device

  • Camera: Capture video
  • Microphone: Capture voice
  • Speaker: Play remote audio
  • WiFi: Stream bidirectional A/V

2. Security Camera with Audio

  • Camera: Motion detection
  • Microphone: Audio events (glass breaking, etc.)
  • Speaker: Two-way communication
  • SD Card: Local recording

3. Baby Monitor

  • Camera: Night vision camera
  • Microphone: Cry detection
  • Speaker: Soothing sounds
  • WiFi: Stream to phone app

4. Podcast Recording Studio

  • Camera: Video recording
  • Microphone: Voice capture
  • Speaker: Monitoring playback
  • SD Card: High-quality recording

UVC Camera

Camera-only streaming basics

UAC Microphone

Microphone capture fundamentals

UAC Speaker

Speaker output basics

Build docs developers (and LLMs) love