Skip to main content

Overview

The Audio Track analyzes raw audio streams to detect distress signals, voice characteristics, and acoustic hazards independently of speech content.

Source Code

Location: app/agents/audio_track.py
def analyze_audio_frame(frame: bytes) -> dict:
    # stub: compute distress score or hazards
    return {"distress_score": 0.0, "hazards": []}
Current implementation is a stub that returns placeholder values. Production version would analyze audio features like pitch, volume, speech rate, and voice quality.

Distress Detection

Audio Features

The audio analyzer would extract:
  • Pitch Analysis: High pitch indicates stress
  • Volume Dynamics: Sudden changes suggest urgency
  • Speech Rate: Rapid speech correlates with panic
  • Voice Quality: Trembling, crying, shouting
  • Background Sounds: Sirens, screams, crashes

Distress Score

Output is a float between 0.0 and 1.0:
{
    "distress_score": 0.75,  # 0.0 = calm, 1.0 = extreme distress
    "hazards": ["background_screaming", "loud_crash"]
}

Implementation Pattern

Frame-by-Frame Analysis

import numpy as np
from scipy import signal

def analyze_audio_frame(frame: bytes) -> dict:
    """
    Analyze a single audio frame for distress indicators.
    
    Args:
        frame: Raw PCM audio bytes (typically 20-100ms worth)
        
    Returns:
        dict with distress_score (float) and hazards (list[str])
    """
    # Convert bytes to numpy array
    audio = np.frombuffer(frame, dtype=np.int16)
    
    # Compute features
    volume = compute_rms_volume(audio)
    pitch = estimate_pitch(audio)
    spectral_features = compute_spectral_features(audio)
    
    # Aggregate into distress score
    distress = aggregate_distress(
        volume=volume,
        pitch=pitch,
        spectral_flux=spectral_features['flux']
    )
    
    # Detect acoustic hazards
    hazards = detect_hazards(audio, spectral_features)
    
    return {
        "distress_score": float(distress),
        "hazards": hazards
    }

Feature Extraction

Volume (RMS)
def compute_rms_volume(audio: np.ndarray) -> float:
    """Root mean square volume"""
    return np.sqrt(np.mean(audio.astype(float)**2))
Pitch Estimation
def estimate_pitch(audio: np.ndarray, sample_rate: int = 8000) -> float:
    """Fundamental frequency estimation using autocorrelation"""
    autocorr = np.correlate(audio, audio, mode='full')
    autocorr = autocorr[len(autocorr)//2:]
    
    # Find first peak after zero lag
    peaks = signal.find_peaks(autocorr)[0]
    if len(peaks) == 0:
        return 0.0
        
    period = peaks[0]
    return sample_rate / period if period > 0 else 0.0
Spectral Features
def compute_spectral_features(audio: np.ndarray) -> dict:
    """Frequency domain analysis"""
    fft = np.fft.rfft(audio)
    magnitude = np.abs(fft)
    
    return {
        'flux': np.sum(np.diff(magnitude)**2),  # Spectral change
        'centroid': np.sum(magnitude * np.arange(len(magnitude))) / np.sum(magnitude),
        'rolloff': np.where(np.cumsum(magnitude) >= 0.85 * np.sum(magnitude))[0][0]
    }

Distress Aggregation

Rule-Based Scoring

def aggregate_distress(volume: float, pitch: float, spectral_flux: float) -> float:
    """
    Combine audio features into distress score.
    
    High distress indicators:
    - High volume (shouting)
    - High pitch (stress)
    - High spectral flux (voice trembling)
    """
    # Normalize features to 0-1 range
    volume_norm = min(volume / 10000, 1.0)
    pitch_norm = min(pitch / 400, 1.0)  # 400 Hz is high for stressed speech
    flux_norm = min(spectral_flux / 1e6, 1.0)
    
    # Weighted combination
    distress = (
        0.4 * volume_norm +
        0.3 * pitch_norm +
        0.3 * flux_norm
    )
    
    return min(max(distress, 0.0), 1.0)

ML-Based Scoring

Production systems might use trained models:
import torch
from transformers import Wav2Vec2ForSequenceClassification

class DistressClassifier:
    def __init__(self):
        self.model = Wav2Vec2ForSequenceClassification.from_pretrained(
            "distress-detector-v1"
        )
        
    def predict(self, audio: np.ndarray) -> float:
        inputs = self.processor(audio, sampling_rate=8000, return_tensors="pt")
        logits = self.model(**inputs).logits
        probs = torch.softmax(logits, dim=-1)
        return probs[0][1].item()  # Probability of distress class

Hazard Detection

Acoustic Event Detection

def detect_hazards(audio: np.ndarray, spectral: dict) -> list[str]:
    """
    Detect specific acoustic hazards in background.
    
    Returns:
        List of detected hazard tags like ['gunshot', 'glass_break']
    """
    hazards = []
    
    # Gunshot: loud impulse with specific frequency signature
    if is_impulsive(audio) and has_gunshot_spectrum(spectral):
        hazards.append('gunshot')
    
    # Glass breaking: high-frequency shattering sound
    if spectral['centroid'] > 4000 and spectral['flux'] > 1e6:
        hazards.append('glass_break')
    
    # Screaming: high-pitch sustained sound
    if has_scream_pattern(audio):
        hazards.append('background_screaming')
    
    # Siren: periodic frequency modulation
    if has_siren_pattern(audio):
        hazards.append('siren')
        
    return hazards

Integration with Emotion Agent

The distress score feeds into emotion classification:
from app.agents.emotion import analyze_emotion

# Audio track provides distress score
audio_result = analyze_audio_frame(audio_chunk)
distress = audio_result['distress_score']

# Emotion agent combines with transcript
emotion = await analyze_emotion(
    transcript="I need help, someone's been shot!",
    distress=distress  # Audio distress enhances text analysis
)
The distress score is critical for detecting emergencies where the caller sounds calm (shock, dissociation) but the content is life-threatening.

Performance

Real-Time Processing

  • Frame Size: 20ms (160 samples at 8kHz)
  • Processing Time: < 5ms per frame
  • Latency: Near-zero (processes as audio arrives)

Resource Usage

# Efficient NumPy operations
import numpy as np

# Process 1 second of audio (8000 samples)
audio = np.frombuffer(pcm_data, dtype=np.int16)
# Processing time: ~2-3ms on modern CPU

Testing

Unit Tests

import pytest
from app.agents.audio_track import analyze_audio_frame

def test_analyze_audio_frame():
    # Generate test audio (silence)
    frame = b'\x00' * 320  # 20ms at 8kHz, 16-bit
    
    result = analyze_audio_frame(frame)
    
    assert 'distress_score' in result
    assert 0.0 <= result['distress_score'] <= 1.0
    assert isinstance(result['hazards'], list)

def test_high_distress_audio():
    # Generate loud, high-pitch audio (distress)
    t = np.linspace(0, 0.02, 160)  # 20ms
    audio = (np.sin(2 * np.pi * 400 * t) * 10000).astype(np.int16)
    frame = audio.tobytes()
    
    result = analyze_audio_frame(frame)
    
    # Should detect high distress
    assert result['distress_score'] > 0.5

Next Steps

NLP Track

Text analysis pipeline

Emotion Agent

How distress scores are used

Build docs developers (and LLMs) love