Overview
The Audio Track analyzes raw audio streams to detect distress signals, voice characteristics, and acoustic hazards independently of speech content.
Source Code
Location: app/agents/audio_track.py
def analyze_audio_frame ( frame : bytes ) -> dict :
# stub: compute distress score or hazards
return { "distress_score" : 0.0 , "hazards" : []}
Current implementation is a stub that returns placeholder values. Production version would analyze audio features like pitch, volume, speech rate, and voice quality.
Distress Detection
Audio Features
The audio analyzer would extract:
Pitch Analysis : High pitch indicates stress
Volume Dynamics : Sudden changes suggest urgency
Speech Rate : Rapid speech correlates with panic
Voice Quality : Trembling, crying, shouting
Background Sounds : Sirens, screams, crashes
Distress Score
Output is a float between 0.0 and 1.0:
{
"distress_score" : 0.75 , # 0.0 = calm, 1.0 = extreme distress
"hazards" : [ "background_screaming" , "loud_crash" ]
}
Implementation Pattern
Frame-by-Frame Analysis
import numpy as np
from scipy import signal
def analyze_audio_frame ( frame : bytes ) -> dict :
"""
Analyze a single audio frame for distress indicators.
Args:
frame: Raw PCM audio bytes (typically 20-100ms worth)
Returns:
dict with distress_score (float) and hazards (list[str])
"""
# Convert bytes to numpy array
audio = np.frombuffer(frame, dtype = np.int16)
# Compute features
volume = compute_rms_volume(audio)
pitch = estimate_pitch(audio)
spectral_features = compute_spectral_features(audio)
# Aggregate into distress score
distress = aggregate_distress(
volume = volume,
pitch = pitch,
spectral_flux = spectral_features[ 'flux' ]
)
# Detect acoustic hazards
hazards = detect_hazards(audio, spectral_features)
return {
"distress_score" : float (distress),
"hazards" : hazards
}
Volume (RMS)
def compute_rms_volume ( audio : np.ndarray) -> float :
"""Root mean square volume"""
return np.sqrt(np.mean(audio.astype( float ) ** 2 ))
Pitch Estimation
def estimate_pitch ( audio : np.ndarray, sample_rate : int = 8000 ) -> float :
"""Fundamental frequency estimation using autocorrelation"""
autocorr = np.correlate(audio, audio, mode = 'full' )
autocorr = autocorr[ len (autocorr) // 2 :]
# Find first peak after zero lag
peaks = signal.find_peaks(autocorr)[ 0 ]
if len (peaks) == 0 :
return 0.0
period = peaks[ 0 ]
return sample_rate / period if period > 0 else 0.0
Spectral Features
def compute_spectral_features ( audio : np.ndarray) -> dict :
"""Frequency domain analysis"""
fft = np.fft.rfft(audio)
magnitude = np.abs(fft)
return {
'flux' : np.sum(np.diff(magnitude) ** 2 ), # Spectral change
'centroid' : np.sum(magnitude * np.arange( len (magnitude))) / np.sum(magnitude),
'rolloff' : np.where(np.cumsum(magnitude) >= 0.85 * np.sum(magnitude))[ 0 ][ 0 ]
}
Distress Aggregation
Rule-Based Scoring
def aggregate_distress ( volume : float , pitch : float , spectral_flux : float ) -> float :
"""
Combine audio features into distress score.
High distress indicators:
- High volume (shouting)
- High pitch (stress)
- High spectral flux (voice trembling)
"""
# Normalize features to 0-1 range
volume_norm = min (volume / 10000 , 1.0 )
pitch_norm = min (pitch / 400 , 1.0 ) # 400 Hz is high for stressed speech
flux_norm = min (spectral_flux / 1e6 , 1.0 )
# Weighted combination
distress = (
0.4 * volume_norm +
0.3 * pitch_norm +
0.3 * flux_norm
)
return min ( max (distress, 0.0 ), 1.0 )
ML-Based Scoring
Production systems might use trained models:
import torch
from transformers import Wav2Vec2ForSequenceClassification
class DistressClassifier :
def __init__ ( self ):
self .model = Wav2Vec2ForSequenceClassification.from_pretrained(
"distress-detector-v1"
)
def predict ( self , audio : np.ndarray) -> float :
inputs = self .processor(audio, sampling_rate = 8000 , return_tensors = "pt" )
logits = self .model( ** inputs).logits
probs = torch.softmax(logits, dim =- 1 )
return probs[ 0 ][ 1 ].item() # Probability of distress class
Hazard Detection
Acoustic Event Detection
def detect_hazards ( audio : np.ndarray, spectral : dict ) -> list[ str ]:
"""
Detect specific acoustic hazards in background.
Returns:
List of detected hazard tags like ['gunshot', 'glass_break']
"""
hazards = []
# Gunshot: loud impulse with specific frequency signature
if is_impulsive(audio) and has_gunshot_spectrum(spectral):
hazards.append( 'gunshot' )
# Glass breaking: high-frequency shattering sound
if spectral[ 'centroid' ] > 4000 and spectral[ 'flux' ] > 1e6 :
hazards.append( 'glass_break' )
# Screaming: high-pitch sustained sound
if has_scream_pattern(audio):
hazards.append( 'background_screaming' )
# Siren: periodic frequency modulation
if has_siren_pattern(audio):
hazards.append( 'siren' )
return hazards
Integration with Emotion Agent
The distress score feeds into emotion classification:
from app.agents.emotion import analyze_emotion
# Audio track provides distress score
audio_result = analyze_audio_frame(audio_chunk)
distress = audio_result[ 'distress_score' ]
# Emotion agent combines with transcript
emotion = await analyze_emotion(
transcript = "I need help, someone's been shot!" ,
distress = distress # Audio distress enhances text analysis
)
The distress score is critical for detecting emergencies where the caller sounds calm (shock, dissociation) but the content is life-threatening.
Real-Time Processing
Frame Size : 20ms (160 samples at 8kHz)
Processing Time : < 5ms per frame
Latency : Near-zero (processes as audio arrives)
Resource Usage
# Efficient NumPy operations
import numpy as np
# Process 1 second of audio (8000 samples)
audio = np.frombuffer(pcm_data, dtype = np.int16)
# Processing time: ~2-3ms on modern CPU
Testing
Unit Tests
import pytest
from app.agents.audio_track import analyze_audio_frame
def test_analyze_audio_frame ():
# Generate test audio (silence)
frame = b ' \x00 ' * 320 # 20ms at 8kHz, 16-bit
result = analyze_audio_frame(frame)
assert 'distress_score' in result
assert 0.0 <= result[ 'distress_score' ] <= 1.0
assert isinstance (result[ 'hazards' ], list )
def test_high_distress_audio ():
# Generate loud, high-pitch audio (distress)
t = np.linspace( 0 , 0.02 , 160 ) # 20ms
audio = (np.sin( 2 * np.pi * 400 * t) * 10000 ).astype(np.int16)
frame = audio.tobytes()
result = analyze_audio_frame(frame)
# Should detect high distress
assert result[ 'distress_score' ] > 0.5
Next Steps
NLP Track Text analysis pipeline
Emotion Agent How distress scores are used