Skip to main content

Overview

Data processing utilities for audio analysis, feature extraction, and text vectorization:
  • Audio Processing - Recording, loading, and preprocessing audio data
  • MFCC Features - Mel-Frequency Cepstral Coefficients extraction
  • TF-IDF Vectorization - Text feature extraction for NLP
  • Dataset Preparation - Loading and validating training data

Audio Processing

Configuration

Standard audio parameters used across the system.
Audio Parameters
# Core Settings
SAMPLE_RATE = 16000   # 16 kHz (standard for speech)
DURATION = 5          # Audio clip length in seconds
CHANNELS = 1          # Mono audio

# File Paths
BASE_PATH = "/path/to/project"
PATH_POSITIVE = os.path.join(BASE_PATH, "audios")
PATH_NEGATIVE = os.path.join(BASE_PATH, "audios bad")
MODEL_SAVE_PATH = os.path.join(BASE_PATH, "wake_word_model.h5")

Recording Audio

Capture live audio using sounddevice.
import sounddevice as sd
import numpy as np

# Record audio
recording = sd.rec(
    int(DURATION * SAMPLE_RATE),  # Total frames
    samplerate=SAMPLE_RATE,        # Sample rate (Hz)
    channels=CHANNELS               # Mono
)
sd.wait()  # Wait until recording is finished

# Convert to 1D array
audio_data = recording.flatten()
print(f"Recorded {len(audio_data)} samples")
print(f"Duration: {len(audio_data) / SAMPLE_RATE:.2f} seconds")
frames
int
required
Number of audio frames to record (sample_rate × duration)
samplerate
int
default:16000
Audio sampling rate in Hz (16000 recommended for speech)
channels
int
default:1
Number of audio channels (1 for mono, 2 for stereo)

Loading Audio Files

Load audio files using librosa with automatic resampling.
import librosa

# Load audio file
audio, sr = librosa.load(
    'audio_file.wav',
    sr=SAMPLE_RATE,      # Resample to target rate
    duration=DURATION     # Load only first N seconds
)

print(f"Audio shape: {audio.shape}")
print(f"Sample rate: {sr} Hz")
print(f"Duration: {len(audio) / sr:.2f} seconds")
path
string
required
Path to audio file (.wav, .mp3, .flac, etc.)
sr
int
default:22050
Target sampling rate (resamples if different from source)
duration
float
Maximum duration to load in seconds (loads entire file if None)
audio
numpy.ndarray
Audio time series as 1D array
sr
int
Actual sampling rate of loaded audio

MFCC Feature Extraction

Mel-Frequency Cepstral Coefficients (MFCCs) are the primary audio features used for machine learning.

extract_features()

Extracts MFCC features with automatic length normalization.
audio_data
numpy.ndarray
required
Raw audio signal as 1D numpy array
mfcc
numpy.ndarray
MFCC feature matrix with shape (time_steps, n_mfcc)
  • Automatically normalized to fixed length
  • Transposed for LSTM input format
import librosa
import numpy as np

def extract_features(audio_data, sample_rate=16000, duration=5, n_mfcc=13):
    """Extract MFCC features from audio."""
    # Normalize audio length
    target_len = int(sample_rate * duration)
    
    if len(audio_data) < target_len:
        # Pad with zeros if too short
        audio_data = np.pad(audio_data, (0, target_len - len(audio_data)))
    else:
        # Truncate if too long
        audio_data = audio_data[:target_len]
    
    # Extract MFCCs
    mfcc = librosa.feature.mfcc(
        y=audio_data,
        sr=sample_rate,
        n_mfcc=n_mfcc
    )
    
    return mfcc.T  # Transpose to (time_steps, n_mfcc)

# Example
audio, sr = librosa.load('sample.wav', sr=16000)
features = extract_features(audio)
print(f"Feature shape: {features.shape}")  # (time_steps, 13)
Parameters:
n_mfcc
int
default:13
Number of MFCC coefficients to extract (13 is standard)
n_fft
int
default:2048
FFT window size for spectral analysis
hop_length
int
default:512
Number of samples between successive frames

Visualization

Visualize MFCC features for analysis and debugging.
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np

# Load and extract features
audio, sr = librosa.load('sample.wav', sr=16000)
mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=13)

# Create visualization
plt.figure(figsize=(12, 6))
librosa.display.specshow(
    mfcc,
    x_axis='time',
    sr=sr,
    hop_length=512
)
plt.colorbar(format='%+2.0f dB')
plt.title('MFCC Features')
plt.xlabel('Time')
plt.ylabel('MFCC Coefficient')
plt.tight_layout()
plt.savefig('mfcc_visualization.png')

Dataset Preparation

prepare_dataset()

Loads and prepares complete training dataset from audio files.
return
tuple
Returns (X: np.ndarray, y: np.ndarray)
  • X: Feature arrays, shape (n_samples, time_steps, n_mfcc)
  • y: Binary labels (1 for positive, 0 for negative)
import os
import numpy as np
import librosa

def prepare_dataset(
    path_positive='audios',
    path_negative='audios bad',
    sample_rate=16000,
    duration=5,
    n_mfcc=13
):
    """Prepare dataset from audio files."""
    X = []
    y = []
    
    # Process positive samples (wake word)
    for path, label in [(path_positive, 1), (path_negative, 0)]:
        print(f"Loading {'positive' if label == 1 else 'negative'} samples...")
        
        if not os.path.exists(path):
            print(f"Warning: {path} does not exist")
            continue
        
        for filename in os.listdir(path):
            if filename.endswith('.wav'):
                filepath = os.path.join(path, filename)
                
                try:
                    # Load audio
                    audio, _ = librosa.load(
                        filepath,
                        sr=sample_rate,
                        duration=duration
                    )
                    
                    # Extract features
                    features = extract_features(audio)
                    
                    X.append(features)
                    y.append(label)
                    
                except Exception as e:
                    print(f"Error processing {filename}: {e}")
    
    return np.array(X), np.array(y)

# Usage
X_train, y_train = prepare_dataset()
print(f"Dataset shape: {X_train.shape}")
print(f"Labels: {np.unique(y_train, return_counts=True)}")
Directory Structure:
project/
├── audios/              # Positive samples
│   ├── wake_word_01.wav
│   ├── wake_word_02.wav
│   └── ...
├── audios bad/          # Negative samples
│   ├── background_01.wav
│   ├── other_speech_01.wav
│   └── ...
└── wake_word_model.h5   # Trained model

TF-IDF Vectorization

Text feature extraction for intent classification.

TfidfVectorizer Configuration

Configuration used in the intent classification pipeline.
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    ngram_range=(1, 2),    # Use unigrams and bigrams
    lowercase=True,         # Convert text to lowercase
    stop_words=None,        # Keep all words (Spanish)
    max_features=None,      # No feature limit
    min_df=1,              # Minimum document frequency
    max_df=1.0             # Maximum document frequency
)

# Fit on training data
texts = [
    "abre netflix",
    "busca videos de gatos",
    "apaga la televisión"
]
vectorizer.fit(texts)

# Transform new text
features = vectorizer.transform(["abre youtube"])
print(f"Feature shape: {features.shape}")
print(f"Vocabulary size: {len(vectorizer.vocabulary_)}")
ngram_range
tuple
default:"(1, 2)"
Range of n-gram sizes. (1, 2) uses unigrams and bigrams
lowercase
bool
default:true
Convert all text to lowercase before tokenization
max_features
int
Maximum number of features (vocabulary size). None means unlimited
min_df
int|float
default:1
Minimum document frequency. Ignore terms below this threshold

Feature Analysis

Analyze TF-IDF features and vocabulary.
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Create and fit vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2))
texts = [
    "abre netflix",
    "busca videos en youtube",
    "apaga la televisión"
]
vectorizer.fit(texts)

# Get vocabulary
vocab = vectorizer.vocabulary_
print(f"Vocabulary size: {len(vocab)}")
print("\nTop 10 features:")
for term, idx in sorted(vocab.items(), key=lambda x: x[1])[:10]:
    print(f"  {term}: {idx}")

# Get IDF values
idf_values = vectorizer.idf_
feature_names = vectorizer.get_feature_names_out()

idf_df = pd.DataFrame({
    'feature': feature_names,
    'idf': idf_values
}).sort_values('idf', ascending=False)

print("\nTop 10 IDF scores:")
print(idf_df.head(10))

Text Processing

JSON Dataset Loading

Load and validate intent classification datasets.
import json
import sys

def cargar_datos(ruta_archivo):
    """Load dataset from JSON file."""
    try:
        with open(ruta_archivo, 'r', encoding='utf-8') as f:
            data = json.load(f)
        return data
    except FileNotFoundError:
        print(f"Error: File not found at {ruta_archivo}")
        sys.exit(1)
    except json.JSONDecodeError:
        print("Error: Invalid JSON format")
        sys.exit(1)

# Usage
data = cargar_datos('dataset.json')
print(f"Loaded {len(data)} examples")
Dataset Format:
[
  {
    "text": "abre netflix",
    "intent": "open_app"
  },
  {
    "text": "pon música",
    "intent": "play_media"
  },
  {
    "text": "apaga la televisión",
    "intent": "power_off"
  }
]

Text Cleaning

Clean and normalize text for audio and display.
import re

def limpiar_texto_para_audio(texto):
    """Clean text for text-to-speech."""
    # Remove command tags
    texto = re.sub(r'/\*.*?\*/', '', texto)
    
    # Remove markdown formatting
    texto = texto.replace('*', '').replace('#', '').replace('_', ' ')
    
    # Remove URLs
    texto = re.sub(r'http\S+', '', texto)
    
    # Normalize whitespace
    texto = " ".join(texto.split())
    
    return texto

# Example
raw_text = "/*app(netflix)*/ Abriendo Netflix para ti, Rosario. https://netflix.com"
cleaned = limpiar_texto_para_audio(raw_text)
print(cleaned)  # "Abriendo Netflix para ti, Rosario."

Performance Tips

Audio Processing

  • Use 16kHz sample rate for speech
  • Keep duration consistent (5 sec)
  • Normalize audio length before feature extraction
  • Use mono audio for efficiency

MFCC Extraction

  • Standard: 13 MFCC coefficients
  • Add delta features for motion capture
  • Use consistent hop_length (512)
  • Cache extracted features when possible

TF-IDF

  • Use bigrams for better context
  • Set max_features to limit vocabulary
  • Remove rare terms with min_df
  • Enable sublinear_tf for large datasets

Datasets

  • Validate data before training
  • Balance positive/negative samples
  • Use data augmentation for audio
  • Save processed features to disk

Common Patterns

import sounddevice as sd
import numpy as np
import tensorflow as tf

# 1. Record audio
audio = sd.rec(
    int(5 * 16000),
    samplerate=16000,
    channels=1
)
sd.wait()

# 2. Extract features
features = extract_features(audio.flatten())

# 3. Prepare for model
features = np.expand_dims(features, axis=0)

# 4. Predict
model = tf.keras.models.load_model('model.h5')
prediction = model.predict(features)[0][0]

# 5. Decision
if prediction > 0.8:
    print("Wake word detected!")
import os
import librosa
import numpy as np

def batch_process_audio(directory, output_file):
    """Process all audio files and save features."""
    features_list = []
    filenames = []
    
    for filename in os.listdir(directory):
        if filename.endswith('.wav'):
            filepath = os.path.join(directory, filename)
            audio, sr = librosa.load(filepath, sr=16000)
            features = extract_features(audio)
            
            features_list.append(features)
            filenames.append(filename)
    
    # Save to disk
    np.savez(
        output_file,
        features=np.array(features_list),
        filenames=filenames
    )

# Usage
batch_process_audio('audio_files/', 'features.npz')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline

# Create pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(
        ngram_range=(1, 2),
        lowercase=True
    )),
    ('clf', SVC(kernel='linear', probability=True))
])

# Train
texts = ["abre netflix", "busca videos", "apaga tv"]
labels = ["open_app", "search", "power_off"]
pipeline.fit(texts, labels)

# Predict
new_text = ["abre youtube"]
intent = pipeline.predict(new_text)[0]
confidence = max(pipeline.predict_proba(new_text)[0])

print(f"Intent: {intent} ({confidence:.2%})")

Data Normalization

DataNormalizer Class

Transform multiple data formats into standardized text for model training. Source: ~/workspace/source/proyectos/ai creator/kamutini/info.py
Class Definition
import json
import os

class DataNormalizer:
    """
    Transform JSON/CSV data into standardized training format.
    Supports multiple input schemas.
    """
    def __init__(self, eos_token="<|endoftext|>"):
        self.eos_token = eos_token
eos_token
string
default:"<|endoftext|>"
End-of-sequence token appended to normalized text

normalize_entry()

Convert a data entry to standard format.
Method Signature
def normalize_entry(self, data: dict) -> str:
    """
    Detect input format and convert to:
    "### Humano: <question>### Asistente: <answer>"
    
    Returns:
        Normalized text string with EOS token
    """
Supported Formats:
{
  "text": "### Humano: ¿Cómo estás?### Asistente: Bien, gracias."
}

load_specific_datasets()

Load and normalize multiple dataset files.
Usage Example
normalizer = DataNormalizer()

# Load specific files
file_paths = [
    "datasets/conversations.json",
    "datasets/qa_pairs.json"
]

normalized_text = normalizer.load_specific_datasets(file_paths)
print(f"Loaded {len(normalized_text.split(normalizer.eos_token))} examples")
file_paths
list[str]
List of JSON file paths to process
Returns: Combined normalized text with entries separated by \n\n
The normalizer handles both single JSON objects and arrays, as well as JSONL format (one JSON object per line).

Integration with Model Training

Used by the PyTorch model trainer to prepare datasets.
Integration Example
from info import DataNormalizer

class LocalOptimizedDataset(Dataset):
    def __init__(self, directory_path, block_size):
        # Initialize normalizer
        self.normalizer = DataNormalizer(eos_token="<|endoftext|>")
        
        formatted_lines = []
        files = [f for f in os.listdir(directory_path) 
                if f.endswith(('.json', '.csv'))]
        
        for filename in files:
            filepath = os.path.join(directory_path, filename)
            with open(filepath, 'r', encoding='utf-8') as f:
                raw_data = json.load(f)
                if not isinstance(raw_data, list):
                    raw_data = [raw_data]
                
                for item in raw_data:
                    norm_text = self.normalizer.normalize_entry(item)
                    if norm_text:
                        formatted_lines.append(norm_text)
        
        text_data = "\n\n".join(formatted_lines)
        # Continue with tokenization...
Use DataNormalizer when training models on heterogeneous datasets with different JSON schemas. It standardizes the format automatically.

Build docs developers (and LLMs) love