Overview
Data processing utilities for audio analysis, feature extraction, and text vectorization:
Audio Processing - Recording, loading, and preprocessing audio data
MFCC Features - Mel-Frequency Cepstral Coefficients extraction
TF-IDF Vectorization - Text feature extraction for NLP
Dataset Preparation - Loading and validating training data
Audio Processing
Configuration
Standard audio parameters used across the system.
# Core Settings
SAMPLE_RATE = 16000 # 16 kHz (standard for speech)
DURATION = 5 # Audio clip length in seconds
CHANNELS = 1 # Mono audio
# File Paths
BASE_PATH = "/path/to/project"
PATH_POSITIVE = os.path.join( BASE_PATH , "audios" )
PATH_NEGATIVE = os.path.join( BASE_PATH , "audios bad" )
MODEL_SAVE_PATH = os.path.join( BASE_PATH , "wake_word_model.h5" )
Recording Audio
Capture live audio using sounddevice.
Real-time Recording
Continuous Recording Loop
import sounddevice as sd
import numpy as np
# Record audio
recording = sd.rec(
int ( DURATION * SAMPLE_RATE ), # Total frames
samplerate = SAMPLE_RATE , # Sample rate (Hz)
channels = CHANNELS # Mono
)
sd.wait() # Wait until recording is finished
# Convert to 1D array
audio_data = recording.flatten()
print ( f "Recorded { len (audio_data) } samples" )
print ( f "Duration: { len (audio_data) / SAMPLE_RATE :.2f} seconds" )
Number of audio frames to record (sample_rate × duration)
Audio sampling rate in Hz (16000 recommended for speech)
Number of audio channels (1 for mono, 2 for stereo)
Loading Audio Files
Load audio files using librosa with automatic resampling.
Basic Loading
Batch Loading
import librosa
# Load audio file
audio, sr = librosa.load(
'audio_file.wav' ,
sr = SAMPLE_RATE , # Resample to target rate
duration = DURATION # Load only first N seconds
)
print ( f "Audio shape: { audio.shape } " )
print ( f "Sample rate: { sr } Hz" )
print ( f "Duration: { len (audio) / sr :.2f} seconds" )
Path to audio file (.wav, .mp3, .flac, etc.)
Target sampling rate (resamples if different from source)
Maximum duration to load in seconds (loads entire file if None)
Audio time series as 1D array
Actual sampling rate of loaded audio
Mel-Frequency Cepstral Coefficients (MFCCs) are the primary audio features used for machine learning.
Extracts MFCC features with automatic length normalization.
Raw audio signal as 1D numpy array
MFCC feature matrix with shape (time_steps, n_mfcc)
Automatically normalized to fixed length
Transposed for LSTM input format
Basic Usage
Advanced Configuration
import librosa
import numpy as np
def extract_features ( audio_data , sample_rate = 16000 , duration = 5 , n_mfcc = 13 ):
"""Extract MFCC features from audio."""
# Normalize audio length
target_len = int (sample_rate * duration)
if len (audio_data) < target_len:
# Pad with zeros if too short
audio_data = np.pad(audio_data, ( 0 , target_len - len (audio_data)))
else :
# Truncate if too long
audio_data = audio_data[:target_len]
# Extract MFCCs
mfcc = librosa.feature.mfcc(
y = audio_data,
sr = sample_rate,
n_mfcc = n_mfcc
)
return mfcc.T # Transpose to (time_steps, n_mfcc)
# Example
audio, sr = librosa.load( 'sample.wav' , sr = 16000 )
features = extract_features(audio)
print ( f "Feature shape: { features.shape } " ) # (time_steps, 13)
Parameters:
Number of MFCC coefficients to extract (13 is standard)
FFT window size for spectral analysis
Number of samples between successive frames
Visualization
Visualize MFCC features for analysis and debugging.
Plot MFCCs
Compare Audio Samples
import librosa
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
# Load and extract features
audio, sr = librosa.load( 'sample.wav' , sr = 16000 )
mfcc = librosa.feature.mfcc( y = audio, sr = sr, n_mfcc = 13 )
# Create visualization
plt.figure( figsize = ( 12 , 6 ))
librosa.display.specshow(
mfcc,
x_axis = 'time' ,
sr = sr,
hop_length = 512
)
plt.colorbar( format = ' %+2.0f dB' )
plt.title( 'MFCC Features' )
plt.xlabel( 'Time' )
plt.ylabel( 'MFCC Coefficient' )
plt.tight_layout()
plt.savefig( 'mfcc_visualization.png' )
Dataset Preparation
prepare_dataset()
Loads and prepares complete training dataset from audio files.
Returns (X: np.ndarray, y: np.ndarray)
X: Feature arrays, shape (n_samples, time_steps, n_mfcc)
y: Binary labels (1 for positive, 0 for negative)
Implementation
With Data Augmentation
import os
import numpy as np
import librosa
def prepare_dataset (
path_positive = 'audios' ,
path_negative = 'audios bad' ,
sample_rate = 16000 ,
duration = 5 ,
n_mfcc = 13
):
"""Prepare dataset from audio files."""
X = []
y = []
# Process positive samples (wake word)
for path, label in [(path_positive, 1 ), (path_negative, 0 )]:
print ( f "Loading { 'positive' if label == 1 else 'negative' } samples..." )
if not os.path.exists(path):
print ( f "Warning: { path } does not exist" )
continue
for filename in os.listdir(path):
if filename.endswith( '.wav' ):
filepath = os.path.join(path, filename)
try :
# Load audio
audio, _ = librosa.load(
filepath,
sr = sample_rate,
duration = duration
)
# Extract features
features = extract_features(audio)
X.append(features)
y.append(label)
except Exception as e:
print ( f "Error processing { filename } : { e } " )
return np.array(X), np.array(y)
# Usage
X_train, y_train = prepare_dataset()
print ( f "Dataset shape: { X_train.shape } " )
print ( f "Labels: { np.unique(y_train, return_counts = True ) } " )
Directory Structure:
project/
├── audios/ # Positive samples
│ ├── wake_word_01.wav
│ ├── wake_word_02.wav
│ └── ...
├── audios bad/ # Negative samples
│ ├── background_01.wav
│ ├── other_speech_01.wav
│ └── ...
└── wake_word_model.h5 # Trained model
TF-IDF Vectorization
Text feature extraction for intent classification.
TfidfVectorizer Configuration
Configuration used in the intent classification pipeline.
Standard Configuration
Advanced Configuration
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(
ngram_range = ( 1 , 2 ), # Use unigrams and bigrams
lowercase = True , # Convert text to lowercase
stop_words = None , # Keep all words (Spanish)
max_features = None , # No feature limit
min_df = 1 , # Minimum document frequency
max_df = 1.0 # Maximum document frequency
)
# Fit on training data
texts = [
"abre netflix" ,
"busca videos de gatos" ,
"apaga la televisión"
]
vectorizer.fit(texts)
# Transform new text
features = vectorizer.transform([ "abre youtube" ])
print ( f "Feature shape: { features.shape } " )
print ( f "Vocabulary size: { len (vectorizer.vocabulary_) } " )
Range of n-gram sizes. (1, 2) uses unigrams and bigrams
Convert all text to lowercase before tokenization
Maximum number of features (vocabulary size). None means unlimited
Minimum document frequency. Ignore terms below this threshold
Feature Analysis
Analyze TF-IDF features and vocabulary.
Inspect Vocabulary
Feature Importance
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
# Create and fit vectorizer
vectorizer = TfidfVectorizer( ngram_range = ( 1 , 2 ))
texts = [
"abre netflix" ,
"busca videos en youtube" ,
"apaga la televisión"
]
vectorizer.fit(texts)
# Get vocabulary
vocab = vectorizer.vocabulary_
print ( f "Vocabulary size: { len (vocab) } " )
print ( " \n Top 10 features:" )
for term, idx in sorted (vocab.items(), key = lambda x : x[ 1 ])[: 10 ]:
print ( f " { term } : { idx } " )
# Get IDF values
idf_values = vectorizer.idf_
feature_names = vectorizer.get_feature_names_out()
idf_df = pd.DataFrame({
'feature' : feature_names,
'idf' : idf_values
}).sort_values( 'idf' , ascending = False )
print ( " \n Top 10 IDF scores:" )
print (idf_df.head( 10 ))
Text Processing
JSON Dataset Loading
Load and validate intent classification datasets.
Basic Loading
With Validation
import json
import sys
def cargar_datos ( ruta_archivo ):
"""Load dataset from JSON file."""
try :
with open (ruta_archivo, 'r' , encoding = 'utf-8' ) as f:
data = json.load(f)
return data
except FileNotFoundError :
print ( f "Error: File not found at { ruta_archivo } " )
sys.exit( 1 )
except json.JSONDecodeError:
print ( "Error: Invalid JSON format" )
sys.exit( 1 )
# Usage
data = cargar_datos( 'dataset.json' )
print ( f "Loaded { len (data) } examples" )
Dataset Format:
[
{
"text" : "abre netflix" ,
"intent" : "open_app"
},
{
"text" : "pon música" ,
"intent" : "play_media"
},
{
"text" : "apaga la televisión" ,
"intent" : "power_off"
}
]
Text Cleaning
Clean and normalize text for audio and display.
Audio Text Cleaning
Intent Text Preprocessing
import re
def limpiar_texto_para_audio ( texto ):
"""Clean text for text-to-speech."""
# Remove command tags
texto = re.sub( r '/ \* . *? \* /' , '' , texto)
# Remove markdown formatting
texto = texto.replace( '*' , '' ).replace( '#' , '' ).replace( '_' , ' ' )
# Remove URLs
texto = re.sub( r 'http \S + ' , '' , texto)
# Normalize whitespace
texto = " " .join(texto.split())
return texto
# Example
raw_text = "/*app(netflix)*/ Abriendo Netflix para ti, Rosario. https://netflix.com"
cleaned = limpiar_texto_para_audio(raw_text)
print (cleaned) # "Abriendo Netflix para ti, Rosario."
Audio Processing
Use 16kHz sample rate for speech
Keep duration consistent (5 sec)
Normalize audio length before feature extraction
Use mono audio for efficiency
MFCC Extraction
Standard: 13 MFCC coefficients
Add delta features for motion capture
Use consistent hop_length (512)
Cache extracted features when possible
TF-IDF
Use bigrams for better context
Set max_features to limit vocabulary
Remove rare terms with min_df
Enable sublinear_tf for large datasets
Datasets
Validate data before training
Balance positive/negative samples
Use data augmentation for audio
Save processed features to disk
Common Patterns
End-to-End Audio Pipeline
import sounddevice as sd
import numpy as np
import tensorflow as tf
# 1. Record audio
audio = sd.rec(
int ( 5 * 16000 ),
samplerate = 16000 ,
channels = 1
)
sd.wait()
# 2. Extract features
features = extract_features(audio.flatten())
# 3. Prepare for model
features = np.expand_dims(features, axis = 0 )
# 4. Predict
model = tf.keras.models.load_model( 'model.h5' )
prediction = model.predict(features)[ 0 ][ 0 ]
# 5. Decision
if prediction > 0.8 :
print ( "Wake word detected!" )
import os
import librosa
import numpy as np
def batch_process_audio ( directory , output_file ):
"""Process all audio files and save features."""
features_list = []
filenames = []
for filename in os.listdir(directory):
if filename.endswith( '.wav' ):
filepath = os.path.join(directory, filename)
audio, sr = librosa.load(filepath, sr = 16000 )
features = extract_features(audio)
features_list.append(features)
filenames.append(filename)
# Save to disk
np.savez(
output_file,
features = np.array(features_list),
filenames = filenames
)
# Usage
batch_process_audio( 'audio_files/' , 'features.npz' )
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
# Create pipeline
pipeline = Pipeline([
( 'tfidf' , TfidfVectorizer(
ngram_range = ( 1 , 2 ),
lowercase = True
)),
( 'clf' , SVC( kernel = 'linear' , probability = True ))
])
# Train
texts = [ "abre netflix" , "busca videos" , "apaga tv" ]
labels = [ "open_app" , "search" , "power_off" ]
pipeline.fit(texts, labels)
# Predict
new_text = [ "abre youtube" ]
intent = pipeline.predict(new_text)[ 0 ]
confidence = max (pipeline.predict_proba(new_text)[ 0 ])
print ( f "Intent: { intent } ( { confidence :.2%} )" )
Data Normalization
DataNormalizer Class
Transform multiple data formats into standardized text for model training.
Source : ~/workspace/source/proyectos/ai creator/kamutini/info.py
import json
import os
class DataNormalizer :
"""
Transform JSON/CSV data into standardized training format.
Supports multiple input schemas.
"""
def __init__ ( self , eos_token = "<|endoftext|>" ):
self .eos_token = eos_token
eos_token
string
default: "<|endoftext|>"
End-of-sequence token appended to normalized text
normalize_entry()
Convert a data entry to standard format.
def normalize_entry ( self , data : dict ) -> str :
"""
Detect input format and convert to:
"### Humano: <question>### Asistente: <answer>"
Returns:
Normalized text string with EOS token
"""
Supported Formats :
Pre-formatted Text
Question/Answer
Prompt/Completion
{
"text" : "### Humano: ¿Cómo estás?### Asistente: Bien, gracias."
}
{
"idx" : 60 ,
"question" : "¿Qué pasatiempos tienes?" ,
"answer" : "Senderismo en montañas." ,
"label" : "Hobbies"
}
Transforms to: "### Humano: ¿Qué pasatiempos tienes?### Asistente: Senderismo en montañas. <|endoftext|>" {
"prompt" : "What's the weather?" ,
"completion" : "It's sunny today."
}
load_specific_datasets()
Load and normalize multiple dataset files.
normalizer = DataNormalizer()
# Load specific files
file_paths = [
"datasets/conversations.json" ,
"datasets/qa_pairs.json"
]
normalized_text = normalizer.load_specific_datasets(file_paths)
print ( f "Loaded { len (normalized_text.split(normalizer.eos_token)) } examples" )
List of JSON file paths to process
Returns : Combined normalized text with entries separated by \n\n
The normalizer handles both single JSON objects and arrays, as well as JSONL format (one JSON object per line).
Integration with Model Training
Used by the PyTorch model trainer to prepare datasets.
from info import DataNormalizer
class LocalOptimizedDataset ( Dataset ):
def __init__ ( self , directory_path , block_size ):
# Initialize normalizer
self .normalizer = DataNormalizer( eos_token = "<|endoftext|>" )
formatted_lines = []
files = [f for f in os.listdir(directory_path)
if f.endswith(( '.json' , '.csv' ))]
for filename in files:
filepath = os.path.join(directory_path, filename)
with open (filepath, 'r' , encoding = 'utf-8' ) as f:
raw_data = json.load(f)
if not isinstance (raw_data, list ):
raw_data = [raw_data]
for item in raw_data:
norm_text = self .normalizer.normalize_entry(item)
if norm_text:
formatted_lines.append(norm_text)
text_data = " \n\n " .join(formatted_lines)
# Continue with tokenization...
Use DataNormalizer when training models on heterogeneous datasets with different JSON schemas. It standardizes the format automatically.