Overview
Model evaluation is crucial for understanding your language detector’s performance. This guide covers evaluation metrics, confusion matrices, per-language analysis, and visualization techniques.Evaluation Metrics
We use several metrics to assess model quality:- Accuracy: Overall correctness rate
- Precision: Correctness of positive predictions per language
- Recall: Coverage of actual instances per language
- F1-Score: Harmonic mean of precision and recall
Complete Evaluation Pipeline
import joblib
import pandas as pd
from sklearn.metrics import (
classification_report,
confusion_matrix,
accuracy_score,
precision_recall_fscore_support
)
# Load saved model and vectorizer
model = joblib.load('language_detector_model.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')
# Load test data
df_test = pd.read_csv('test_data.csv')
# Transform test data
X_test = vectorizer.transform(df_test['texto'])
y_test = df_test['idioma']
# Make predictions
y_pred = model.predict(X_test)
print(f"Test set size: {len(y_test)} samples")
accuracy = accuracy_score(y_test, y_pred)
print(f"\nOverall Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
# Language name mapping
idiomas = {
"es": "Español",
"fr": "Francés",
"de": "Alemán",
"it": "Italiano",
"pt": "Portugués",
"nl": "Neerlandés",
"sv": "Sueco"
}
# Generate classification report
report = classification_report(
y_test,
y_pred,
target_names=[idiomas[lang] for lang in sorted(idiomas.keys())],
digits=4
)
print("\n=== Classification Report ===")
print(report)
=== Classification Report ===
precision recall f1-score support
Alemán 0.9856 0.9865 0.9860 1018
Español 0.9886 0.9886 0.9886 1042
Francés 0.9905 0.9905 0.9905 1050
Italiano 0.9899 0.9826 0.9862 1089
Neerlandés 0.9809 0.9828 0.9819 1027
Portugués 0.9890 0.9890 0.9890 1086
Sueco 0.9817 0.9827 0.9822 1038
accuracy 0.9872 7350
macro avg 0.9866 0.9861 0.9863 7350
weighted avg 0.9872 0.9872 0.9872 7350
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)
# Calculate percentages
cm_percent = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100
# Plot confusion matrix
plt.figure(figsize=(12, 10))
sns.heatmap(
cm_percent,
annot=True,
fmt='.2f',
cmap='Blues',
xticklabels=[idiomas[lang] for lang in sorted(idiomas.keys())],
yticklabels=[idiomas[lang] for lang in sorted(idiomas.keys())],
cbar_kws={'label': 'Percentage (%)'}
)
plt.title('Confusion Matrix - Language Detection\n(Percentage of True Labels)',
fontsize=14, pad=20)
plt.xlabel('Predicted Language', fontsize=12)
plt.ylabel('True Language', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.tight_layout()
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()
print("Confusion matrix saved to 'confusion_matrix.png'")
The diagonal represents correct predictions. High off-diagonal values indicate confusion between specific language pairs (e.g., Spanish and Portuguese).
# Find misclassified samples
errors = df_test[y_test != y_pred].copy()
errors['predicted'] = y_pred[y_test != y_pred]
errors['true_label'] = y_test[y_test != y_pred]
print(f"\nTotal errors: {len(errors)} out of {len(y_test)}")
print(f"Error rate: {len(errors)/len(y_test)*100:.2f}%")
# Show error examples
print("\n=== Sample Errors ===")
for idx, row in errors.head(10).iterrows():
print(f"\nText: {row['texto']}")
print(f"True: {idiomas[row['true_label']]} | "
f"Predicted: {idiomas[row['predicted']]}")
import pandas as pd
# Calculate per-language metrics
precision, recall, f1, support = precision_recall_fscore_support(
y_test, y_pred, average=None
)
# Create performance DataFrame
lang_codes = sorted(idiomas.keys())
perf_df = pd.DataFrame({
'Language': [idiomas[lang] for lang in lang_codes],
'Precision': precision,
'Recall': recall,
'F1-Score': f1,
'Support': support
})
print("\n=== Per-Language Performance ===")
print(perf_df.to_string(index=False))
# Visualize
perf_df.set_index('Language')[['Precision', 'Recall', 'F1-Score']].plot(
kind='bar',
figsize=(12, 6),
rot=45
)
plt.title('Performance Metrics by Language', fontsize=14)
plt.ylabel('Score', fontsize=12)
plt.ylim([0.95, 1.0])
plt.legend(loc='lower right')
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig('per_language_performance.png', dpi=300)
plt.show()
Advanced Evaluation
Cross-Validation
Use k-fold cross-validation for more robust evaluation:Learning Curves
Visualize how performance improves with more training data:Feature Importance (for tree-based models)
For Random Forest models, analyze feature importance:Evaluation Checklist
- Overall accuracy > 95%
- Per-language F1-scores > 0.95
- No language with significantly lower performance
- Confusion matrix shows clear diagonal pattern
- Cross-validation scores are consistent
- No overfitting (train/val curves close)
Benchmarking
Compare your model against baselines:Next Steps
Training
Go back to improve your training process
Inference
Deploy your model for real-world predictions