Overview
Once your model is trained and evaluated, you can use it to detect languages in new text. This guide covers loading models, making predictions, batch processing, and deployment strategies.Quick Start
Detect the language of a text in 3 lines:Complete Inference Pipeline
import joblib
import numpy as np
from typing import List, Dict, Tuple
class LanguageDetector:
"""Language detection inference class."""
def __init__(self, model_path: str, vectorizer_path: str):
"""Load model and vectorizer.
Args:
model_path: Path to saved model file
vectorizer_path: Path to saved vectorizer file
"""
self.model = joblib.load(model_path)
self.vectorizer = joblib.load(vectorizer_path)
# Language mapping
self.language_names = {
"es": "Spanish",
"fr": "French",
"de": "German",
"it": "Italian",
"pt": "Portuguese",
"nl": "Dutch",
"sv": "Swedish"
}
def predict(self, text: str) -> str:
"""Predict language of a single text.
Args:
text: Input text string
Returns:
Language code (e.g., 'es', 'fr')
"""
X = self.vectorizer.transform([text])
prediction = self.model.predict(X)[0]
return prediction
def predict_proba(self, text: str) -> Dict[str, float]:
"""Get probability distribution over languages.
Args:
text: Input text string
Returns:
Dictionary mapping language codes to probabilities
"""
X = self.vectorizer.transform([text])
probas = self.model.predict_proba(X)[0]
# Get language classes
classes = self.model.classes_
# Create sorted dictionary
lang_probas = {lang: prob for lang, prob in zip(classes, probas)}
return dict(sorted(lang_probas.items(),
key=lambda x: x[1],
reverse=True))
def predict_batch(self, texts: List[str]) -> List[str]:
"""Predict languages for multiple texts.
Args:
texts: List of input text strings
Returns:
List of language codes
"""
X = self.vectorizer.transform(texts)
predictions = self.model.predict(X)
return predictions.tolist()
def get_language_name(self, code: str) -> str:
"""Convert language code to full name.
Args:
code: Language code (e.g., 'es')
Returns:
Full language name (e.g., 'Spanish')
"""
return self.language_names.get(code, code)
# Initialize detector
detector = LanguageDetector(
model_path='language_detector_model.pkl',
vectorizer_path='tfidf_vectorizer.pkl'
)
print("Language detector ready!")
# Example texts in different languages
examples = [
"Hola, ¿cómo estás?", # Spanish
"Bonjour, comment allez-vous?", # French
"Guten Tag, wie geht es Ihnen?", # German
"Ciao, come stai?", # Italian
"Olá, como você está?", # Portuguese
"Hallo, hoe gaat het met je?", # Dutch
"Hej, hur mår du?" # Swedish
]
print("\n=== Single Predictions ===")
for text in examples:
lang_code = detector.predict(text)
lang_name = detector.get_language_name(lang_code)
print(f"{text:40} -> {lang_name} ({lang_code})")
=== Single Predictions ===
Hola, ¿cómo estás? -> Spanish (es)
Bonjour, comment allez-vous? -> French (fr)
Guten Tag, wie geht es Ihnen? -> German (de)
Ciao, come stai? -> Italian (it)
Olá, como você está? -> Portuguese (pt)
Hallo, hoe gaat het met je? -> Dutch (nl)
Hej, hur mår du? -> Swedish (sv)
text = "Je pense que c'est une bonne idée"
print(f"\nText: {text}")
print("\nLanguage Probabilities:")
probabilities = detector.predict_proba(text)
for lang, prob in probabilities.items():
lang_name = detector.get_language_name(lang)
print(f" {lang_name:12} ({lang}): {prob:.4f} ({prob*100:.2f}%)")
Text: Je pense que c'est une bonne idée
Language Probabilities:
French (fr): 0.9987 (99.87%)
Italian (it): 0.0008 (0.08%)
Spanish (es): 0.0003 (0.03%)
Portuguese (pt): 0.0001 (0.01%)
Dutch (nl): 0.0001 (0.01%)
German (de): 0.0000 (0.00%)
Swedish (sv): 0.0000 (0.00%)
High confidence (>95%) indicates reliable predictions. Lower confidence may suggest mixed languages or ambiguous text.
import time
# Large batch of texts
texts = [
"El sol brilla en el cielo",
"La vie est belle",
"Das Wetter ist schön",
# ... add more texts
] * 100 # 300+ texts
print(f"\nProcessing {len(texts)} texts...")
start_time = time.time()
predictions = detector.predict_batch(texts)
elapsed = time.time() - start_time
print(f"Processed {len(texts)} texts in {elapsed:.2f}s")
print(f"Speed: {len(texts)/elapsed:.0f} texts/second")
# Show distribution
from collections import Counter
lang_counts = Counter(predictions)
print("\nLanguage Distribution:")
for lang, count in lang_counts.most_common():
lang_name = detector.get_language_name(lang)
print(f" {lang_name:12}: {count:4} ({count/len(texts)*100:.1f}%)")
Advanced Usage
Handle Edge Cases
Deal with unusual inputs:Create a REST API
Deploy as a web service using Flask:Command-Line Tool
Create a CLI for quick testing:Performance Optimization
Caching Results
Cache frequent predictions:Parallel Processing
Process large batches faster:Integration Examples
Web Application (Streamlit)
Data Pipeline (pandas)
Next Steps
Training
Improve your model with better training
Evaluation
Assess model performance in production
Troubleshooting
Predictions are slow:- Use batch processing for multiple texts
- Consider model quantization
- Cache frequent predictions
- Check input text length (minimum 3-5 characters)
- Verify text is in one of the 7 trained languages
- Review confidence scores
- Process in smaller batches
- Use model compression techniques
- Consider deploying on a server with more RAM