For fast, lightweight inference with minimal dependencies:
import joblibfrom sklearn.feature_extraction.text import TfidfVectorizer# Load the pre-trained modelmodel = joblib.load('modelos/naive_bayes_alpha_0.5.zip')# Initialize the vectorizer (must match training configuration)vectorizer = TfidfVectorizer( max_features=10000, ngram_range=(1, 3), analyzer='char')# Note: You'll need to fit the vectorizer on your training data# or load a pre-fitted vectorizer if saved separately
Character-level n-grams (1-3) work particularly well for language detection as they capture language-specific patterns like diacritics, common letter combinations, and morphological features.
Here’s a complete working example that combines everything:
import numpy as npimport tensorflow as tffrom tensorflow import kerasfrom tensorflow.keras.preprocessing.sequence import pad_sequencesimport pickleclass LanguageDetector: """Language detection using pre-trained BiLSTM model.""" def __init__(self, model_path, config_path): """Initialize the detector with model and configuration. Args: model_path: Path to the .keras model file config_path: Path to the .pkl configuration file """ # Load model self.model = keras.models.load_model(model_path) # Load configuration with open(config_path, 'rb') as f: config = pickle.load(f) self.tokenizer = config['tokenizer'] self.max_length = config['max_length'] # Language mapping self.languages = ['de', 'es', 'fr', 'it', 'nl', 'pt', 'sv'] self.language_names = { 'de': 'German', 'es': 'Spanish', 'fr': 'French', 'it': 'Italian', 'nl': 'Dutch', 'pt': 'Portuguese', 'sv': 'Swedish' } def predict(self, texts): """Predict languages for a list of texts. Args: texts: List of text strings or single string Returns: List of language codes or single language code """ # Handle single string single = isinstance(texts, str) if single: texts = [texts] # Tokenize and pad sequences = self.tokenizer.texts_to_sequences(texts) padded = pad_sequences(sequences, maxlen=self.max_length, padding='post') # Predict predictions = self.model.predict(padded, verbose=0) predicted_indices = np.argmax(predictions, axis=1) predicted_langs = [self.languages[idx] for idx in predicted_indices] return predicted_langs[0] if single else predicted_langs def predict_with_confidence(self, text): """Predict language with confidence scores. Args: text: Input text string Returns: Dictionary with language code, name, and confidence """ # Tokenize and pad sequences = self.tokenizer.texts_to_sequences([text]) padded = pad_sequences(sequences, maxlen=self.max_length, padding='post') # Get prediction probabilities predictions = self.model.predict(padded, verbose=0)[0] predicted_idx = np.argmax(predictions) lang_code = self.languages[predicted_idx] confidence = float(predictions[predicted_idx]) # Get top 3 predictions top_3_idx = np.argsort(predictions)[-3:][::-1] top_3 = [ { 'language': self.languages[idx], 'name': self.language_names[self.languages[idx]], 'confidence': float(predictions[idx]) } for idx in top_3_idx ] return { 'language': lang_code, 'name': self.language_names[lang_code], 'confidence': confidence, 'top_3': top_3 }# Usage exampleif __name__ == "__main__": # Initialize detector detector = LanguageDetector( model_path='modelos/mejor_modelo_recurrente.keras', config_path='modelos/mejor_modelo_recurrente_config.pkl' ) # Test with various texts test_texts = [ "Hello, how are you today?", "Bonjour, comment allez-vous aujourd'hui?", "Hola, ¿cómo estás hoy?", "Guten Tag, wie geht es Ihnen heute?", "Ciao, come stai oggi?", "Olá, como você está hoje?", "Hej, hur mår du idag?" ] print("=" * 60) print("Language Detection Results") print("=" * 60) for text in test_texts: result = detector.predict_with_confidence(text) print(f"\nText: {text}") print(f"Detected: {result['name']} ({result['language']})") print(f"Confidence: {result['confidence']:.2%}") print(f"Top 3 predictions:") for pred in result['top_3']: print(f" - {pred['name']}: {pred['confidence']:.2%}")
Make sure the tokenizer configuration matches the one used during training. Loading the configuration from mejor_modelo_recurrente_config.pkl ensures consistency.
Learn about the different models and their trade-offs in the Models Overview section.
2
Train Your Own Models
Follow the Training Guide to train models on your own data or fine-tune existing ones.
3
Integrate into Applications
Check out the Inference Guide for deploying models as web services.
Pro Tip: For production deployments, consider wrapping the detector in a REST API using Flask or FastAPI, or deploying it as a serverless function for scalability.