Language Detection

Language Detection

Detect document languages for proper embedding model selection and multilingual RAG.

Language Detection

Identify document languages for proper processing and retrieval.

Language Detection

from langdetect import detect, detect_langs

def detect_language(text):
    try:
        return detect(text)
    except:
        return 'unknown'

def detect_with_confidence(text):
    try:
        probs = detect_langs(text)
        return {
            'language': str(probs[0].lang),
            'confidence': probs[0].prob
        }
    except:
        return {'language': 'unknown', 'confidence': 0.0}

Multilingual Strategy

def process_multilingual(text):
    lang = detect_language(text)
    
    # Use language-specific embedding model
    if lang == 'en':
        embedding = english_model.embed(text)
    elif lang in ['fr', 'de', 'es']:
        embedding = multilingual_model.embed(text)
    else:
        embedding = universal_model.embed(text)
    
    return embedding, lang

Language-Specific Collections

# Separate collections per language
collections = {
    'en': chroma_client.create_collection('docs_english'),
    'es': chroma_client.create_collection('docs_spanish'),
    'fr': chroma_client.create_collection('docs_french')
}

def store_document(text):
    lang = detect_language(text)
    collection = collections.get(lang, collections['en'])
    collection.add(documents=[text])

Next: Metadata enrichment.

Subscribe to our newsletter

Get the latest posts delivered right to your inbox.

Subscribe on LinkedIn