
Language Detection
Detect document languages for proper embedding model selection and multilingual RAG.
Language Detection
Identify document languages for proper processing and retrieval.
Language Detection
from langdetect import detect, detect_langs
def detect_language(text):
try:
return detect(text)
except:
return 'unknown'
def detect_with_confidence(text):
try:
probs = detect_langs(text)
return {
'language': str(probs[0].lang),
'confidence': probs[0].prob
}
except:
return {'language': 'unknown', 'confidence': 0.0}
Multilingual Strategy
def process_multilingual(text):
lang = detect_language(text)
# Use language-specific embedding model
if lang == 'en':
embedding = english_model.embed(text)
elif lang in ['fr', 'de', 'es']:
embedding = multilingual_model.embed(text)
else:
embedding = universal_model.embed(text)
return embedding, lang
Language-Specific Collections
# Separate collections per language
collections = {
'en': chroma_client.create_collection('docs_english'),
'es': chroma_client.create_collection('docs_spanish'),
'fr': chroma_client.create_collection('docs_french')
}
def store_document(text):
lang = detect_language(text)
collection = collections.get(lang, collections['en'])
collection.add(documents=[text])
Next: Metadata enrichment.