
Metadata Enrichment
Extract and enrich metadata to improve retrieval accuracy and enable advanced filtering.
Metadata Enrichment
Rich metadata enables better filtering and retrieval.
Extract Basic Metadata
def extract_metadata(file_path, content):
return {
'filename': os.path.basename(file_path),
'extension': os.path.splitext(file_path)[1],
'size_bytes': os.path.getsize(file_path),
'created_at': os.path.getctime(file_path),
'modified_at': os.path.getmtime(file_path),
'word_count': len(content.split()),
'character_count': len(content)
}
Extract Semantic Metadata
def enrich_metadata(content):
metadata = {}
# Extract dates
dates = extract_dates(content)
if dates:
metadata['dates'] = dates
metadata['latest_date'] = max(dates)
# Extract entities
entities = extract_entities(content)
metadata['people'] = entities.get('PERSON', [])
metadata['organizations'] = entities.get('ORG', [])
metadata['locations'] = entities.get('GPE', [])
# Classify content
metadata['category'] = classify_document(content)
metadata['tags'] = generate_tags(content)
return metadata
LLM-Based Enrichment
def llm_enrich_metadata(content):
prompt = f"""
Analyze this document and extract:
1. Main topic/subject
2. Document type (report, email, article, etc.)
3. Key themes (3-5 keywords)
4. Intended audience
Document:
{content[:1000]} # First 1000 chars
"""
response = claude.generate(prompt)
return parse_llm_metadata(response)
Using Metadata for Retrieval
# Filter by metadata
results = collection.query(
query_embeddings=[query_emb],
where={
"category": "technical",
"latest_date": {"$gte": "2025-01-01"},
"language": "en"
},
n_results=10
)
Batch 1 (Modules 1-6) complete! Now continuing with remaining batches.