
Noise Removal
Clean documents by removing headers, footers, boilerplate, and other non-content text.
Noise Removal
Remove boilerplate, headers, footers, and other noise for cleaner embeddings.
Common Noise Patterns
import re
def remove_noise(text):
# Remove page numbers
text = re.sub(r'Page \d+ of \d+', '', text)
# Remove email footers
text = re.sub(r'Sent from my iPhone', '', text)
# Remove URLs
text = re.sub(r'http[s]?://\S+', '', text)
# Remove excessive whitespace
text = re.sub(r'\s+', ' ', text)
return text.strip()
Email-Specific Cleaning
def clean_email(email_text):
# Remove quoted replies
text = re.sub(r'On .* wrote:', '', email_text)
text = re.sub(r'^>.*$', '', text, flags=re.MULTILINE)
# Remove signatures
text = text.split('--')[0] # Common signature delimiter
return text
PDF Artifacts
def clean_pdf_text(text):
# Remove hyphenation
text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)
# Fix spacing
text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)
return text
Next: Layout normalization.