
Deduplication
Identify and remove duplicate content to improve index quality and reduce costs.
Deduplication
Duplicate content wastes storage and confuses retrieval. Remove it systematically.
Exact Deduplication
import hashlib
def content_hash(text):
return hashlib.sha256(text.encode()).hexdigest()
seen_hashes = set()
def is_duplicate(text):
h = content_hash(text)
if h in seen_hashes:
return True
seen_hashes.add(h)
return False
Fuzzy Deduplication
from difflib import SequenceMatcher
def similarity(text1, text2):
return SequenceMatcher(None, text1, text2).ratio()
def is_near_duplicate(new_text, existing_texts, threshold=0.95):
for existing in existing_texts:
if similarity(new_text, existing) > threshold:
return True
return False
MinHash for Scale
from datasketch import MinHash, MinHashLSH
lsh = MinHashLSH(threshold=0.9, num_perm=128)
def add_document(doc_id, text):
m = MinHash(num_perm=128)
for word in text.split():
m.update(word.encode('utf8'))
lsh.insert(doc_id, m)
def find_duplicates(text):
m = MinHash(num_perm=128)
for word in text.split():
m.update(word.encode('utf8'))
return lsh.query(m)
Next: Noise removal.