Deduplication

Deduplication

Identify and remove duplicate content to improve index quality and reduce costs.

Deduplication

Duplicate content wastes storage and confuses retrieval. Remove it systematically.

Exact Deduplication

import hashlib

def content_hash(text):
    return hashlib.sha256(text.encode()).hexdigest()

seen_hashes = set()

def is_duplicate(text):
    h = content_hash(text)
    if h in seen_hashes:
        return True
    seen_hashes.add(h)
    return False

Fuzzy Deduplication

from difflib import SequenceMatcher

def similarity(text1, text2):
    return SequenceMatcher(None, text1, text2).ratio()

def is_near_duplicate(new_text, existing_texts, threshold=0.95):
    for existing in existing_texts:
        if similarity(new_text, existing) > threshold:
            return True
    return False

MinHash for Scale

from datasketch import MinHash, MinHashLSH

lsh = MinHashLSH(threshold=0.9, num_perm=128)

def add_document(doc_id, text):
    m = MinHash(num_perm=128)
    for word in text.split():
        m.update(word.encode('utf8'))
    lsh.insert(doc_id, m)

def find_duplicates(text):
    m = MinHash(num_perm=128)
    for word in text.split():
        m.update(word.encode('utf8'))
    return lsh.query(m)

Next: Noise removal.

Subscribe to our newsletter

Get the latest posts delivered right to your inbox.

Subscribe on LinkedIn