Chunking Text Documents

Chunking Text Documents

Master chunking techniques specifically for text documents to optimize RAG retrieval.

Chunking Strategies

Effective chunking is critical for RAG quality. Learn proven strategies.

Fixed-Size Chunking

def chunk_by_tokens(text, chunk_size=500, overlap=50):
    """
    Split text into fixed-size chunks with overlap.
    """
    from transformers import GPT2Tokenizer
    
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    tokens = tokenizer.encode(text)
    
    chunks = []
    start = 0
    
    while start < len(tokens):
        end = min(start + chunk_size, len(tokens))
        chunk_tokens = tokens[start:end]
        chunk_text = tokenizer.decode(chunk_tokens)
        
        chunks.append({
            'text': chunk_text,
            'start_token': start,
            'end_token': end,
            'token_count': len(chunk_tokens)
        })
        
        start += (chunk_size - overlap)
    
    return chunks

Semantic Chunking

def chunk_by_semantics(text):
    """
    Chunk based on semantic boundaries (paragraphs, sections).
    """
    # Split by double newlines (paragraphs)
    paragraphs = text.split('\n\n')
    
    chunks = []
    current_chunk = ""
    
    for para in paragraphs:
        # If adding this paragraph exceeds target size, save current chunk
        if len(current_chunk) + len(para) > 1000:
            if current_chunk:
                chunks.append(current_chunk.strip())
            current_chunk = para
        else:
            current_chunk += "\n\n" + para
    
    # Add final chunk
    if current_chunk:
        chunks.append(current_chunk.strip())
    
    return chunks

Recursive Chunking

def recursive_chunk(text, max_size=1000, separators=['\n\n', '\n', '. ', ' ']):
    """
    Recursively split text using hierarchy of separators.
    LangChain's RecursiveCharacterTextSplitter approach.
    """
    if len(text) <= max_size:
        return [text]
    
    # Try each separator in order
    for separator in separators:
        if separator in text:
            splits = text.split(separator)
            
            chunks = []
            current_chunk = ""
            
            for split in splits:
                if len(current_chunk) + len(split) + len(separator) <= max_size:
                    current_chunk += split + separator
                else:
                    if current_chunk:
                        chunks.append(current_chunk.strip())
                    current_chunk = split + separator
            
            if current_chunk:
                chunks.append(current_chunk.strip())
            
            return chunks
    
    # If no separators found, force split
    return [text[i:i+max_size] for i in range(0, len(text), max_size)]

Next: Chunking multimodal content.

Subscribe to our newsletter

Get the latest posts delivered right to your inbox.

Subscribe on LinkedIn