
Chunking Text Documents
Master chunking techniques specifically for text documents to optimize RAG retrieval.
Chunking Strategies
Effective chunking is critical for RAG quality. Learn proven strategies.
Fixed-Size Chunking
def chunk_by_tokens(text, chunk_size=500, overlap=50):
"""
Split text into fixed-size chunks with overlap.
"""
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokens = tokenizer.encode(text)
chunks = []
start = 0
while start < len(tokens):
end = min(start + chunk_size, len(tokens))
chunk_tokens = tokens[start:end]
chunk_text = tokenizer.decode(chunk_tokens)
chunks.append({
'text': chunk_text,
'start_token': start,
'end_token': end,
'token_count': len(chunk_tokens)
})
start += (chunk_size - overlap)
return chunks
Semantic Chunking
def chunk_by_semantics(text):
"""
Chunk based on semantic boundaries (paragraphs, sections).
"""
# Split by double newlines (paragraphs)
paragraphs = text.split('\n\n')
chunks = []
current_chunk = ""
for para in paragraphs:
# If adding this paragraph exceeds target size, save current chunk
if len(current_chunk) + len(para) > 1000:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = para
else:
current_chunk += "\n\n" + para
# Add final chunk
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
Recursive Chunking
def recursive_chunk(text, max_size=1000, separators=['\n\n', '\n', '. ', ' ']):
"""
Recursively split text using hierarchy of separators.
LangChain's RecursiveCharacterTextSplitter approach.
"""
if len(text) <= max_size:
return [text]
# Try each separator in order
for separator in separators:
if separator in text:
splits = text.split(separator)
chunks = []
current_chunk = ""
for split in splits:
if len(current_chunk) + len(split) + len(separator) <= max_size:
current_chunk += split + separator
else:
if current_chunk:
chunks.append(current_chunk.strip())
current_chunk = split + separator
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
# If no separators found, force split
return [text[i:i+max_size] for i in range(0, len(text), max_size)]
Next: Chunking multimodal content.