
Page-Level vs Section-Level Parsing
Choose the right granularity for document parsing to optimize retrieval relevance and context quality.
Page-Level vs Section-Level Parsing
The granularity of document parsing affects retrieval quality. Learn when to parse by page, section, or paragraph.
Parsing Granularity Levels
class ParsingGranularity:
PAGE = "page" # One chunk per page
SECTION = "section" # One chunk per section/chapter
PARAGRAPH = "paragraph" # One chunk per paragraph
SENTENCE = "sentence" # One chunk per sentence (rare)
Page-Level Parsing
Best for: PDFs where page boundaries are meaningful (slides, forms, catalogs).
def parse_by_page(pdf_path):
"""
Extract content page by page.
Maintains page-level context and boundaries.
"""
import PyPDF2
pages = []
with open(pdf_path, 'rb') as file:
reader = PyPDF2.PdfReader(file)
for page_num, page in enumerate(reader.pages):
text = page.extract_text()
# Store each page separately
pages.append({
'page_number': page_num + 1,
'content': text,
'metadata': {
'source': pdf_path,
'page': page_num + 1,
'total_pages': len(reader.pages)
}
})
return pages
# Example usage for RAG indexing
def index_by_pages(pdf_path):
pages = parse_by_page(pdf_path)
for page_data in pages:
embedding = embed(page_data['content'])
store_in_vector_db(
embedding=embedding,
content=page_data['content'],
metadata=page_data['metadata']
)
Advantages:
- Simple and reliable
- Page numbers preserved for citations
- Good for slide decks and presentations
Disadvantages:
- May split logical sections across pages
- Page breaks might be arbitrary
- Context might be incomplete
Section-Level Parsing
Best for: Long documents with clear section boundaries (reports, books, articles).
def parse_by_section(text):
"""
Extract content by logical sections.
More semantically meaningful than pages.
"""
sections = []
current_section = None
lines = text.split('\n')
for line in lines:
# Detect section headers
if is_section_header(line):
# Save previous section
if current_section:
sections.append(current_section)
# Start new section
current_section = {
'title': line.strip(),
'content': '',
'level': detect_header_level(line)
}
elif current_section:
current_section['content'] += line + '\n'
# Add final section
if current_section:
sections.append(current_section)
return sections
def detect_header_level(line):
"""
Determine header hierarchy level.
H1 > H2 > H3, etc.
"""
# Check for numbered headers
if re.match(r'^\d+\.', line): # "1. Introduction"
return 1
elif re.match(r'^\d+\.\d+', line): # "1.1 Background"
return 2
elif re.match(r'^\d+\.\d+\.\d+', line): # "1.1.1 Details"
return 3
# Check for all caps (often H1)
if line.isupper():
return 1
# Check for title case
if line.istitle():
return 2
return 3 # Default
# Example: Hierarchical section indexing
def index_by_sections(document):
sections = parse_by_section(document['text'])
for section in sections:
# Create rich context by including section hierarchy
context = f"""
Section: {section['title']}
Level: H{section['level']}
Content:
{section['content']}
"""
embedding = embed(context)
store_in_vector_db(
embedding=embedding,
content=section['content'],
metadata={
'section_title': section['title'],
'section_level': section['level'],
'document': document['name']
}
)
Advantages:
- Respects logical document structure
- Better semantic coherence
- Easier to cite ("Section 3.2")
Disadvantages:
- Requires accurate header detection
- Variable chunk sizes
- Complex multi-level hierarchies
Paragraph-Level Parsing
Best for: Dense text documents where each paragraph is self-contained.
def parse_by_paragraph(text):
"""
Split document into paragraphs.
Good for articles and reports.
"""
# Split on double newlines (typical paragraph separator)
paragraphs = re.split(r'\n\s*\n', text)
# Filter out very short "paragraphs" (likely noise)
paragraphs = [p.strip() for p in paragraphs if len(p.strip()) > 50]
return paragraphs
def index_paragraphs_with_context(document):
"""
Index paragraphs but include surrounding context.
"""
paragraphs = parse_by_paragraph(document['text'])
for i, para in enumerate(paragraphs):
# Include context from adjacent paragraphs
context_before = paragraphs[i-1] if i > 0 else ""
context_after = paragraphs[i+1] if i < len(paragraphs)-1 else ""
# Create searchable text with context
searchable = f"{context_before}\n\n{para}\n\n{context_after}"
# But store only the main paragraph
embedding = embed(searchable)
store_in_vector_db(
embedding=embedding,
content=para, # Store only main content
metadata={
'paragraph_number': i + 1,
'has_context': True,
'document': document['name']
}
)
Advantages:
- Fine-grained retrieval
- Self-contained units of meaning
- Flexible chunk sizes
Disadvantages:
- May lose broader context
- More chunks to manage
- Higher storage costs
Intelligent Adaptive Parsing
Choose granularity based on document characteristics.
def adaptive_parse(document):
"""
Automatically choose best parsing strategy.
"""
text = document['text']
doc_type = document['type']
# For slide decks: page-level
if doc_type in ['presentation', 'slides']:
return parse_by_page(document['path'])
# For long reports with clear structure: section-level
elif has_clear_sections(text):
return parse_by_section(text)
# For dense articles: paragraph-level
else:
return parse_by_paragraph(text)
def has_clear_sections(text):
"""
Heuristic to detect if document has clear section structure.
"""
lines = text.split('\n')
potential_headers = sum(1 for line in lines if is_section_header(line))
# If >5% of lines are headers, document is well-structured
header_ratio = potential_headers / len(lines) if lines else 0
return header_ratio > 0.05
Hybrid: Multi-Level Indexing
Index the same document at multiple granularities for different use cases.
def multi_level_index(document):
"""
Index at both section and paragraph level.
Allows flexible retrieval.
"""
# Index by sections for broad queries
sections = parse_by_section(document['text'])
for section in sections:
embed_and_store(
content=section['content'],
metadata={'granularity': 'section', 'title': section['title']}
)
# Index by paragraphs for specific queries
paragraphs = parse_by_paragraph(document['text'])
for i, para in enumerate(paragraphs):
embed_and_store(
content=para,
metadata={'granularity': 'paragraph', 'number': i}
)
# Query with granularity preference
def search_with_granularity(query, prefer='section'):
"""
Search with granularity preference.
"""
if prefer == 'section':
# Prioritize section-level results
results = vector_db.query(
query,
where={'granularity': 'section'},
n_results=5
)
else:
# Use paragraph-level
results = vector_db.query(
query,
where={'granularity': 'paragraph'},
n_results=10
)
return results
Best Practices
- Consider document type: Slides → pages, Reports → sections, Articles → paragraphs
- Maintain metadata: Always store page/section numbers for citations
- Test retrieval: Evaluate which granularity gives best results
- Hybrid indexing: When in doubt, index at multiple levels
- Context windows: Ensure chunks fit in LLM context (typically 500-1000 tokens)
Next lesson: Table extraction challenges.