
Layout-Aware OCR and Error Handling
Implement layout-aware OCR for complex documents and handle OCR errors gracefully in RAG systems.
Layout-Aware OCR
Preserve document layout during OCR for better context and structure. Learn error handling strategies.
Layout-Aware OCR with Tesseract
import pytesseract
from pdf2image import convert_from_path
def layout_aware_ocr(pdf_path):
"""
Perform OCR while preserving layout structure.
Uses Tesseract's PSM modes for better results.
"""
images = convert_from_path(pdf_path, dpi=300)
pages = []
for page_num, image in enumerate(images):
# Use PSM 1 (Automatic page segmentation with OSD)
# or PSM 3 (Fully automatic page segmentation, but no OSD)
text = pytesseract.image_to_string(
image,
config='--psm 1' # Detect orientation and script
)
# Get detailed layout information
layout_data = pytesseract.image_to_data(
image,
output_type=pytesseract.Output.DICT
)
# Organize by blocks (paragraphs/sections)
blocks = organize_by_blocks(layout_data)
pages.append({
'page_number': page_num + 1,
'text': text,
'blocks': blocks,
'layout_preserved': True
})
return pages
def organize_by_blocks(layout_data):
"""
Group OCR results into logical blocks/paragraphs.
"""
blocks = {}
for i in range(len(layout_data['text'])):
if layout_data['text'][i].strip(): # Non-empty text
block_num = layout_data['block_num'][i]
if block_num not in blocks:
blocks[block_num] = {
'text': '',
'confidence': [],
'bbox': {
'left': layout_data['left'][i],
'top': layout_data['top'][i],
'width': layout_data['width'][i],
'height': layout_data['height'][i]
}
}
blocks[block_num]['text'] += layout_data['text'][i] + ' '
if layout_data['conf'][i] != '-1':
blocks[block_num]['confidence'].append(int(layout_data['conf'][i]))
# Calculate average confidence per block
for block in blocks.values():
if block['confidence']:
block['avg_confidence'] = sum(block['confidence']) / len(block['confidence'])
else:
block['avg_confidence'] = 0
return list(blocks.values())
Multi-Column Layout Handling
def ocr_multi_column(image):
"""
Handle multi-column layouts (newspapers, academic papers).
"""
# Step 1: Detect columns
columns = detect_columns(image)
# Step 2: OCR each column separately
results = []
for col_num, column_region in enumerate(columns):
# Crop to column
col_image = image.crop(column_region)
# OCR the column
text = pytesseract.image_to_string(col_image)
results.append({
'column': col_num + 1,
'text': text,
'bbox': column_region
})
# Step 3: Combine columns in reading order (left to right)
full_text = '\n\n'.join(r['text'] for r in sorted(results, key=lambda x: x['bbox'][0]))
return full_text
def detect_columns(image):
"""
Detect column boundaries using image processing.
Simple approach: find vertical gaps.
"""
import numpy as np
# Convert to grayscale and get vertical projection
gray = np.array(image.convert('L'))
vertical_projection = np.sum(gray < 200, axis=0) # Count dark pixels
# Find gaps (low pixel density)
threshold = np.mean(vertical_projection) * 0.3
gaps = vertical_projection < threshold
# Identify column boundaries
columns = []
in_gap = False
start = 0
for x, is_gap in enumerate(gaps):
if not in_gap and is_gap:
# Start of gap
in_gap = True
if x - start > 100: # Minimum column width
columns.append((start, 0, x, image.height))
elif in_gap and not is_gap:
# End of gap
in_gap = False
start = x
# Add last column
if image.width - start > 100:
columns.append((start, 0, image.width, image.height))
return columns
OCR Error Handling
def robust_ocr(pdf_path, max_retries=3):
"""
OCR with error handling and fallback strategies.
"""
for attempt in range(max_retries):
try:
# Try primary OCR method
result = ocr_pdf_with_tesseract(pdf_path)
# Validate results
if validate_ocr_quality(result):
return result
else:
print(f"Low quality OCR on attempt {attempt + 1}")
# Try with different settings
if attempt == 1:
# Try with image preprocessing
result = ocr_with_preprocessing(pdf_path)
elif attempt == 2:
# Fall back to cloud OCR
result = ocr_with_textract(pdf_path)
if validate_ocr_quality(result):
return result
except Exception as e:
print(f"OCR attempt {attempt + 1} failed: {e}")
if attempt == max_retries - 1:
# Final attempt: return with warning
return {
'status': 'failed',
'error': str(e),
'text': '',
'confidence': 0
}
return result
def validate_ocr_quality(ocr_result):
"""
Check if OCR results meet quality threshold.
"""
if not ocr_result or not ocr_result.get('pages'):
return False
# Check average confidence across all pages
confidences = [p.get('confidence', 0) for p in ocr_result['pages']]
avg_confidence = sum(confidences) / len(confidences) if confidences else 0
# Require >70% confidence
if avg_confidence < 0.7:
return False
# Check that we extracted meaningful text
total_text = ''.join(p.get('text', '') for p in ocr_result['pages'])
if len(total_text.strip()) < 100: # Very little text
return False
return True
Image Preprocessing for Better OCR
from PIL import Image, ImageEnhance, ImageFilter
def preprocess_for_ocr(image):
"""
Enhance image quality before OCR.
Improves accuracy for poor-quality scans.
"""
# Convert to grayscale
image = image.convert('L')
# Increase contrast
enhancer = ImageEnhance.Contrast(image)
image = enhancer.enhance(2.0)
# Sharpen
image = image.filter(ImageFilter.SHARPEN)
# Denoise (remove speckles)
image = image.filter(ImageFilter.MedianFilter(size=3))
# Binarization (convert to pure black/white)
threshold = 128
image = image.point(lambda p: 255 if p > threshold else 0)
return image
def ocr_with_preprocessing(pdf_path):
"""
OCR with image preprocessing pipeline.
"""
images = convert_from_path(pdf_path, dpi=300)
pages = []
for page_num, image in enumerate(images):
# Preprocess
enhanced = preprocess_for_ocr(image)
# OCR
text = pytesseract.image_to_string(enhanced)
pages.append({
'page_number': page_num + 1,
'text': text,
'preprocessed': True
})
return {'pages': pages}
Storing OCR Confidence Scores
def store_ocr_with_confidence(pages, pdf_path):
"""
Index OCR results with conf idence metadata.
Allows filtering low-confidence extractions.
"""
for page in pages:
content = page['text']
confidence = page.get('confidence', 0)
embedding = embed(content)
metadata = {
'source': pdf_path,
'page': page['page_number'],
'extraction_method': 'ocr',
'ocr_confidence': confidence,
'quality': 'high' if confidence > 0.85 else 'medium' if confidence > 0.7 else 'low'
}
# Only index if confidence is acceptable
if confidence > 0.6: # Threshold
store_in_vector_db(
embedding=embedding,
content=content,
metadata=metadata
)
else:
print(f"Skipping page {page['page_number']} - low confidence {confidence:.2%}")
Module 8 complete! Next: Multimodal preprocessing.