Layout-Aware OCR and Error Handling

Layout-Aware OCR and Error Handling

Implement layout-aware OCR for complex documents and handle OCR errors gracefully in RAG systems.

Layout-Aware OCR

Preserve document layout during OCR for better context and structure. Learn error handling strategies.

Layout-Aware OCR with Tesseract

import pytesseract
from pdf2image import convert_from_path

def layout_aware_ocr(pdf_path):
    """
    Perform OCR while preserving layout structure.
    Uses Tesseract's PSM modes for better results.
    """
    images = convert_from_path(pdf_path, dpi=300)
    
    pages = []
    for page_num, image in enumerate(images):
        # Use PSM 1 (Automatic page segmentation with OSD)
        # or PSM 3 (Fully automatic page segmentation, but no OSD)
        text = pytesseract.image_to_string(
            image,
            config='--psm 1'  # Detect orientation and script
        )
        
        # Get detailed layout information
        layout_data = pytesseract.image_to_data(
            image,
            output_type=pytesseract.Output.DICT
        )
        
        # Organize by blocks (paragraphs/sections)
        blocks = organize_by_blocks(layout_data)
        
        pages.append({
            'page_number': page_num + 1,
            'text': text,
            'blocks': blocks,
            'layout_preserved': True
        })
    
    return pages

def organize_by_blocks(layout_data):
    """
    Group OCR results into logical blocks/paragraphs.
    """
    blocks = {}
    
    for i in range(len(layout_data['text'])):
        if layout_data['text'][i].strip():  # Non-empty text
            block_num = layout_data['block_num'][i]
            
            if block_num not in blocks:
                blocks[block_num] = {
                    'text': '',
                    'confidence': [],
                    'bbox': {
                        'left': layout_data['left'][i],
                        'top': layout_data['top'][i],
                        'width': layout_data['width'][i],
                        'height': layout_data['height'][i]
                    }
                }
            
            blocks[block_num]['text'] += layout_data['text'][i] + ' '
            if layout_data['conf'][i] != '-1':
                blocks[block_num]['confidence'].append(int(layout_data['conf'][i]))
    
    # Calculate average confidence per block
    for block in blocks.values():
        if block['confidence']:
            block['avg_confidence'] = sum(block['confidence']) / len(block['confidence'])
        else:
            block['avg_confidence'] = 0
    
    return list(blocks.values())

Multi-Column Layout Handling

def ocr_multi_column(image):
    """
    Handle multi-column layouts (newspapers, academic papers).
    """
    # Step 1: Detect columns
    columns = detect_columns(image)
    
    # Step 2: OCR each column separately
    results = []
    for col_num, column_region in enumerate(columns):
        # Crop to column
        col_image = image.crop(column_region)
        
        # OCR the column
        text = pytesseract.image_to_string(col_image)
        
        results.append({
            'column': col_num + 1,
            'text': text,
            'bbox': column_region
        })
    
    # Step 3: Combine columns in reading order (left to right)
    full_text = '\n\n'.join(r['text'] for r in sorted(results, key=lambda x: x['bbox'][0]))
    
    return full_text

def detect_columns(image):
    """
    Detect column boundaries using image processing.
    Simple approach: find vertical gaps.
    """
    import numpy as np
    
    # Convert to grayscale and get vertical projection
    gray = np.array(image.convert('L'))
    vertical_projection = np.sum(gray < 200, axis=0)  # Count dark pixels
    
    # Find gaps (low pixel density)
    threshold = np.mean(vertical_projection) * 0.3
    gaps = vertical_projection < threshold
    
    # Identify column boundaries
    columns = []
    in_gap = False
    start = 0
    
    for x, is_gap in enumerate(gaps):
        if not in_gap and is_gap:
            # Start of gap
            in_gap = True
            if x - start > 100:  # Minimum column width
                columns.append((start, 0, x, image.height))
        elif in_gap and not is_gap:
            # End of gap
            in_gap = False
            start = x
    
    # Add last column
    if image.width - start > 100:
        columns.append((start, 0, image.width, image.height))
    
    return columns

OCR Error Handling

def robust_ocr(pdf_path, max_retries=3):
    """
    OCR with error handling and fallback strategies.
    """
    for attempt in range(max_retries):
        try:
            # Try primary OCR method
            result = ocr_pdf_with_tesseract(pdf_path)
            
            # Validate results
            if validate_ocr_quality(result):
                return result
            else:
                print(f"Low quality OCR on attempt {attempt + 1}")
                
                # Try with different settings
                if attempt == 1:
                    # Try with image preprocessing
                    result = ocr_with_preprocessing(pdf_path)
                elif attempt == 2:
                    # Fall back to cloud OCR
                    result = ocr_with_textract(pdf_path)
                
                if validate_ocr_quality(result):
                    return result
        
        except Exception as e:
            print(f"OCR attempt {attempt + 1} failed: {e}")
            if attempt == max_retries - 1:
                # Final attempt: return with warning
                return {
                    'status': 'failed',
                    'error': str(e),
                    'text': '',
                    'confidence': 0
                }
    
    return result

def validate_ocr_quality(ocr_result):
    """
    Check if OCR results meet quality threshold.
    """
    if not ocr_result or not ocr_result.get('pages'):
        return False
    
    # Check average confidence across all pages
    confidences = [p.get('confidence', 0) for p in ocr_result['pages']]
    avg_confidence = sum(confidences) / len(confidences) if confidences else 0
    
    # Require >70% confidence
    if avg_confidence < 0.7:
        return False
    
    # Check that we extracted meaningful text
    total_text = ''.join(p.get('text', '') for p in ocr_result['pages'])
    if len(total_text.strip()) < 100:  # Very little text
        return False
    
    return True

Image Preprocessing for Better OCR

from PIL import Image, ImageEnhance, ImageFilter

def preprocess_for_ocr(image):
    """
    Enhance image quality before OCR.
    Improves accuracy for poor-quality scans.
    """
    # Convert to grayscale
    image = image.convert('L')
    
    # Increase contrast
    enhancer = ImageEnhance.Contrast(image)
    image = enhancer.enhance(2.0)
    
    # Sharpen
    image = image.filter(ImageFilter.SHARPEN)
    
    # Denoise (remove speckles)
    image = image.filter(ImageFilter.MedianFilter(size=3))
    
    # Binarization (convert to pure black/white)
    threshold = 128
    image = image.point(lambda p: 255 if p > threshold else 0)
    
    return image

def ocr_with_preprocessing(pdf_path):
    """
    OCR with image preprocessing pipeline.
    """
    images = convert_from_path(pdf_path, dpi=300)
    
    pages = []
    for page_num, image in enumerate(images):
        # Preprocess
        enhanced = preprocess_for_ocr(image)
        
        # OCR
        text = pytesseract.image_to_string(enhanced)
        
        pages.append({
            'page_number': page_num + 1,
            'text': text,
            'preprocessed': True
        })
    
    return {'pages': pages}

Storing OCR Confidence Scores

def store_ocr_with_confidence(pages, pdf_path):
    """
    Index OCR results with conf idence metadata.
    Allows filtering low-confidence extractions.
    """
    for page in pages:
        content = page['text']
        confidence = page.get('confidence', 0)
        
        embedding = embed(content)
        
        metadata = {
            'source': pdf_path,
            'page': page['page_number'],
            'extraction_method': 'ocr',
            'ocr_confidence': confidence,
            'quality': 'high' if confidence > 0.85 else 'medium' if confidence > 0.7 else 'low'
        }
        
        # Only index if confidence is acceptable
        if confidence > 0.6:  # Threshold
            store_in_vector_db(
                embedding=embedding,
                content=content,
                metadata=metadata
            )
        else:
            print(f"Skipping page {page['page_number']} - low confidence {confidence:.2%}")

Module 8 complete! Next: Multimodal preprocessing.

Subscribe to our newsletter

Get the latest posts delivered right to your inbox.

Subscribe on LinkedIn