OCR for Scanned PDFs

OCR (Optical Character Recognition) converts images of text into machine-readable text. Essential for scanned documents.

Detecting When OCR is Needed

import PyPDF2

def needs_ocr(pdf_path):
    """
    Check if PDF requires OCR by examining text content.
    """
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        
        # Try extracting text from first page
        first_page = reader.pages[0]
        text = first_page.extract_text()
        
        # If very little text extracted, likely scanned
        chars_per_page = len(text.strip())
        
        if chars_per_page &lt; 100:
            return True  # Likely scanned/image-based
        else:
            return False  # Native PDF with text

# Example usage
if needs_ocr('document.pdf'):
    print("This PDF needs OCR")
    text = perform_ocr('document.pdf')
else:
    print("This is a native PDF")
    text = extract_native_text('document.pdf')

OCR with Tesseract

from pdf2image import convert_from_path
import pytesseract
from PIL import Image

def ocr_pdf_with_tesseract(pdf_path, language='eng'):
    """
    Perform OCR on scanned PDF using Tesseract.
    
    Args:
        pdf_path: Path to PDF file
        language: OCR language (eng, spa, fra, etc.)
    
    Returns:
        List of dicts with page number and extracted text
    """
    # Convert PDF pages to images
    images = convert_from_path(pdf_path, dpi=300)  # High DPI for quality
    
    pages = []
    for page_num, image in enumerate(images):
        # Perform OCR on each page
        text = pytesseract.image_to_string(
            image,
            lang=language,
            config='--psm 1'  # Automatic page segmentation
        )
        
        # Get confidence score
        data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
        confidences = [int(c) for c in data['conf'] if c != '-1']
        avg_confidence = sum(confidences) / len(confidences) if confidences else 0
        
        pages.append({
            'page_number': page_num + 1,
            'text': text,
            'confidence': avg_confidence / 100,  # Normalize to 0-1
            'method': 'tesseract_ocr'
        })
    
    return pages

# Usage
pages = ocr_pdf_with_tesseract('scanned_report.pdf')
for page in pages:
    print(f"Page {page['page_number']}: {page['confidence']:.2%} confidence")
    print(page['text'][:200])  # First 200 chars

OCR with Cloud Services (AWS Textract)

import boto3

def ocr_with_textract(pdf_path):
    """
    Use AWS Textract for high-quality OCR.
    Better accuracy than Tesseract, especially for complex layouts.
    """
    textract = boto3.client('textract')
    
    # Upload PDF to S3 first
    s3_bucket = 'my-documents'
    s3_key = f'to-ocr/{pdf_path}'
    upload_to_s3(pdf_path, s3_bucket, s3_key)
    
    # Start Textract job
    response = textract.start_document_text_detection(
        DocumentLocation={
            'S3Object': {
                'Bucket': s3_bucket,
                'Name': s3_key
            }
        }
    )
    
    job_id = response['JobId']
    
    # Wait for completion
    import time
    while True:
        result = textract.get_document_text_detection(JobId=job_id)
        status = result['JobStatus']
        
        if status == 'SUCCEEDED':
            break
        elif status == 'FAILED':
            raise Exception("Textract job failed")
        
        time.sleep(5)
    
    # Extract text from results
    pages = {}
    for block in result['Blocks']:
        if block['BlockType'] == 'LINE':
            page_num = block['Page']
            if page_num not in pages:
                pages[page_num] = []
            pages[page_num].append(block['Text'])
    
    # Combine into page text
    extracted_pages = []
    for page_num in sorted(pages.keys()):
        text = '\n'.join(pages[page_num])
        extracted_pages.append({
            'page_number': page_num,
            'text': text,
            'confidence': 0.95,  # Textract is highly accurate
            'method': 'aws_textract'
        })
    
    return extracted_pages

Post-OCR Text Cleaning

def clean_ocr_text(text):
    """
    Clean common OCR errors and artifacts.
    """
    import re
    
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Fix common OCR mistakes
    replacements = {
        'rn': 'm',  # Common: "rn" misread as "m"
        'fi': 'fi',  # Ligature issues
        '|': 'I',   # Vertical bar as letter I
        '0': 'O',   # Zero as letter O (context-dependent)
    }
    
    # Apply replacements carefully (simple example)
    for wrong, right in replacements.items():
        # Only replace in specific contexts to avoid false positives
        pass  # Implement context-aware replacement
    
    # Remove page numbers if they appear separately
    text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
    
    # Fix hyphenation at line breaks
    text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)
    
    return text.strip()

Hybrid Approach: Try Native First, OCR if Needed

def extract_text_auto(pdf_path):
    """
    Automatically choose best extraction method.
    """
    if needs_ocr(pdf_path):
        print(f"Performing OCR on {pdf_path}...")
        pages = ocr_pdf_with_tesseract(pdf_path)
        
        # Clean OCR text
        for page in pages:
            page['text'] = clean_ocr_text(page['text'])
        
        return pages
    else:
        print(f"Extracting native text from {pdf_path}...")
        return extract_native_pdf(pdf_path)

Next: Layout-aware OCR for better quality.

OCR for Scanned PDFs - When and How

OCR for Scanned PDFs

Detecting When OCR is Needed

OCR with Tesseract

OCR with Cloud Services (AWS Textract)

Post-OCR Text Cleaning

Hybrid Approach: Try Native First, OCR if Needed

Subscribe to our newsletter