OCR for Scanned PDFs - When and How

OCR for Scanned PDFs - When and How

Identify when OCR is needed and implement effective OCR strategies for scanned documents in RAG systems.

OCR for Scanned PDFs

OCR (Optical Character Recognition) converts images of text into machine-readable text. Essential for scanned documents.

Detecting When OCR is Needed

import PyPDF2

def needs_ocr(pdf_path):
    """
    Check if PDF requires OCR by examining text content.
    """
    with open(pdf_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        
        # Try extracting text from first page
        first_page = reader.pages[0]
        text = first_page.extract_text()
        
        # If very little text extracted, likely scanned
        chars_per_page = len(text.strip())
        
        if chars_per_page < 100:
            return True  # Likely scanned/image-based
        else:
            return False  # Native PDF with text

# Example usage
if needs_ocr('document.pdf'):
    print("This PDF needs OCR")
    text = perform_ocr('document.pdf')
else:
    print("This is a native PDF")
    text = extract_native_text('document.pdf')

OCR with Tesseract

from pdf2image import convert_from_path
import pytesseract
from PIL import Image

def ocr_pdf_with_tesseract(pdf_path, language='eng'):
    """
    Perform OCR on scanned PDF using Tesseract.
    
    Args:
        pdf_path: Path to PDF file
        language: OCR language (eng, spa, fra, etc.)
    
    Returns:
        List of dicts with page number and extracted text
    """
    # Convert PDF pages to images
    images = convert_from_path(pdf_path, dpi=300)  # High DPI for quality
    
    pages = []
    for page_num, image in enumerate(images):
        # Perform OCR on each page
        text = pytesseract.image_to_string(
            image,
            lang=language,
            config='--psm 1'  # Automatic page segmentation
        )
        
        # Get confidence score
        data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
        confidences = [int(c) for c in data['conf'] if c != '-1']
        avg_confidence = sum(confidences) / len(confidences) if confidences else 0
        
        pages.append({
            'page_number': page_num + 1,
            'text': text,
            'confidence': avg_confidence / 100,  # Normalize to 0-1
            'method': 'tesseract_ocr'
        })
    
    return pages

# Usage
pages = ocr_pdf_with_tesseract('scanned_report.pdf')
for page in pages:
    print(f"Page {page['page_number']}: {page['confidence']:.2%} confidence")
    print(page['text'][:200])  # First 200 chars

OCR with Cloud Services (AWS Textract)

import boto3

def ocr_with_textract(pdf_path):
    """
    Use AWS Textract for high-quality OCR.
    Better accuracy than Tesseract, especially for complex layouts.
    """
    textract = boto3.client('textract')
    
    # Upload PDF to S3 first
    s3_bucket = 'my-documents'
    s3_key = f'to-ocr/{pdf_path}'
    upload_to_s3(pdf_path, s3_bucket, s3_key)
    
    # Start Textract job
    response = textract.start_document_text_detection(
        DocumentLocation={
            'S3Object': {
                'Bucket': s3_bucket,
                'Name': s3_key
            }
        }
    )
    
    job_id = response['JobId']
    
    # Wait for completion
    import time
    while True:
        result = textract.get_document_text_detection(JobId=job_id)
        status = result['JobStatus']
        
        if status == 'SUCCEEDED':
            break
        elif status == 'FAILED':
            raise Exception("Textract job failed")
        
        time.sleep(5)
    
    # Extract text from results
    pages = {}
    for block in result['Blocks']:
        if block['BlockType'] == 'LINE':
            page_num = block['Page']
            if page_num not in pages:
                pages[page_num] = []
            pages[page_num].append(block['Text'])
    
    # Combine into page text
    extracted_pages = []
    for page_num in sorted(pages.keys()):
        text = '\n'.join(pages[page_num])
        extracted_pages.append({
            'page_number': page_num,
            'text': text,
            'confidence': 0.95,  # Textract is highly accurate
            'method': 'aws_textract'
        })
    
    return extracted_pages

Post-OCR Text Cleaning

def clean_ocr_text(text):
    """
    Clean common OCR errors and artifacts.
    """
    import re
    
    # Remove excessive whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Fix common OCR mistakes
    replacements = {
        'rn': 'm',  # Common: "rn" misread as "m"
        'fi': 'fi',  # Ligature issues
        '|': 'I',   # Vertical bar as letter I
        '0': 'O',   # Zero as letter O (context-dependent)
    }
    
    # Apply replacements carefully (simple example)
    for wrong, right in replacements.items():
        # Only replace in specific contexts to avoid false positives
        pass  # Implement context-aware replacement
    
    # Remove page numbers if they appear separately
    text = re.sub(r'^\s*\d+\s*$', '', text, flags=re.MULTILINE)
    
    # Fix hyphenation at line breaks
    text = re.sub(r'(\w+)-\s+(\w+)', r'\1\2', text)
    
    return text.strip()

Hybrid Approach: Try Native First, OCR if Needed

def extract_text_auto(pdf_path):
    """
    Automatically choose best extraction method.
    """
    if needs_ocr(pdf_path):
        print(f"Performing OCR on {pdf_path}...")
        pages = ocr_pdf_with_tesseract(pdf_path)
        
        # Clean OCR text
        for page in pages:
            page['text'] = clean_ocr_text(page['text'])
        
        return pages
    else:
        print(f"Extracting native text from {pdf_path}...")
        return extract_native_pdf(pdf_path)

Next: Layout-aware OCR for better quality.

Subscribe to our newsletter

Get the latest posts delivered right to your inbox.

Subscribe on LinkedIn