Spreadsheets and CSVs

Tabular data requires specialized processing. Learn to handle spreadsheets effectively.

Processing CSV Files

import pandas as pd

def process_csv(file_path):
    df = pd.read_csv(file_path)
    
    # Convert to searchable format
    rows = []
    for idx, row in df.iterrows():
        row_text = format_row_as_text(row, df.columns)
        rows.append({
            'row_number': idx + 1,
            'content': row_text,
            'data': row.to_dict()
        })
    
    return rows

def format_row_as_text(row, columns):
    """Convert row to natural language"""
    parts = []
    for col in columns:
        parts.append(f"{col}: {row[col]}")
    return ", ".join(parts)

Processing Excel Files

def process_excel(file_path):
    # Read all sheets
    excel_file = pd.ExcelFile(file_path)
    
    all_data = []
    for sheet_name in excel_file.sheet_names:
        df = pd.read_excel(file_path, sheet_name=sheet_name)
        
        all_data.append({
            'sheet': sheet_name,
            'data': process_dataframe(df),
            'summary': generate_sheet_summary(df)
        })
    
    return all_data

Table Summarization

def generate_sheet_summary(df):
    """Create searchable summary of table"""
    summary = f"""
Table with {len(df)} rows and {len(df.columns)} columns.
Columns: {', '.join(df.columns)}

Sample data:
{df.head(3).to_string()}

Statistics:
{df.describe().to_string()}
    """
    return summary

Indexing Strategy

Option 1: Row-by-Row

# Index each row separately
for row in df.iterrows():
    row_text = format_row_as_text(row)
    embedding = embed(row_text)
    store(embedding, row_text, metadata={'row': idx})

Option 2: Summary-Based

# Index table summary for discovery
summary = generate_sheet_summary(df)
embedding = embed(summary)
store(embedding, summary, metadata={'file': 'data.xlsx', 'sheet': 'Sales'})

# Then query the actual data when needed

Option 3: Hybrid

# Summary for discovery + rows for detail
summary_emb = embed(generate_sheet_summary(df))
store(summary_emb, summary, metadata={'type': 'table_summary'})

for idx, row in df.iterrows():
    row_emb = embed(format_row_as_text(row))
    store(row_emb, row_text, metadata={'type': 'table_row', 'row': idx})

Querying Tables

def query_table_data(query):
    # First find relevant table
    table_results = search(query, filter={'type': 'table_summary'})
    
    if table_results:
        # Then search within that table
        row_results = search(
            query,
            filter={
                'type': 'table_row',
                'file': table_results[0]['file']
            }
        )
        return row_results

LLM Integration

def answer_table_question(question, df):
    # Convert table to text representation
    table_text = df.to_markdown()
    
    prompt = f"""
Table:
{table_text}

Question: {question}

Answer based on the table data.
    """
    
    response = claude.generate(prompt)
    return response

Next: Structured data (JSON, databases).

Spreadsheets and CSVs

Processing CSV Files

Processing Excel Files

Table Summarization

Indexing Strategy

Option 1: Row-by-Row

Option 2: Summary-Based

Option 3: Hybrid

Querying Tables

LLM Integration

Subscribe to our newsletter