Spreadsheets and CSVs

Spreadsheets and CSVs

Process tabular data from spreadsheets and CSV files for structured RAG queries.

Spreadsheets and CSVs

Tabular data requires specialized processing. Learn to handle spreadsheets effectively.

Processing CSV Files

import pandas as pd

def process_csv(file_path):
    df = pd.read_csv(file_path)
    
    # Convert to searchable format
    rows = []
    for idx, row in df.iterrows():
        row_text = format_row_as_text(row, df.columns)
        rows.append({
            'row_number': idx + 1,
            'content': row_text,
            'data': row.to_dict()
        })
    
    return rows

def format_row_as_text(row, columns):
    """Convert row to natural language"""
    parts = []
    for col in columns:
        parts.append(f"{col}: {row[col]}")
    return ", ".join(parts)

Processing Excel Files

def process_excel(file_path):
    # Read all sheets
    excel_file = pd.ExcelFile(file_path)
    
    all_data = []
    for sheet_name in excel_file.sheet_names:
        df = pd.read_excel(file_path, sheet_name=sheet_name)
        
        all_data.append({
            'sheet': sheet_name,
            'data': process_dataframe(df),
            'summary': generate_sheet_summary(df)
        })
    
    return all_data

Table Summarization

def generate_sheet_summary(df):
    """Create searchable summary of table"""
    summary = f"""
Table with {len(df)} rows and {len(df.columns)} columns.
Columns: {', '.join(df.columns)}

Sample data:
{df.head(3).to_string()}

Statistics:
{df.describe().to_string()}
    """
    return summary

Indexing Strategy

Option 1: Row-by-Row

# Index each row separately
for row in df.iterrows():
    row_text = format_row_as_text(row)
    embedding = embed(row_text)
    store(embedding, row_text, metadata={'row': idx})

Option 2: Summary-Based

# Index table summary for discovery
summary = generate_sheet_summary(df)
embedding = embed(summary)
store(embedding, summary, metadata={'file': 'data.xlsx', 'sheet': 'Sales'})

# Then query the actual data when needed

Option 3: Hybrid

# Summary for discovery + rows for detail
summary_emb = embed(generate_sheet_summary(df))
store(summary_emb, summary, metadata={'type': 'table_summary'})

for idx, row in df.iterrows():
    row_emb = embed(format_row_as_text(row))
    store(row_emb, row_text, metadata={'type': 'table_row', 'row': idx})

Querying Tables

def query_table_data(query):
    # First find relevant table
    table_results = search(query, filter={'type': 'table_summary'})
    
    if table_results:
        # Then search within that table
        row_results = search(
            query,
            filter={
                'type': 'table_row',
                'file': table_results[0]['file']
            }
        )
        return row_results

LLM Integration

def answer_table_question(question, df):
    # Convert table to text representation
    table_text = df.to_markdown()
    
    prompt = f"""
Table:
{table_text}

Question: {question}

Answer based on the table data.
    """
    
    response = claude.generate(prompt)
    return response

Next: Structured data (JSON, databases).

Subscribe to our newsletter

Get the latest posts delivered right to your inbox.

Subscribe on LinkedIn