
Spreadsheets and CSVs
Process tabular data from spreadsheets and CSV files for structured RAG queries.
Spreadsheets and CSVs
Tabular data requires specialized processing. Learn to handle spreadsheets effectively.
Processing CSV Files
import pandas as pd
def process_csv(file_path):
df = pd.read_csv(file_path)
# Convert to searchable format
rows = []
for idx, row in df.iterrows():
row_text = format_row_as_text(row, df.columns)
rows.append({
'row_number': idx + 1,
'content': row_text,
'data': row.to_dict()
})
return rows
def format_row_as_text(row, columns):
"""Convert row to natural language"""
parts = []
for col in columns:
parts.append(f"{col}: {row[col]}")
return ", ".join(parts)
Processing Excel Files
def process_excel(file_path):
# Read all sheets
excel_file = pd.ExcelFile(file_path)
all_data = []
for sheet_name in excel_file.sheet_names:
df = pd.read_excel(file_path, sheet_name=sheet_name)
all_data.append({
'sheet': sheet_name,
'data': process_dataframe(df),
'summary': generate_sheet_summary(df)
})
return all_data
Table Summarization
def generate_sheet_summary(df):
"""Create searchable summary of table"""
summary = f"""
Table with {len(df)} rows and {len(df.columns)} columns.
Columns: {', '.join(df.columns)}
Sample data:
{df.head(3).to_string()}
Statistics:
{df.describe().to_string()}
"""
return summary
Indexing Strategy
Option 1: Row-by-Row
# Index each row separately
for row in df.iterrows():
row_text = format_row_as_text(row)
embedding = embed(row_text)
store(embedding, row_text, metadata={'row': idx})
Option 2: Summary-Based
# Index table summary for discovery
summary = generate_sheet_summary(df)
embedding = embed(summary)
store(embedding, summary, metadata={'file': 'data.xlsx', 'sheet': 'Sales'})
# Then query the actual data when needed
Option 3: Hybrid
# Summary for discovery + rows for detail
summary_emb = embed(generate_sheet_summary(df))
store(summary_emb, summary, metadata={'type': 'table_summary'})
for idx, row in df.iterrows():
row_emb = embed(format_row_as_text(row))
store(row_emb, row_text, metadata={'type': 'table_row', 'row': idx})
Querying Tables
def query_table_data(query):
# First find relevant table
table_results = search(query, filter={'type': 'table_summary'})
if table_results:
# Then search within that table
row_results = search(
query,
filter={
'type': 'table_row',
'file': table_results[0]['file']
}
)
return row_results
LLM Integration
def answer_table_question(question, df):
# Convert table to text representation
table_text = df.to_markdown()
prompt = f"""
Table:
{table_text}
Question: {question}
Answer based on the table data.
"""
response = claude.generate(prompt)
return response
Next: Structured data (JSON, databases).