
Video (Lectures, Demos)
Extract and index both visual and audio content from video files for comprehensive RAG.
Video (Lectures, Demos)
Videos combine audio and visual information. Process both for complete coverage.
Video Processing Strategy
graph TD
A[Video File] --> B[Extract Audio]
A --> C[Extract Frames]
B --> D[Transcription]
C --> E[Frame Analysis]
D & E --> F[Align Timeline]
F --> G[Multimodal Index]
Extract Audio Track
from moviepy.editor import VideoFileClip
def extract_audio(video_path):
video = VideoFileClip(video_path)
audio_path = video_path.replace('.mp4', '.mp3')
video.audio.write_audiofile(audio_path)
return audio_path
Extract Key Frames
import cv2
def extract_frames(video_path, interval_seconds=5):
"""Extract one frame every N seconds"""
cap = cv2.VideoCapture(video_path)
fps = cap.get(cv2.CAP_PROP_FPS)
frame_interval = int(fps * interval_seconds)
frames = []
frame_count = 0
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
if frame_count % frame_interval == 0:
timestamp = frame_count / fps
frames.append({
'timestamp': timestamp,
'frame': frame,
'frame_number': frame_count
})
frame_count += 1
cap.release()
return frames
Analyze Frames with Vision Model
def analyze_video_frames(frames):
analyzed = []
for frame_data in frames:
# Convert frame to base64
_, buffer = cv2.imencode('.jpg', frame_data['frame'])
frame_b64 = base64.b64encode(buffer).decode()
# Vision model analysis
description = claude vision.describe(frame_b64)
analyzed.append({
'timestamp': frame_data['timestamp'],
'description': description
})
return analyzed
Combine Video + Audio
def process_video(video_path):
# Extract and transcribe audio
audio_path = extract_audio(video_path)
transcript = transcribe_audio(audio_path)
# Extract and analyze frames
frames = extract_frames(video_path, interval_seconds=10)
frame_descriptions = analyze_video_frames(frames)
# Align
return align_video_content(transcript, frame_descriptions)
def align_video_content(transcript, frames):
"""Align transcript segments with frame descriptions"""
aligned = []
for frame in frames:
# Find transcript segments near this timestamp
nearby_text = []
for segment in transcript['segments']:
if abs(segment['start'] - frame['timestamp']) < 10: # Within 10s
nearby_text.append(segment['text'])
aligned.append({
'timestamp': frame['timestamp'],
'visual': frame['description'],
'audio': ' '.join(nearby_text)
})
return aligned
Indexing Video Content
def index_video(video_path):
content = process_video(video_path)
for item in content:
# Combine visual and audio for embedding
combined_text = f"""
Visual: {item['visual']}
Audio: {item['audio']}
"""
embedding = embed(combined_text)
store(
embedding=embedding,
content=combined_text,
metadata={
'source': video_path,
'timestamp': item['timestamp'],
'type': 'video_segment'
}
)
Query Response with Video
{
"answer": "The instructor demonstrates connecting the red wire to terminal A.",
"source": {
"file": "installation_tutorial.mp4",
"timestamp": "3:45",
"visual_description": "Close-up of circuit board with red wire being connected",
"transcript": "Now take the red wire and carefully connect it to terminal A as shown"
}
}
Optimization Tips
- Frame interval: 5-10 seconds for most content
- Scene detection: Extract frames on scene changes
- Selective analysis: Only analyze important frames
- Caching: Store frame descriptions to avoid re-analysis
Next: Spreadsheets and CSVs.