Video (Lectures, Demos)

Videos combine audio and visual information. Process both for complete coverage.

Video Processing Strategy

graph TD
    A[Video File] --> B[Extract Audio]
    A --> C[Extract Frames]
    
    B --> D[Transcription]
    C --> E[Frame Analysis]
    
    D & E --> F[Align Timeline]
    F  --> G[Multimodal Index]

Extract Audio Track

from moviepy.editor import VideoFileClip

def extract_audio(video_path):
    video = VideoFileClip(video_path)
    audio_path = video_path.replace('.mp4', '.mp3')
    video.audio.write_audiofile(audio_path)
    return audio_path

Extract Key Frames

import cv2

def extract_frames(video_path, interval_seconds=5):
    """Extract one frame every N seconds"""
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(fps * interval_seconds)
    
    frames = []
    frame_count = 0
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        if frame_count % frame_interval == 0:
            timestamp = frame_count / fps
            frames.append({
                'timestamp': timestamp,
                'frame': frame,
                'frame_number': frame_count
            })
        
        frame_count += 1
    
    cap.release()
    return frames

Analyze Frames with Vision Model

def analyze_video_frames(frames):
    analyzed = []
    
    for frame_data in frames:
        # Convert frame to base64
        _, buffer = cv2.imencode('.jpg', frame_data['frame'])
        frame_b64 = base64.b64encode(buffer).decode()
        
        # Vision model analysis
        description = claude vision.describe(frame_b64)
        
        analyzed.append({
            'timestamp': frame_data['timestamp'],
            'description': description
        })
    
    return analyzed

Combine Video + Audio

def process_video(video_path):
    # Extract and transcribe audio
    audio_path = extract_audio(video_path)
    transcript = transcribe_audio(audio_path)
    
    # Extract and analyze frames
    frames = extract_frames(video_path, interval_seconds=10)
    frame_descriptions = analyze_video_frames(frames)
    
    # Align
    return align_video_content(transcript, frame_descriptions)

def align_video_content(transcript, frames):
    """Align transcript segments with frame descriptions"""
    aligned = []
    
    for frame in frames:
        # Find transcript segments near this timestamp
        nearby_text = []
        for segment in transcript['segments']:
            if abs(segment['start'] - frame['timestamp']) &lt; 10:  # Within 10s
                nearby_text.append(segment['text'])
        
        aligned.append({
            'timestamp': frame['timestamp'],
            'visual': frame['description'],
            'audio': ' '.join(nearby_text)
        })
    
    return aligned

Indexing Video Content

def index_video(video_path):
    content = process_video(video_path)
    
    for item in content:
        # Combine visual and audio for embedding
        combined_text = f"""
Visual: {item['visual']}
Audio: {item['audio']}
        """
        
        embedding = embed(combined_text)
        
        store(
            embedding=embedding,
            content=combined_text,
            metadata={
                'source': video_path,
                'timestamp': item['timestamp'],
                'type': 'video_segment'
            }
        )

Query Response with Video

{
    "answer": "The instructor demonstrates connecting the red wire to terminal A.",
    "source": {
        "file": "installation_tutorial.mp4",
        "timestamp": "3:45",
        "visual_description": "Close-up of circuit board with red wire being connected",
        "transcript": "Now take the red wire and carefully connect it to terminal A as shown"
    }
}

Optimization Tips

Frame interval: 5-10 seconds for most content
Scene detection: Extract frames on scene changes
Selective analysis: Only analyze important frames
Caching: Store frame descriptions to avoid re-analysis

Next: Spreadsheets and CSVs.

Video (Lectures, Demos)

Video Processing Strategy

Extract Audio Track

Extract Key Frames

Analyze Frames with Vision Model

Combine Video + Audio

Indexing Video Content

Query Response with Video

Optimization Tips

Subscribe to our newsletter