Video (Lectures, Demos)

Video (Lectures, Demos)

Extract and index both visual and audio content from video files for comprehensive RAG.

Video (Lectures, Demos)

Videos combine audio and visual information. Process both for complete coverage.

Video Processing Strategy

graph TD
    A[Video File] --> B[Extract Audio]
    A --> C[Extract Frames]
    
    B --> D[Transcription]
    C --> E[Frame Analysis]
    
    D & E --> F[Align Timeline]
    F  --> G[Multimodal Index]

Extract Audio Track

from moviepy.editor import VideoFileClip

def extract_audio(video_path):
    video = VideoFileClip(video_path)
    audio_path = video_path.replace('.mp4', '.mp3')
    video.audio.write_audiofile(audio_path)
    return audio_path

Extract Key Frames

import cv2

def extract_frames(video_path, interval_seconds=5):
    """Extract one frame every N seconds"""
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frame_interval = int(fps * interval_seconds)
    
    frames = []
    frame_count = 0
    
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        
        if frame_count % frame_interval == 0:
            timestamp = frame_count / fps
            frames.append({
                'timestamp': timestamp,
                'frame': frame,
                'frame_number': frame_count
            })
        
        frame_count += 1
    
    cap.release()
    return frames

Analyze Frames with Vision Model

def analyze_video_frames(frames):
    analyzed = []
    
    for frame_data in frames:
        # Convert frame to base64
        _, buffer = cv2.imencode('.jpg', frame_data['frame'])
        frame_b64 = base64.b64encode(buffer).decode()
        
        # Vision model analysis
        description = claude vision.describe(frame_b64)
        
        analyzed.append({
            'timestamp': frame_data['timestamp'],
            'description': description
        })
    
    return analyzed

Combine Video + Audio

def process_video(video_path):
    # Extract and transcribe audio
    audio_path = extract_audio(video_path)
    transcript = transcribe_audio(audio_path)
    
    # Extract and analyze frames
    frames = extract_frames(video_path, interval_seconds=10)
    frame_descriptions = analyze_video_frames(frames)
    
    # Align
    return align_video_content(transcript, frame_descriptions)

def align_video_content(transcript, frames):
    """Align transcript segments with frame descriptions"""
    aligned = []
    
    for frame in frames:
        # Find transcript segments near this timestamp
        nearby_text = []
        for segment in transcript['segments']:
            if abs(segment['start'] - frame['timestamp']) < 10:  # Within 10s
                nearby_text.append(segment['text'])
        
        aligned.append({
            'timestamp': frame['timestamp'],
            'visual': frame['description'],
            'audio': ' '.join(nearby_text)
        })
    
    return aligned

Indexing Video Content

def index_video(video_path):
    content = process_video(video_path)
    
    for item in content:
        # Combine visual and audio for embedding
        combined_text = f"""
Visual: {item['visual']}
Audio: {item['audio']}
        """
        
        embedding = embed(combined_text)
        
        store(
            embedding=embedding,
            content=combined_text,
            metadata={
                'source': video_path,
                'timestamp': item['timestamp'],
                'type': 'video_segment'
            }
        )

Query Response with Video

{
    "answer": "The instructor demonstrates connecting the red wire to terminal A.",
    "source": {
        "file": "installation_tutorial.mp4",
        "timestamp": "3:45",
        "visual_description": "Close-up of circuit board with red wire being connected",
        "transcript": "Now take the red wire and carefully connect it to terminal A as shown"
    }
}

Optimization Tips

  • Frame interval: 5-10 seconds for most content
  • Scene detection: Extract frames on scene changes
  • Selective analysis: Only analyze important frames
  • Caching: Store frame descriptions to avoid re-analysis

Next: Spreadsheets and CSVs.

Subscribe to our newsletter

Get the latest posts delivered right to your inbox.

Subscribe on LinkedIn