Spaces:

Harshilforworks
/

MumbaiHacks-backend

Sleeping

File size: 4,967 Bytes

b949a69

# pip install yt-dlp

import yt_dlp
import os
import re
from pathlib import Path

def get_youtube_transcript_ytdlp(video_url, output_file="transcript.txt"):
    """
    Extract YouTube transcript using yt-dlp
    Works perfectly in India - yt-dlp handles all signature/blocking issues
    """
    
    print("[*] Starting transcript extraction with yt-dlp...")
    
    # Extract video ID for reference
    video_id_match = re.search(r'v=([^&]*)', video_url)
    video_id = video_id_match.group(1) if video_id_match else 'unknown'
    
    print(f"[+] Video ID: {video_id}")
    
    # Normalize URL to just the video (remove playlist parameters)
    normalized_url = f"https://www.youtube.com/watch?v={video_id}"
    print(f"[+] Normalized URL: {normalized_url}")
    
    try:
        # Create temp directory for subtitles
        temp_dir = "temp_subs"
        os.makedirs(temp_dir, exist_ok=True)
        
        # Setup yt-dlp options
        ydl_opts = {
            'writeautomaticsub': True,      # Download auto-generated subtitles
            'subtitlesformat': 'vtt',       # Format (can also be 'json3', 'srt', 'ass')
            'skip_download': True,          # Only download subs, not video
            'noplaylist': True,             # Only download the video, not the playlist
            'outtmpl': os.path.join(temp_dir, '%(id)s'),  # Output template
            'quiet': False,                 # Show progress
            'no_warnings': False,
            'sub_langs': 'en',              # Only English subtitles
        }
        
        print("[*] Downloading subtitles...")
        
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(normalized_url, download=True)  # Use normalized URL
            
        print("[+] Subtitles downloaded successfully")
        
        # Find the subtitle file
        subtitle_file = None
        for file in os.listdir(temp_dir):
            if video_id in file and (file.endswith('.vtt') or file.endswith('.srt')):
                subtitle_file = os.path.join(temp_dir, file)
                print(f"[+] Found subtitle file: {file}")
                break
        
        if not subtitle_file or not os.path.exists(subtitle_file):
            print("[ERROR] Subtitle file not found")
            print(f"[DEBUG] Files in {temp_dir}: {os.listdir(temp_dir)}")
            return None
        
        # Read and parse the subtitle file
        print("[*] Parsing subtitle file...")
        
        transcript_lines = []
        
        if subtitle_file.endswith('.vtt'):
            # Parse VTT format
            with open(subtitle_file, 'r', encoding='utf-8') as f:
                lines = f.readlines()
            
            for line in lines:
                line = line.strip()
                # Skip headers, timestamps, and empty lines
                if line and not line.startswith('WEBVTT') and not '-->' in line and line:
                    transcript_lines.append(line)
        
        elif subtitle_file.endswith('.srt'):
            # Parse SRT format
            with open(subtitle_file, 'r', encoding='utf-8') as f:
                lines = f.readlines()
            
            for line in lines:
                line = line.strip()
                # Skip sequence numbers and timestamps
                if line and not line[0].isdigit() and not '-->' in line and line:
                    transcript_lines.append(line)
        
        if not transcript_lines:
            print("[ERROR] No text extracted from subtitle file")
            return None
        
        # Combine into full transcript
        full_text = "\n".join(transcript_lines)
        
        # Save to output file
        print(f"[*] Saving transcript to {output_file}...")
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(full_text)
        
        # Cleanup temp directory
        import shutil
        shutil.rmtree(temp_dir)
        
        print(f"\n✓ SUCCESS!")
        print(f"  File: {output_file}")
        print(f"  Total characters: {len(full_text)}")
        print(f"  Total lines: {len(transcript_lines)}")
        
        return full_text
        
    except Exception as e:
        print(f"[ERROR] {str(e)}")
        import traceback
        traceback.print_exc()
        return None


# ==================== MAIN ====================

if __name__ == "__main__":
    
    print("=" * 70)
    print("YouTube Transcript Extractor - yt-dlp VERSION (WORKS IN INDIA!)")
    print("=" * 70)
    
    video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
    
    print(f"\nTarget video: {video_url}\n")
    
    transcript = get_youtube_transcript_ytdlp(video_url)
    
    if transcript:
        print("\n" + "=" * 70)
        print("TRANSCRIPT PREVIEW (First 800 characters)")
        print("=" * 70)
        print(transcript[:800])
        print("\n...")
    else:
        print("\n[FAILED] Could not extract transcript")