Spaces:
Sleeping
Sleeping
File size: 4,967 Bytes
b949a69 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
# pip install yt-dlp
import yt_dlp
import os
import re
from pathlib import Path
def get_youtube_transcript_ytdlp(video_url, output_file="transcript.txt"):
"""
Extract YouTube transcript using yt-dlp
Works perfectly in India - yt-dlp handles all signature/blocking issues
"""
print("[*] Starting transcript extraction with yt-dlp...")
# Extract video ID for reference
video_id_match = re.search(r'v=([^&]*)', video_url)
video_id = video_id_match.group(1) if video_id_match else 'unknown'
print(f"[+] Video ID: {video_id}")
# Normalize URL to just the video (remove playlist parameters)
normalized_url = f"https://www.youtube.com/watch?v={video_id}"
print(f"[+] Normalized URL: {normalized_url}")
try:
# Create temp directory for subtitles
temp_dir = "temp_subs"
os.makedirs(temp_dir, exist_ok=True)
# Setup yt-dlp options
ydl_opts = {
'writeautomaticsub': True, # Download auto-generated subtitles
'subtitlesformat': 'vtt', # Format (can also be 'json3', 'srt', 'ass')
'skip_download': True, # Only download subs, not video
'noplaylist': True, # Only download the video, not the playlist
'outtmpl': os.path.join(temp_dir, '%(id)s'), # Output template
'quiet': False, # Show progress
'no_warnings': False,
'sub_langs': 'en', # Only English subtitles
}
print("[*] Downloading subtitles...")
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(normalized_url, download=True) # Use normalized URL
print("[+] Subtitles downloaded successfully")
# Find the subtitle file
subtitle_file = None
for file in os.listdir(temp_dir):
if video_id in file and (file.endswith('.vtt') or file.endswith('.srt')):
subtitle_file = os.path.join(temp_dir, file)
print(f"[+] Found subtitle file: {file}")
break
if not subtitle_file or not os.path.exists(subtitle_file):
print("[ERROR] Subtitle file not found")
print(f"[DEBUG] Files in {temp_dir}: {os.listdir(temp_dir)}")
return None
# Read and parse the subtitle file
print("[*] Parsing subtitle file...")
transcript_lines = []
if subtitle_file.endswith('.vtt'):
# Parse VTT format
with open(subtitle_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
line = line.strip()
# Skip headers, timestamps, and empty lines
if line and not line.startswith('WEBVTT') and not '-->' in line and line:
transcript_lines.append(line)
elif subtitle_file.endswith('.srt'):
# Parse SRT format
with open(subtitle_file, 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
line = line.strip()
# Skip sequence numbers and timestamps
if line and not line[0].isdigit() and not '-->' in line and line:
transcript_lines.append(line)
if not transcript_lines:
print("[ERROR] No text extracted from subtitle file")
return None
# Combine into full transcript
full_text = "\n".join(transcript_lines)
# Save to output file
print(f"[*] Saving transcript to {output_file}...")
with open(output_file, 'w', encoding='utf-8') as f:
f.write(full_text)
# Cleanup temp directory
import shutil
shutil.rmtree(temp_dir)
print(f"\n✓ SUCCESS!")
print(f" File: {output_file}")
print(f" Total characters: {len(full_text)}")
print(f" Total lines: {len(transcript_lines)}")
return full_text
except Exception as e:
print(f"[ERROR] {str(e)}")
import traceback
traceback.print_exc()
return None
# ==================== MAIN ====================
if __name__ == "__main__":
print("=" * 70)
print("YouTube Transcript Extractor - yt-dlp VERSION (WORKS IN INDIA!)")
print("=" * 70)
video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
print(f"\nTarget video: {video_url}\n")
transcript = get_youtube_transcript_ytdlp(video_url)
if transcript:
print("\n" + "=" * 70)
print("TRANSCRIPT PREVIEW (First 800 characters)")
print("=" * 70)
print(transcript[:800])
print("\n...")
else:
print("\n[FAILED] Could not extract transcript")
|