File size: 4,967 Bytes
b949a69
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# pip install yt-dlp

import yt_dlp
import os
import re
from pathlib import Path

def get_youtube_transcript_ytdlp(video_url, output_file="transcript.txt"):
    """
    Extract YouTube transcript using yt-dlp
    Works perfectly in India - yt-dlp handles all signature/blocking issues
    """
    
    print("[*] Starting transcript extraction with yt-dlp...")
    
    # Extract video ID for reference
    video_id_match = re.search(r'v=([^&]*)', video_url)
    video_id = video_id_match.group(1) if video_id_match else 'unknown'
    
    print(f"[+] Video ID: {video_id}")
    
    # Normalize URL to just the video (remove playlist parameters)
    normalized_url = f"https://www.youtube.com/watch?v={video_id}"
    print(f"[+] Normalized URL: {normalized_url}")
    
    try:
        # Create temp directory for subtitles
        temp_dir = "temp_subs"
        os.makedirs(temp_dir, exist_ok=True)
        
        # Setup yt-dlp options
        ydl_opts = {
            'writeautomaticsub': True,      # Download auto-generated subtitles
            'subtitlesformat': 'vtt',       # Format (can also be 'json3', 'srt', 'ass')
            'skip_download': True,          # Only download subs, not video
            'noplaylist': True,             # Only download the video, not the playlist
            'outtmpl': os.path.join(temp_dir, '%(id)s'),  # Output template
            'quiet': False,                 # Show progress
            'no_warnings': False,
            'sub_langs': 'en',              # Only English subtitles
        }
        
        print("[*] Downloading subtitles...")
        
        with yt_dlp.YoutubeDL(ydl_opts) as ydl:
            info = ydl.extract_info(normalized_url, download=True)  # Use normalized URL
            
        print("[+] Subtitles downloaded successfully")
        
        # Find the subtitle file
        subtitle_file = None
        for file in os.listdir(temp_dir):
            if video_id in file and (file.endswith('.vtt') or file.endswith('.srt')):
                subtitle_file = os.path.join(temp_dir, file)
                print(f"[+] Found subtitle file: {file}")
                break
        
        if not subtitle_file or not os.path.exists(subtitle_file):
            print("[ERROR] Subtitle file not found")
            print(f"[DEBUG] Files in {temp_dir}: {os.listdir(temp_dir)}")
            return None
        
        # Read and parse the subtitle file
        print("[*] Parsing subtitle file...")
        
        transcript_lines = []
        
        if subtitle_file.endswith('.vtt'):
            # Parse VTT format
            with open(subtitle_file, 'r', encoding='utf-8') as f:
                lines = f.readlines()
            
            for line in lines:
                line = line.strip()
                # Skip headers, timestamps, and empty lines
                if line and not line.startswith('WEBVTT') and not '-->' in line and line:
                    transcript_lines.append(line)
        
        elif subtitle_file.endswith('.srt'):
            # Parse SRT format
            with open(subtitle_file, 'r', encoding='utf-8') as f:
                lines = f.readlines()
            
            for line in lines:
                line = line.strip()
                # Skip sequence numbers and timestamps
                if line and not line[0].isdigit() and not '-->' in line and line:
                    transcript_lines.append(line)
        
        if not transcript_lines:
            print("[ERROR] No text extracted from subtitle file")
            return None
        
        # Combine into full transcript
        full_text = "\n".join(transcript_lines)
        
        # Save to output file
        print(f"[*] Saving transcript to {output_file}...")
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(full_text)
        
        # Cleanup temp directory
        import shutil
        shutil.rmtree(temp_dir)
        
        print(f"\n✓ SUCCESS!")
        print(f"  File: {output_file}")
        print(f"  Total characters: {len(full_text)}")
        print(f"  Total lines: {len(transcript_lines)}")
        
        return full_text
        
    except Exception as e:
        print(f"[ERROR] {str(e)}")
        import traceback
        traceback.print_exc()
        return None


# ==================== MAIN ====================

if __name__ == "__main__":
    
    print("=" * 70)
    print("YouTube Transcript Extractor - yt-dlp VERSION (WORKS IN INDIA!)")
    print("=" * 70)
    
    video_url = "https://www.youtube.com/watch?v=dQw4w9WgXcQ"
    
    print(f"\nTarget video: {video_url}\n")
    
    transcript = get_youtube_transcript_ytdlp(video_url)
    
    if transcript:
        print("\n" + "=" * 70)
        print("TRANSCRIPT PREVIEW (First 800 characters)")
        print("=" * 70)
        print(transcript[:800])
        print("\n...")
    else:
        print("\n[FAILED] Could not extract transcript")