forked from minhnd/youtube-subtitle-downloader
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathyoutubesub.py
150 lines (127 loc) · 5.39 KB
/
youtubesub.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
# -*- coding: utf-8 -*-
"""
Youtube Subtitle Downloader downloads subtitles from Youtube videos
(if those are present) and convert them to SRT format.
Usage: youtubesub.py [-h] [-l] [--language LANGUAGE] [--filename FILENAME]
[--filetype {srt,xml}]
url
positional arguments:
url URL of the Youtube video
optional arguments:
-h, --help show this help message and exit
-l, --list list all available languages
--language LANGUAGE the ISO language code
--filename FILENAME specify the name of subtitle
--filetype {srt,xml} specify the output type of subtitle
Example:
python youtubesub.py --filename subtitle --language en http://www.youtube.com/watch?v=5MgBikgcWnY
:copyright: (c) 2014 by Nguyen Dang Minh (www.minhnd.com)
:license: BSD, see LICENSE for more details.
"""
import urllib2
import urlparse
import argparse
import sys
import xml.etree.ElementTree as ET
class YoutubeSubDownloader():
video_id = None
subtitle = None
languages = {}
def __init__(self, url=None):
self.video_id = self.extractVideoID(url)
self.languages = self.getAvailableLanguages()
if self.languages == {}:
print "There's no subtitle"
sys.exit()
def extractVideoID(self, url=None):
"""
Examples:
- http://youtu.be/5MgBikgcWnY
- http://www.youtube.com/watch?v=5MgBikgcWnY&feature=feed
- http://www.youtube.com/embed/5MgBikgcWnY
- http://www.youtube.com/v/5MgBikgcWnY?version=3&hl=en_US
"""
url_data = urlparse.urlparse(url)
if url_data.hostname == 'youtu.be':
return url_data.path[1:]
if url_data.hostname in ('www.youtube.com', 'youtube.com'):
if url_data.path == '/watch':
query = urlparse.parse_qs(url_data.query)
return query['v'][0]
if url_data.path[:7] == '/embed/':
return url_data.path.split('/')[2]
if url_data.path[:3] == '/v/':
return url_data.path.split('/')[2]
return None
def download(self, language, filename, filetype):
"""Download subtitle of the selected language"""
if language not in self.languages.keys():
print "Theres's no subtitle in this language"
sys.exit()
url = "http://www.youtube.com/api/timedtext?v={0}&lang={1}".format(self.video_id, language)
self.subtitle = urllib2.urlopen(url)
if filetype == "srt":
self.writeSRTFile(filename)
else:
self.writeXMLFile(filename)
def getAvailableLanguages(self):
"""Get all available languages of subtitle"""
url = "http://www.youtube.com/api/timedtext?v=%s&type=list" % self.video_id
xml = urllib2.urlopen(url)
tree = ET.parse(xml)
root = tree.getroot()
languages = {}
for child in root:
languages[child.attrib["lang_code"]] = child.attrib["lang_translated"]
return languages
def list(self):
"""List all available languages of subtitle"""
for key, value in self.languages.iteritems():
print key, value
def writeXMLFile(self, filename=None):
with open(filename + ".xml", 'w') as f:
for line in self.subtitle:
f.write(line)
def writeSRTFile(self, filename=None):
tree = ET.parse(self.subtitle)
root = tree.getroot()
with open(filename + ".srt", 'w') as f:
line = 1
for child in root:
f.write(self.printSRTLine(line, child.attrib["start"], child.attrib["dur"], child.text.encode('utf-8')))
line += 1
def formatSRTTime(self, secTime):
"""Convert a time in seconds (in Google's subtitle) to SRT time format"""
sec, micro = str(secTime).split('.')
m, s = divmod(int(sec), 60)
h, m = divmod(m, 60)
return "{:02}:{:02}:{:02},{}".format(h,m,s,micro)
def printSRTLine(self, line, start, duration, text):
"""Print a subtitle in SRT format"""
end = self.formatSRTTime(float(start) + float(duration))
start = self.formatSRTTime(start)
text = self.convertHTML(text)
return "{}\n{} --> {}\n{}\n\n".format(line, start, end, text)
def convertHTML(self, text):
"""A few HTML encodings replacements.
' to '
"""
return text.replace(''', "'")
def main():
try:
parser = argparse.ArgumentParser(description="Youtube Subtitle Downloader")
parser.add_argument("url", help="URL of the Youtube video")
parser.add_argument("-l", "--list", action="store_true", help="list all available languages")
parser.add_argument("--language", default="en", help="the ISO language code")
parser.add_argument("--filename", default="subtitle", help="specify the name of subtitle")
parser.add_argument("--filetype", default="srt", choices=["srt", "xml"], help="specify the output type of subtitle")
args = parser.parse_args()
downloader = YoutubeSubDownloader(args.url)
if args.list:
print "Available languages:"
f = downloader.list()
downloader.download(args.language, args.filename, args.filetype)
except Exception as e:
print e
if __name__ == '__main__':
main()