1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
#!/usr/bin/env python


# The directory where the files will be downloaded
basedir = "~/video/ai-class/"

# Video formats in order of preference
# (see http://en.wikipedia.org/wiki/YouTube#Quality_and_codecs
#  for a description of these values)
formats = ['22', '34', '18', '43']


################################################################################
from itertools import chain
import string
import re
import urllib
import urlparse
import xml.etree.ElementTree
import sys
from os import mkdir
import os.path


class Video:
    _title_pattern = [("Unit", re.compile("[uU]nit[\\s_]+(\\d+)\\D+(\\w+)")),
                      ("Homework", re.compile("[hH]omework[\\s_]+(\\w+)\\D+(\\w+)")),
                      ("Programming Project", re.compile("[pP]rogramming[\\s_][pP]roject[\\s_]+(\\d+)\\D+(\\w+)")),
                      ("Final", re.compile("[fF]inal[\\s_]+(\\d+)\\D+(\\w+)"))]
    _id_pattern = re.compile("v/([\\w_-]+)")

    videoid = None
    title = None
    url = None
    unit = "unknown"
    topic = None
    download = {}

    def __parse_title(self, title):
        self.unit = "unknown"
        self.topic = None

        for (group, pattern) in Video._title_pattern:
            match = pattern.search(title)
            if match:
                self.unit = group + " " + match.group(1)
                self.topic = match.group(2)
        return

    def __parse_url(self, url):
        self.videoid = None
        match = Video._id_pattern.search(url)
        if match:
            self.videoid = match.group(1)

    def get_download(self):
        self.download = {}
        if self.videoid:
            infourl = "http://www.youtube.com/get_video_info?&video_id=" \
                + self.videoid
            try:
                videoinfo = urlparse.parse_qs(urllib.urlopen(infourl).read())
            except IOError:
                return
            formats = map(lambda s: s.split("/", 1)[0],
                          videoinfo.get("fmt_list",[""])[0].split(","))
            self.download = dict(zip(formats,
                                     map(urlparse.parse_qs,
                                         videoinfo.get("url_encoded_fmt_stream_map",
                                                       [""])[0].split(","))))

    def __init__(self, title, url):
        self.__parse_title(title)
        self.title = title
        self.__parse_url(url)
        self.url = url


# Filter out video titles matching regex
ignore_titles = [re.compile(".*[\\s_]ad.mp4$"), re.compile(".*[\\s_]ad$")]
def get_page(num, offset):
    videos = {}
    url = ("http://gdata.youtube.com/feeds/api/videos?v=2"
           "&author=knowitvideos"
           "&fields=entry(title,content)&"
           "max-results={0}&"
           "start-index={1}").format(num, num*offset+1)
    rfile = urllib.urlopen(url)
    #xml.etree.ElementTree.register_namespace("atom", "http://www.w3.org/2005/Atom")
    tree = xml.etree.ElementTree.parse(rfile)
    entries = tree.findall("./{http://www.w3.org/2005/Atom}entry")
    size = len(entries)
    for entry in entries:
        title = entry.findtext("./{http://www.w3.org/2005/Atom}title")
        if sum(map((lambda exp : exp.match(title) != None), ignore_titles)):
            continue
        url = entry.find("./{http://www.w3.org/2005/Atom}content[@src]").get("src")
        video = Video(title, url)
        unit_videos = videos.get(video.unit)
        if unit_videos:
            unit_videos.append(video)
        else:
            unit_videos = [video]
            videos[video.unit] = unit_videos
    return (size, videos)

def get_single(video_id):
    url = ("http://gdata.youtube.com/feeds/api/videos/{0}?v=2").format(video_id)
    rfile = urllib.urlopen(url)
    tree = xml.etree.ElementTree.parse(rfile)
    entry = tree.getroot()
    title = entry.findtext("./{http://www.w3.org/2005/Atom}title")
    url = entry.find("./{http://www.w3.org/2005/Atom}content[@src]").get("src")
    video = Video(title, url)
    return {video.unit: [video]}


def get_video_list(video_ids=None):
    videos = {}
    offset = 0
    per_page = 50

    if video_ids:
        for v in video_ids:
            new_videos = get_single(v)
            videos = (lambda a, b: dict(
                    [(k,a.get(k,[])+b.get(k,[])) for k in frozenset(chain(a, b))]
                    ))(videos, new_videos)

    while True:
        fetched, new_videos = get_page(per_page, offset)
        offset = offset + 1
        videos = (lambda a, b: dict([(k,a.get(k,[])+b.get(k,[]))
                                     for k in frozenset(chain(a, b))]
                                    ))(videos, new_videos)
        if fetched < per_page:
            break

    return videos



# filter for characters that are invalid in filenames
transtbl = string.maketrans(" <>:\"/\\|?*\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f", "_##_#___###_______________________________")

if "__main__" == __name__:
    videos = get_video_list(sys.argv[1:])

    downloaded = open(basedir + "videos.dat", "r+")
    old_videos = frozenset(map(lambda s: s.strip(), downloaded.readlines()))
    new_videos = []

    for unit in videos.keys():
        for v in videos[unit]:
            if not v.videoid in old_videos:
                new_videos.append(v)

    downloaded_videos = []
    for v in new_videos:
        try:
            v.get_download()
            fmt = filter(None, [v.download.get(code) for code in formats])
            if fmt:
                url = fmt[0]["url"][0]
                filename = basedir + "{0}/{1}-{2}.mp4".format(
                    v.unit, v.title.translate(transtbl), v.videoid)
                if not os.path.exists(filename):
                    print "Downloading [{0}] {1} from {2} to {3}".format(
                        v.unit, v.title, url, filename)
                    try:
                        mkdir(basedir + v.unit, 0755)
                    except:
                        pass
                    urllib.urlretrieve(url, filename)
                    downloaded_videos.append(v)
                else:
                    print >> sys.stderr, \
                        "Not downloading [{0}] {1} () to {3}: file exists".format(
                        v.unit, v.title, v.videoid, filename)
            else:
                print >> sys.stderr, "No usable format for {0} ({1})".format(
                    v.title, v.videoid)
        except Exception as e:
            print >> sys.stderr, "Error downloading {0} ({1}): {2}".format(
                v.title, v.videoid, e)

    new_ids = map(lambda s: s+"\n", [v.videoid for v in new_videos])
    downloaded.writelines(new_ids)
    downloaded.close()