Skip to content

Commit

Permalink
standardized the use of unescapeHTML; added clean_html()
Browse files Browse the repository at this point in the history
  • Loading branch information
FiloSottile committed Apr 10, 2012
1 parent ceba827 commit d6a9615
Showing 1 changed file with 15 additions and 17 deletions.
32 changes: 15 additions & 17 deletions youtube_dl/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,18 @@ def htmlentity_transform(matchobj):
return (u'&%s;' % entity)


def clean_html(html):
"""Clean an HTML snippet into a readable string"""
# Newline vs <br />
html = html.replace('\n', ' ')
html = re.sub('<\s*br\s*/?\s*>', '\n', html)
# Strip html tags
html = re.sub('<.*?>', '', html)
# Replace html entities
html = re.sub(ur'(?u)&(.+?);', htmlentity_transform, html)
return html


def sanitize_title(utitle):
"""Sanitizes a video title so it could be used as part of a filename."""
utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
Expand Down Expand Up @@ -3343,8 +3355,6 @@ def report_config_download(self, showName):
self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)

def _real_extract(self, url):
htmlParser = HTMLParser.HTMLParser()

mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
Expand All @@ -3360,11 +3370,11 @@ def _real_extract(self, url):
return

descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
description = htmlParser.unescape(descMatch.group(1))
description = unescapeHTML(descMatch.group(1))
imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
imgUrl = htmlParser.unescape(imgMatch.group(1))
imgUrl = unescapeHTML(imgMatch.group(1))
playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
playerUrl = unescapeHTML(playerUrlMatch.group(1))
configUrlMatch = re.search('config=(.*)$', playerUrl)
configUrl = urllib2.unquote(configUrlMatch.group(1))

Expand Down Expand Up @@ -3423,8 +3433,6 @@ def report_extraction(self, video_id):
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

def _real_extract(self, url):
htmlParser = HTMLParser.HTMLParser()

mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
Expand Down Expand Up @@ -3495,8 +3503,6 @@ def report_extraction(self, video_id):
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

def _real_extract(self, url):
htmlParser = HTMLParser.HTMLParser()

mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
Expand Down Expand Up @@ -3585,8 +3591,6 @@ def report_extraction(self, video_id):
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

def _real_extract(self, url):
htmlParser = HTMLParser.HTMLParser()

mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
Expand Down Expand Up @@ -3674,8 +3678,6 @@ def report_extraction(self, video_id):
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

def _real_extract(self, url):
htmlParser = HTMLParser.HTMLParser()

mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
Expand Down Expand Up @@ -3909,8 +3911,6 @@ def _real_extract(self, url):
except UnavailableVideoError, err:
self._downloader.trouble(u'\nERROR: unable to download video')
elif mobj.group('course'): # A course page
unescapeHTML = HTMLParser.HTMLParser().unescape

course = mobj.group('course')
info = {
'id': _simplify_title(course),
Expand Down Expand Up @@ -3947,8 +3947,6 @@ def _real_extract(self, url):
assert entry['type'] == 'reference'
self.extract(entry['url'])
else: # Root page
unescapeHTML = HTMLParser.HTMLParser().unescape

info = {
'id': 'Stanford OpenClassroom',
'type': 'playlist',
Expand Down

0 comments on commit d6a9615

Please sign in to comment.