Skip to content

Commit

Permalink
merged unescapeHTML branch; removed lxml dependency
Browse files Browse the repository at this point in the history
  • Loading branch information
FiloSottile committed Apr 10, 2012
2 parents d11d05d + 7a8501e commit 9e6dd23
Show file tree
Hide file tree
Showing 5 changed files with 92 additions and 57 deletions.
5 changes: 0 additions & 5 deletions devscripts/wine-py2exe.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ if [ ! -d wine-py2exe ]; then

axel -a "http://www.python.org/ftp/python/2.7/python-2.7.msi"
axel -a "http://downloads.sourceforge.net/project/py2exe/py2exe/0.6.9/py2exe-0.6.9.win32-py2.7.exe"
axel -a "http://pypi.python.org/packages/2.7/l/lxml/lxml-2.3.win32-py2.7.exe"
#axel -a "http://winetricks.org/winetricks"

# http://appdb.winehq.org/objectManager.php?sClass=version&iId=21957
Expand All @@ -28,13 +27,9 @@ if [ ! -d wine-py2exe ]; then
echo "Follow py2exe setup on screen"
wine py2exe-0.6.9.win32-py2.7.exe

echo "Follow lxml setup on screen"
wine lxml-2.3.win32-py2.7.exe

#echo "Follow Microsoft Visual C++ 2008 Redistributable Package setup on screen"
#bash winetricks vcrun2008

rm lxml-2.3.win32-py2.7.exe
rm py2exe-0.6.9.win32-py2.7.exe
rm python-2.7.msi
#rm winetricks
Expand Down
Binary file modified youtube-dl
Binary file not shown.
Binary file modified youtube-dl.exe
Binary file not shown.
61 changes: 12 additions & 49 deletions youtube_dl/InfoExtractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,6 @@
except ImportError:
from cgi import parse_qs

try:
import lxml.etree
except ImportError:
pass # Handled below

try:
import xml.etree.ElementTree
except ImportError: # Python<2.5: Not officially supported, but let it slip
Expand Down Expand Up @@ -193,8 +188,8 @@ def _closed_captions_xml_to_srt(self, xml_string):
end = start + float(dur)
start = "%02i:%02i:%02i,%03i" %(start/(60*60), start/60%60, start%60, start%1*1000)
end = "%02i:%02i:%02i,%03i" %(end/(60*60), end/60%60, end%60, end%1*1000)
caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption)
caption = re.sub(ur'(?u)&(.+?);', htmlentity_transform, caption) # double cycle, inentional
caption = unescapeHTML(caption)
caption = unescapeHTML(caption) # double cycle, inentional
srt += str(n) + '\n'
srt += start + ' --> ' + end + '\n'
srt += caption + '\n\n'
Expand Down Expand Up @@ -364,18 +359,9 @@ def _real_extract(self, url):
pass

# description
try:
lxml.etree
except NameError:
video_description = u'No description available.'
mobj = re.search(r'<meta name="description" content="(.*?)">', video_webpage)
if mobj is not None:
video_description = mobj.group(1).decode('utf-8')
else:
html_parser = lxml.etree.HTMLParser(encoding='utf-8')
vwebpage_doc = lxml.etree.parse(StringIO.StringIO(video_webpage), html_parser)
video_description = u''.join(vwebpage_doc.xpath('id("eow-description")//text()'))
# TODO use another parser
video_description = get_element_by_id("eow-description", video_webpage)
if video_description: video_description = clean_html(video_description.decode('utf8'))
else: video_description = ''

# closed captions
video_subtitles = None
Expand Down Expand Up @@ -992,7 +978,7 @@ def _real_extract(self, url, new_video=True):
self._downloader.trouble(u'ERROR: Unable to extract media URL')
return
video_url = urllib.unquote(mobj.group(1) + mobj.group(2)).decode('utf-8')
video_url = re.sub(r'(?u)&(.+?);', htmlentity_transform, video_url)
video_url = unescapeHTML(video_url)

return [{
'id': video_id.decode('utf-8'),
Expand Down Expand Up @@ -1069,18 +1055,9 @@ def _real_extract(self, url, new_video=True):
video_thumbnail = config["video"]["thumbnail"]

# Extract video description
try:
lxml.etree
except NameError:
video_description = u'No description available.'
mobj = re.search(r'<meta name="description" content="(.*?)" />', webpage, re.MULTILINE)
if mobj is not None:
video_description = mobj.group(1)
else:
html_parser = lxml.etree.HTMLParser()
vwebpage_doc = lxml.etree.parse(StringIO.StringIO(webpage), html_parser)
video_description = u''.join(vwebpage_doc.xpath('id("description")//text()')).strip()
# TODO use another parser
video_description = get_element_by_id("description", webpage)
if video_description: video_description = clean_html(video_description.decode('utf8'))
else: video_description = ''

# Extract upload date
video_upload_date = u'NA'
Expand Down Expand Up @@ -2248,8 +2225,6 @@ def report_config_download(self, showName):
self._downloader.to_screen(u'[escapist] %s: Downloading configuration' % showName)

def _real_extract(self, url):
htmlParser = HTMLParser.HTMLParser()

mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
Expand All @@ -2265,11 +2240,11 @@ def _real_extract(self, url):
return

descMatch = re.search('<meta name="description" content="([^"]*)"', webPage)
description = htmlParser.unescape(descMatch.group(1))
description = unescapeHTML(descMatch.group(1))
imgMatch = re.search('<meta property="og:image" content="([^"]*)"', webPage)
imgUrl = htmlParser.unescape(imgMatch.group(1))
imgUrl = unescapeHTML(imgMatch.group(1))
playerUrlMatch = re.search('<meta property="og:video" content="([^"]*)"', webPage)
playerUrl = htmlParser.unescape(playerUrlMatch.group(1))
playerUrl = unescapeHTML(playerUrlMatch.group(1))
configUrlMatch = re.search('config=(.*)$', playerUrl)
configUrl = urllib2.unquote(configUrlMatch.group(1))

Expand Down Expand Up @@ -2324,8 +2299,6 @@ def report_extraction(self, video_id):
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

def _real_extract(self, url):
htmlParser = HTMLParser.HTMLParser()

mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
Expand Down Expand Up @@ -2391,8 +2364,6 @@ def report_extraction(self, video_id):
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

def _real_extract(self, url):
htmlParser = HTMLParser.HTMLParser()

mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
Expand Down Expand Up @@ -2475,8 +2446,6 @@ def report_extraction(self, video_id):
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

def _real_extract(self, url):
htmlParser = HTMLParser.HTMLParser()

mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
Expand Down Expand Up @@ -2561,8 +2530,6 @@ def report_extraction(self, video_id):
self._downloader.to_screen(u'[%s] %s: Extracting information' % (self.IE_NAME, video_id))

def _real_extract(self, url):
htmlParser = HTMLParser.HTMLParser()

mobj = re.match(self._VALID_URL, url)
if mobj is None:
self._downloader.trouble(u'ERROR: invalid URL: %s' % url)
Expand Down Expand Up @@ -2782,8 +2749,6 @@ def _real_extract(self, url):
info['format'] = info['ext']
return [info]
elif mobj.group('course'): # A course page
unescapeHTML = HTMLParser.HTMLParser().unescape

course = mobj.group('course')
info = {
'id': simplify_title(course),
Expand Down Expand Up @@ -2822,8 +2787,6 @@ def _real_extract(self, url):
return results

else: # Root page
unescapeHTML = HTMLParser.HTMLParser().unescape

info = {
'id': 'Stanford OpenClassroom',
'type': 'playlist',
Expand Down
83 changes: 80 additions & 3 deletions youtube_dl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,86 @@ def htmlentity_transform(matchobj):
return (u'&%s;' % entity)


class IDParser(HTMLParser.HTMLParser):
"""Modified HTMLParser that isolates a tag with the specified id"""
def __init__(self, id):
self.id = id
self.result = None
self.started = False
self.depth = {}
self.html = None
self.watch_startpos = False
HTMLParser.HTMLParser.__init__(self)

def loads(self, html):
self.html = html
self.feed(html)
self.close()

def handle_starttag(self, tag, attrs):
attrs = dict(attrs)
if self.started:
self.find_startpos(None)
if 'id' in attrs and attrs['id'] == self.id:
self.result = [tag]
self.started = True
self.watch_startpos = True
if self.started:
if not tag in self.depth: self.depth[tag] = 0
self.depth[tag] += 1

def handle_endtag(self, tag):
if self.started:
if tag in self.depth: self.depth[tag] -= 1
if self.depth[self.result[0]] == 0:
self.started = False
self.result.append(self.getpos())

def find_startpos(self, x):
"""Needed to put the start position of the result (self.result[1])
after the opening tag with the requested id"""
if self.watch_startpos:
self.watch_startpos = False
self.result.append(self.getpos())
handle_entityref = handle_charref = handle_data = handle_comment = \
handle_decl = handle_pi = unknown_decl = find_startpos

def get_result(self):
if self.result == None: return None
if len(self.result) != 3: return None
lines = self.html.split('\n')
lines = lines[self.result[1][0]-1:self.result[2][0]]
lines[0] = lines[0][self.result[1][1]:]
if len(lines) == 1:
lines[-1] = lines[-1][:self.result[2][1]-self.result[1][1]]
lines[-1] = lines[-1][:self.result[2][1]]
return '\n'.join(lines).strip()

def get_element_by_id(id, html):
"""Return the content of the tag with the specified id in the passed HTML document"""
parser = IDParser(id)
try:
parser.loads(html)
except HTMLParser.HTMLParseError:
pass
return parser.get_result()


def clean_html(html):
"""Clean an HTML snippet into a readable string"""
# Newline vs <br />
html = html.replace('\n', ' ')
html = re.sub('\s*<\s*br\s*/?\s*>\s*', '\n', html)
# Strip html tags
html = re.sub('<.*?>', '', html)
# Replace html entities
html = unescapeHTML(html)
return html


def sanitize_title(utitle):
"""Sanitizes a video title so it could be used as part of a filename."""
utitle = re.sub(ur'(?u)&(.+?);', htmlentity_transform, utitle)
utitle = unescapeHTML(utitle)
return utitle.replace(unicode(os.sep), u'%')


Expand Down Expand Up @@ -133,8 +210,8 @@ def unescapeHTML(s):
"""
assert type(s) == type(u'')

htmlParser = HTMLParser.HTMLParser()
return htmlParser.unescape(s)
result = re.sub(ur'(?u)&(.+?);', htmlentity_transform, s)
return result

def encodeFilename(s):
"""
Expand Down

1 comment on commit 9e6dd23

@remitamine
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why the html unescape that came with python http://stackoverflow.com/a/2360639 is not used it can handle two cases that the unescapeHTML can't(&apos; and the ones that start with &#X) and it was improved in the last version(handle HTML5 named character references).

Please sign in to comment.