[compat] compat_etree_fromstring: also decode the text attribute

Deletes parse_xml from utils, because it also does it.
This commit is contained in:
Jaime Marquínez Ferrándiz 2015-10-26 16:41:24 +01:00
parent 387db16a78
commit f78546272c
5 changed files with 30 additions and 30 deletions

View File

@ -74,10 +74,19 @@ class TestCompat(unittest.TestCase):
self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two']) self.assertEqual(compat_shlex_split('-option "one two"'), ['-option', 'one two'])
def test_compat_etree_fromstring(self): def test_compat_etree_fromstring(self):
xml = '<el foo="bar" spam="中文"></el>' xml = '''
<root foo="bar" spam="中文">
<normal>foo</normal>
<chinese>中文</chinese>
<foo><bar>spam</bar></foo>
</root>
'''
doc = compat_etree_fromstring(xml.encode('utf-8')) doc = compat_etree_fromstring(xml.encode('utf-8'))
self.assertTrue(isinstance(doc.attrib['foo'], compat_str)) self.assertTrue(isinstance(doc.attrib['foo'], compat_str))
self.assertTrue(isinstance(doc.attrib['spam'], compat_str)) self.assertTrue(isinstance(doc.attrib['spam'], compat_str))
self.assertTrue(isinstance(doc.find('normal').text, compat_str))
self.assertTrue(isinstance(doc.find('chinese').text, compat_str))
self.assertTrue(isinstance(doc.find('foo/bar').text, compat_str))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@ -216,9 +216,19 @@ except ImportError: # Python 2.6
if sys.version_info[0] >= 3: if sys.version_info[0] >= 3:
compat_etree_fromstring = xml.etree.ElementTree.fromstring compat_etree_fromstring = xml.etree.ElementTree.fromstring
else: else:
# on python 2.x the the attributes of a node aren't always unicode objects # on python 2.x the attributes and text of a node aren't always unicode
# objects
etree = xml.etree.ElementTree etree = xml.etree.ElementTree
try:
_etree_iter = etree.Element.iter
except AttributeError: # Python <=2.6
def _etree_iter(root):
for el in root.findall('*'):
yield el
for sub in _etree_iter(el):
yield sub
# on 2.6 XML doesn't have a parser argument, function copied from CPython # on 2.6 XML doesn't have a parser argument, function copied from CPython
# 2.7 source # 2.7 source
def _XML(text, parser=None): def _XML(text, parser=None):
@ -235,7 +245,11 @@ else:
return el return el
def compat_etree_fromstring(text): def compat_etree_fromstring(text):
return _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory))) doc = _XML(text, parser=etree.XMLParser(target=etree.TreeBuilder(element_factory=_element_factory)))
for el in _etree_iter(doc):
if el.text is not None and isinstance(el.text, bytes):
el.text = el.text.decode('utf-8')
return doc
try: try:
from urllib.parse import parse_qs as compat_parse_qs from urllib.parse import parse_qs as compat_parse_qs

View File

@ -14,8 +14,8 @@ from ..utils import (
parse_duration, parse_duration,
unified_strdate, unified_strdate,
xpath_text, xpath_text,
parse_xml,
) )
from ..compat import compat_etree_fromstring
class ARDMediathekIE(InfoExtractor): class ARDMediathekIE(InfoExtractor):
@ -161,7 +161,7 @@ class ARDMediathekIE(InfoExtractor):
raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True) raise ExtractorError('This program is only suitable for those aged 12 and older. Video %s is therefore only available between 20 pm and 6 am.' % video_id, expected=True)
if re.search(r'[\?&]rss($|[=&])', url): if re.search(r'[\?&]rss($|[=&])', url):
doc = parse_xml(webpage) doc = compat_etree_fromstring(webpage.encode('utf-8'))
if doc.tag == 'rss': if doc.tag == 'rss':
return GenericIE()._extract_rss(url, video_id, doc) return GenericIE()._extract_rss(url, video_id, doc)

View File

@ -9,6 +9,7 @@ import sys
from .common import InfoExtractor from .common import InfoExtractor
from .youtube import YoutubeIE from .youtube import YoutubeIE
from ..compat import ( from ..compat import (
compat_etree_fromstring,
compat_urllib_parse_unquote, compat_urllib_parse_unquote,
compat_urllib_request, compat_urllib_request,
compat_urlparse, compat_urlparse,
@ -21,7 +22,6 @@ from ..utils import (
HEADRequest, HEADRequest,
is_html, is_html,
orderedSet, orderedSet,
parse_xml,
smuggle_url, smuggle_url,
unescapeHTML, unescapeHTML,
unified_strdate, unified_strdate,
@ -1237,7 +1237,7 @@ class GenericIE(InfoExtractor):
# Is it an RSS feed, a SMIL file or a XSPF playlist? # Is it an RSS feed, a SMIL file or a XSPF playlist?
try: try:
doc = parse_xml(webpage) doc = compat_etree_fromstring(webpage.encode('utf-8'))
if doc.tag == 'rss': if doc.tag == 'rss':
return self._extract_rss(url, video_id, doc) return self._extract_rss(url, video_id, doc)
elif re.match(r'^(?:{[^}]+})?smil$', doc.tag): elif re.match(r'^(?:{[^}]+})?smil$', doc.tag):

View File

@ -1652,29 +1652,6 @@ def encode_dict(d, encoding='utf-8'):
return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items()) return dict((k.encode(encoding), v.encode(encoding)) for k, v in d.items())
try:
etree_iter = xml.etree.ElementTree.Element.iter
except AttributeError: # Python <=2.6
etree_iter = lambda n: n.findall('.//*')
def parse_xml(s):
class TreeBuilder(xml.etree.ElementTree.TreeBuilder):
def doctype(self, name, pubid, system):
pass # Ignore doctypes
parser = xml.etree.ElementTree.XMLParser(target=TreeBuilder())
kwargs = {'parser': parser} if sys.version_info >= (2, 7) else {}
tree = xml.etree.ElementTree.XML(s.encode('utf-8'), **kwargs)
# Fix up XML parser in Python 2.x
if sys.version_info < (3, 0):
for n in etree_iter(tree):
if n.text is not None:
if not isinstance(n.text, compat_str):
n.text = n.text.decode('utf-8')
return tree
US_RATINGS = { US_RATINGS = {
'G': 0, 'G': 0,
'PG': 10, 'PG': 10,