# parser.charset = 'ascii'
converted_title = title = parser.title
- if title and (not parser.charset):
+ if title and isinstance(title, bytes) and (not parser.charset):
try:
title.decode("ascii")
except UnicodeDecodeError:
__all__ = ['parse_html']
+import warnings
+
from bs4 import BeautifulSoup
from .bkmk_ph_util import HTMLParser
from compat import string_type
+warnings.filterwarnings(
+ 'ignore', 'No parser was explicitly specified')
+warnings.filterwarnings(
+ 'ignore',
+ "It looks like you're parsing an XML document using an HTML parser.")
+
universal_charset = "utf-8"
DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic
def _parse_html(html_text, charset):
try:
- return BeautifulSoup(html_text, from_encoding=charset)
+ if isinstance(html_text, bytes):
+ return BeautifulSoup(html_text, from_encoding=charset)
+ else:
+ return BeautifulSoup(html_text)
except TypeError:
return None
def parse_html(html_text, charset=None, log=None):
parser = HTML5Parser()
- html_tree = parser.parse(
- html_text, encoding=charset, parseMeta=bool(charset))
+ if isinstance(html_text, bytes):
+ html_tree = parser.parse(
+ html_text, encoding=charset, parseMeta=bool(charset))
+ else:
+ html_tree = parser.parse(html_text)
html = None
if hasattr(html_tree, 'childNodes'):
__all__ = ['parse_html']
-from HTMLParser import HTMLParseError
+try:
+ from HTMLParser import HTMLParseError
+except ImportError:
+ class HTMLParseError(Exception): pass
from m_lib.net.www.html import HTMLParser as _HTMLParser
__all__ = ['parse_html']
-from lxml.html import fromtring
+import re
+from lxml.html import fromstring
from .bkmk_ph_util import HTMLParser
def parse_html(html_text, charset=None, log=None):
- html_tree = fromtring(html_text)
-
- if html_tree.getroot() is None:
- return None
+ try:
+ html_tree = fromstring(html_text)
+ except ValueError as e:
+ if e.args[0].startswith(
+ 'Unicode strings with encoding declaration are not supported.'
+ ' Please use bytes input'
+ ):
+ if not charset:
+ match = re.search(
+ '<\\?xml version="(\\d|.)+" encoding="([^"]+)"\\?>',
+ html_text, re.U)
+ if match:
+ charset = match.group(2)
+ if charset:
+ html_text = html_text.encode(charset)
+ html_tree = fromstring(html_text)
+ else:
+ return None
+ else:
+ raise
title = html_tree.findtext('head/title')
if title is None:
__all__ = ['HTMLParser']
-from HTMLParser import HTMLParser
+from m_lib.net.www.html import HTMLParser as _HTMLParser
-class HTMLParser(HTMLParser):
+class HTMLParser(_HTMLParser):
def __init__(self, charset, meta_charset, title, refresh, icon):
object.__init__(self)
self.charset = charset