parsers = []
-try:
- import parse_html_html5
-except ImportError:
- pass
-else:
- parsers.append(parse_html_html5.parse_html)
-
try:
import parse_html_beautifulsoup
parse_html_beautifulsoup.DEFAULT_CHARSET = DEFAULT_CHARSET
else:
parsers.append(parse_html)
+try:
+ import parse_html_html5
+except ImportError:
+ pass
+else:
+ parsers.append(parse_html_html5.parse_html)
+
import re
from htmlentitydefs import name2codepoint
else:
if log: log("Parser %s.%s failed, trying next one." % (p.__module__, p.__name__))
+ if not parser:
+ return None
+
converted_title = title = parser.title
if title and (not parser.charset):
try:
# Lookup TITLE in the root
title = root.title
- if title is not None:
- if title.string:
- title = title.string.encode(_charset)
- else:
- parts = []
- for part in title:
- if not isinstance(part, basestring):
- part = unicode(part)
- parts.append(part.strip())
- title = ''.join(parts).encode(_charset)
+ if title is None:
+ return None
+
+ if title.string:
+ title = title.string.encode(_charset)
+ else:
+ parts = []
+ for part in title:
+ if not isinstance(part, basestring):
+ part = unicode(part)
+ parts.append(part.strip())
+ title = ''.join(parts)
meta = head.find(_find_contenttype, recursive=False)
if meta:
else:
meta_charset = False
+ if charset or meta_charset:
+ title = title.encode(charset or meta_charset)
+
meta = head.find(_find_refresh, recursive=False)
if meta:
refresh = meta.get("content")
if elem.tag.startswith(XHTML):
elem.tag = elem.tag[len(XHTML):]
+ title = html_tree.findtext('head/title')
+ if title is None:
+ return None
+
meta = html_tree.findall('head/meta')
for m in meta:
if m.get('http-equiv', '').lower() == 'content-type':
meta_content = m.get("content")
if meta_content:
- meta_charset = \
- meta_content.lower().split('charset=')[1].split(';')[0]
- break
+ try:
+ meta_charset = \
+ meta_content.lower().split('charset=')[1].split(';')[0]
+ break
+ except IndexError:
+ meta_charset = False
else:
meta_charset = False
- title = html_tree.findtext('head/title')
- if title and (charset or meta_charset):
+ if charset or meta_charset:
title = title.encode(charset or meta_charset)
for m in meta:
icon = None
if head:
+ for node in head.childNodes:
+ if node.name == 'title':
+ if node.childNodes:
+ title = node.childNodes[0].value
+ break
+ else:
+ title = ''
+
+ if title is None:
+ return None
+
for node in head.childNodes:
if node.name == 'meta' and \
('http-equiv' in node.attributes) and \
else:
break
- for node in head.childNodes:
- if node.name == 'title':
- if node.childNodes:
- title = node.childNodes[0].value
- break
- else:
- title = ''
-
if not charset:
charset = parser.tokenizer.stream.charEncoding[0]
- if title and (charset or meta_charset):
+ if charset or meta_charset:
title = title.encode(charset or meta_charset)
for node in head.childNodes:
except (HTMLParseError, HTMLHeadDone):
pass
+ if parser.title is None:
+ return None
+
return parser