From: Oleg Broytman Date: Thu, 12 Aug 2010 15:09:34 +0000 (+0000) Subject: Try parser in order until the first one finds a title. X-Git-Tag: v4.5.3~106 X-Git-Url: https://git.phdru.name/?a=commitdiff_plain;h=f8c96fe94544f486e9e76640d16fc355daa4db70;p=bookmarks_db.git Try parser in order until the first one finds a title. git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@270 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23 --- diff --git a/Robots/parse_html.py b/Robots/parse_html.py index 23f53e4..bc9a8d4 100755 --- a/Robots/parse_html.py +++ b/Robots/parse_html.py @@ -12,13 +12,6 @@ DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic parsers = [] -try: - import parse_html_html5 -except ImportError: - pass -else: - parsers.append(parse_html_html5.parse_html) - try: import parse_html_beautifulsoup parse_html_beautifulsoup.DEFAULT_CHARSET = DEFAULT_CHARSET @@ -34,6 +27,13 @@ except ImportError: else: parsers.append(parse_html) +try: + import parse_html_html5 +except ImportError: + pass +else: + parsers.append(parse_html_html5.parse_html) + import re from htmlentitydefs import name2codepoint @@ -94,6 +94,9 @@ def parse_html(filename, charset=None, log=None): else: if log: log("Parser %s.%s failed, trying next one." % (p.__module__, p.__name__)) + if not parser: + return None + converted_title = title = parser.title if title and (not parser.charset): try: diff --git a/Robots/parse_html_beautifulsoup.py b/Robots/parse_html_beautifulsoup.py index 8b26d63..a7df16f 100644 --- a/Robots/parse_html_beautifulsoup.py +++ b/Robots/parse_html_beautifulsoup.py @@ -77,16 +77,18 @@ def parse_html(filename, charset=None, log=None): # Lookup TITLE in the root title = root.title - if title is not None: - if title.string: - title = title.string.encode(_charset) - else: - parts = [] - for part in title: - if not isinstance(part, basestring): - part = unicode(part) - parts.append(part.strip()) - title = ''.join(parts).encode(_charset) + if title is None: + return None + + if title.string: + title = title.string.encode(_charset) + else: + parts = [] + for part in title: + if not isinstance(part, basestring): + part = unicode(part) + parts.append(part.strip()) + title = ''.join(parts) meta = head.find(_find_contenttype, recursive=False) if meta: @@ -103,6 +105,9 @@ def parse_html(filename, charset=None, log=None): else: meta_charset = False + if charset or meta_charset: + title = title.encode(charset or meta_charset) + meta = head.find(_find_refresh, recursive=False) if meta: refresh = meta.get("content") diff --git a/Robots/parse_html_etreetidy.py b/Robots/parse_html_etreetidy.py index 65d42ae..5f8bd86 100644 --- a/Robots/parse_html_etreetidy.py +++ b/Robots/parse_html_etreetidy.py @@ -20,19 +20,25 @@ def parse_html(filename, charset=None, log=None): if elem.tag.startswith(XHTML): elem.tag = elem.tag[len(XHTML):] + title = html_tree.findtext('head/title') + if title is None: + return None + meta = html_tree.findall('head/meta') for m in meta: if m.get('http-equiv', '').lower() == 'content-type': meta_content = m.get("content") if meta_content: - meta_charset = \ - meta_content.lower().split('charset=')[1].split(';')[0] - break + try: + meta_charset = \ + meta_content.lower().split('charset=')[1].split(';')[0] + break + except IndexError: + meta_charset = False else: meta_charset = False - title = html_tree.findtext('head/title') - if title and (charset or meta_charset): + if charset or meta_charset: title = title.encode(charset or meta_charset) for m in meta: diff --git a/Robots/parse_html_html5.py b/Robots/parse_html_html5.py index 6255825..2302051 100644 --- a/Robots/parse_html_html5.py +++ b/Robots/parse_html_html5.py @@ -38,6 +38,17 @@ def parse_html(filename, charset=None, log=None): icon = None if head: + for node in head.childNodes: + if node.name == 'title': + if node.childNodes: + title = node.childNodes[0].value + break + else: + title = '' + + if title is None: + return None + for node in head.childNodes: if node.name == 'meta' and \ ('http-equiv' in node.attributes) and \ @@ -52,18 +63,10 @@ def parse_html(filename, charset=None, log=None): else: break - for node in head.childNodes: - if node.name == 'title': - if node.childNodes: - title = node.childNodes[0].value - break - else: - title = '' - if not charset: charset = parser.tokenizer.stream.charEncoding[0] - if title and (charset or meta_charset): + if charset or meta_charset: title = title.encode(charset or meta_charset) for node in head.childNodes: diff --git a/Robots/parse_html_htmlparser.py b/Robots/parse_html_htmlparser.py index df37f75..351fc74 100644 --- a/Robots/parse_html_htmlparser.py +++ b/Robots/parse_html_htmlparser.py @@ -90,4 +90,7 @@ def parse_html(filename, charset=None, log=None): except (HTMLParseError, HTMLHeadDone): pass + if parser.title is None: + return None + return parser