From 80054d3bdcbde67d3b941e0b69b55ca4998f4673 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Sun, 23 Sep 2012 22:54:49 +0000 Subject: [PATCH] Extract html redirect even if ther is no title git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@366 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23 --- doc/ANNOUNCE | 2 ++ parse_html/bkmk_ph_beautifulsoup.py | 26 +++++++++++++------------- parse_html/bkmk_ph_etreetidy.py | 6 +++--- parse_html/bkmk_ph_html5.py | 10 +++------- parse_html/bkmk_ph_htmlparser.py | 3 +-- parse_html/bkmk_ph_lxml.py | 6 +++--- 6 files changed, 25 insertions(+), 28 deletions(-) diff --git a/doc/ANNOUNCE b/doc/ANNOUNCE index fd50ef5..57b0f69 100644 --- a/doc/ANNOUNCE +++ b/doc/ANNOUNCE @@ -13,6 +13,8 @@ WHAT'S NEW in version 4.5.2 (2012-??-??). Fixed a bug in handling place: URIs (do not append '//'). + Extract html redirect even if ther is no title. + WHAT'S NEW in version 4.5.1 (2011-12-28). diff --git a/parse_html/bkmk_ph_beautifulsoup.py b/parse_html/bkmk_ph_beautifulsoup.py index 437f67b..a0ef6af 100644 --- a/parse_html/bkmk_ph_beautifulsoup.py +++ b/parse_html/bkmk_ph_beautifulsoup.py @@ -84,18 +84,16 @@ def parse_html(filename, charset=None, log=None): # Lookup TITLE in the root title = root.title - if title is None: - return None - - if title.string: - title = title.string - else: - parts = [] - for part in title: - if not isinstance(part, basestring): - part = unicode(part) - parts.append(part.strip()) - title = ''.join(parts) + if title is not None: + if title.string: + title = title.string + else: + parts = [] + for part in title: + if not isinstance(part, basestring): + part = unicode(part) + parts.append(part.strip()) + title = ''.join(parts) meta = head.find(_find_contenttype, recursive=False) if meta: @@ -112,7 +110,7 @@ def parse_html(filename, charset=None, log=None): else: meta_charset = False - if _charset or meta_charset: + if title and (_charset or meta_charset): title = title.encode(_charset or meta_charset) meta = head.find(_find_refresh, recursive=False) @@ -127,6 +125,8 @@ def parse_html(filename, charset=None, log=None): else: icon = None + if (title is None) and (refresh is None) and (icon is None): + return None return HTMLParser(_charset, meta_charset, title, refresh, icon) def _find_contenttype(Tag): diff --git a/parse_html/bkmk_ph_etreetidy.py b/parse_html/bkmk_ph_etreetidy.py index f5e794c..c823dfa 100644 --- a/parse_html/bkmk_ph_etreetidy.py +++ b/parse_html/bkmk_ph_etreetidy.py @@ -29,8 +29,6 @@ def parse_html(filename, charset=None, log=None): title = html_tree.findtext('head/title') if title is None: title = html_tree.findtext('title') - if title is None: - return None meta = html_tree.findall('head/meta') for m in meta: @@ -46,7 +44,7 @@ def parse_html(filename, charset=None, log=None): else: meta_charset = False - if charset or meta_charset: + if title and (charset or meta_charset): title = title.encode(charset or meta_charset) for m in meta: @@ -63,4 +61,6 @@ def parse_html(filename, charset=None, log=None): else: icon = None + if (title is None) and (refresh is None) and (icon is None): + return None return HTMLParser(charset, meta_charset, title, refresh, icon) diff --git a/parse_html/bkmk_ph_html5.py b/parse_html/bkmk_ph_html5.py index 0e86477..a490628 100644 --- a/parse_html/bkmk_ph_html5.py +++ b/parse_html/bkmk_ph_html5.py @@ -52,9 +52,6 @@ def parse_html(filename, charset=None, log=None): else: title = '' - if title is None: - return None - for node in head.childNodes: if node.name == 'meta' and \ ('http-equiv' in node.attributes) and \ @@ -72,7 +69,7 @@ def parse_html(filename, charset=None, log=None): if not charset: charset = parser.tokenizer.stream.charEncoding[0] - if charset or meta_charset: + if title and (charset or meta_charset): title = title.encode(charset or meta_charset) for node in head.childNodes: @@ -98,7 +95,6 @@ def parse_html(filename, charset=None, log=None): else: title = '' - if title is None: - return None - + if (title is None) and (refresh is None) and (icon is None): + return None return HTMLParser(charset, meta_charset, title, refresh, icon) diff --git a/parse_html/bkmk_ph_htmlparser.py b/parse_html/bkmk_ph_htmlparser.py index 5c0a440..8cdd240 100644 --- a/parse_html/bkmk_ph_htmlparser.py +++ b/parse_html/bkmk_ph_htmlparser.py @@ -93,7 +93,6 @@ def parse_html(filename, charset=None, log=None): except (HTMLParseError, HTMLHeadDone): pass - if parser.title is None: + if (parser.title is None) and (parser.refresh is None) and (parser.icon is None): return None - return parser diff --git a/parse_html/bkmk_ph_lxml.py b/parse_html/bkmk_ph_lxml.py index 26c9268..b14be40 100644 --- a/parse_html/bkmk_ph_lxml.py +++ b/parse_html/bkmk_ph_lxml.py @@ -23,8 +23,6 @@ def parse_html(filename, charset=None, log=None): title = html_tree.findtext('head/title') if title is None: title = html_tree.findtext('title') - if title is None: - return None meta = html_tree.findall('head/meta') for m in meta: @@ -40,7 +38,7 @@ def parse_html(filename, charset=None, log=None): else: meta_charset = False - if charset or meta_charset: + if title and (charset or meta_charset): title = title.encode(charset or meta_charset) for m in meta: @@ -57,4 +55,6 @@ def parse_html(filename, charset=None, log=None): else: icon = None + if (title is None) and (refresh is None) and (icon is None): + return None return HTMLParser(charset, meta_charset, title, refresh, icon) -- 2.39.2