From 276cd9e51625c346acb375fcc1be52ea6bbc11b1 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Tue, 8 Jan 2008 16:01:08 +0000 Subject: [PATCH] Some sites put TITLE in HTML outside of HEAD. git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@158 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23 --- Robots/parse_html_beautifulsoup.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Robots/parse_html_beautifulsoup.py b/Robots/parse_html_beautifulsoup.py index 209486c..3d5c44a 100644 --- a/Robots/parse_html_beautifulsoup.py +++ b/Robots/parse_html_beautifulsoup.py @@ -72,6 +72,14 @@ def parse_html(filename, charset=None): except AttributeError: title = '' # HEAD but no TITLE + if not title: + head = root.html # Some sites put TITLE in HTML outside of HEAD + + try: + title = head.title.string.encode(_charset) + except AttributeError: + title = '' # HEAD but no TITLE + meta = head.find(_find_refresh, recursive=False) if meta: refresh = meta.get("content") -- 2.39.5