]> git.phdru.name Git - bookmarks_db.git/commitdiff
Some sites put TITLE in HTML outside of HEAD.
authorOleg Broytman <phd@phdru.name>
Tue, 8 Jan 2008 16:01:08 +0000 (16:01 +0000)
committerOleg Broytman <phd@phdru.name>
Tue, 8 Jan 2008 16:01:08 +0000 (16:01 +0000)
git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@158 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23

Robots/parse_html_beautifulsoup.py

index 209486c4054f015a179d4469ccaa5adc3f8bbe8f..3d5c44a3f4464c699bd2b2f6090963d936944ae6 100644 (file)
@@ -72,6 +72,14 @@ def parse_html(filename, charset=None):
    except AttributeError:
       title = '' # HEAD but no TITLE
 
+   if not title:
+      head = root.html # Some sites put TITLE in HTML outside of HEAD
+
+   try:
+      title = head.title.string.encode(_charset)
+   except AttributeError:
+      title = '' # HEAD but no TITLE
+
    meta = head.find(_find_refresh, recursive=False)
    if meta:
       refresh = meta.get("content")