From: Oleg Broytman Date: Mon, 3 Mar 2008 21:23:58 +0000 (+0000) Subject: In the default hierarchy "root > html > head > title" any part can be absent. X-Git-Tag: v4.5.3~184 X-Git-Url: https://git.phdru.name/?a=commitdiff_plain;h=046c380e26f22347d73829114176e0bc30637bf1;p=bookmarks_db.git In the default hierarchy "root > html > head > title" any part can be absent. git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@192 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23 --- diff --git a/Robots/parse_html_beautifulsoup.py b/Robots/parse_html_beautifulsoup.py index 25719ca..3c52ce4 100644 --- a/Robots/parse_html_beautifulsoup.py +++ b/Robots/parse_html_beautifulsoup.py @@ -60,13 +60,22 @@ def parse_html(filename, charset=None, log=None): infile.close() try: - head = root.html.head + html = root.html except AttributeError: - if log: log("No HTML in root or no HEAD in HTML") - return None + if log: log("No HTML in root") + html = root + + if html is None: + html = root + + try: + head = html.head + except AttributeError: + if log: log("No HEAD in HTML") + head = html if head is None: - head = root.html # Some sites put TITLE in HTML without HEAD + head = html # Some sites put TITLE in HTML without HEAD _charset = root.originalEncoding if _charset == "windows-1252": # Replace default @@ -77,11 +86,11 @@ def parse_html(filename, charset=None, log=None): except AttributeError: title = '' # HEAD but no TITLE - if (not title) and (head is not root.html): + if (not title) and (head is not html): # Some sites put TITLE in HTML outside of HEAD try: - title = root.html.title.string.encode(_charset) + title = html.title.string.encode(_charset) except AttributeError: title = '' # no TITLE in HTML too