From 046c380e26f22347d73829114176e0bc30637bf1 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Mon, 3 Mar 2008 21:23:58 +0000 Subject: [PATCH] In the default hierarchy "root > html > head > title" any part can be absent. git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@192 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23 --- Robots/parse_html_beautifulsoup.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/Robots/parse_html_beautifulsoup.py b/Robots/parse_html_beautifulsoup.py index 25719ca..3c52ce4 100644 --- a/Robots/parse_html_beautifulsoup.py +++ b/Robots/parse_html_beautifulsoup.py @@ -60,13 +60,22 @@ def parse_html(filename, charset=None, log=None): infile.close() try: - head = root.html.head + html = root.html except AttributeError: - if log: log("No HTML in root or no HEAD in HTML") - return None + if log: log("No HTML in root") + html = root + + if html is None: + html = root + + try: + head = html.head + except AttributeError: + if log: log("No HEAD in HTML") + head = html if head is None: - head = root.html # Some sites put TITLE in HTML without HEAD + head = html # Some sites put TITLE in HTML without HEAD _charset = root.originalEncoding if _charset == "windows-1252": # Replace default @@ -77,11 +86,11 @@ def parse_html(filename, charset=None, log=None): except AttributeError: title = '' # HEAD but no TITLE - if (not title) and (head is not root.html): + if (not title) and (head is not html): # Some sites put TITLE in HTML outside of HEAD try: - title = root.html.title.string.encode(_charset) + title = html.title.string.encode(_charset) except AttributeError: title = '' # no TITLE in HTML too -- 2.39.2