X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=Robots%2Fparse_html_beautifulsoup.py;h=3408def7162a50deec9fee0f4894fdff5e37c9ea;hb=4c8bc9dc1f441cf9256ce3a933f51213a0c3c3ff;hp=e0129fd1d4cb3c7b34f69f8c772946c04afab645;hpb=66ef97e93d66c741926db216c29dad6047c5d7f4;p=bookmarks_db.git diff --git a/Robots/parse_html_beautifulsoup.py b/Robots/parse_html_beautifulsoup.py index e0129fd..3408def 100644 --- a/Robots/parse_html_beautifulsoup.py +++ b/Robots/parse_html_beautifulsoup.py @@ -20,37 +20,31 @@ class BSoupParser(HTMLParser): def parse_html(filename, charset=None): infile = open(filename, 'r') - root = BeautifulSoup(infile, fromEncoding=charset) + try: + root = BeautifulSoup(infile, fromEncoding=charset) + except TypeError: + return None infile.close() - charset = root.originalEncoding + _charset = root.originalEncoding try: - title = root.html.head.title.string.encode(charset) + title = root.html.head.title.string.encode(_charset) except AttributeError: - title = '' + return None - try: - meta = root.html.head.find(_find_refresh, recursive=False) - except AttributeError: - refresh = None + meta = root.html.head.find(_find_refresh, recursive=False) + if meta: + refresh = meta.get("content") else: - if meta: - refresh = meta.get("content") - else: - refresh = None + refresh = None - try: - meta = root.html.head.find(_find_icon, recursive=False) - except AttributeError: - icon = None + meta = root.html.head.find(_find_icon, recursive=False) + if meta: + icon = meta.get("href") else: - if meta: - icon = meta.get("href") - else: - icon = None + icon = None - parser = BSoupParser(charset, False, title, refresh, icon) - return parser + return BSoupParser(_charset, _charset != charset, title, refresh, icon) def _find_refresh(Tag): return (Tag.name == "meta") and \