X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=Robots%2Fparse_html_beautifulsoup.py;h=8c51d0500b6e78d19475f1b25e6c29c250b3c8c9;hb=fd49fd89d9ad5570a0f98d2ca63178e81d999a4b;hp=4f395a16507e58d5e9a600de00b9056cb2aeb94b;hpb=d2499bf060be42a28feebde2e8bded52504ced95;p=bookmarks_db.git diff --git a/Robots/parse_html_beautifulsoup.py b/Robots/parse_html_beautifulsoup.py index 4f395a1..8c51d05 100644 --- a/Robots/parse_html_beautifulsoup.py +++ b/Robots/parse_html_beautifulsoup.py @@ -4,10 +4,11 @@ Written by BroytMann. Copyright (C) 2007 PhiloSoft Design """ +from HTMLParser import HTMLParser from BeautifulSoup import BeautifulSoup -class DummyParser(object): +class BSoupParser(HTMLParser): def __init__(self, charset, meta, title, refresh, icon): object.__init__(self) self.charset = charset @@ -16,14 +17,15 @@ class DummyParser(object): self.refresh = refresh self.icon = icon + def parse_html(filename, charset=None): infile = open(filename, 'r') root = BeautifulSoup(infile, fromEncoding=charset) infile.close() - charset = root.originalEncoding + _charset = root.originalEncoding try: - title = root.html.head.title.string.encode(charset) + title = root.html.head.title.string.encode(_charset) except AttributeError: title = '' @@ -47,8 +49,7 @@ def parse_html(filename, charset=None): else: icon = None - parser = DummyParser(charset, False, title, refresh, icon) - return parser + return BSoupParser(_charset, _charset != charset, title, refresh, icon) def _find_refresh(Tag): return (Tag.name == "meta") and \