X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=parse_html%2Fbkmk_ph_beautifulsoup.py;h=e1969f3250aa73d6e46e24883dd264f6df7ccd94;hb=9faa13f6f8199790cf01533e857c593520559649;hp=1479f034255c4c5b4e53b872ea5b0c157b4a74c6;hpb=4255ead7de9ed5069f94b90cb134a077387d43c1;p=bookmarks_db.git diff --git a/parse_html/bkmk_ph_beautifulsoup.py b/parse_html/bkmk_ph_beautifulsoup.py index 1479f03..e1969f3 100644 --- a/parse_html/bkmk_ph_beautifulsoup.py +++ b/parse_html/bkmk_ph_beautifulsoup.py @@ -19,6 +19,8 @@ from .bkmk_ph_util import HTMLParser DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic # http://groups.google.com/group/beautifulsoup/browse_thread/thread/69093cb0d3a3cf63 + + class BadDeclParser(BeautifulSoup): def parse_declaration(self, i): """Treat a bogus SGML declaration as raw data. Treat a CDATA @@ -56,6 +58,7 @@ def _parse_html(html_text, charset): except TypeError: return None + def parse_html(html_text, charset=None, log=None): root = _parse_html(html_text, charset) if root is None: @@ -137,17 +140,21 @@ def parse_html(html_text, charset=None, log=None): return None return HTMLParser(_charset, meta_charset, title, refresh, icon) + def _find_contenttype(Tag): return (Tag.name == "meta") and \ (Tag.get("http-equiv", '').lower() == "content-type") + def _find_charset(Tag): return (Tag.name == "meta") and Tag.get("charset", '') + def _find_refresh(Tag): return (Tag.name == "meta") and \ (Tag.get("http-equiv", '').lower() == "refresh") + def _find_icon(Tag): return (Tag.name == "link") and \ (Tag.get("rel", '').lower() in ('icon', 'shortcut icon'))