From 11632b7623b2b2e01995f013bc6d8ba01c20cf74 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Sun, 25 Jun 2017 18:18:31 +0300 Subject: [PATCH] HTML parser based on BeautifulSoup4 --- doc/TODO | 2 - parse_html/bkmk_parse_html.py | 19 +++++ parse_html/bkmk_ph_beautifulsoup4.py | 114 +++++++++++++++++++++++++++ 3 files changed, 133 insertions(+), 2 deletions(-) create mode 100644 parse_html/bkmk_ph_beautifulsoup4.py diff --git a/doc/TODO b/doc/TODO index e4fe652..deb79ca 100644 --- a/doc/TODO +++ b/doc/TODO @@ -1,5 +1,3 @@ -HTML parser based on BeautifulSoup4. Bs3 for Python 2, bs4 for Py3. - Replace subproc.py with some IPC. Or update for Python 3. Python 3. diff --git a/parse_html/bkmk_parse_html.py b/parse_html/bkmk_parse_html.py index 7bc4640..fa72e1f 100644 --- a/parse_html/bkmk_parse_html.py +++ b/parse_html/bkmk_parse_html.py @@ -18,6 +18,14 @@ DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic parsers = [] +try: + from . import bkmk_ph_beautifulsoup4 +except ImportError: + pass +else: + bkmk_ph_beautifulsoup4.DEFAULT_CHARSET = DEFAULT_CHARSET + parsers.append(bkmk_ph_beautifulsoup4.parse_html) + try: from . import bkmk_ph_beautifulsoup except ImportError: @@ -136,6 +144,17 @@ def parse_html(html_text, charset=None, log=None): p, parser = _parsers[0] if log: log(" Using %s" % p.__module__) + title = parser.title + if isinstance(title, unicode): + if parser.charset: + parser.title = title.encode(parser.charset) + else: + try: + parser.title = title.encode('ascii') + except UnicodeEncodeError: + parser.charset = DEFAULT_CHARSET + parser.title = title.encode(DEFAULT_CHARSET) + converted_title = title = parser.title if title and (not parser.charset): try: diff --git a/parse_html/bkmk_ph_beautifulsoup4.py b/parse_html/bkmk_ph_beautifulsoup4.py new file mode 100644 index 0000000..aad3f8f --- /dev/null +++ b/parse_html/bkmk_ph_beautifulsoup4.py @@ -0,0 +1,114 @@ +"""HTML Parser using BeautifulSoup4 + +This file is a part of Bookmarks database and Internet robot. + +""" + +__author__ = "Oleg Broytman " +__copyright__ = "Copyright (C) 2017 PhiloSoft Design" +__license__ = "GNU GPL" + +__all__ = ['parse_html'] + + +import re +from bs4 import BeautifulSoup +from .bkmk_ph_util import HTMLParser + +DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic + +def _parse_html(html_text, charset): + try: + return BeautifulSoup(html_text, from_encoding=charset) + except TypeError: + return None + +def parse_html(html_text, charset=None, log=None): + root = _parse_html(html_text, charset) + if root is None: + return None + + _charset = root.originalEncoding + html = root.html + if html is None: + html = root + + head = html.head + if head is None: + head = html # Some sites put TITLE in HTML without HEAD + + title = head.title + if (title is None) and (html is not head): + # Some sites put TITLE in HTML outside of HEAD + title = html.title + + if title is None: + # Lookup TITLE in the root + title = root.title + + if title is not None: + if title.string: + title = title.string + else: + parts = [] + for part in title: + if not isinstance(part, basestring): + part = unicode(part) + parts.append(part.strip()) + title = ''.join(parts) + + meta = head.find(_find_contenttype, recursive=False) + if meta: + try: + meta_content = meta.get("content") + if meta_content: + __charset = meta_content.lower().split('charset=')[1].split(';')[0] + else: + __charset = False + except IndexError: # No charset in the META Content-Type + meta_charset = False + else: + meta_charset = _charset = __charset + else: + meta_charset = False + + if not meta_charset: + meta = head.find(_find_charset, recursive=False) + if meta: + meta_content = meta.get("charset") + if meta_content: + meta_charset = _charset = meta_content.lower() + + if title and (_charset or meta_charset): + title = title.encode(_charset or meta_charset) + + meta = head.find(_find_refresh, recursive=False) + if meta: + refresh = meta.get("content") + else: + refresh = None + + meta = head.find(_find_icon, recursive=False) + if meta: + icon = meta.get("href") + else: + icon = None + + if (title is None) and (refresh is None) and (icon is None): + return None + return HTMLParser(_charset, meta_charset, title, refresh, icon) + +def _find_contenttype(Tag): + return (Tag.name == "meta") and \ + (Tag.get_attribute_list("http-equiv", '')[0].lower() == "content-type") + +def _find_charset(Tag): + return (Tag.name == "meta") and Tag.get("charset", '') + +def _find_refresh(Tag): + return (Tag.name == "meta") and \ + (Tag.get_attribute_list("http-equiv", '')[0].lower() == "refresh") + +def _find_icon(Tag): + return (Tag.name == "link") and \ + (Tag.get_attribute_list("rel", '')[0].lower() in ('icon', 'shortcut icon')) -- 2.39.2