parse_html/bkmk_ph_beautifulsoup4.py

   1 """HTML Parser using BeautifulSoup4
   2
   3 This file is a part of Bookmarks database and Internet robot.
   4
   5 """
   6
   7 __author__ = "Oleg Broytman <phd@phdru.name>"
   8 __copyright__ = "Copyright (C) 2017-2023 PhiloSoft Design"
   9 __license__ = "GNU GPL"
  10
  11 __all__ = ['parse_html']
  12
  13
  14 import warnings
  15
  16 from bs4 import BeautifulSoup
  17
  18 from .bkmk_ph_util import HTMLParser
  19 from compat import string_type
  20
  21 warnings.filterwarnings(
  22     'ignore', 'No parser was explicitly specified')
  23 warnings.filterwarnings(
  24     'ignore',
  25     "It looks like you're parsing an XML document using an HTML parser.")
  26
  27 universal_charset = "utf-8"
  28 DEFAULT_CHARSET = "cp1251"  # Stupid default for Russian Cyrillic
  29
  30
  31 def _parse_html(html_text, charset):
  32     try:
  33         if isinstance(html_text, bytes):
  34             return BeautifulSoup(html_text, from_encoding=charset)
  35         else:
  36             return BeautifulSoup(html_text)
  37     except TypeError:
  38         return None
  39
  40
  41 def parse_html(html_text, charset=None, log=None):
  42     if not html_text:
  43         return None
  44     root = _parse_html(html_text, charset)
  45     if root is None:
  46         return None
  47
  48     _charset = root.originalEncoding
  49     html = root.html
  50     if html is None:
  51         html = root
  52
  53     head = html.head
  54     if head is None:
  55         head = html  # Some sites put TITLE in HTML without HEAD
  56
  57     title = head.title
  58     if (title is None) and (html is not head):
  59         # Some sites put TITLE in HTML outside of HEAD
  60         title = html.title
  61
  62     if title is None:
  63         # Lookup TITLE in the root
  64         title = root.title
  65
  66     if title is not None:
  67         if title.string:
  68             title = title.string
  69         else:
  70             parts = []
  71             for part in title:
  72                 if not isinstance(part, string_type):
  73                     part = part.decode()
  74                 parts.append(part.strip())
  75             title = ''.join(parts)
  76
  77     meta = head.find(_find_contenttype, recursive=False)
  78     if meta:
  79         try:
  80             meta_content = meta.get("content")
  81             if meta_content:
  82                 __charset = meta_content.lower().split('charset=')[1].\
  83                     split(';')[0]
  84             else:
  85                 __charset = False
  86         except IndexError:  # No charset in the META Content-Type
  87             meta_charset = False
  88         else:
  89             meta_charset = _charset = __charset
  90     else:
  91         meta_charset = False
  92
  93     if not meta_charset:
  94         meta = head.find(_find_charset, recursive=False)
  95         if meta:
  96             meta_content = meta.get("charset")
  97             if meta_content:
  98                 meta_charset = _charset = meta_content.lower()
  99
 100     #if title and (_charset or meta_charset):
 101     #    try:
 102     #        title = title.encode(_charset or meta_charset)
 103     #    except LookupError:
 104     #        title = title.encode(universal_charset)
 105     #        _charset = universal_charset
 106
 107     meta = head.find(_find_refresh, recursive=False)
 108     if meta:
 109         refresh = meta.get("content")
 110     else:
 111         refresh = None
 112
 113     meta = head.find(_find_icon, recursive=False)
 114     if meta:
 115         icon = meta.get("href")
 116     else:
 117         icon = None
 118
 119     if (title is None) and (refresh is None) and (icon is None):
 120         return None
 121     return HTMLParser(_charset, meta_charset, title, refresh, icon)
 122
 123
 124 def _find_contenttype(Tag):
 125     return (Tag.name == "meta") and \
 126        (Tag.get_attribute_list("http-equiv", '')[0].lower() == "content-type")
 127
 128
 129 def _find_charset(Tag):
 130     return (Tag.name == "meta") and Tag.get("charset", '')
 131
 132
 133 def _find_refresh(Tag):
 134     return (Tag.name == "meta") and \
 135        (Tag.get_attribute_list("http-equiv", '')[0].lower() == "refresh")
 136
 137
 138 def _find_icon(Tag):
 139     return (Tag.name == "link") and \
 140        (Tag.get_attribute_list("rel", '')[0].lower()
 141         in ('icon', 'shortcut icon'))