parse_html/bkmk_ph_beautifulsoup4.py

   1 """HTML Parser using BeautifulSoup4
   2
   3 This file is a part of Bookmarks database and Internet robot.
   4
   5 """
   6
   7 __author__ = "Oleg Broytman <phd@phdru.name>"
   8 __copyright__ = "Copyright (C) 2017-2023 PhiloSoft Design"
   9 __license__ = "GNU GPL"
  10
  11 __all__ = ['parse_html']
  12
  13
  14 import warnings
  15
  16 from bs4 import BeautifulSoup
  17
  18 from .bkmk_ph_util import HTMLParser
  19 from compat import string_type
  20
  21 warnings.filterwarnings(
  22     'ignore', 'No parser was explicitly specified')
  23 warnings.filterwarnings(
  24     'ignore',
  25     "It looks like you're parsing an XML document using an HTML parser.")
  26
  27 universal_charset = "utf-8"
  28 DEFAULT_CHARSET = "cp1251"  # Stupid default for Russian Cyrillic
  29
  30
  31 def _parse_html(html_text, charset):
  32     try:
  33         if isinstance(html_text, bytes):
  34             return BeautifulSoup(html_text, from_encoding=charset)
  35         else:
  36             return BeautifulSoup(html_text)
  37     except TypeError:
  38         return None
  39
  40
  41 def parse_html(html_text, charset=None, log=None):
  42     if not html_text:
  43         return None
  44     root = _parse_html(html_text, charset)
  45     if root is None:
  46         return None
  47
  48     _charset = root.originalEncoding
  49     html = root.html
  50     if html is None:
  51         html = root
  52
  53     head = html.head
  54     if head is None:
  55         head = html  # Some sites put TITLE in HTML without HEAD
  56
  57     title = head.title
  58     if (title is None) and (html is not head):
  59         # Some sites put TITLE in HTML outside of HEAD
  60         title = html.title
  61
  62     if title is None:
  63         # Lookup TITLE in the root
  64         title = root.title
  65
  66     if title is not None:
  67         if title.string:
  68             title = title.string
  69         else:
  70             parts = []
  71             for part in title:
  72                 #if not isinstance(part, string_type):
  73                 #    part = part.decode()
  74                 if part.strip:
  75                     parts.append(part.strip())
  76                 else:
  77                     parts.append(' ')  # Skip tags, they're usually `<br>`
  78             title = ''.join(parts)
  79
  80     meta = head.find(_find_contenttype, recursive=False)
  81     if meta:
  82         try:
  83             meta_content = meta.get("content")
  84             if meta_content:
  85                 __charset = meta_content.lower().split('charset=')[1].\
  86                     split(';')[0]
  87             else:
  88                 __charset = False
  89         except IndexError:  # No charset in the META Content-Type
  90             meta_charset = False
  91         else:
  92             meta_charset = _charset = __charset
  93     else:
  94         meta_charset = False
  95
  96     if not meta_charset:
  97         meta = head.find(_find_charset, recursive=False)
  98         if meta:
  99             meta_content = meta.get("charset")
 100             if meta_content:
 101                 meta_charset = _charset = meta_content.lower()
 102
 103     #if title and (_charset or meta_charset):
 104     #    try:
 105     #        title = title.encode(_charset or meta_charset)
 106     #    except LookupError:
 107     #        title = title.encode(universal_charset)
 108     #        _charset = universal_charset
 109
 110     meta = head.find(_find_refresh, recursive=False)
 111     if meta:
 112         refresh = meta.get("content")
 113     else:
 114         refresh = None
 115
 116     meta = head.find(_find_icon, recursive=False)
 117     if meta:
 118         icon = meta.get("href")
 119     else:
 120         icon = None
 121
 122     if (title is None) and (refresh is None) and (icon is None):
 123         return None
 124     return HTMLParser(_charset, meta_charset, title, refresh, icon)
 125
 126
 127 def _find_contenttype(Tag):
 128     return (Tag.name == "meta") and \
 129        (Tag.get_attribute_list("http-equiv", '')[0].lower() == "content-type")
 130
 131
 132 def _find_charset(Tag):
 133     return (Tag.name == "meta") and Tag.get("charset", '')
 134
 135
 136 def _find_refresh(Tag):
 137     return (Tag.name == "meta") and \
 138        (Tag.get_attribute_list("http-equiv", '')[0].lower() == "refresh")
 139
 140
 141 def _find_icon(Tag):
 142     return (Tag.name == "link") and \
 143        (Tag.get_attribute_list("rel", '')[0].lower()
 144         in ('icon', 'shortcut icon'))