parse_html/bkmk_ph_beautifulsoup4.py

   1 """HTML Parser using BeautifulSoup4
   2
   3 This file is a part of Bookmarks database and Internet robot.
   4
   5 """
   6
   7 __author__ = "Oleg Broytman <phd@phdru.name>"
   8 __copyright__ = "Copyright (C) 2017-2023 PhiloSoft Design"
   9 __license__ = "GNU GPL"
  10
  11 __all__ = ['parse_html']
  12
  13
  14 import warnings
  15
  16 from bs4 import BeautifulSoup
  17
  18 from .bkmk_ph_util import HTMLParser
  19 from compat import string_type
  20
  21 warnings.filterwarnings(
  22     'ignore', 'No parser was explicitly specified')
  23 warnings.filterwarnings(
  24     'ignore',
  25     "It looks like you're parsing an XML document using an HTML parser.")
  26
  27 universal_charset = "utf-8"
  28 DEFAULT_CHARSET = "cp1251"  # Stupid default for Russian Cyrillic
  29
  30
  31 def _parse_html(html_text, charset):
  32     try:
  33         if isinstance(html_text, bytes):
  34             return BeautifulSoup(html_text, from_encoding=charset)
  35         else:
  36             return BeautifulSoup(html_text)
  37     except TypeError:
  38         return None
  39
  40
  41 def parse_html(html_text, charset=None, log=None):
  42     root = _parse_html(html_text, charset)
  43     if root is None:
  44         return None
  45
  46     _charset = root.originalEncoding
  47     html = root.html
  48     if html is None:
  49         html = root
  50
  51     head = html.head
  52     if head is None:
  53         head = html  # Some sites put TITLE in HTML without HEAD
  54
  55     title = head.title
  56     if (title is None) and (html is not head):
  57         # Some sites put TITLE in HTML outside of HEAD
  58         title = html.title
  59
  60     if title is None:
  61         # Lookup TITLE in the root
  62         title = root.title
  63
  64     if title is not None:
  65         if title.string:
  66             title = title.string
  67         else:
  68             parts = []
  69             for part in title:
  70                 if not isinstance(part, string_type):
  71                     part = part.decode()
  72                 parts.append(part.strip())
  73             title = ''.join(parts)
  74
  75     meta = head.find(_find_contenttype, recursive=False)
  76     if meta:
  77         try:
  78             meta_content = meta.get("content")
  79             if meta_content:
  80                 __charset = meta_content.lower().split('charset=')[1].\
  81                     split(';')[0]
  82             else:
  83                 __charset = False
  84         except IndexError:  # No charset in the META Content-Type
  85             meta_charset = False
  86         else:
  87             meta_charset = _charset = __charset
  88     else:
  89         meta_charset = False
  90
  91     if not meta_charset:
  92         meta = head.find(_find_charset, recursive=False)
  93         if meta:
  94             meta_content = meta.get("charset")
  95             if meta_content:
  96                 meta_charset = _charset = meta_content.lower()
  97
  98     #if title and (_charset or meta_charset):
  99     #    try:
 100     #        title = title.encode(_charset or meta_charset)
 101     #    except LookupError:
 102     #        title = title.encode(universal_charset)
 103     #        _charset = universal_charset
 104
 105     meta = head.find(_find_refresh, recursive=False)
 106     if meta:
 107         refresh = meta.get("content")
 108     else:
 109         refresh = None
 110
 111     meta = head.find(_find_icon, recursive=False)
 112     if meta:
 113         icon = meta.get("href")
 114     else:
 115         icon = None
 116
 117     if (title is None) and (refresh is None) and (icon is None):
 118         return None
 119     return HTMLParser(_charset, meta_charset, title, refresh, icon)
 120
 121
 122 def _find_contenttype(Tag):
 123     return (Tag.name == "meta") and \
 124        (Tag.get_attribute_list("http-equiv", '')[0].lower() == "content-type")
 125
 126
 127 def _find_charset(Tag):
 128     return (Tag.name == "meta") and Tag.get("charset", '')
 129
 130
 131 def _find_refresh(Tag):
 132     return (Tag.name == "meta") and \
 133        (Tag.get_attribute_list("http-equiv", '')[0].lower() == "refresh")
 134
 135
 136 def _find_icon(Tag):
 137     return (Tag.name == "link") and \
 138        (Tag.get_attribute_list("rel", '')[0].lower()
 139         in ('icon', 'shortcut icon'))