parse_html/bkmk_ph_beautifulsoup.py

   1 """HTML Parser using BeautifulSoup
   2
   3 This file is a part of Bookmarks database and Internet robot.
   4
   5 """
   6
   7 __author__ = "Oleg Broytman <phd@phdru.name>"
   8 __copyright__ = "Copyright (C) 2007-2023 PhiloSoft Design"
   9 __license__ = "GNU GPL"
  10
  11 __all__ = ['parse_html']
  12
  13
  14 import re
  15 from sgmllib import SGMLParser, SGMLParseError
  16 from BeautifulSoup import BeautifulSoup, CData
  17
  18 from .bkmk_ph_util import HTMLParser
  19 from compat import string_type
  20
  21 DEFAULT_CHARSET = "cp1251"  # Stupid default for Russian Cyrillic
  22
  23 # http://groups.google.com/group/beautifulsoup/browse_thread/thread/69093cb0d3a3cf63
  24
  25
  26 class BadDeclParser(BeautifulSoup):
  27     def parse_declaration(self, i):
  28         """Treat a bogus SGML declaration as raw data. Treat a CDATA
  29         declaration as a CData object."""
  30         j = None
  31         if self.rawdata[i:i+9] == '<![CDATA[':
  32             k = self.rawdata.find(']]>', i)
  33             if k == -1:
  34                 k = len(self.rawdata)
  35             data = self.rawdata[i+9:k]
  36             j = k+3
  37             self._toStringSubclass(data, CData)
  38         else:
  39             try:
  40                 j = SGMLParser.parse_declaration(self, i)
  41             except SGMLParseError:
  42                 # Could not parse the DOCTYPE declaration
  43                 # Try to just skip the actual declaration
  44                 match = re.search(
  45                     r'<!DOCTYPE([^>]*?)>', self.rawdata[i:],
  46                     re.MULTILINE|re.IGNORECASE)  # noqa: E227
  47                 #           missing whitespace around bitwise or shift operator
  48                 if match:
  49                     toHandle = self.rawdata[i:match.end()]
  50                 else:
  51                     toHandle = self.rawdata[i:]
  52                 self.handle_data(toHandle)
  53                 j = i + len(toHandle)
  54         return j
  55
  56
  57 def _parse_html(html_text, charset):
  58     try:
  59         return BadDeclParser(html_text, fromEncoding=charset)
  60     except TypeError:
  61         return None
  62
  63
  64 def parse_html(html_text, charset=None, log=None):
  65     if not html_text:
  66         return None
  67     root = _parse_html(html_text, charset)
  68     if root is None:
  69         return None
  70
  71     _charset = root.originalEncoding
  72     if _charset in ("ISO-8859-2", "windows-1252", "MacCyrillic"):
  73         # Replace with default and re-parse
  74         _charset = DEFAULT_CHARSET
  75         root = _parse_html(html_text, _charset)
  76         if root is None:
  77             return None
  78
  79     html = root.html
  80     if html is None:
  81         html = root
  82
  83     head = html.head
  84     if head is None:
  85         head = html  # Some sites put TITLE in HTML without HEAD
  86
  87     title = head.title
  88     if (title is None) and (html is not head):
  89         # Some sites put TITLE in HTML outside of HEAD
  90         title = html.title
  91
  92     if title is None:
  93         # Lookup TITLE in the root
  94         title = root.title
  95
  96     if title is not None:
  97         if title.string:
  98             title = title.string
  99         else:
 100             parts = []
 101             for part in title:
 102                 if not isinstance(part, string_type):
 103                     part = part.decode()
 104                 parts.append(part.strip())
 105             title = ''.join(parts)
 106
 107     meta = head.find(_find_contenttype, recursive=False)
 108     if meta:
 109         try:
 110             meta_content = meta.get("content")
 111             if meta_content:
 112                 __charset = meta_content.lower().split('charset=')[1].\
 113                     split(';')[0]
 114             else:
 115                 __charset = False
 116         except IndexError:  # No charset in the META Content-Type
 117             meta_charset = False
 118         else:
 119             meta_charset = _charset == __charset
 120     else:
 121         meta_charset = False
 122
 123     if not meta_charset:
 124         meta = head.find(_find_charset, recursive=False)
 125         if meta:
 126             meta_content = meta.get("charset")
 127             if meta_content:
 128                 meta_charset = _charset = meta_content.lower()
 129
 130     #if title and (_charset or meta_charset):
 131     #    title = title.encode(_charset or meta_charset)
 132
 133     meta = head.find(_find_refresh, recursive=False)
 134     if meta:
 135         refresh = meta.get("content")
 136     else:
 137         refresh = None
 138
 139     meta = head.find(_find_icon, recursive=False)
 140     if meta:
 141         icon = meta.get("href")
 142     else:
 143         icon = None
 144
 145     if (title is None) and (refresh is None) and (icon is None):
 146         return None
 147     return HTMLParser(_charset, meta_charset, title, refresh, icon)
 148
 149
 150 def _find_contenttype(Tag):
 151     return (Tag.name == "meta") and \
 152        (Tag.get("http-equiv", '').lower() == "content-type")
 153
 154
 155 def _find_charset(Tag):
 156     return (Tag.name == "meta") and Tag.get("charset", '')
 157
 158
 159 def _find_refresh(Tag):
 160     return (Tag.name == "meta") and \
 161        (Tag.get("http-equiv", '').lower() == "refresh")
 162
 163
 164 def _find_icon(Tag):
 165     return (Tag.name == "link") and \
 166        (Tag.get("rel", '').lower() in ('icon', 'shortcut icon'))