parse_html/bkmk_ph_beautifulsoup.py

   1 """HTML Parser using BeautifulSoup
   2
   3 This file is a part of Bookmarks database and Internet robot.
   4
   5 """
   6
   7 __author__ = "Oleg Broytman <phd@phdru.name>"
   8 __copyright__ = "Copyright (C) 2007-2023 PhiloSoft Design"
   9 __license__ = "GNU GPL"
  10
  11 __all__ = ['parse_html']
  12
  13
  14 import re
  15 from sgmllib import SGMLParser, SGMLParseError
  16 from BeautifulSoup import BeautifulSoup, CData
  17 from .bkmk_ph_util import HTMLParser
  18
  19 DEFAULT_CHARSET = "cp1251"  # Stupid default for Russian Cyrillic
  20
  21 # http://groups.google.com/group/beautifulsoup/browse_thread/thread/69093cb0d3a3cf63
  22 class BadDeclParser(BeautifulSoup):
  23     def parse_declaration(self, i):
  24         """Treat a bogus SGML declaration as raw data. Treat a CDATA
  25         declaration as a CData object."""
  26         j = None
  27         if self.rawdata[i:i+9] == '<![CDATA[':
  28             k = self.rawdata.find(']]>', i)
  29             if k == -1:
  30                 k = len(self.rawdata)
  31             data = self.rawdata[i+9:k]
  32             j = k+3
  33             self._toStringSubclass(data, CData)
  34         else:
  35             try:
  36                 j = SGMLParser.parse_declaration(self, i)
  37             except SGMLParseError:
  38                 # Could not parse the DOCTYPE declaration
  39                 # Try to just skip the actual declaration
  40                 match = re.search(
  41                     r'<!DOCTYPE([^>]*?)>', self.rawdata[i:],
  42                     re.MULTILINE|re.IGNORECASE)  # noqa: E227
  43                 #           missing whitespace around bitwise or shift operator
  44                 if match:
  45                     toHandle = self.rawdata[i:match.end()]
  46                 else:
  47                     toHandle = self.rawdata[i:]
  48                 self.handle_data(toHandle)
  49                 j = i + len(toHandle)
  50         return j
  51
  52
  53 def _parse_html(html_text, charset):
  54     try:
  55         return BadDeclParser(html_text, fromEncoding=charset)
  56     except TypeError:
  57         return None
  58
  59 def parse_html(html_text, charset=None, log=None):
  60     root = _parse_html(html_text, charset)
  61     if root is None:
  62         return None
  63
  64     _charset = root.originalEncoding
  65     if _charset in ("ISO-8859-2", "windows-1252", "MacCyrillic"):  # Replace default
  66         _charset = DEFAULT_CHARSET
  67         root = _parse_html(html_text, _charset)
  68         if root is None:
  69             return None
  70
  71     html = root.html
  72     if html is None:
  73         html = root
  74
  75     head = html.head
  76     if head is None:
  77         head = html  # Some sites put TITLE in HTML without HEAD
  78
  79     title = head.title
  80     if (title is None) and (html is not head):
  81         # Some sites put TITLE in HTML outside of HEAD
  82         title = html.title
  83
  84     if title is None:
  85         # Lookup TITLE in the root
  86         title = root.title
  87
  88     if title is not None:
  89         if title.string:
  90             title = title.string
  91         else:
  92             parts = []
  93             for part in title:
  94                 if not isinstance(part, basestring):
  95                     part = unicode(part)
  96                 parts.append(part.strip())
  97             title = ''.join(parts)
  98
  99     meta = head.find(_find_contenttype, recursive=False)
 100     if meta:
 101         try:
 102             meta_content = meta.get("content")
 103             if meta_content:
 104                 __charset = meta_content.lower().split('charset=')[1].split(';')[0]
 105             else:
 106                 __charset = False
 107         except IndexError:  # No charset in the META Content-Type
 108             meta_charset = False
 109         else:
 110             meta_charset = _charset == __charset
 111     else:
 112         meta_charset = False
 113
 114     if not meta_charset:
 115         meta = head.find(_find_charset, recursive=False)
 116         if meta:
 117             meta_content = meta.get("charset")
 118             if meta_content:
 119                 meta_charset = _charset = meta_content.lower()
 120
 121     if title and (_charset or meta_charset):
 122         title = title.encode(_charset or meta_charset)
 123
 124     meta = head.find(_find_refresh, recursive=False)
 125     if meta:
 126         refresh = meta.get("content")
 127     else:
 128         refresh = None
 129
 130     meta = head.find(_find_icon, recursive=False)
 131     if meta:
 132         icon = meta.get("href")
 133     else:
 134         icon = None
 135
 136     if (title is None) and (refresh is None) and (icon is None):
 137         return None
 138     return HTMLParser(_charset, meta_charset, title, refresh, icon)
 139
 140 def _find_contenttype(Tag):
 141     return (Tag.name == "meta") and \
 142        (Tag.get("http-equiv", '').lower() == "content-type")
 143
 144 def _find_charset(Tag):
 145     return (Tag.name == "meta") and Tag.get("charset", '')
 146
 147 def _find_refresh(Tag):
 148     return (Tag.name == "meta") and \
 149        (Tag.get("http-equiv", '').lower() == "refresh")
 150
 151 def _find_icon(Tag):
 152     return (Tag.name == "link") and \
 153        (Tag.get("rel", '').lower() in ('icon', 'shortcut icon'))