X-Git-Url: https://git.phdru.name/?p=bookmarks_db.git;a=blobdiff_plain;f=parse_html%2Fbkmk_ph_beautifulsoup.py;h=f796744a406812e92cbd208b8b132adf88f47643;hp=a2f57157db0347d71592d6732259b380972e8001;hb=c88cb7a7;hpb=a04eaa0346e8aa5ad86a195f8f4d36487ebfe09c diff --git a/parse_html/bkmk_ph_beautifulsoup.py b/parse_html/bkmk_ph_beautifulsoup.py index a2f5715..f796744 100644 --- a/parse_html/bkmk_ph_beautifulsoup.py +++ b/parse_html/bkmk_ph_beautifulsoup.py @@ -5,7 +5,7 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2007-2014 PhiloSoft Design" +__copyright__ = "Copyright (C) 2007-2017 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['parse_html'] @@ -21,130 +21,130 @@ DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic # http://groups.google.com/group/beautifulsoup/browse_thread/thread/69093cb0d3a3cf63 class BadDeclParser(BeautifulSoup): def parse_declaration(self, i): - """Treat a bogus SGML declaration as raw data. Treat a CDATA - declaration as a CData object.""" - j = None - if self.rawdata[i:i+9] == '', i) - if k == -1: - k = len(self.rawdata) - data = self.rawdata[i+9:k] - j = k+3 - self._toStringSubclass(data, CData) - else: - try: - j = SGMLParser.parse_declaration(self, i) - except SGMLParseError: - # Could not parse the DOCTYPE declaration - # Try to just skip the actual declaration - match = re.search(r']*?)>', self.rawdata[i:], re.MULTILINE|re.IGNORECASE) - if match: - toHandle = self.rawdata[i:match.end()] - else: - toHandle = self.rawdata[i:] - self.handle_data(toHandle) - j = i + len(toHandle) - return j + """Treat a bogus SGML declaration as raw data. Treat a CDATA + declaration as a CData object.""" + j = None + if self.rawdata[i:i+9] == '', i) + if k == -1: + k = len(self.rawdata) + data = self.rawdata[i+9:k] + j = k+3 + self._toStringSubclass(data, CData) + else: + try: + j = SGMLParser.parse_declaration(self, i) + except SGMLParseError: + # Could not parse the DOCTYPE declaration + # Try to just skip the actual declaration + match = re.search(r']*?)>', self.rawdata[i:], re.MULTILINE|re.IGNORECASE) + if match: + toHandle = self.rawdata[i:match.end()] + else: + toHandle = self.rawdata[i:] + self.handle_data(toHandle) + j = i + len(toHandle) + return j def _parse_html(html_text, charset): - try: - return BadDeclParser(html_text, fromEncoding=charset) - except TypeError: - return None + try: + return BadDeclParser(html_text, fromEncoding=charset) + except TypeError: + return None def parse_html(html_text, charset=None, log=None): - root = _parse_html(html_text, charset) - if root is None: - return None - - _charset = root.originalEncoding - if _charset in ("ISO-8859-2", "windows-1252", "MacCyrillic"): # Replace default - _charset = DEFAULT_CHARSET - root = _parse_html(html_text, _charset) - if root is None: - return None - - html = root.html - if html is None: - html = root - - head = html.head - if head is None: - head = html # Some sites put TITLE in HTML without HEAD - - title = head.title - if (title is None) and (html is not head): - # Some sites put TITLE in HTML outside of HEAD - title = html.title - - if title is None: - # Lookup TITLE in the root - title = root.title - - if title is not None: - if title.string: - title = title.string - else: - parts = [] - for part in title: - if not isinstance(part, basestring): - part = unicode(part) - parts.append(part.strip()) - title = ''.join(parts) - - meta = head.find(_find_contenttype, recursive=False) - if meta: - try: - meta_content = meta.get("content") - if meta_content: - __charset = meta_content.lower().split('charset=')[1].split(';')[0] - else: - __charset = False - except IndexError: # No charset in the META Content-Type - meta_charset = False - else: - meta_charset = _charset == __charset - else: - meta_charset = False - - if not meta_charset: - meta = head.find(_find_charset, recursive=False) - if meta: - meta_content = meta.get("charset") - if meta_content: - meta_charset = _charset = meta_content.lower() - - if title and (_charset or meta_charset): - title = title.encode(_charset or meta_charset) - - meta = head.find(_find_refresh, recursive=False) - if meta: - refresh = meta.get("content") - else: - refresh = None - - meta = head.find(_find_icon, recursive=False) - if meta: - icon = meta.get("href") - else: - icon = None - - if (title is None) and (refresh is None) and (icon is None): - return None - return HTMLParser(_charset, meta_charset, title, refresh, icon) + root = _parse_html(html_text, charset) + if root is None: + return None + + _charset = root.originalEncoding + if _charset in ("ISO-8859-2", "windows-1252", "MacCyrillic"): # Replace default + _charset = DEFAULT_CHARSET + root = _parse_html(html_text, _charset) + if root is None: + return None + + html = root.html + if html is None: + html = root + + head = html.head + if head is None: + head = html # Some sites put TITLE in HTML without HEAD + + title = head.title + if (title is None) and (html is not head): + # Some sites put TITLE in HTML outside of HEAD + title = html.title + + if title is None: + # Lookup TITLE in the root + title = root.title + + if title is not None: + if title.string: + title = title.string + else: + parts = [] + for part in title: + if not isinstance(part, basestring): + part = unicode(part) + parts.append(part.strip()) + title = ''.join(parts) + + meta = head.find(_find_contenttype, recursive=False) + if meta: + try: + meta_content = meta.get("content") + if meta_content: + __charset = meta_content.lower().split('charset=')[1].split(';')[0] + else: + __charset = False + except IndexError: # No charset in the META Content-Type + meta_charset = False + else: + meta_charset = _charset == __charset + else: + meta_charset = False + + if not meta_charset: + meta = head.find(_find_charset, recursive=False) + if meta: + meta_content = meta.get("charset") + if meta_content: + meta_charset = _charset = meta_content.lower() + + if title and (_charset or meta_charset): + title = title.encode(_charset or meta_charset) + + meta = head.find(_find_refresh, recursive=False) + if meta: + refresh = meta.get("content") + else: + refresh = None + + meta = head.find(_find_icon, recursive=False) + if meta: + icon = meta.get("href") + else: + icon = None + + if (title is None) and (refresh is None) and (icon is None): + return None + return HTMLParser(_charset, meta_charset, title, refresh, icon) def _find_contenttype(Tag): - return (Tag.name == "meta") and \ - (Tag.get("http-equiv", '').lower() == "content-type") + return (Tag.name == "meta") and \ + (Tag.get("http-equiv", '').lower() == "content-type") def _find_charset(Tag): - return (Tag.name == "meta") and Tag.get("charset", '') + return (Tag.name == "meta") and Tag.get("charset", '') def _find_refresh(Tag): - return (Tag.name == "meta") and \ - (Tag.get("http-equiv", '').lower() == "refresh") + return (Tag.name == "meta") and \ + (Tag.get("http-equiv", '').lower() == "refresh") def _find_icon(Tag): - return (Tag.name == "link") and \ - (Tag.get("rel", '').lower() in ('icon', 'shortcut icon')) + return (Tag.name == "link") and \ + (Tag.get("rel", '').lower() in ('icon', 'shortcut icon'))