This file is a part of Bookmarks database and Internet robot.
"""
-__version__ = "$Revision$"[11:-2]
-__revision__ = "$Id$"[5:-2]
-__date__ = "$Date$"[7:-2]
__author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2007-2011 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2007-2013 PhiloSoft Design"
__license__ = "GNU GPL"
__all__ = ['parse_html']
from BeautifulSoup import BeautifulSoup, CData
from .bkmk_ph_util import HTMLParser
+DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic
# http://groups.google.com/group/beautifulsoup/browse_thread/thread/69093cb0d3a3cf63
class BadDeclParser(BeautifulSoup):
# Lookup TITLE in the root
title = root.title
- if title is None:
- return None
-
- if title.string:
- title = title.string
- else:
- parts = []
- for part in title:
- if not isinstance(part, basestring):
- part = unicode(part)
- parts.append(part.strip())
- title = ''.join(parts)
+ if title is not None:
+ if title.string:
+ title = title.string
+ else:
+ parts = []
+ for part in title:
+ if not isinstance(part, basestring):
+ part = unicode(part)
+ parts.append(part.strip())
+ title = ''.join(parts)
meta = head.find(_find_contenttype, recursive=False)
if meta:
else:
meta_charset = False
- if _charset or meta_charset:
+ if not meta_charset:
+ meta = head.find(_find_charset, recursive=False)
+ if meta:
+ meta_content = meta.get("charset")
+ if meta_content:
+ meta_charset = _charset = meta_content.lower()
+
+ if title and (_charset or meta_charset):
title = title.encode(_charset or meta_charset)
meta = head.find(_find_refresh, recursive=False)
else:
icon = None
+ if (title is None) and (refresh is None) and (icon is None):
+ return None
return HTMLParser(_charset, meta_charset, title, refresh, icon)
def _find_contenttype(Tag):
return (Tag.name == "meta") and \
(Tag.get("http-equiv", '').lower() == "content-type")
+def _find_charset(Tag):
+ return (Tag.name == "meta") and Tag.get("charset", '')
+
def _find_refresh(Tag):
return (Tag.name == "meta") and \
(Tag.get("http-equiv", '').lower() == "refresh")