From: Oleg Broytman Date: Wed, 4 Dec 2013 15:35:38 +0000 (+0400) Subject: Parse X-Git-Tag: v4.5.5~1 X-Git-Url: https://git.phdru.name/?a=commitdiff_plain;h=1249f2d538e9d679421d7bbb59dfac33fad537f4;p=bookmarks_db.git Parse --- diff --git a/doc/TODO b/doc/TODO index 2bbe37d..5705e21 100644 --- a/doc/TODO +++ b/doc/TODO @@ -1,6 +1,3 @@ -Parse - - Switch simple robot to urllib2. A new robot based on PycURL. diff --git a/parse_html/bkmk_ph_beautifulsoup.py b/parse_html/bkmk_ph_beautifulsoup.py index a0ef6af..225cb27 100644 --- a/parse_html/bkmk_ph_beautifulsoup.py +++ b/parse_html/bkmk_ph_beautifulsoup.py @@ -4,7 +4,7 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2007-2012 PhiloSoft Design" +__copyright__ = "Copyright (C) 2007-2013 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['parse_html'] @@ -110,6 +110,13 @@ def parse_html(filename, charset=None, log=None): else: meta_charset = False + if not meta_charset: + meta = head.find(_find_charset, recursive=False) + if meta: + meta_content = meta.get("charset") + if meta_content: + meta_charset = _charset = meta_content.lower() + if title and (_charset or meta_charset): title = title.encode(_charset or meta_charset) @@ -133,6 +140,9 @@ def _find_contenttype(Tag): return (Tag.name == "meta") and \ (Tag.get("http-equiv", '').lower() == "content-type") +def _find_charset(Tag): + return (Tag.name == "meta") and Tag.get("charset", '') + def _find_refresh(Tag): return (Tag.name == "meta") and \ (Tag.get("http-equiv", '').lower() == "refresh") diff --git a/parse_html/bkmk_ph_etreetidy.py b/parse_html/bkmk_ph_etreetidy.py index c823dfa..b85ae2a 100644 --- a/parse_html/bkmk_ph_etreetidy.py +++ b/parse_html/bkmk_ph_etreetidy.py @@ -4,7 +4,7 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2010-2012 PhiloSoft Design" +__copyright__ = "Copyright (C) 2010-2013 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['parse_html'] @@ -41,6 +41,9 @@ def parse_html(filename, charset=None, log=None): break except IndexError: meta_charset = False + elif m.get('charset', ''): + meta_charset = m.get('charset').lower() + break else: meta_charset = False diff --git a/parse_html/bkmk_ph_html5.py b/parse_html/bkmk_ph_html5.py index a490628..53109be 100644 --- a/parse_html/bkmk_ph_html5.py +++ b/parse_html/bkmk_ph_html5.py @@ -4,7 +4,7 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2010-2012 PhiloSoft Design" +__copyright__ = "Copyright (C) 2010-2013 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['parse_html'] @@ -53,7 +53,7 @@ def parse_html(filename, charset=None, log=None): title = '' for node in head.childNodes: - if node.name == 'meta' and \ + if (node.name == 'meta') and \ ('http-equiv' in node.attributes) and \ (node.attributes['http-equiv'] == 'content-type'): meta_content = node.attributes['content'] @@ -65,6 +65,9 @@ def parse_html(filename, charset=None, log=None): meta_charset = False else: break + elif (node.name == 'meta') and ('charset' in node.attributes): + meta_charset = node.attributes['charset'].lower() + break if not charset: charset = parser.tokenizer.stream.charEncoding[0] diff --git a/parse_html/bkmk_ph_htmlparser.py b/parse_html/bkmk_ph_htmlparser.py index 8cdd240..d7020b0 100644 --- a/parse_html/bkmk_ph_htmlparser.py +++ b/parse_html/bkmk_ph_htmlparser.py @@ -4,7 +4,7 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 1997-2012 PhiloSoft Design" +__copyright__ = "Copyright (C) 1997-2013 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['parse_html'] @@ -40,6 +40,9 @@ class HTMLParser(_HTMLParser): http_equiv = value.lower() elif attrname == 'content': content = value + elif (attrname == 'charset') and (not self.charset): + self.charset = value.lower() + self.meta_charset = 1 if (not self.charset) and (http_equiv == "content-type"): try: diff --git a/parse_html/bkmk_ph_lxml.py b/parse_html/bkmk_ph_lxml.py index b14be40..222f116 100644 --- a/parse_html/bkmk_ph_lxml.py +++ b/parse_html/bkmk_ph_lxml.py @@ -4,7 +4,7 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2010-2012 PhiloSoft Design" +__copyright__ = "Copyright (C) 2010-2013 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['parse_html'] @@ -35,6 +35,9 @@ def parse_html(filename, charset=None, log=None): break except IndexError: meta_charset = False + elif m.get('charset', ''): + meta_charset = m.get('charset').lower() + break else: meta_charset = False