From: Oleg Broytman <phd@phdru.name>
Date: Wed, 4 Dec 2013 15:35:38 +0000 (+0400)
Subject: Parse <meta charset="...">
X-Git-Tag: v4.5.5~1
X-Git-Url: https://git.phdru.name/?a=commitdiff_plain;h=1249f2d538e9d679421d7bbb59dfac33fad537f4;p=bookmarks_db.git

Parse <meta charset="...">
---

diff --git a/doc/TODO b/doc/TODO
index 2bbe37d..5705e21 100644
--- a/doc/TODO
+++ b/doc/TODO
@@ -1,6 +1,3 @@
-Parse <meta charset="...">
-
-
 Switch simple robot to urllib2.
 
 A new robot based on PycURL.
diff --git a/parse_html/bkmk_ph_beautifulsoup.py b/parse_html/bkmk_ph_beautifulsoup.py
index a0ef6af..225cb27 100644
--- a/parse_html/bkmk_ph_beautifulsoup.py
+++ b/parse_html/bkmk_ph_beautifulsoup.py
@@ -4,7 +4,7 @@ This file is a part of Bookmarks database and Internet robot.
 """
 
 __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2007-2012 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2007-2013 PhiloSoft Design"
 __license__ = "GNU GPL"
 
 __all__ = ['parse_html']
@@ -110,6 +110,13 @@ def parse_html(filename, charset=None, log=None):
    else:
       meta_charset = False
 
+   if not meta_charset:
+      meta = head.find(_find_charset, recursive=False)
+      if meta:
+         meta_content = meta.get("charset")
+         if meta_content:
+            meta_charset = _charset = meta_content.lower()
+
    if title and (_charset or meta_charset):
       title = title.encode(_charset or meta_charset)
 
@@ -133,6 +140,9 @@ def _find_contenttype(Tag):
    return (Tag.name == "meta") and \
       (Tag.get("http-equiv", '').lower() == "content-type")
 
+def _find_charset(Tag):
+   return (Tag.name == "meta") and Tag.get("charset", '')
+
 def _find_refresh(Tag):
    return (Tag.name == "meta") and \
       (Tag.get("http-equiv", '').lower() == "refresh")
diff --git a/parse_html/bkmk_ph_etreetidy.py b/parse_html/bkmk_ph_etreetidy.py
index c823dfa..b85ae2a 100644
--- a/parse_html/bkmk_ph_etreetidy.py
+++ b/parse_html/bkmk_ph_etreetidy.py
@@ -4,7 +4,7 @@ This file is a part of Bookmarks database and Internet robot.
 """
 
 __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2010-2012 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2010-2013 PhiloSoft Design"
 __license__ = "GNU GPL"
 
 __all__ = ['parse_html']
@@ -41,6 +41,9 @@ def parse_html(filename, charset=None, log=None):
                     break
                 except IndexError:
                     meta_charset = False
+        elif m.get('charset', ''):
+           meta_charset = m.get('charset').lower()
+           break
     else:
         meta_charset = False
 
diff --git a/parse_html/bkmk_ph_html5.py b/parse_html/bkmk_ph_html5.py
index a490628..53109be 100644
--- a/parse_html/bkmk_ph_html5.py
+++ b/parse_html/bkmk_ph_html5.py
@@ -4,7 +4,7 @@ This file is a part of Bookmarks database and Internet robot.
 """
 
 __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2010-2012 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2010-2013 PhiloSoft Design"
 __license__ = "GNU GPL"
 
 __all__ = ['parse_html']
@@ -53,7 +53,7 @@ def parse_html(filename, charset=None, log=None):
                     title = ''
 
         for node in head.childNodes:
-            if node.name == 'meta' and \
+            if (node.name == 'meta') and \
                     ('http-equiv' in node.attributes) and \
                     (node.attributes['http-equiv'] == 'content-type'):
                 meta_content = node.attributes['content']
@@ -65,6 +65,9 @@ def parse_html(filename, charset=None, log=None):
                         meta_charset = False
                     else:
                         break
+            elif (node.name == 'meta') and ('charset' in node.attributes):
+                meta_charset = node.attributes['charset'].lower()
+                break
 
         if not charset:
             charset = parser.tokenizer.stream.charEncoding[0]
diff --git a/parse_html/bkmk_ph_htmlparser.py b/parse_html/bkmk_ph_htmlparser.py
index 8cdd240..d7020b0 100644
--- a/parse_html/bkmk_ph_htmlparser.py
+++ b/parse_html/bkmk_ph_htmlparser.py
@@ -4,7 +4,7 @@ This file is a part of Bookmarks database and Internet robot.
 """
 
 __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 1997-2012 PhiloSoft Design"
+__copyright__ = "Copyright (C) 1997-2013 PhiloSoft Design"
 __license__ = "GNU GPL"
 
 __all__ = ['parse_html']
@@ -40,6 +40,9 @@ class HTMLParser(_HTMLParser):
                http_equiv = value.lower()
             elif attrname == 'content':
                content = value
+            elif (attrname == 'charset') and (not self.charset):
+               self.charset = value.lower()
+               self.meta_charset = 1
 
       if (not self.charset) and (http_equiv == "content-type"):
          try:
diff --git a/parse_html/bkmk_ph_lxml.py b/parse_html/bkmk_ph_lxml.py
index b14be40..222f116 100644
--- a/parse_html/bkmk_ph_lxml.py
+++ b/parse_html/bkmk_ph_lxml.py
@@ -4,7 +4,7 @@ This file is a part of Bookmarks database and Internet robot.
 """
 
 __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2010-2012 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2010-2013 PhiloSoft Design"
 __license__ = "GNU GPL"
 
 __all__ = ['parse_html']
@@ -35,6 +35,9 @@ def parse_html(filename, charset=None, log=None):
                     break
                 except IndexError:
                     meta_charset = False
+        elif m.get('charset', ''):
+           meta_charset = m.get('charset').lower()
+           break
     else:
         meta_charset = False