Parse <meta charset="...">

author Oleg Broytman <phd@phdru.name>

Wed, 4 Dec 2013 15:35:38 +0000 (19:35 +0400)

committer Oleg Broytman <phd@phdru.name>

Wed, 4 Dec 2013 15:35:38 +0000 (19:35 +0400)
author Oleg Broytman <phd@phdru.name>
Wed, 4 Dec 2013 15:35:38 +0000 (19:35 +0400)
committer Oleg Broytman <phd@phdru.name>
Wed, 4 Dec 2013 15:35:38 +0000 (19:35 +0400)
diff --git a/doc/TODO b/doc/TODO

index 2bbe37ddef6f54fb674f86b75f4a08fe5f116b28..5705e2139cc941f8b1d3a53ae3d0d15fd1c75120 100644 (file)
--- a/doc/TODO
+++ b/doc/TODO
@@ -1,6 +1,3 @@
-Parse <meta charset="...">
-
-
  Switch simple robot to urllib2.
  
  A new robot based on PycURL.
diff --git a/parse_html/bkmk_ph_beautifulsoup.py b/parse_html/bkmk_ph_beautifulsoup.py

index a0ef6af76a5c78dc061d24acbf95498b44912cec..225cb2720f352ae6695a5f1d05e0910e0b118ff0 100644 (file)
--- a/parse_html/bkmk_ph_beautifulsoup.py
+++ b/parse_html/bkmk_ph_beautifulsoup.py
@@ -4,7 +4,7 @@ This file is a part of Bookmarks database and Internet robot.
  """
  
  __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2007-2012 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2007-2013 PhiloSoft Design"
  __license__ = "GNU GPL"
  
  __all__ = ['parse_html']
@@ -110,6 +110,13 @@ def parse_html(filename, charset=None, log=None):
     else:
        meta_charset = False
  
+   if not meta_charset:
+      meta = head.find(_find_charset, recursive=False)
+      if meta:
+         meta_content = meta.get("charset")
+         if meta_content:
+            meta_charset = _charset = meta_content.lower()
+
     if title and (_charset or meta_charset):
        title = title.encode(_charset or meta_charset)
  
@@ -133,6 +140,9 @@ def _find_contenttype(Tag):
     return (Tag.name == "meta") and \
        (Tag.get("http-equiv", '').lower() == "content-type")
  
+def _find_charset(Tag):
+   return (Tag.name == "meta") and Tag.get("charset", '')
+
  def _find_refresh(Tag):
     return (Tag.name == "meta") and \
        (Tag.get("http-equiv", '').lower() == "refresh")
diff --git a/parse_html/bkmk_ph_etreetidy.py b/parse_html/bkmk_ph_etreetidy.py

index c823dfac4a9929524e2388b3015716f98791c6a7..b85ae2ae209c37decf8ebce34e649d25dcc7012b 100644 (file)
--- a/parse_html/bkmk_ph_etreetidy.py
+++ b/parse_html/bkmk_ph_etreetidy.py
@@ -4,7 +4,7 @@ This file is a part of Bookmarks database and Internet robot.
  """
  
  __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2010-2012 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2010-2013 PhiloSoft Design"
  __license__ = "GNU GPL"
  
  __all__ = ['parse_html']
@@ -41,6 +41,9 @@ def parse_html(filename, charset=None, log=None):
                      break
                  except IndexError:
                      meta_charset = False
+        elif m.get('charset', ''):
+           meta_charset = m.get('charset').lower()
+           break
      else:
          meta_charset = False
  
diff --git a/parse_html/bkmk_ph_html5.py b/parse_html/bkmk_ph_html5.py

index a4906288af13fe5da0cf3f24f39e30a010f7cbe7..53109be72fd761f874a92723f0eb4d606c47b11f 100644 (file)
--- a/parse_html/bkmk_ph_html5.py
+++ b/parse_html/bkmk_ph_html5.py
@@ -4,7 +4,7 @@ This file is a part of Bookmarks database and Internet robot.
  """
  
  __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2010-2012 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2010-2013 PhiloSoft Design"
  __license__ = "GNU GPL"
  
  __all__ = ['parse_html']
@@ -53,7 +53,7 @@ def parse_html(filename, charset=None, log=None):
                      title = ''
  
          for node in head.childNodes:
-            if node.name == 'meta' and \
+            if (node.name == 'meta') and \
                      ('http-equiv' in node.attributes) and \
                      (node.attributes['http-equiv'] == 'content-type'):
                  meta_content = node.attributes['content']
@@ -65,6 +65,9 @@ def parse_html(filename, charset=None, log=None):
                          meta_charset = False
                      else:
                          break
+            elif (node.name == 'meta') and ('charset' in node.attributes):
+                meta_charset = node.attributes['charset'].lower()
+                break
  
          if not charset:
              charset = parser.tokenizer.stream.charEncoding[0]
diff --git a/parse_html/bkmk_ph_htmlparser.py b/parse_html/bkmk_ph_htmlparser.py

index 8cdd240a06c7fb3514e79eb9df571dcf1b67279c..d7020b03e933a9d822c541fe9c3936a5e0310ee1 100644 (file)
--- a/parse_html/bkmk_ph_htmlparser.py
+++ b/parse_html/bkmk_ph_htmlparser.py
@@ -4,7 +4,7 @@ This file is a part of Bookmarks database and Internet robot.
  """
  
  __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 1997-2012 PhiloSoft Design"
+__copyright__ = "Copyright (C) 1997-2013 PhiloSoft Design"
  __license__ = "GNU GPL"
  
  __all__ = ['parse_html']
@@ -40,6 +40,9 @@ class HTMLParser(_HTMLParser):
                 http_equiv = value.lower()
              elif attrname == 'content':
                 content = value
+            elif (attrname == 'charset') and (not self.charset):
+               self.charset = value.lower()
+               self.meta_charset = 1
  
        if (not self.charset) and (http_equiv == "content-type"):
           try:
diff --git a/parse_html/bkmk_ph_lxml.py b/parse_html/bkmk_ph_lxml.py

index b14be408571ba53e0cfa5f60997394658ad2061b..222f11697340f32381f846ab55cf753ffa15444a 100644 (file)
--- a/parse_html/bkmk_ph_lxml.py
+++ b/parse_html/bkmk_ph_lxml.py
@@ -4,7 +4,7 @@ This file is a part of Bookmarks database and Internet robot.
  """
  
  __author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2010-2012 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2010-2013 PhiloSoft Design"
  __license__ = "GNU GPL"
  
  __all__ = ['parse_html']
@@ -35,6 +35,9 @@ def parse_html(filename, charset=None, log=None):
                      break
                  except IndexError:
                      meta_charset = False
+        elif m.get('charset', ''):
+           meta_charset = m.get('charset').lower()
+           break
      else:
          meta_charset = False
author	Oleg Broytman <phd@phdru.name>
	Wed, 4 Dec 2013 15:35:38 +0000 (19:35 +0400)
committer	Oleg Broytman <phd@phdru.name>
	Wed, 4 Dec 2013 15:35:38 +0000 (19:35 +0400)
doc/TODO		patch \| blob \| history
parse_html/bkmk_ph_beautifulsoup.py		patch \| blob \| history
parse_html/bkmk_ph_etreetidy.py		patch \| blob \| history
parse_html/bkmk_ph_html5.py		patch \| blob \| history
parse_html/bkmk_ph_htmlparser.py		patch \| blob \| history
parse_html/bkmk_ph_lxml.py		patch \| blob \| history