Extract charset from "text/html; foo; charset=UTF-8, bar; baz;"

author Oleg Broytman <phd@phdru.name>

Tue, 4 Mar 2008 12:07:52 +0000 (12:07 +0000)

committer Oleg Broytman <phd@phdru.name>

Tue, 4 Mar 2008 12:07:52 +0000 (12:07 +0000)
author Oleg Broytman <phd@phdru.name>
Tue, 4 Mar 2008 12:07:52 +0000 (12:07 +0000)
committer Oleg Broytman <phd@phdru.name>
Tue, 4 Mar 2008 12:07:52 +0000 (12:07 +0000)
diff --git a/Robots/bkmk_rsimple.py b/Robots/bkmk_rsimple.py

index 0dee51e8837ae3db9922704d78bd3e08b7325df7..f481d66e1bce396d46387571211d010eeea40a52 100644 (file)
--- a/Robots/bkmk_rsimple.py
+++ b/Robots/bkmk_rsimple.py
@@ -1,7 +1,7 @@
  """
     Simple, strightforward robot
  
-   Written by Oleg BroytMann. Copyright (C) 2000-2007 PhiloSoft Design.
+   Written by Oleg BroytMann. Copyright (C) 2000-2008 PhiloSoft Design.
  """
  
  import sys, os
@@ -157,9 +157,10 @@ class robot_simple(Robot):
              try:
                 content_type = headers["Content-Type"]
                 try:
+                  # extract charset from "text/html; foo; charset=UTF-8, bar; baz;"
                    content_type, charset = content_type.split(';', 1)
                    content_type = content_type.strip()
-                  charset = charset.split('=')[1].strip()
+                  charset = charset.split('=')[1].strip().split(',')[0]
                    self.log("   HTTP charset   : %s" % charset)
                 except (ValueError, IndexError):
                    charset = None
diff --git a/Robots/parse_html_htmlparser.py b/Robots/parse_html_htmlparser.py

index 30911dda7012b0ad9aa273692cc766d3eb5cd775..cccfe8c7a4b5adab468accbed54287f009f1df72 100644 (file)
--- a/Robots/parse_html_htmlparser.py
+++ b/Robots/parse_html_htmlparser.py
@@ -38,8 +38,8 @@ class HTMLParser(_HTMLParser):
  
        if (not self.charset) and (http_equiv == "content-type"):
           try:
-            # extract charset from "text/html; foo; charset=UTF-8; bar;"
-            self.charset = content.lower().split('charset=')[1].split(';')[0]
+            # extract charset from "text/html; foo; charset=UTF-8, bar; baz;"
+            self.charset = content.lower().split('charset=')[1].split(';')[0].split(',')[0]
              self.meta_charset = 1 # Remember that the charset was retrieved from
                                    # META tag, not from the Content-Type header
           except IndexError:
author	Oleg Broytman <phd@phdru.name>
	Tue, 4 Mar 2008 12:07:52 +0000 (12:07 +0000)
committer	Oleg Broytman <phd@phdru.name>
	Tue, 4 Mar 2008 12:07:52 +0000 (12:07 +0000)
Robots/bkmk_rsimple.py		patch \| blob \| history
Robots/parse_html_htmlparser.py		patch \| blob \| history