From 38f3645cce7a5875128d788df6631069c761b987 Mon Sep 17 00:00:00 2001
From: Oleg Broytman <phd@phdru.name>
Date: Tue, 4 Mar 2008 12:07:52 +0000
Subject: [PATCH] Extract charset from "text/html; foo; charset=UTF-8, bar;
 baz;" ("bar" is in case there are few Content-Type headers.)

git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@203 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23
---
 Robots/bkmk_rsimple.py          | 5 +++--
 Robots/parse_html_htmlparser.py | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/Robots/bkmk_rsimple.py b/Robots/bkmk_rsimple.py
index 0dee51e..f481d66 100644
--- a/Robots/bkmk_rsimple.py
+++ b/Robots/bkmk_rsimple.py
@@ -1,7 +1,7 @@
 """
    Simple, strightforward robot
 
-   Written by Oleg BroytMann. Copyright (C) 2000-2007 PhiloSoft Design.
+   Written by Oleg BroytMann. Copyright (C) 2000-2008 PhiloSoft Design.
 """
 
 import sys, os
@@ -157,9 +157,10 @@ class robot_simple(Robot):
             try:
                content_type = headers["Content-Type"]
                try:
+                  # extract charset from "text/html; foo; charset=UTF-8, bar; baz;"
                   content_type, charset = content_type.split(';', 1)
                   content_type = content_type.strip()
-                  charset = charset.split('=')[1].strip()
+                  charset = charset.split('=')[1].strip().split(',')[0]
                   self.log("   HTTP charset   : %s" % charset)
                except (ValueError, IndexError):
                   charset = None
diff --git a/Robots/parse_html_htmlparser.py b/Robots/parse_html_htmlparser.py
index 30911dd..cccfe8c 100644
--- a/Robots/parse_html_htmlparser.py
+++ b/Robots/parse_html_htmlparser.py
@@ -38,8 +38,8 @@ class HTMLParser(_HTMLParser):
 
       if (not self.charset) and (http_equiv == "content-type"):
          try:
-            # extract charset from "text/html; foo; charset=UTF-8; bar;"
-            self.charset = content.lower().split('charset=')[1].split(';')[0]
+            # extract charset from "text/html; foo; charset=UTF-8, bar; baz;"
+            self.charset = content.lower().split('charset=')[1].split(';')[0].split(',')[0]
             self.meta_charset = 1 # Remember that the charset was retrieved from
                                   # META tag, not from the Content-Type header
          except IndexError:
-- 
2.39.2