From 38f3645cce7a5875128d788df6631069c761b987 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Tue, 4 Mar 2008 12:07:52 +0000 Subject: [PATCH] Extract charset from "text/html; foo; charset=UTF-8, bar; baz;" ("bar" is in case there are few Content-Type headers.) git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@203 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23 --- Robots/bkmk_rsimple.py | 5 +++-- Robots/parse_html_htmlparser.py | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/Robots/bkmk_rsimple.py b/Robots/bkmk_rsimple.py index 0dee51e..f481d66 100644 --- a/Robots/bkmk_rsimple.py +++ b/Robots/bkmk_rsimple.py @@ -1,7 +1,7 @@ """ Simple, strightforward robot - Written by Oleg BroytMann. Copyright (C) 2000-2007 PhiloSoft Design. + Written by Oleg BroytMann. Copyright (C) 2000-2008 PhiloSoft Design. """ import sys, os @@ -157,9 +157,10 @@ class robot_simple(Robot): try: content_type = headers["Content-Type"] try: + # extract charset from "text/html; foo; charset=UTF-8, bar; baz;" content_type, charset = content_type.split(';', 1) content_type = content_type.strip() - charset = charset.split('=')[1].strip() + charset = charset.split('=')[1].strip().split(',')[0] self.log(" HTTP charset : %s" % charset) except (ValueError, IndexError): charset = None diff --git a/Robots/parse_html_htmlparser.py b/Robots/parse_html_htmlparser.py index 30911dd..cccfe8c 100644 --- a/Robots/parse_html_htmlparser.py +++ b/Robots/parse_html_htmlparser.py @@ -38,8 +38,8 @@ class HTMLParser(_HTMLParser): if (not self.charset) and (http_equiv == "content-type"): try: - # extract charset from "text/html; foo; charset=UTF-8; bar;" - self.charset = content.lower().split('charset=')[1].split(';')[0] + # extract charset from "text/html; foo; charset=UTF-8, bar; baz;" + self.charset = content.lower().split('charset=')[1].split(';')[0].split(',')[0] self.meta_charset = 1 # Remember that the charset was retrieved from # META tag, not from the Content-Type header except IndexError: -- 2.39.2