From beae1a1df44dc95377bbb0a28e1eb2eac827c8b5 Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Wed, 10 Oct 2007 16:08:03 +0000 Subject: [PATCH] In case of unknown charset try charset from HTML. git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@80 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23 --- Robots/parse_html.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Robots/parse_html.py b/Robots/parse_html.py index d2ca656..6ec6336 100755 --- a/Robots/parse_html.py +++ b/Robots/parse_html.py @@ -104,6 +104,12 @@ def recode_entities(title, charset): def parse_html(filename, charset=None, log=None): + if charset: + try: + codecs.lookup(charset) # In case of unknown charset... + except (ValueError, LookupError): + charset = None # ...try charset from HTML + infile = open(filename, 'r') parser = HTMLParser(charset) -- 2.39.5