From 83354a5dc4b5b3fd4e2c3e8a1b43487254b4bcde Mon Sep 17 00:00:00 2001 From: Oleg Broytman Date: Sat, 23 Feb 2008 22:34:24 +0000 Subject: [PATCH] Try a list of charsets, including the universal (utf-8) and the default ones. git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@178 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23 --- Robots/parse_html.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/Robots/parse_html.py b/Robots/parse_html.py index c7acb64..6fe1df9 100755 --- a/Robots/parse_html.py +++ b/Robots/parse_html.py @@ -56,7 +56,15 @@ def parse_html(filename, charset=None, log=None): charset = None # ...try charset from HTML for p in parsers: - parser = p(filename, charset) + charsets = [universal_charset, DEFAULT_CHARSET] + if charset not in charsets: + charsets.insert(0, charset) + parser = None + for c in charsets: + try: + parser = p(filename, c) + except UnicodeEncodeError: + pass if parser: break else: -- 2.39.2