]> git.phdru.name Git - bookmarks_db.git/commitdiff
Try a list of charsets, including the universal (utf-8) and the default ones.
authorOleg Broytman <phd@phdru.name>
Sat, 23 Feb 2008 22:34:24 +0000 (22:34 +0000)
committerOleg Broytman <phd@phdru.name>
Sat, 23 Feb 2008 22:34:24 +0000 (22:34 +0000)
git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@178 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23

Robots/parse_html.py

index c7acb6426e7dfbf386702ffb18fe5f12b3bed1ce..6fe1df954236f855e2f08ae1ca048b4f275a704a 100755 (executable)
@@ -56,7 +56,15 @@ def parse_html(filename, charset=None, log=None):
          charset = None         # ...try charset from HTML
 
    for p in parsers:
-      parser = p(filename, charset)
+      charsets = [universal_charset, DEFAULT_CHARSET]
+      if charset not in charsets:
+         charsets.insert(0, charset)
+      parser = None
+      for c in charsets:
+         try:
+            parser = p(filename, c)
+         except UnicodeEncodeError:
+            pass
       if parser:
          break
       else: