]> git.phdru.name Git - bookmarks_db.git/commitdiff
In case of unknown charset try charset from HTML.
authorOleg Broytman <phd@phdru.name>
Wed, 10 Oct 2007 16:08:03 +0000 (16:08 +0000)
committerOleg Broytman <phd@phdru.name>
Wed, 10 Oct 2007 16:08:03 +0000 (16:08 +0000)
git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@80 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23

Robots/parse_html.py

index d2ca6567fe319a4866b348c81ca5e13a65b3d0ee..6ec63369065035ca61308d3ad5cd65738dabd554 100755 (executable)
@@ -104,6 +104,12 @@ def recode_entities(title, charset):
 
 
 def parse_html(filename, charset=None, log=None):
+   if charset:
+      try:
+         codecs.lookup(charset) # In case of unknown charset...
+      except (ValueError, LookupError):
+         charset = None         # ...try charset from HTML
+
    infile = open(filename, 'r')
    parser = HTMLParser(charset)