]> git.phdru.name Git - bookmarks_db.git/commitdiff
Added code to collect statistics on parsers;
authorOleg Broytman <phd@phdru.name>
Thu, 6 Jan 2011 19:00:27 +0000 (19:00 +0000)
committerOleg Broytman <phd@phdru.name>
Thu, 6 Jan 2011 19:00:27 +0000 (19:00 +0000)
sort parser according to the statistics; commented out statistics code.

git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@322 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23

parse_html/bkmk_parse_html.py

index f42dab8d4de79b040b4638b5d8d7806aa519b5d2..42cb5ce1575abf2f0e1eb073943ca62240bcccb2 100644 (file)
@@ -20,6 +20,12 @@ DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic
 
 parsers = []
 
+# Statistics by parser - successfully parsed HTML pages:
+# 4358 beautifulsoup
+# 4310 htmlparser
+# 4307 html5
+# 4250 lxml
+
 try:
    from . import bkmk_ph_beautifulsoup
 except ImportError:
@@ -29,25 +35,25 @@ else:
    parsers.append(bkmk_ph_beautifulsoup.parse_html)
 
 try:
-   from . import bkmk_ph_lxml
+   from . import bkmk_ph_htmlparser
 except ImportError:
    pass
 else:
-    parsers.append(bkmk_ph_lxml.parse_html)
+    parsers.append(bkmk_ph_htmlparser.parse_html)
 
 try:
-   from . import bkmk_ph_htmlparser
+   from . import bkmk_ph_html5
 except ImportError:
    pass
 else:
-    parsers.append(bkmk_ph_htmlparser.parse_html)
+   parsers.append(bkmk_ph_html5.parse_html)
 
 try:
-   from . import bkmk_ph_html5
+   from . import bkmk_ph_lxml
 except ImportError:
    pass
 else:
-   parsers.append(bkmk_ph_html5.parse_html)
+    parsers.append(bkmk_ph_lxml.parse_html)
 
 # ElementTidy often segfaults
 #try:
@@ -103,25 +109,31 @@ def parse_html(filename, charset=None, log=None):
          charsets.remove(charset)
       charsets.insert(0, charset)
 
+   #_parsers = []
    for p in parsers:
       parser = None
       for c in charsets:
          try:
             parser = p(filename, c, log)
-         except UnicodeEncodeError:
+         except UnicodeError:
             pass
          else:
-            break
+            if parser:
+               if log: log("   Parser %s: ok" % p.__module__)
+               #_parsers.append(parser)
+               break
+      else:
+         if log: log("   Parser %s: fail" % p.__module__)
       if parser:
          break
-      else:
-         if log: log("Parser %s.%s failed, trying next one." % (p.__module__, p.__name__))
 
+   #if not _parsers:
    if not parser:
-       if log: log("All parser has failed.")
+       if log: log("   All parser has failed")
        return None
 
-   if log: log("   Using %s" % p.__module__)
+   #parser = _parsers[0]
+   if log: log("   Using %s" % parser.__module__)
 
    converted_title = title = parser.title
    if title and (not parser.charset):