X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=parse_html%2F__init__.py;fp=Robots%2Fparse_html.py;h=c67f2e7ef30a0ff18233d604a1b8b1f6c54487a9;hb=9989e73be9690cf0fccab901c9db81711cb9a9e7;hp=cbb45d612f9b33699b6f2d0bbe397b11a3562fe1;hpb=f79d81fcd336e913bf0f42f0a6fbdb582de0f3e3;p=bookmarks_db.git diff --git a/Robots/parse_html.py b/parse_html/__init__.py old mode 100755 new mode 100644 similarity index 91% rename from Robots/parse_html.py rename to parse_html/__init__.py index cbb45d6..c67f2e7 --- a/Robots/parse_html.py +++ b/parse_html/__init__.py @@ -1,4 +1,3 @@ -#! /usr/bin/env python """ HTML Parsers wrapper @@ -13,41 +12,41 @@ DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic parsers = [] try: - import parse_html_beautifulsoup - parse_html_beautifulsoup.DEFAULT_CHARSET = DEFAULT_CHARSET + from . import beautifulsoup except ImportError: pass else: - parsers.append(parse_html_beautifulsoup.parse_html) + beautifulsoup.DEFAULT_CHARSET = DEFAULT_CHARSET + parsers.append(beautifulsoup.parse_html) try: - from parse_html_lxml import parse_html + from .lxml import parse_html except ImportError: pass else: parsers.append(parse_html) try: - from parse_html_htmlparser import parse_html + from .htmlparser import parse_html except ImportError: pass else: parsers.append(parse_html) try: - import parse_html_html5 + from . import html5 except ImportError: pass else: - parsers.append(parse_html_html5.parse_html) + parsers.append(html5.parse_html) # ElementTidy often segfaults #try: -# import parse_html_etreetidy +# from . import etreetidy #except ImportError: # pass #else: -# parsers.append(parse_html_etreetidy.parse_html) +# parsers.append(etreetidy.parse_html) import re from htmlentitydefs import name2codepoint @@ -100,9 +99,10 @@ def parse_html(filename, charset=None, log=None): for c in charsets: try: parser = p(filename, c, log) - break except UnicodeEncodeError: pass + else: + break if parser: break else: @@ -161,7 +161,7 @@ def parse_html(filename, charset=None, log=None): return parser -if __name__ == '__main__': +def test(): import sys l = len(sys.argv)