]> git.phdru.name Git - bookmarks_db.git/commitdiff
Moved parse_html.py and its submodules to a separate parse_html module.
authorOleg Broytman <phd@phdru.name>
Mon, 3 Jan 2011 19:35:08 +0000 (19:35 +0000)
committerOleg Broytman <phd@phdru.name>
Mon, 3 Jan 2011 19:35:08 +0000 (19:35 +0000)
git-svn-id: file:///home/phd/archive/SVN/bookmarks_db/trunk@311 fdd5c36f-1aea-0310-aeeb-c58d7e2b6c23

Robots/bkmk_rsimple_tos.py
parse_html/__init__.py [moved from Robots/parse_html.py with 91% similarity, mode: 0644]
parse_html/beautifulsoup.py [moved from Robots/parse_html_beautifulsoup.py with 97% similarity]
parse_html/etreetidy.py [moved from Robots/parse_html_etreetidy.py with 94% similarity]
parse_html/html5.py [moved from Robots/parse_html_html5.py with 96% similarity]
parse_html/htmlparser.py [moved from Robots/parse_html_htmlparser.py with 97% similarity]
parse_html/lxml.py [moved from Robots/parse_html_lxml.py with 93% similarity]
parse_html/util.py [moved from Robots/parse_html_util.py with 100% similarity]

index 0be22b843f421c5bd58505ce42bea75c3df4f4b3..ad42f09a73077370e68bcd1accf076d6230a78e5 100644 (file)
@@ -1,13 +1,13 @@
 """
    Simple robot with socket's timeout
 
-   Written by Broytman. Copyright (C) 2000-2010 PhiloSoft Design
+   Written by Broytman. Copyright (C) 2000-2011 PhiloSoft Design
 """
 
 import socket
 socket.setdefaulttimeout(900)
 
-from bkmk_rsimple import robot_simple, get_error
+from .bkmk_rsimple import robot_simple, get_error
 
 
 class robot_simple_tos(robot_simple):
old mode 100755 (executable)
new mode 100644 (file)
similarity index 91%
rename from Robots/parse_html.py
rename to parse_html/__init__.py
index cbb45d6..c67f2e7
@@ -1,4 +1,3 @@
-#! /usr/bin/env python
 """
    HTML Parsers wrapper
 
@@ -13,41 +12,41 @@ DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic
 parsers = []
 
 try:
-   import parse_html_beautifulsoup
-   parse_html_beautifulsoup.DEFAULT_CHARSET = DEFAULT_CHARSET
+   from . import beautifulsoup
 except ImportError:
    pass
 else:
-   parsers.append(parse_html_beautifulsoup.parse_html)
+   beautifulsoup.DEFAULT_CHARSET = DEFAULT_CHARSET
+   parsers.append(beautifulsoup.parse_html)
 
 try:
-   from parse_html_lxml import parse_html
+   from .lxml import parse_html
 except ImportError:
    pass
 else:
     parsers.append(parse_html)
 
 try:
-   from parse_html_htmlparser import parse_html
+   from .htmlparser import parse_html
 except ImportError:
    pass
 else:
     parsers.append(parse_html)
 
 try:
-   import parse_html_html5
+   from . import html5
 except ImportError:
    pass
 else:
-   parsers.append(parse_html_html5.parse_html)
+   parsers.append(html5.parse_html)
 
 # ElementTidy often segfaults
 #try:
-#   import parse_html_etreetidy
+#   from . import etreetidy
 #except ImportError:
 #   pass
 #else:
-#   parsers.append(parse_html_etreetidy.parse_html)
+#   parsers.append(etreetidy.parse_html)
 
 import re
 from htmlentitydefs import name2codepoint
@@ -100,9 +99,10 @@ def parse_html(filename, charset=None, log=None):
       for c in charsets:
          try:
             parser = p(filename, c, log)
-            break
          except UnicodeEncodeError:
             pass
+         else:
+            break
       if parser:
          break
       else:
@@ -161,7 +161,7 @@ def parse_html(filename, charset=None, log=None):
    return parser
 
 
-if __name__ == '__main__':
+def test():
    import sys
 
    l = len(sys.argv)
similarity index 97%
rename from Robots/parse_html_beautifulsoup.py
rename to parse_html/beautifulsoup.py
index e03dfce99faa55b980e21a75e38249790d50c0a7..86b8c1bee9a74257b1a809ea36c1d02061ec1445 100644 (file)
@@ -1,13 +1,13 @@
 """
    HTML Parser using BeautifulSoup
 
-   Written by Broytman. Copyright (C) 2007-2010 PhiloSoft Design
+   Written by Broytman. Copyright (C) 2007-2011 PhiloSoft Design
 """
 
 import re
 from sgmllib import SGMLParser, SGMLParseError
 from BeautifulSoup import BeautifulSoup, CData
-from parse_html_util import HTMLParser
+from .util import HTMLParser
 
 
 # http://groups.google.com/group/beautifulsoup/browse_thread/thread/69093cb0d3a3cf63
similarity index 94%
rename from Robots/parse_html_etreetidy.py
rename to parse_html/etreetidy.py
index 7149c2286332a55c3655bddda4632e26865279d7..d55aa57fc850b900a248e0e2daef3c0251e72ffd 100644 (file)
@@ -1,11 +1,11 @@
 """
     HTML Parser using ElementTree+TidyLib.
 
-    Written by Broytman. Copyright (C) 2010 PhiloSoft Design
+    Written by Broytman. Copyright (C) 2010, 2011 PhiloSoft Design
 """
 
 from elementtidy import TidyHTMLTreeBuilder
-from parse_html_util import HTMLParser
+from .util import HTMLParser
 
 
 def parse_html(filename, charset=None, log=None):
similarity index 96%
rename from Robots/parse_html_html5.py
rename to parse_html/html5.py
index 43e8d74ba957f658ffef276cadb2782ac27e84e5..52bd57654ae6a475a40937dff1d9818ac4d9160b 100644 (file)
@@ -1,11 +1,11 @@
 """
     HTML Parser using html5.
 
-    Written by Broytman. Copyright (C) 2010 PhiloSoft Design
+    Written by Broytman. Copyright (C) 2010, 2011 PhiloSoft Design
 """
 
 from html5lib import HTMLParser as HTML5Parser
-from parse_html_util import HTMLParser
+from .util import HTMLParser
 
 
 def parse_html(filename, charset=None, log=None):
similarity index 97%
rename from Robots/parse_html_htmlparser.py
rename to parse_html/htmlparser.py
index 77021624ff9543a82ebe26cc170398a49f13a6bb..5f885b2a9b853d1f216879481392e099272fb482 100644 (file)
@@ -1,7 +1,7 @@
 """
    HTML Parser
 
-   Written by Broytman. Copyright (C) 1997-2010 PhiloSoft Design
+   Written by Broytman. Copyright (C) 1997-2011 PhiloSoft Design
 """
 
 from HTMLParser import HTMLParseError
similarity index 93%
rename from Robots/parse_html_lxml.py
rename to parse_html/lxml.py
index 8f658e1a026d9d16789f69bf43c5cf8a8d45fd83..f39939772e5032c308bb9b1a2fbebc2ac9714fae 100644 (file)
@@ -1,11 +1,11 @@
 """
     HTML Parser using lxml.html.
 
-    Written by Broytman. Copyright (C) 2010 PhiloSoft Design
+    Written by Broytman. Copyright (C) 2010, 2011 PhiloSoft Design
 """
 
 from lxml.html import parse
-from parse_html_util import HTMLParser
+from .util import HTMLParser
 
 
 def parse_html(filename, charset=None, log=None):
similarity index 100%
rename from Robots/parse_html_util.py
rename to parse_html/util.py