"""
__author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2000-2012 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2000-2014 PhiloSoft Design"
__license__ = "GNU GPL"
__all__ = ['robot_simple', 'get_error']
from m_lib.md5wrapper import md5wrapper
from bkmk_objects import Robot
-from parse_html import parse_html
+from parse_html import parse_filename
class RedirectException(Exception):
else:
html = False
if html:
- parser = parse_html(fname, charset, self.log)
+ parser = parse_filename(fname, charset, self.log)
if parser:
bookmark.real_title = parser.title
icon = parser.icon
WHAT'S NEW in version 4.5.7
+ Change parse_html to parse strings, not files.
+
Add ChangeLog.
-Change parse_html to parse strings, not files.
-
Split simple robot: separate network operations and URL handling.
Allow parameters in BKMK_ROBOT; for example, 'forking:urllib'.
"""HTML Parsers
This file is a part of Bookmarks database and Internet robot.
+
"""
__author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 1997-2012 PhiloSoft Design"
+__copyright__ = "Copyright (C) 1997-2014 PhiloSoft Design"
__license__ = "GNU GPL"
-__all__ = ['parse_html', 'main']
+__all__ = ['parse_html', 'parse_filename', 'main']
-from .bkmk_parse_html import parse_html
+from .bkmk_parse_html import parse_html, parse_filename
def main():
else:
sys.exit("Usage: main filename [charset]")
- parser = parse_html(filename, charset, log=lambda s: sys.stdout.write(s + '\n'))
+ parser = parse_filename(filename, charset, log=lambda s: sys.stdout.write(s + '\n'))
print " refresh:", parser.refresh
print " icon :", parser.icon
"""HTML Parsers
This file is a part of Bookmarks database and Internet robot.
+
"""
__author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 1997-2012 PhiloSoft Design"
+__copyright__ = "Copyright (C) 1997-2014 PhiloSoft Design"
__license__ = "GNU GPL"
-__all__ = ['parse_html', 'universal_charset']
+__all__ = ['parse_html', 'parse_filename', 'universal_charset']
import codecs
import os
BKMK_DEBUG_HTML_PARSERS = os.environ.get("BKMK_DEBUG_HTML_PARSERS")
-def parse_html(filename, charset=None, log=None):
+def parse_html(html_text, charset=None, log=None):
if not parsers:
return None
parser = None
for c in charsets:
try:
- parser = p(filename, c, log)
+ parser = p(html_text, c, log)
except UnicodeError:
pass
else:
if parser.charset:
parser.icon = icon.encode(parser.charset)
return parser
+
+def parse_filename(filename, charset=None, log=None):
+ fp = open(filename, 'r')
+ try:
+ parser = parse_html(fp.read(), charset=charset, log=log)
+ finally:
+ fp.close()
+ return parser
"""HTML Parser using BeautifulSoup
This file is a part of Bookmarks database and Internet robot.
+
"""
__author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2007-2013 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2007-2014 PhiloSoft Design"
__license__ = "GNU GPL"
__all__ = ['parse_html']
return j
-def _parse_html(filename, charset):
- infile = open(filename, 'r')
+def _parse_html(html_text, charset):
try:
- return BadDeclParser(infile, fromEncoding=charset)
+ return BadDeclParser(html_text, fromEncoding=charset)
except TypeError:
return None
- finally:
- infile.close()
-def parse_html(filename, charset=None, log=None):
- root = _parse_html(filename, charset)
+def parse_html(html_text, charset=None, log=None):
+ root = _parse_html(html_text, charset)
if root is None:
return None
_charset = root.originalEncoding
if _charset in ("ISO-8859-2", "windows-1252", "MacCyrillic"): # Replace default
_charset = DEFAULT_CHARSET
- root = _parse_html(filename, _charset)
+ root = _parse_html(html_text, _charset)
if root is None:
return None
"""HTML Parser using ElementTree+TidyLib.
This file is a part of Bookmarks database and Internet robot.
+
"""
__author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2010-2013 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2010-2014 PhiloSoft Design"
__license__ = "GNU GPL"
__all__ = ['parse_html']
from .bkmk_ph_util import HTMLParser
-def parse_html(filename, charset=None, log=None):
+def parse_html(html_text, charset=None, log=None):
try:
- html_tree = TidyHTMLTreeBuilder.parse(filename)
+ html_tree = TidyHTMLTreeBuilder.parseString(html_text)
except:
return None
"""HTML Parser using html5
This file is a part of Bookmarks database and Internet robot.
+
"""
__author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2010-2013 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2010-2014 PhiloSoft Design"
__license__ = "GNU GPL"
__all__ = ['parse_html']
from .bkmk_ph_util import HTMLParser
-def parse_html(filename, charset=None, log=None):
+def parse_html(html_text, charset=None, log=None):
parser = HTML5Parser()
- fp = open(filename)
- parser._parse(fp, encoding=charset, parseMeta=bool(charset))
- fp.close()
- html_tree = parser.tree.getDocument()
+ html_tree = parser.parse(html_text, encoding=charset, parseMeta=bool(charset))
for node in html_tree.childNodes:
if (node.name == 'html') and (node.type != 3): # Skip DocType element
"""HTML Parser using Pythons' HTMLParser
This file is a part of Bookmarks database and Internet robot.
+
"""
__author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 1997-2013 PhiloSoft Design"
+__copyright__ = "Copyright (C) 1997-2014 PhiloSoft Design"
__license__ = "GNU GPL"
__all__ = ['parse_html']
self.icon = href
-def parse_html(filename, charset=None, log=None):
- infile = open(filename, 'r')
+def parse_html(html_text, charset=None, log=None):
parser = HTMLParser(charset)
- for line in infile:
- try:
- parser.feed(line)
- except (HTMLParseError, HTMLHeadDone):
- break
-
- infile.close()
+ try:
+ parser.feed(html_text)
+ except (HTMLParseError, HTMLHeadDone):
+ pass
try:
parser.close()
"""HTML Parser using lxml.html
This file is a part of Bookmarks database and Internet robot.
+
"""
__author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2010-2013 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2010-2014 PhiloSoft Design"
__license__ = "GNU GPL"
__all__ = ['parse_html']
-from lxml.html import parse
+from lxml.html import fromtring
from .bkmk_ph_util import HTMLParser
-def parse_html(filename, charset=None, log=None):
- html_tree = parse(filename)
+def parse_html(html_text, charset=None, log=None):
+ html_tree = fromtring(html_text)
if html_tree.getroot() is None:
return None