X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=bkmk_parser.py;h=8f44f47f13bd1459a0f910dfbfd517e997ba6f46;hb=71900f3630cb51580964038b78100d60e3671981;hp=c156cb76cc1b8fab12dcd48d012817a62ac73cbe;hpb=387f77d110986aa12967c9cd788ab0e4f41f2be2;p=bookmarks_db.git diff --git a/bkmk_parser.py b/bkmk_parser.py old mode 100755 new mode 100644 index c156cb7..8f44f47 --- a/bkmk_parser.py +++ b/bkmk_parser.py @@ -1,25 +1,33 @@ -""" - Parser for Netscape Navigator's bookmarks.html +"""Parser for Netscape Navigator's and Mozilla's bookmarks.html - Written by BroytMann, Jun 1997 - Jul 2003. Copyright (C) 1997-2003 PhiloSoft Design +This file is a part of Bookmarks database and Internet robot. """ +__author__ = "Oleg Broytman " +__copyright__ = "Copyright (C) 1997-2017 PhiloSoft Design" +__license__ = "GNU GPL" + +__all__ = ['BkmkParser'] + -import string +import os +from m_lib.defenc import default_encoding from m_lib.net.www.html import HTMLParser from bkmk_objects import Folder, Bookmark, Ruler -if __debug__: +DEBUG = os.environ.has_key("BKMK_DEBUG") + +if DEBUG: def debug(note): - print note + print(note) def dump_names(folder_stack): l = [] for object in folder_stack: if object.isFolder: l.append(object.name) - return "'" + string.join(l, "' '") + "'" + return "'%s'" % "' '".join(l) else: def debug(note): @@ -34,16 +42,14 @@ class BkmkParser(HTMLParser): self.urls = 0 self.objects = 0 - self.charset = "" + self.charset = None self.recode = None - def handle_data(self, data): if data: - if self.charset: - data = unicode(data, self.charset).encode() - self.accumulator = "%s%s" % (self.accumulator, data) - + if self.charset and default_encoding: + data = unicode(data, self.charset, "replace").encode(default_encoding, "xmlcharrefreplace") + self.accumulator += data # Mozilla - get charset def do_meta(self, attrs): @@ -51,7 +57,7 @@ class BkmkParser(HTMLParser): content = "" for attrname, value in attrs: - value = string.strip(value) + value = value.strip() if attrname == 'http-equiv': http_equiv = value.lower() elif attrname == 'content': @@ -64,13 +70,13 @@ class BkmkParser(HTMLParser): except IndexError: pass - def start_title(self, attrs): - self.accumulator = "%s" % self.accumulator + if default_encoding: + self.accumulator += '<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=%s">\n' % default_encoding + self.accumulator += "<TITLE>" def end_title(self): - self.accumulator = "%s" % self.accumulator - + self.accumulator += "" # Start root folder def start_h1(self, attrs): @@ -80,7 +86,7 @@ class BkmkParser(HTMLParser): self.current_folder = root_folder self.folder_stack = [root_folder] - self.root_folder.header = self.accumulator + self.root_folder.header = self.accumulator.strip() self.accumulator = '' def end_h1(self): @@ -90,21 +96,23 @@ class BkmkParser(HTMLParser): debug("Root folder name: `%s'" % accumulator) self.root_folder.name = accumulator - - # Start next folder + # Start a folder def start_h3(self, attrs): + last_modified = None for attrname, value in attrs: - value = string.strip(value) + value = value.strip() if attrname == 'add_date': add_date = value + elif attrname == 'last_modified': + last_modified = value debug("New folder...") - folder = Folder(add_date) + folder = Folder(add_date, last_modified=last_modified) self.current_object = folder self.current_folder.append(folder) self.folder_stack.append(folder) # push new folder self.current_folder = folder - self.objects = self.objects + 1 + self.objects += 1 def end_h3(self): accumulator = self.accumulator @@ -113,29 +121,40 @@ class BkmkParser(HTMLParser): debug("Folder name: `%s'" % accumulator) self.current_folder.name = accumulator - - # Start bookmark + # Start a bookmark def start_a(self, attrs): + add_date = None last_visit = None last_modified = None + keyword = '' + icon = None + charset = None for attrname, value in attrs: - value = string.strip(value) - if attrname == 'href': + value = value.strip() + if attrname == "href": href = value - if attrname == 'add_date': + elif attrname == "add_date": add_date = value - if attrname == 'last_visit': + elif attrname == "last_visit": last_visit = value - if attrname == 'last_modified': + elif attrname == "last_modified": last_modified = value + elif attrname == "shortcuturl": + keyword = value + elif attrname == "icon": + icon = value + elif attrname == "last_charset": + charset = value debug("Bookmark points to: `%s'" % href) - bookmark = Bookmark(href, add_date, last_visit, last_modified) + bookmark = Bookmark(href, add_date, last_visit, last_modified, + keyword=keyword, icon=icon, + charset=charset, parser_charset=self.charset or default_encoding) self.current_object = bookmark self.current_folder.append(bookmark) - self.urls = self.urls + 1 - self.objects = self.objects + 1 + self.urls += 1 + self.objects += 1 def end_a(self): accumulator = self.accumulator @@ -145,7 +164,6 @@ class BkmkParser(HTMLParser): bookmark = self.current_folder[-1] bookmark.name = accumulator - def flush(self): accumulator = self.accumulator @@ -153,16 +171,15 @@ class BkmkParser(HTMLParser): self.accumulator = '' current_object = self.current_object - current_object.comment = current_object.comment + accumulator - debug("Comment: `%s'" % current_object.comment) - + if current_object: + current_object.comment += accumulator.strip() + debug("Comment: `%s'" % current_object.comment) def start_dl(self, attrs): self.flush() do_dt = start_dl - # End of folder def end_dl(self): self.flush() @@ -178,37 +195,31 @@ class BkmkParser(HTMLParser): debug("FOLDER STACK is EMPTY!!! (2)") self.current_object = None - def close(self): HTMLParser.close(self) if self.folder_stack: - raise ValueError, "wrong folder stack: %s" % self.folder_stack - + raise ValueError("wrong folder stack: %s" % self.folder_stack) def do_dd(self, attrs): pass do_p = do_dd - # Start ruler def do_hr(self, attrs): self.flush() debug("Ruler") self.current_folder.append(Ruler()) self.current_object = None - self.objects = self.objects + 1 - + self.objects += 1 # BR in comment def do_br(self, attrs): - self.accumulator = "%s
" % self.accumulator - + self.accumulator += "
" # Allow < in the text def unknown_starttag(self, tag, attrs): - self.accumulator = "%s<%s>" % (self.accumulator, tag) - + self.accumulator += "<%s>" % tag # Do not allow unknow end tags def unknown_endtag(self, tag):