X-Git-Url: https://git.phdru.name/?a=blobdiff_plain;f=bkmk_parser.py;h=02116ccf025c9edacd119362331b64174ce89767;hb=54576c927fe40509d146212dce65c555148a22c5;hp=c3ca1b5baf0a2ac8275a06ee26696d31e3b6ed68;hpb=2e82a937f80392639176d9a414b55ffb8164ebca;p=bookmarks_db.git diff --git a/bkmk_parser.py b/bkmk_parser.py old mode 100755 new mode 100644 index c3ca1b5..02116cc --- a/bkmk_parser.py +++ b/bkmk_parser.py @@ -1,321 +1,226 @@ -""" - Bookmarks parsers +"""Parser for Netscape Navigator's and Mozilla's bookmarks.html - Written by BroytMann, Mar 1997 - Feb 2000. Copyright (C) 1997-2000 PhiloSoft Design +This file is a part of Bookmarks database and Internet robot. """ +__author__ = "Oleg Broytman " +__copyright__ = "Copyright (C) 1997-2012 PhiloSoft Design" +__license__ = "GNU GPL" -import os, string, shutil -from htmllib import HTMLParser - - -class BookmarksParser(HTMLParser): # Parser for Navigator's bookmarks (abstract class) - def __init__(self, formatter, verbose=0): - HTMLParser.__init__(self, formatter, verbose) - self.urls_no = 0 # cross-reference counter - self.record_no = 1 # record counter - self.outfile = None # output file - self.level = 0 # Indentation level - self.flag_out = 0 # Is it time to flush? - self.saved_data = '' - self.saved_anchor = None - self.saved_folder = None - self.saved_ruler = None +__all__ = ['BkmkParser'] - def flush(self): - if not self.outfile: - return - - record_flushed = 0 +import os +from m_lib.defenc import default_encoding +from m_lib.net.www.html import HTMLParser +from bkmk_objects import Folder, Bookmark, Ruler - if self.saved_anchor: - name, href, add_date, last_visit, last_modified, comment = self.saved_anchor - self.saved_anchor = (name, href, add_date, last_visit, last_modified, comment + self.saved_data) - self.flush_anchor() - self.saved_data = '' - record_flushed = 1 - self.saved_anchor = None - if self.saved_folder: - name, add_date, comment = self.saved_folder - self.saved_folder = (name, add_date, comment + self.saved_data) - self.flush_folder() - self.saved_data = '' - record_flushed = 1 - self.saved_folder = None +DEBUG = os.environ.has_key("BKMK_DEBUG") - if self.saved_ruler: - self.flush_ruler() - record_flushed = 1 - self.saved_ruler = None +if DEBUG: + def debug(note): + print note - if record_flushed: - self.record_no = self.record_no + 1 + def dump_names(folder_stack): + l = [] + for object in folder_stack: + if object.isFolder: + l.append(object.name) + return "'%s'" % "' '".join(l) - if self.saved_data <> '': # This may occur after ampersand - self.flag_out = 0 +else: + def debug(note): + pass + dump_names = debug +class BkmkParser(HTMLParser): + def __init__(self): + HTMLParser.__init__(self) + self.urls = 0 + self.objects = 0 - def close(self): - HTMLParser.close(self) - - if self.outfile: - self.outfile.close() - - if self.level <> 0: - print "Bad HTML:
and
mismatch; level=%d" % self.level - + self.charset = None + self.recode = None def handle_data(self, data): - if not self.outfile: - return - - if data and (data[0] == '&'): # Ampersand parsed by SGMLlib - self.flag_out = 0 - - if self.flag_out == 2: # Process comment after
or
- if self.saved_anchor: - name, href, add_date, last_visit, last_modified, comment = self.saved_anchor - self.saved_anchor = (name, href, add_date, last_visit, last_modified, comment + data) - data = '' # Used - - if self.saved_folder: - name, add_date, comment = self.saved_folder - self.saved_folder = (name, add_date, comment + data) - data = '' # Used - - self.flag_out = 0 - - if self.flag_out == 1: - self.flush() - - if data and (data[0] <> '&') and (self.flag_out == 0): - self.flag_out = 1 # Set flag (to flush data on next call) - if data: - self.saved_data = self.saved_data + data + if self.charset and default_encoding: + data = unicode(data, self.charset, "replace").encode(default_encoding, "xmlcharrefreplace") + self.accumulator += data + # Mozilla - get charset + def do_meta(self, attrs): + http_equiv = "" + content = "" - def anchor_bgn(self, href, add_date, last_visit, last_modified): - self.flush() - self.anchor = (href, add_date, last_visit, last_modified) - + for attrname, value in attrs: + value = value.strip() + if attrname == 'http-equiv': + http_equiv = value.lower() + elif attrname == 'content': + content = value + + if http_equiv == "content-type": + try: + # extract charset from "text/html; charset=UTF-8" + self.charset = content.split('=')[1] + except IndexError: + pass + + def start_title(self, attrs): + if default_encoding: + self.accumulator += '\n' % default_encoding + self.accumulator += "" + + def end_title(self): + self.accumulator += "" + + # Start root folder + def start_h1(self, attrs): + root_folder = Folder() + self.current_object = root_folder + self.root_folder = root_folder + self.current_folder = root_folder + self.folder_stack = [root_folder] + + self.root_folder.header = self.accumulator.strip() + self.accumulator = '' + + def end_h1(self): + accumulator = self.accumulator + self.accumulator = '' + + debug("Root folder name: `%s'" % accumulator) + self.root_folder.name = accumulator + + # Start a folder + def start_h3(self, attrs): + last_modified = None + for attrname, value in attrs: + value = value.strip() + if attrname == 'add_date': + add_date = value + elif attrname == 'last_modified': + last_modified = value - def anchor_end(self): - if self.anchor: - href, add_date, last_visit, last_modified = self.anchor - self.anchor = None - self.urls_no = self.urls_no + 1 + debug("New folder...") + folder = Folder(add_date, last_modified=last_modified) + self.current_object = folder + self.current_folder.append(folder) + self.folder_stack.append(folder) # push new folder + self.current_folder = folder + self.objects += 1 - self.saved_anchor = (self.saved_data, href, add_date, last_visit, last_modified, '') - self.saved_data = '' # Used + def end_h3(self): + accumulator = self.accumulator + self.accumulator = '' + debug("Folder name: `%s'" % accumulator) + self.current_folder.name = accumulator + # Start a bookmark def start_a(self, attrs): - href = '' - add_date = '' - last_visit = '' - last_modified = '' + add_date = None + last_visit = None + last_modified = None + keyword = '' + icon = None + charset = None for attrname, value in attrs: - value = string.strip(value) - if attrname == 'href': + value = value.strip() + if attrname == "href": href = value - if attrname == 'add_date': + elif attrname == "add_date": add_date = value - if attrname == 'last_visit': + elif attrname == "last_visit": last_visit = value - if attrname == 'last_modified': + elif attrname == "last_modified": last_modified = value + elif attrname == "shortcuturl": + keyword = value + elif attrname == "icon": + icon = value + elif attrname == "last_charset": + charset = value + + debug("Bookmark points to: `%s'" % href) + bookmark = Bookmark(href, add_date, last_visit, last_modified, + keyword=keyword, icon=icon, + charset=charset, parser_charset=self.charset or default_encoding) + self.current_object = bookmark + self.current_folder.append(bookmark) + self.urls += 1 + self.objects += 1 + + def end_a(self): + accumulator = self.accumulator + self.accumulator = '' + + debug("Bookmark name: `%s'" % accumulator) + bookmark = self.current_folder[-1] + bookmark.name = accumulator - self.anchor_bgn(href, add_date, last_visit, last_modified) - - - def start_h3(self, attrs): # Navigator marks folders with

tags - self.flush() - add_date = '' - - for attrname, value in attrs: - value = string.strip(value) - if attrname == 'add_date': - add_date = value - - self.saved_folder = ('', add_date, '') - self.flag_out = 0 - + def flush(self): + accumulator = self.accumulator - def end_h3(self): # End of folder - name, add_date, comment = self.saved_folder - self.saved_folder = (name + self.saved_data, add_date, comment) - self.saved_data = '' # Used + if accumulator: + self.accumulator = '' + current_object = self.current_object + if current_object: + current_object.comment += accumulator.strip() + debug("Comment: `%s'" % current_object.comment) def start_dl(self, attrs): self.flush() - if not self.outfile: # We are starting output after 1st
tag to skip header - self.open_outfile() - - self.level = self.level + 1 - + do_dt = start_dl + # End of folder def end_dl(self): self.flush() - self.level = self.level - 1 + debug("End folder") + debug("Folder stack: %s" % dump_names(self.folder_stack)) + if self.folder_stack: + del self.folder_stack[-1] # pop last folder + if self.folder_stack: + self.current_folder = self.folder_stack[-1] + else: + debug("FOLDER STACK is EMPTY!!! (1)") + else: + debug("FOLDER STACK is EMPTY!!! (2)") + self.current_object = None + def close(self): + HTMLParser.close(self) + if self.folder_stack: + raise ValueError, "wrong folder stack: %s" % self.folder_stack def do_dd(self, attrs): - if self.outfile: - self.flag_out = 2 # Set flag to signal "comment starting" - - - def do_br(self, attrs): - if self.outfile: - self.saved_data = self.saved_data + "
" # Add
... - self.flag_out = 0 # ...and next line of comment to saved comment + pass + do_p = do_dd + # Start ruler def do_hr(self, attrs): - if self.outfile: - self.flush() - self.saved_ruler = 1 - - - def handle_charref(self, name): - if self.outfile: - self.flag_out = 0 - self.saved_data = "%s&%c" % (self.saved_data, chr(name)) - - - def handle_entityref(self, name): - if self.outfile: - self.flag_out = 0 - if self.entitydefs.has_key(name): # If it is one of the standard SGML entities - close it with semicolon - x = ';' - else: - x = '' - self.saved_data = "%s&%s%s" % (self.saved_data, name, x) - - - def open_outfile(self): - self.outfile = open("bookmarks.tmp", 'w') - - -class Bookmarks2Text(BookmarksParser): - def flush_anchor(self): - self.outfile.write(" "*(self.level-1) + str(self.saved_anchor) + '\n') - - - def flush_folder(self): - self.outfile.write(" "*(self.level-1) + str(self.saved_folder) + '\n') - - - def flush_ruler(self): - self.outfile.write(" "*(self.level-1) + "----------\n") - - - def __del__(self): - shutil.copy("bookmarks.tmp", "bookmarks.txt") - os.unlink("bookmarks.tmp") - - -class Bookmarks2Flad(BookmarksParser): - def __init__(self, formatter, verbose=0): - BookmarksParser.__init__(self, formatter, verbose) - self.flush_record = 0 - + self.flush() + debug("Ruler") + self.current_folder.append(Ruler()) + self.current_object = None + self.objects += 1 - def flush(self): - if not self.outfile: - return + # BR in comment + def do_br(self, attrs): + self.accumulator += "
" - record_flushed = 0 + # Allow < in the text + def unknown_starttag(self, tag, attrs): + self.accumulator += "<%s>" % tag - if self.saved_anchor or self.saved_folder or self.saved_ruler or self.saved_data: - if self.flush_record: - self.outfile.write('\n') - else: - self.flush_record = 1 - - BookmarksParser.flush(self) - - - def flush_anchor(self): - name, href, add_date, last_visit, last_modified, comment = self.saved_anchor - self.outfile.write("""Level: %d -Title: %s -URL: %s -AddDate: %s -LastVisit: %s -LastModified: %s -Comment: %s -""" % (self.level, name, href, add_date, last_visit, last_modified, comment)) - - def flush_folder(self): - name, add_date, comment = self.saved_folder - self.outfile.write("""Level: %d -Folder: %s -AddDate: %s -Comment: %s -""" % (self.level, name, add_date, comment)) - - def flush_ruler(self): - self.outfile.write("Level: %s\nRuler: YES\n" % self.level) - - - def __del__(self): - shutil.copy("bookmarks.tmp", "bookmarks.db") - os.unlink("bookmarks.tmp") - - -class Bookmarks2Gadfly(BookmarksParser): - def open_outfile(self): - import gadfly - connection = gadfly.gadfly() - connection.startup("bookmarks", ".") - self.connection = connection - - cursor = connection.cursor() - cursor.execute("""create table bookmarks ( - rec_no integer, - level integer, - title varchar, - DATA varchar, - add_date integer, - last_visit integer, - last_modified integer, - comment varchar - )""") - self.outfile = cursor - - self.template = """insert into bookmarks - (rec_no, level, title, DATA, add_date, last_visit, last_modified, comment) - values (?, ?, ?, ?, ?, ?, ?, ?)""" - - - def __del__(self): - self.connection.commit() - - - def flush_anchor(self): - name, href, add_date, last_visit, last_modified, comment = self.saved_anchor - self.outfile.execute(self.template, - (self.record_no, self.level, name, href, - add_date, last_visit, last_modified, comment) - ) - - def flush_folder(self): - name, add_date, comment = self.saved_folder - self.outfile.execute(self.template, - (self.record_no, self.level, name, "Folder", - add_date, '', '', comment) - ) - - def flush_ruler(self): - self.outfile.execute(self.template, - (self.record_no, self.level, '', "Ruler", - '', '', '', '') - ) + # Do not allow unknow end tags + def unknown_endtag(self, tag): + raise NotImplementedError("Unknow end tag `%s'" % tag)