"""
- Bookmarks parsers
+ Parser for Netscape Navigator's and Mozilla's bookmarks.html
- Written by BroytMann, Mar 1997 - Feb 2000. Copyright (C) 1997-2000 PhiloSoft Design
+ Written by BroytMann. Copyright (C) 1997-2003 PhiloSoft Design
"""
-import os, string, shutil
-from htmllib import HTMLParser
+import os
+from m_lib.net.www.html import HTMLParser
+from bkmk_objects import Folder, Bookmark, Ruler
-class BookmarksParser(HTMLParser): # Parser for Navigator's bookmarks (abstract class)
- def __init__(self, formatter, verbose=0):
- HTMLParser.__init__(self, formatter, verbose)
- self.urls_no = 0 # cross-reference counter
- self.record_no = 1 # record counter
- self.outfile = None # output file
- self.level = 0 # Indentation level
- self.flag_out = 0 # Is it time to flush?
- self.saved_data = ''
- self.saved_anchor = None
- self.saved_folder = None
- self.saved_ruler = None
+DEBUG = os.environ.has_key("BKMK_DEBUG")
+if DEBUG:
+ def debug(note):
+ print note
- def flush(self):
- if not self.outfile:
- return
-
- record_flushed = 0
+ def dump_names(folder_stack):
+ l = []
+ for object in folder_stack:
+ if object.isFolder:
+ l.append(object.name)
+ return "'%s'" % "' '".join(l)
- if self.saved_anchor:
- name, href, add_date, last_visit, last_modified, comment = self.saved_anchor
- self.saved_anchor = (name, href, add_date, last_visit, last_modified, comment + self.saved_data)
- self.flush_anchor()
- self.saved_data = ''
- record_flushed = 1
- self.saved_anchor = None
+else:
+ def debug(note):
+ pass
+ dump_names = debug
- if self.saved_folder:
- name, add_date, comment = self.saved_folder
- self.saved_folder = (name, add_date, comment + self.saved_data)
- self.flush_folder()
- self.saved_data = ''
- record_flushed = 1
- self.saved_folder = None
- if self.saved_ruler:
- self.flush_ruler()
- record_flushed = 1
- self.saved_ruler = None
+class BkmkParser(HTMLParser):
+ def __init__(self):
+ HTMLParser.__init__(self)
- if record_flushed:
- self.record_no = self.record_no + 1
+ self.urls = 0
+ self.objects = 0
- if self.saved_data <> '': # This may occur after ampersand
- self.flag_out = 0
+ self.charset = ""
+ self.recode = None
+ def handle_data(self, data):
+ if data:
+ if self.charset:
+ data = unicode(data, self.charset).encode()
+ self.accumulator += data
- def close(self):
- HTMLParser.close(self)
+ # Mozilla - get charset
+ def do_meta(self, attrs):
+ http_equiv = ""
+ content = ""
- if self.outfile:
- self.outfile.close()
+ for attrname, value in attrs:
+ value = value.strip()
+ if attrname == 'http-equiv':
+ http_equiv = value.lower()
+ elif attrname == 'content':
+ content = value
- if self.level <> 0:
- print "Bad HTML: <DL> and </DL> mismatch; level=%d" % self.level
+ if http_equiv == "content-type":
+ try:
+ # extract charset from "text/html; charset=UTF-8"
+ self.charset = content.split('=')[1]
+ except IndexError:
+ pass
- def handle_data(self, data):
- if not self.outfile:
- return
+ def start_title(self, attrs):
+ self.accumulator += "<TITLE>"
- if data and (data[0] == '&'): # Ampersand parsed by SGMLlib
- self.flag_out = 0
+ def end_title(self):
+ self.accumulator += "</TITLE>"
- if self.flag_out == 2: # Process comment after <DD> or <HR>
- if self.saved_anchor:
- name, href, add_date, last_visit, last_modified, comment = self.saved_anchor
- self.saved_anchor = (name, href, add_date, last_visit, last_modified, comment + data)
- data = '' # Used
- if self.saved_folder:
- name, add_date, comment = self.saved_folder
- self.saved_folder = (name, add_date, comment + data)
- data = '' # Used
+ # Start root folder
+ def start_h1(self, attrs):
+ root_folder = Folder()
+ self.current_object = root_folder
+ self.root_folder = root_folder
+ self.current_folder = root_folder
+ self.folder_stack = [root_folder]
- self.flag_out = 0
+ self.root_folder.header = self.accumulator.strip()
+ self.accumulator = ''
- if self.flag_out == 1:
- self.flush()
+ def end_h1(self):
+ accumulator = self.accumulator
+ self.accumulator = ''
- if data and (data[0] <> '&') and (self.flag_out == 0):
- self.flag_out = 1 # Set flag (to flush data on next call)
+ debug("Root folder name: `%s'" % accumulator)
+ self.root_folder.name = accumulator
- if data:
- self.saved_data = self.saved_data + data
+ # Start next folder
+ def start_h3(self, attrs):
+ for attrname, value in attrs:
+ value = value.strip()
+ if attrname == 'add_date':
+ add_date = value
- def anchor_bgn(self, href, add_date, last_visit, last_modified):
- self.flush()
- self.anchor = (href, add_date, last_visit, last_modified)
-
+ debug("New folder...")
+ folder = Folder(add_date)
+ self.current_object = folder
+ self.current_folder.append(folder)
+ self.folder_stack.append(folder) # push new folder
+ self.current_folder = folder
+ self.objects += 1
- def anchor_end(self):
- if self.anchor:
- href, add_date, last_visit, last_modified = self.anchor
- self.anchor = None
- self.urls_no = self.urls_no + 1
+ def end_h3(self):
+ accumulator = self.accumulator
+ self.accumulator = ''
- self.saved_anchor = (self.saved_data, href, add_date, last_visit, last_modified, '')
- self.saved_data = '' # Used
+ debug("Folder name: `%s'" % accumulator)
+ self.current_folder.name = accumulator
+ # Start bookmark
def start_a(self, attrs):
- href = ''
- add_date = ''
- last_visit = ''
- last_modified = ''
+ last_visit = None
+ last_modified = None
for attrname, value in attrs:
- value = string.strip(value)
+ value = value.strip()
if attrname == 'href':
href = value
if attrname == 'add_date':
if attrname == 'last_modified':
last_modified = value
- self.anchor_bgn(href, add_date, last_visit, last_modified)
+ debug("Bookmark points to: `%s'" % href)
+ bookmark = Bookmark(href, add_date, last_visit, last_modified)
+ self.current_object = bookmark
+ self.current_folder.append(bookmark)
+ self.urls += 1
+ self.objects += 1
+ def end_a(self):
+ accumulator = self.accumulator
+ self.accumulator = ''
- def start_h3(self, attrs): # Navigator marks folders with <H3> tags
- self.flush()
- add_date = ''
+ debug("Bookmark name: `%s'" % accumulator)
+ bookmark = self.current_folder[-1]
+ bookmark.name = accumulator
- for attrname, value in attrs:
- value = string.strip(value)
- if attrname == 'add_date':
- add_date = value
- self.saved_folder = ('', add_date, '')
- self.flag_out = 0
+ def flush(self):
+ accumulator = self.accumulator
+ if accumulator:
+ self.accumulator = ''
- def end_h3(self): # End of folder
- name, add_date, comment = self.saved_folder
- self.saved_folder = (name + self.saved_data, add_date, comment)
- self.saved_data = '' # Used
+ current_object = self.current_object
+ if current_object:
+ current_object.comment += accumulator.strip()
+ debug("Comment: `%s'" % current_object.comment)
def start_dl(self, attrs):
self.flush()
- if not self.outfile: # We are starting output after 1st <DL> tag to skip header
- self.open_outfile()
-
- self.level = self.level + 1
+ do_dt = start_dl
+ # End of folder
def end_dl(self):
self.flush()
- self.level = self.level - 1
-
-
- def do_dd(self, attrs):
- if self.outfile:
- self.flag_out = 2 # Set flag to signal "comment starting"
-
-
- def do_br(self, attrs):
- if self.outfile:
- self.saved_data = self.saved_data + "<BR>" # Add <BR>...
- self.flag_out = 0 # ...and next line of comment to saved comment
-
-
- def do_hr(self, attrs):
- if self.outfile:
- self.flush()
- self.saved_ruler = 1
-
-
- def handle_charref(self, name):
- if self.outfile:
- self.flag_out = 0
- self.saved_data = "%s&%c" % (self.saved_data, chr(name))
-
-
- def handle_entityref(self, name):
- if self.outfile:
- self.flag_out = 0
- if self.entitydefs.has_key(name): # If it is one of the standard SGML entities - close it with semicolon
- x = ';'
+ debug("End folder")
+ debug("Folder stack: %s" % dump_names(self.folder_stack))
+ if self.folder_stack:
+ del self.folder_stack[-1] # pop last folder
+ if self.folder_stack:
+ self.current_folder = self.folder_stack[-1]
else:
- x = ''
- self.saved_data = "%s&%s%s" % (self.saved_data, name, x)
+ debug("FOLDER STACK is EMPTY!!! (1)")
+ else:
+ debug("FOLDER STACK is EMPTY!!! (2)")
+ self.current_object = None
- def open_outfile(self):
- self.outfile = open("bookmarks.tmp", 'w')
-
-
-class Bookmarks2Text(BookmarksParser):
- def flush_anchor(self):
- self.outfile.write(" "*(self.level-1) + str(self.saved_anchor) + '\n')
-
+ def close(self):
+ HTMLParser.close(self)
+ if self.folder_stack:
+ raise ValueError, "wrong folder stack: %s" % self.folder_stack
- def flush_folder(self):
- self.outfile.write(" "*(self.level-1) + str(self.saved_folder) + '\n')
+ def do_dd(self, attrs):
+ pass
- def flush_ruler(self):
- self.outfile.write(" "*(self.level-1) + "----------\n")
+ do_p = do_dd
- def __del__(self):
- shutil.copy("bookmarks.tmp", "bookmarks.txt")
- os.unlink("bookmarks.tmp")
+ # Start ruler
+ def do_hr(self, attrs):
+ self.flush()
+ debug("Ruler")
+ self.current_folder.append(Ruler())
+ self.current_object = None
+ self.objects += 1
-class Bookmarks2Flad(BookmarksParser):
- def __init__(self, formatter, verbose=0):
- BookmarksParser.__init__(self, formatter, verbose)
- self.flush_record = 0
+ # BR in comment
+ def do_br(self, attrs):
+ self.accumulator += "<BR>"
- def flush(self):
- if not self.outfile:
- return
+ # Allow < in the text
+ def unknown_starttag(self, tag, attrs):
+ self.accumulator += "<%s>" % tag
- record_flushed = 0
- if self.saved_anchor or self.saved_folder or self.saved_ruler or self.saved_data:
- if self.flush_record:
- self.outfile.write('\n')
- else:
- self.flush_record = 1
-
- BookmarksParser.flush(self)
-
-
- def flush_anchor(self):
- name, href, add_date, last_visit, last_modified, comment = self.saved_anchor
- self.outfile.write("""Level: %d
-Title: %s
-URL: %s
-AddDate: %s
-LastVisit: %s
-LastModified: %s
-Comment: %s
-""" % (self.level, name, href, add_date, last_visit, last_modified, comment))
-
- def flush_folder(self):
- name, add_date, comment = self.saved_folder
- self.outfile.write("""Level: %d
-Folder: %s
-AddDate: %s
-Comment: %s
-""" % (self.level, name, add_date, comment))
-
- def flush_ruler(self):
- self.outfile.write("Level: %s\nRuler: YES\n" % self.level)
-
-
- def __del__(self):
- shutil.copy("bookmarks.tmp", "bookmarks.db")
- os.unlink("bookmarks.tmp")
-
-
-class Bookmarks2Gadfly(BookmarksParser):
- def open_outfile(self):
- import gadfly
- connection = gadfly.gadfly()
- connection.startup("bookmarks", ".")
- self.connection = connection
-
- cursor = connection.cursor()
- cursor.execute("""create table bookmarks (
- rec_no integer,
- level integer,
- title varchar,
- DATA varchar,
- add_date integer,
- last_visit integer,
- last_modified integer,
- comment varchar
- )""")
- self.outfile = cursor
-
- self.template = """insert into bookmarks
- (rec_no, level, title, DATA, add_date, last_visit, last_modified, comment)
- values (?, ?, ?, ?, ?, ?, ?, ?)"""
-
-
- def __del__(self):
- self.connection.commit()
-
-
- def flush_anchor(self):
- name, href, add_date, last_visit, last_modified, comment = self.saved_anchor
- self.outfile.execute(self.template,
- (self.record_no, self.level, name, href,
- add_date, last_visit, last_modified, comment)
- )
-
- def flush_folder(self):
- name, add_date, comment = self.saved_folder
- self.outfile.execute(self.template,
- (self.record_no, self.level, name, "Folder",
- add_date, '', '', comment)
- )
-
- def flush_ruler(self):
- self.outfile.execute(self.template,
- (self.record_no, self.level, '', "Ruler",
- '', '', '', '')
- )
+ # Do not allow unknow end tags
+ def unknown_endtag(self, tag):
+ raise NotImplementedError("Unknow end tag `%s'" % tag)