]> git.phdru.name Git - bookmarks_db.git/blobdiff - bkmk_parser.py
Version 3.3.1.
[bookmarks_db.git] / bkmk_parser.py
index c3ca1b5baf0a2ac8275a06ee26696d31e3b6ed68..f396e2e5aa9be64cd40e3db47ec8d19b8927bc3d 100755 (executable)
 """
-   Bookmarks parsers
+   Parser for Netscape Navigator's bookmarks.html
 
-   Written by BroytMann, Mar 1997 - Feb 2000. Copyright (C) 1997-2000 PhiloSoft Design
+   Written by BroytMann, Jun 1997 - Jun 2002. Copyright (C) 1997-2002 PhiloSoft Design
 """
 
 
-import os, string, shutil
-from htmllib import HTMLParser
+import string
+from m_lib.www.html import HTMLParser
+from bkmk_objects import Folder, Bookmark, Ruler
 
 
-class BookmarksParser(HTMLParser): # Parser for Navigator's bookmarks (abstract class)
-   def __init__(self, formatter, verbose=0):
-      HTMLParser.__init__(self, formatter, verbose)
-      self.urls_no = 0   # cross-reference counter
-      self.record_no = 1  # record counter
-      self.outfile = None # output file
-      self.level = 0      # Indentation level
-      self.flag_out = 0   # Is it time to flush?
-      self.saved_data = ''
-      self.saved_anchor = None
-      self.saved_folder = None
-      self.saved_ruler = None
+if __debug__:
+   def debug(note):
+      print note
 
+   def dump_names(folder_stack):
+      l = []
+      for object in folder_stack:
+         if object.isFolder:
+            l.append(object.name)
+      return "'" + string.join(l, "' '") + "'"
 
-   def flush(self):
-      if not self.outfile:
-         return
-
-      record_flushed = 0
+else:
+   def debug(note):
+      pass
+   dump_names = debug
 
-      if self.saved_anchor:
-         name, href, add_date, last_visit, last_modified, comment = self.saved_anchor
-         self.saved_anchor = (name, href, add_date, last_visit, last_modified, comment + self.saved_data)
-         self.flush_anchor()
-         self.saved_data = ''
-         record_flushed = 1
-         self.saved_anchor = None
 
-      if self.saved_folder:
-         name, add_date, comment = self.saved_folder
-         self.saved_folder = (name, add_date, comment + self.saved_data)
-         self.flush_folder()
-         self.saved_data = ''
-         record_flushed = 1
-         self.saved_folder = None
+class BkmkParser(HTMLParser):
+   def __init__(self):
+      HTMLParser.__init__(self)
 
-      if self.saved_ruler:
-         self.flush_ruler()
-         record_flushed = 1
-         self.saved_ruler = None
+      self.urls = 0
+      self.objects = 0
 
-      if record_flushed:
-         self.record_no = self.record_no + 1
-
-      if self.saved_data <> '': # This may occur after ampersand
-         self.flag_out = 0
+      self.charset = ""
+      self.recode = None
 
 
+   def handle_data(self, data):
+      if data:
+         if self.charset:
+            data = unicode(data, self.charset).encode()
+         self.accumulator = "%s%s" % (self.accumulator, data)
 
 
-   def close(self):
-      HTMLParser.close(self)
+   # Mozilla - get charset
+   def do_meta(self, attrs):
+      http_equiv = ""
+      content = ""
 
-      if self.outfile:
-         self.outfile.close()
-
-      if self.level <> 0:
-         print "Bad HTML: <DL> and </DL> mismatch; level=%d" % self.level
+      for attrname, value in attrs:
+         value = string.strip(value)
+         if attrname == 'http-equiv':
+            http_equiv = value.lower()
+         elif attrname == 'content':
+            content = value
 
+      if http_equiv == "content-type":
+         try:
+            # extract charset from "text/html; charset=UTF-8"
+            self.charset = content.split('=')[1]
+         except IndexError:
+            pass
 
-   def handle_data(self, data):
-      if not self.outfile:
-         return
 
-      if data and (data[0] == '&'): # Ampersand parsed by SGMLlib
-         self.flag_out = 0
+   def start_title(self, attrs):
+      self.accumulator = "%s<TITLE>" % self.accumulator
 
-      if self.flag_out == 2: # Process comment after <DD> or <HR>
-         if self.saved_anchor:
-            name, href, add_date, last_visit, last_modified, comment = self.saved_anchor
-            self.saved_anchor = (name, href, add_date, last_visit, last_modified, comment + data)
-            data = '' # Used
+   def end_title(self):
+      self.accumulator = "%s</TITLE>" % self.accumulator
 
-         if self.saved_folder:
-            name, add_date, comment = self.saved_folder
-            self.saved_folder = (name, add_date, comment + data)
-            data = '' # Used
 
-         self.flag_out = 0
+   # Start root folder
+   def start_h1(self, attrs):
+      root_folder = Folder()
+      self.current_object = root_folder
+      self.root_folder = root_folder
+      self.current_folder = root_folder
+      self.folder_stack = [root_folder]
 
-      if self.flag_out == 1:
-         self.flush()
+      self.root_folder.header = self.accumulator
+      self.accumulator = ''
 
-      if data and (data[0] <> '&') and (self.flag_out == 0):
-         self.flag_out = 1 # Set flag (to flush data on next call)
+   def end_h1(self):
+      accumulator = self.accumulator
+      self.accumulator = ''
 
-      if data:
-         self.saved_data = self.saved_data + data
+      debug("Root folder name: `%s'" % accumulator)
+      self.root_folder.name = accumulator
 
 
-   def anchor_bgn(self, href, add_date, last_visit, last_modified):
-      self.flush()
-      self.anchor = (href, add_date, last_visit, last_modified)
+   # Start next folder
+   def start_h3(self, attrs):
+      for attrname, value in attrs:
+         value = string.strip(value)
+         if attrname == 'add_date':
+            add_date = value
 
+      debug("New folder...")
+      folder = Folder(add_date)
+      self.current_object = folder
+      self.current_folder.append(folder)
+      self.folder_stack.append(folder) # push new folder
+      self.current_folder = folder
+      self.objects = self.objects + 1
 
-   def anchor_end(self):
-      if self.anchor:
-         href, add_date, last_visit, last_modified = self.anchor
-         self.anchor = None
-         self.urls_no = self.urls_no + 1
+   def end_h3(self):
+      accumulator = self.accumulator
+      self.accumulator = ''
 
-         self.saved_anchor = (self.saved_data, href, add_date, last_visit, last_modified, '')
-         self.saved_data = '' # Used
+      debug("Folder name: `%s'" % accumulator)
+      self.current_folder.name = accumulator
 
 
+   # Start bookmark
    def start_a(self, attrs):
-      href = ''
-      add_date = ''
-      last_visit = ''
-      last_modified = ''
-
       for attrname, value in attrs:
          value = string.strip(value)
          if attrname == 'href':
@@ -131,191 +127,86 @@ class BookmarksParser(HTMLParser): # Parser for Navigator's bookmarks (abstract
          if attrname == 'last_modified':
             last_modified = value
 
-      self.anchor_bgn(href, add_date, last_visit, last_modified)
+      debug("Bookmark points to: `%s'" % href)
+      bookmark = Bookmark(href, add_date, last_visit, last_modified)
+      self.current_object = bookmark
+      self.current_folder.append(bookmark)
+      self.urls = self.urls + 1
+      self.objects = self.objects + 1
 
+   def end_a(self):
+      accumulator = self.accumulator
+      self.accumulator = ''
 
-   def start_h3(self, attrs): # Navigator marks folders with <H3> tags
-      self.flush()
-      add_date = ''
+      debug("Bookmark name: `%s'" % accumulator)
+      bookmark = self.current_folder[-1]
+      bookmark.name = accumulator
 
-      for attrname, value in attrs:
-         value = string.strip(value)
-         if attrname == 'add_date':
-            add_date = value
 
-      self.saved_folder = ('', add_date, '')
-      self.flag_out = 0
+   def flush(self):
+      accumulator = self.accumulator
 
+      if accumulator:
+         self.accumulator = ''
 
-   def end_h3(self): # End of folder
-      name, add_date, comment = self.saved_folder
-      self.saved_folder = (name + self.saved_data, add_date, comment)
-      self.saved_data = '' # Used
+         current_object = self.current_object
+         current_object.comment = current_object.comment + accumulator
+         debug("Comment: `%s'" % current_object.comment)
 
 
    def start_dl(self, attrs):
       self.flush()
 
-      if not self.outfile: # We are starting output after 1st <DL> tag to skip header
-         self.open_outfile()
-
-      self.level = self.level + 1
+   do_dt = start_dl
 
 
+   # End of folder
    def end_dl(self):
       self.flush()
-      self.level = self.level - 1
-
-
-   def do_dd(self, attrs):
-      if self.outfile:
-         self.flag_out = 2 # Set flag to signal "comment starting"
-
-
-   def do_br(self, attrs):
-      if self.outfile:
-         self.saved_data = self.saved_data + "<BR>" # Add <BR>...
-         self.flag_out = 0 # ...and next line of comment to saved comment
-
-
-   def do_hr(self, attrs):
-      if self.outfile:
-         self.flush()
-         self.saved_ruler = 1
-
-
-   def handle_charref(self, name):
-      if self.outfile:
-         self.flag_out = 0
-         self.saved_data = "%s&%c" % (self.saved_data, chr(name))
-
-
-   def handle_entityref(self, name):
-      if self.outfile:
-         self.flag_out = 0
-         if self.entitydefs.has_key(name): # If it is one of the standard SGML entities - close it with semicolon
-            x = ';'
+      debug("End folder")
+      debug("Folder stack: %s" % dump_names(self.folder_stack))
+      if self.folder_stack:
+         del self.folder_stack[-1] # pop last folder
+         if self.folder_stack:
+            self.current_folder = self.folder_stack[-1]
          else:
-            x = ''
-         self.saved_data = "%s&%s%s" % (self.saved_data, name, x)
-
-
-   def open_outfile(self):
-      self.outfile = open("bookmarks.tmp", 'w')
+            debug("FOLDER STACK is EMPTY!!! (1)")
+      else:
+         debug("FOLDER STACK is EMPTY!!! (2)")
+      self.current_object = None
 
 
-class Bookmarks2Text(BookmarksParser):
-   def flush_anchor(self):
-      self.outfile.write("   "*(self.level-1) + str(self.saved_anchor) + '\n')
-
+   def close(self):
+      HTMLParser.close(self)
+      if self.folder_stack:
+         raise ValueError, "wrong folder stack: %s" % self.folder_stack
 
-   def flush_folder(self):
-      self.outfile.write("   "*(self.level-1) + str(self.saved_folder) + '\n')
 
+   def do_dd(self, attrs):
+      pass
 
-   def flush_ruler(self):
-      self.outfile.write("   "*(self.level-1) + "----------\n")
+   do_p = do_dd
 
 
-   def __del__(self):
-      shutil.copy("bookmarks.tmp", "bookmarks.txt")
-      os.unlink("bookmarks.tmp")
+   # Start ruler
+   def do_hr(self, attrs):
+      self.flush()
+      debug("Ruler")
+      self.current_folder.append(Ruler())
+      self.current_object = None
+      self.objects = self.objects + 1
 
 
-class Bookmarks2Flad(BookmarksParser):
-   def __init__(self, formatter, verbose=0):
-      BookmarksParser.__init__(self, formatter, verbose)
-      self.flush_record = 0
+   # BR in comment
+   def do_br(self, attrs):
+      self.accumulator = "%s<BR>" % self.accumulator
 
 
-   def flush(self):
-      if not self.outfile:
-         return
+   # Allow < in the text
+   def unknown_starttag(self, tag, attrs):
+      self.accumulator = "%s<%s>" % (self.accumulator, tag)
 
-      record_flushed = 0
 
-      if self.saved_anchor or self.saved_folder or self.saved_ruler or self.saved_data:
-         if self.flush_record:
-            self.outfile.write('\n')
-         else:
-            self.flush_record = 1
-
-      BookmarksParser.flush(self)
-
-
-   def flush_anchor(self):
-      name, href, add_date, last_visit, last_modified, comment = self.saved_anchor
-      self.outfile.write("""Level: %d
-Title: %s
-URL: %s
-AddDate: %s
-LastVisit: %s
-LastModified: %s
-Comment: %s
-""" % (self.level, name, href, add_date, last_visit, last_modified, comment))
-
-   def flush_folder(self):
-      name, add_date, comment = self.saved_folder
-      self.outfile.write("""Level: %d
-Folder: %s
-AddDate: %s
-Comment: %s
-""" % (self.level, name, add_date, comment))
-
-   def flush_ruler(self):
-      self.outfile.write("Level: %s\nRuler: YES\n" % self.level)
-
-
-   def __del__(self):
-      shutil.copy("bookmarks.tmp", "bookmarks.db")
-      os.unlink("bookmarks.tmp")
-
-
-class Bookmarks2Gadfly(BookmarksParser):
-   def open_outfile(self):
-      import gadfly
-      connection = gadfly.gadfly()
-      connection.startup("bookmarks", ".")
-      self.connection = connection
-
-      cursor = connection.cursor()
-      cursor.execute("""create table bookmarks (
-         rec_no integer,
-         level integer,
-         title varchar,
-         DATA varchar,
-         add_date integer,
-         last_visit integer,
-         last_modified integer,
-         comment varchar
-      )""")
-      self.outfile = cursor
-
-      self.template = """insert into bookmarks
-         (rec_no, level, title, DATA, add_date, last_visit, last_modified, comment)
-         values (?, ?, ?, ?, ?, ?, ?, ?)"""
-
-
-   def __del__(self):
-      self.connection.commit()
-
-
-   def flush_anchor(self):
-      name, href, add_date, last_visit, last_modified, comment = self.saved_anchor
-      self.outfile.execute(self.template,
-         (self.record_no, self.level, name, href,
-         add_date, last_visit, last_modified, comment)
-      )
-
-   def flush_folder(self):
-      name, add_date, comment = self.saved_folder
-      self.outfile.execute(self.template,
-         (self.record_no, self.level, name, "Folder",
-         add_date, '', '', comment)
-      )
-
-   def flush_ruler(self):
-      self.outfile.execute(self.template,
-         (self.record_no, self.level, '', "Ruler",
-         '', '', '', '')
-      )
+   # Do not allow unknow end tags
+   def unknown_endtag(self, tag):
+      raise NotImplementedError("Unknow end tag `%s'" % tag)