"""Parser for Netscape Navigator's and Mozilla's bookmarks.html This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " __copyright__ = "Copyright (C) 1997-2023 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['BkmkParser'] import os from m_lib.defenc import default_encoding from m_lib.net.www.html import HTMLParser from bkmk_objects import Folder, Bookmark, Ruler DEBUG = "BKMK_DEBUG" in os.environ if DEBUG: def debug(note): print(note) def dump_names(folder_stack): _l = [] for object in folder_stack: if object.isFolder: _l.append(object.name) return "'%s'" % "' '".join(_l) else: def debug(note): pass dump_names = debug class BkmkParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.urls = 0 self.objects = 0 self.charset = None self.recode = None def handle_data(self, data): if data: #if self.charset and default_encoding: # data = data.decode(self.charset, "replace").\ # encode(default_encoding, "xmlcharrefreplace") self.accumulator += data # Mozilla - get charset def do_meta(self, attrs): http_equiv = "" content = "" for attrname, value in attrs: value = value.strip() if attrname == 'http-equiv': http_equiv = value.lower() elif attrname == 'content': content = value if http_equiv == "content-type": try: # extract charset from "text/html; charset=UTF-8" self.charset = content.split('=')[1] except IndexError: pass def start_title(self, attrs): if default_encoding: self.accumulator += '\n' % default_encoding self.accumulator += "" def end_title(self): self.accumulator += "" # Start root folder def start_h1(self, attrs): root_folder = Folder() self.current_object = root_folder self.root_folder = root_folder self.current_folder = root_folder self.folder_stack = [root_folder] self.root_folder.header = self.accumulator.strip() self.accumulator = '' def end_h1(self): accumulator = self.accumulator self.accumulator = '' debug("Root folder name: `%s'" % accumulator) self.root_folder.name = accumulator # Start a folder def start_h3(self, attrs): last_modified = None for attrname, value in attrs: value = value.strip() if attrname == 'add_date': add_date = value elif attrname == 'last_modified': last_modified = value debug("New folder...") folder = Folder(add_date, last_modified=last_modified) self.current_object = folder self.current_folder.append(folder) self.folder_stack.append(folder) # push new folder self.current_folder = folder self.objects += 1 def end_h3(self): accumulator = self.accumulator self.accumulator = '' debug("Folder name: `%s'" % accumulator) self.current_folder.name = accumulator # Start a bookmark def start_a(self, attrs): add_date = None last_visit = None last_modified = None keyword = '' icon = None charset = None for attrname, value in attrs: value = value.strip() if attrname == "href": href = value elif attrname == "add_date": add_date = value elif attrname == "last_visit": last_visit = value elif attrname == "last_modified": last_modified = value elif attrname == "shortcuturl": keyword = value elif attrname == "icon": icon = value elif attrname == "last_charset": charset = value debug("Bookmark points to: `%s'" % href) bookmark = Bookmark(href, add_date, last_visit, last_modified, keyword=keyword, icon=icon, charset=charset, parser_charset=self.charset or default_encoding) self.current_object = bookmark self.current_folder.append(bookmark) self.urls += 1 self.objects += 1 def end_a(self): accumulator = self.accumulator self.accumulator = '' debug("Bookmark name: `%s'" % accumulator) bookmark = self.current_folder[-1] bookmark.name = accumulator def flush(self): accumulator = self.accumulator if accumulator: self.accumulator = '' current_object = self.current_object if current_object: current_object.comment += accumulator.strip() debug("Comment: `%s'" % current_object.comment) def start_dl(self, attrs): self.flush() do_dt = start_dl # End of folder def end_dl(self): self.flush() debug("End folder") debug("Folder stack: %s" % dump_names(self.folder_stack)) if self.folder_stack: del self.folder_stack[-1] # pop last folder if self.folder_stack: self.current_folder = self.folder_stack[-1] else: debug("FOLDER STACK is EMPTY!!! (1)") else: debug("FOLDER STACK is EMPTY!!! (2)") self.current_object = None def close(self): HTMLParser.close(self) if self.folder_stack: raise ValueError("wrong folder stack: %s" % self.folder_stack) def do_dd(self, attrs): pass do_p = do_dd # Start ruler def do_hr(self, attrs): self.flush() debug("Ruler") self.current_folder.append(Ruler()) self.current_object = None self.objects += 1 # BR in comment def do_br(self, attrs): self.accumulator += "
" # Allow < in the text def unknown_starttag(self, tag, attrs): self.accumulator += "<%s>" % tag # Do not allow unknow end tags def unknown_endtag(self, tag): raise NotImplementedError("Unknow end tag `%s'" % tag)