X-Git-Url: https://git.phdru.name/?p=bookmarks_db.git;a=blobdiff_plain;f=bkmk_parser.py;h=6a69433ddf9ca3286cd3410284435b199c73252c;hp=8f44f47f13bd1459a0f910dfbfd517e997ba6f46;hb=HEAD;hpb=71900f3630cb51580964038b78100d60e3671981 diff --git a/bkmk_parser.py b/bkmk_parser.py index 8f44f47..997728f 100644 --- a/bkmk_parser.py +++ b/bkmk_parser.py @@ -4,7 +4,7 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 1997-2017 PhiloSoft Design" +__copyright__ = "Copyright (C) 1997-2023 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['BkmkParser'] @@ -16,211 +16,213 @@ from m_lib.net.www.html import HTMLParser from bkmk_objects import Folder, Bookmark, Ruler -DEBUG = os.environ.has_key("BKMK_DEBUG") +DEBUG = "BKMK_DEBUG" in os.environ if DEBUG: - def debug(note): - print(note) + def debug(note): + print(note) - def dump_names(folder_stack): - l = [] - for object in folder_stack: - if object.isFolder: - l.append(object.name) - return "'%s'" % "' '".join(l) + def dump_names(folder_stack): + _l = [] + for object in folder_stack: + if object.isFolder: + _l.append(object.name) + return "'%s'" % "' '".join(_l) else: - def debug(note): - pass - dump_names = debug + def debug(note): + pass + dump_names = debug class BkmkParser(HTMLParser): - def __init__(self): - HTMLParser.__init__(self) - - self.urls = 0 - self.objects = 0 - - self.charset = None - self.recode = None - - def handle_data(self, data): - if data: - if self.charset and default_encoding: - data = unicode(data, self.charset, "replace").encode(default_encoding, "xmlcharrefreplace") - self.accumulator += data - - # Mozilla - get charset - def do_meta(self, attrs): - http_equiv = "" - content = "" - - for attrname, value in attrs: - value = value.strip() - if attrname == 'http-equiv': - http_equiv = value.lower() - elif attrname == 'content': - content = value - - if http_equiv == "content-type": - try: - # extract charset from "text/html; charset=UTF-8" - self.charset = content.split('=')[1] - except IndexError: - pass - - def start_title(self, attrs): - if default_encoding: - self.accumulator += '\n' % default_encoding - self.accumulator += "" - - def end_title(self): - self.accumulator += "" - - # Start root folder - def start_h1(self, attrs): - root_folder = Folder() - self.current_object = root_folder - self.root_folder = root_folder - self.current_folder = root_folder - self.folder_stack = [root_folder] - - self.root_folder.header = self.accumulator.strip() - self.accumulator = '' - - def end_h1(self): - accumulator = self.accumulator - self.accumulator = '' - - debug("Root folder name: `%s'" % accumulator) - self.root_folder.name = accumulator - - # Start a folder - def start_h3(self, attrs): - last_modified = None - for attrname, value in attrs: - value = value.strip() - if attrname == 'add_date': - add_date = value - elif attrname == 'last_modified': - last_modified = value - - debug("New folder...") - folder = Folder(add_date, last_modified=last_modified) - self.current_object = folder - self.current_folder.append(folder) - self.folder_stack.append(folder) # push new folder - self.current_folder = folder - self.objects += 1 - - def end_h3(self): - accumulator = self.accumulator - self.accumulator = '' - - debug("Folder name: `%s'" % accumulator) - self.current_folder.name = accumulator - - # Start a bookmark - def start_a(self, attrs): - add_date = None - last_visit = None - last_modified = None - keyword = '' - icon = None - charset = None - - for attrname, value in attrs: - value = value.strip() - if attrname == "href": - href = value - elif attrname == "add_date": - add_date = value - elif attrname == "last_visit": - last_visit = value - elif attrname == "last_modified": - last_modified = value - elif attrname == "shortcuturl": - keyword = value - elif attrname == "icon": - icon = value - elif attrname == "last_charset": - charset = value - - debug("Bookmark points to: `%s'" % href) - bookmark = Bookmark(href, add_date, last_visit, last_modified, - keyword=keyword, icon=icon, - charset=charset, parser_charset=self.charset or default_encoding) - self.current_object = bookmark - self.current_folder.append(bookmark) - self.urls += 1 - self.objects += 1 - - def end_a(self): - accumulator = self.accumulator - self.accumulator = '' - - debug("Bookmark name: `%s'" % accumulator) - bookmark = self.current_folder[-1] - bookmark.name = accumulator - - def flush(self): - accumulator = self.accumulator - - if accumulator: - self.accumulator = '' - - current_object = self.current_object - if current_object: - current_object.comment += accumulator.strip() - debug("Comment: `%s'" % current_object.comment) - - def start_dl(self, attrs): - self.flush() - - do_dt = start_dl - - # End of folder - def end_dl(self): - self.flush() - debug("End folder") - debug("Folder stack: %s" % dump_names(self.folder_stack)) - if self.folder_stack: - del self.folder_stack[-1] # pop last folder - if self.folder_stack: - self.current_folder = self.folder_stack[-1] - else: - debug("FOLDER STACK is EMPTY!!! (1)") - else: - debug("FOLDER STACK is EMPTY!!! (2)") - self.current_object = None - - def close(self): - HTMLParser.close(self) - if self.folder_stack: - raise ValueError("wrong folder stack: %s" % self.folder_stack) - - def do_dd(self, attrs): - pass - - do_p = do_dd - - # Start ruler - def do_hr(self, attrs): - self.flush() - debug("Ruler") - self.current_folder.append(Ruler()) - self.current_object = None - self.objects += 1 - - # BR in comment - def do_br(self, attrs): - self.accumulator += "
" - - # Allow < in the text - def unknown_starttag(self, tag, attrs): - self.accumulator += "<%s>" % tag - - # Do not allow unknow end tags - def unknown_endtag(self, tag): - raise NotImplementedError("Unknow end tag `%s'" % tag) + def __init__(self): + HTMLParser.__init__(self) + + self.urls = 0 + self.objects = 0 + + self.charset = None + self.recode = None + + def handle_data(self, data): + if data: + #if self.charset and default_encoding: + # data = data.decode(self.charset, "replace").\ + # encode(default_encoding, "xmlcharrefreplace") + self.accumulator += data + + # Mozilla - get charset + def do_meta(self, attrs): + http_equiv = "" + content = "" + + for attrname, value in attrs: + value = value.strip() + if attrname == 'http-equiv': + http_equiv = value.lower() + elif attrname == 'content': + content = value + + if http_equiv == "content-type": + try: + # extract charset from "text/html; charset=UTF-8" + self.charset = content.split('=')[1] + except IndexError: + pass + + def start_title(self, attrs): + if default_encoding: + self.accumulator += '\n' % default_encoding + self.accumulator += "" + + def end_title(self): + self.accumulator += "" + + # Start root folder + def start_h1(self, attrs): + root_folder = Folder() + self.current_object = root_folder + self.root_folder = root_folder + self.current_folder = root_folder + self.folder_stack = [root_folder] + + self.root_folder.header = self.accumulator.strip() + self.accumulator = '' + + def end_h1(self): + accumulator = self.accumulator + self.accumulator = '' + + debug("Root folder name: `%s'" % accumulator) + self.root_folder.name = accumulator + + # Start a folder + def start_h3(self, attrs): + last_modified = None + for attrname, value in attrs: + value = value.strip() + if attrname == 'add_date': + add_date = value + elif attrname == 'last_modified': + last_modified = value + + debug("New folder...") + folder = Folder(add_date, last_modified=last_modified) + self.current_object = folder + self.current_folder.append(folder) + self.folder_stack.append(folder) # push new folder + self.current_folder = folder + self.objects += 1 + + def end_h3(self): + accumulator = self.accumulator + self.accumulator = '' + + debug("Folder name: `%s'" % accumulator) + self.current_folder.name = accumulator + + # Start a bookmark + def start_a(self, attrs): + add_date = None + last_visit = None + last_modified = None + keyword = '' + icon = None + charset = None + + for attrname, value in attrs: + value = value.strip() + if attrname == "href": + href = value + elif attrname == "add_date": + add_date = value + elif attrname == "last_visit": + last_visit = value + elif attrname == "last_modified": + last_modified = value + elif attrname == "shortcuturl": + keyword = value + elif attrname == "icon": + icon = value + elif attrname == "last_charset": + charset = value + + debug("Bookmark points to: `%s'" % href) + bookmark = Bookmark(href, add_date, last_visit, last_modified, + keyword=keyword, icon=icon, charset=charset, + parser_charset=self.charset or default_encoding) + self.current_object = bookmark + self.current_folder.append(bookmark) + self.urls += 1 + self.objects += 1 + + def end_a(self): + accumulator = self.accumulator + self.accumulator = '' + + debug("Bookmark name: `%s'" % accumulator) + bookmark = self.current_folder[-1] + bookmark.name = accumulator + + def flush(self): + accumulator = self.accumulator + + if accumulator: + self.accumulator = '' + + current_object = self.current_object + if current_object: + current_object.comment += accumulator.strip() + debug("Comment: `%s'" % current_object.comment) + + def start_dl(self, attrs): + self.flush() + + do_dt = start_dl + + # End of folder + def end_dl(self): + self.flush() + debug("End folder") + debug("Folder stack: %s" % dump_names(self.folder_stack)) + if self.folder_stack: + del self.folder_stack[-1] # pop last folder + if self.folder_stack: + self.current_folder = self.folder_stack[-1] + else: + debug("FOLDER STACK is EMPTY!!! (1)") + else: + debug("FOLDER STACK is EMPTY!!! (2)") + self.current_object = None + + def close(self): + HTMLParser.close(self) + if self.folder_stack: + raise ValueError("wrong folder stack: %s" % self.folder_stack) + + def do_dd(self, attrs): + pass + + do_p = do_dd + + # Start ruler + def do_hr(self, attrs): + self.flush() + debug("Ruler") + self.current_folder.append(Ruler()) + self.current_object = None + self.objects += 1 + + # BR in comment + def do_br(self, attrs): + self.accumulator += "
" + + # Allow < in the text + def unknown_starttag(self, tag, attrs): + self.accumulator += "<%s>" % tag + + # Do not allow unknow end tags + def unknown_endtag(self, tag): + raise NotImplementedError("Unknow end tag `%s'" % tag)