2 Parser for Netscape Navigator's and Mozilla's bookmarks.html
4 Written by BroytMann. Copyright (C) 1997-2007 PhiloSoft Design
9 from m_lib.net.www.html import HTMLParser
10 from bkmk_objects import Folder, Bookmark, Ruler
13 DEBUG = os.environ.has_key("BKMK_DEBUG")
19 def dump_names(folder_stack):
21 for object in folder_stack:
24 return "'%s'" % "' '".join(l)
32 DEFAULT_CHARSET = None
34 class BkmkParser(HTMLParser):
36 HTMLParser.__init__(self)
45 def handle_data(self, data):
48 data = unicode(data, self.charset, "replace").encode(DEFAULT_CHARSET, "replace")
49 self.accumulator += data
52 # Mozilla - get charset
53 def do_meta(self, attrs):
57 for attrname, value in attrs:
59 if attrname == 'http-equiv':
60 http_equiv = value.lower()
61 elif attrname == 'content':
64 if http_equiv == "content-type":
66 # extract charset from "text/html; charset=UTF-8"
67 self.charset = content.split('=')[1]
71 global DEFAULT_CHARSET
72 DEFAULT_CHARSET = sys.getdefaultencoding()
73 if DEFAULT_CHARSET == "ascii":
79 DEFAULT_CHARSET = locale.getpreferredencoding()
82 def start_title(self, attrs):
84 self.accumulator += '<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=%s">\n' % DEFAULT_CHARSET
85 self.accumulator += "<TITLE>"
88 self.accumulator += "</TITLE>"
92 def start_h1(self, attrs):
93 root_folder = Folder()
94 self.current_object = root_folder
95 self.root_folder = root_folder
96 self.current_folder = root_folder
97 self.folder_stack = [root_folder]
99 self.root_folder.header = self.accumulator.strip()
100 self.accumulator = ''
103 accumulator = self.accumulator
104 self.accumulator = ''
106 debug("Root folder name: `%s'" % accumulator)
107 self.root_folder.name = accumulator
111 def start_h3(self, attrs):
113 for attrname, value in attrs:
114 value = value.strip()
115 if attrname == 'add_date':
117 elif attrname == 'last_modified':
118 last_modified = value
120 debug("New folder...")
121 folder = Folder(add_date, last_modified=last_modified)
122 self.current_object = folder
123 self.current_folder.append(folder)
124 self.folder_stack.append(folder) # push new folder
125 self.current_folder = folder
129 accumulator = self.accumulator
130 self.accumulator = ''
132 debug("Folder name: `%s'" % accumulator)
133 self.current_folder.name = accumulator
137 def start_a(self, attrs):
144 for attrname, value in attrs:
145 value = value.strip()
146 if attrname == "href":
148 elif attrname == "add_date":
150 elif attrname == "last_visit":
152 elif attrname == "last_modified":
153 last_modified = value
154 elif attrname == "shortcuturl":
156 elif attrname == "icon":
158 elif attrname == "last_charset":
161 debug("Bookmark points to: `%s'" % href)
162 bookmark = Bookmark(href, add_date, last_visit, last_modified,
163 keyword or '', '', icon, charset)
164 self.current_object = bookmark
165 self.current_folder.append(bookmark)
170 accumulator = self.accumulator
171 self.accumulator = ''
173 debug("Bookmark name: `%s'" % accumulator)
174 bookmark = self.current_folder[-1]
175 bookmark.name = accumulator
179 accumulator = self.accumulator
182 self.accumulator = ''
184 current_object = self.current_object
186 current_object.comment += accumulator.strip()
187 debug("Comment: `%s'" % current_object.comment)
190 def start_dl(self, attrs):
200 debug("Folder stack: %s" % dump_names(self.folder_stack))
201 if self.folder_stack:
202 del self.folder_stack[-1] # pop last folder
203 if self.folder_stack:
204 self.current_folder = self.folder_stack[-1]
206 debug("FOLDER STACK is EMPTY!!! (1)")
208 debug("FOLDER STACK is EMPTY!!! (2)")
209 self.current_object = None
213 HTMLParser.close(self)
214 if self.folder_stack:
215 raise ValueError, "wrong folder stack: %s" % self.folder_stack
218 def do_dd(self, attrs):
225 def do_hr(self, attrs):
228 self.current_folder.append(Ruler())
229 self.current_object = None
234 def do_br(self, attrs):
235 self.accumulator += "<BR>"
238 # Allow < in the text
239 def unknown_starttag(self, tag, attrs):
240 self.accumulator += "<%s>" % tag
243 # Do not allow unknow end tags
244 def unknown_endtag(self, tag):
245 raise NotImplementedError("Unknow end tag `%s'" % tag)