1 """Parser for Netscape Navigator's and Mozilla's bookmarks.html
3 This file is a part of Bookmarks database and Internet robot.
6 __author__ = "Oleg Broytman <phd@phdru.name>"
7 __copyright__ = "Copyright (C) 1997-2017 PhiloSoft Design"
8 __license__ = "GNU GPL"
10 __all__ = ['BkmkParser']
14 from m_lib.defenc import default_encoding
15 from m_lib.net.www.html import HTMLParser
16 from bkmk_objects import Folder, Bookmark, Ruler
19 DEBUG = os.environ.has_key("BKMK_DEBUG")
25 def dump_names(folder_stack):
27 for object in folder_stack:
30 return "'%s'" % "' '".join(l)
38 class BkmkParser(HTMLParser):
40 HTMLParser.__init__(self)
48 def handle_data(self, data):
50 if self.charset and default_encoding:
51 data = unicode(data, self.charset, "replace").encode(default_encoding, "xmlcharrefreplace")
52 self.accumulator += data
54 # Mozilla - get charset
55 def do_meta(self, attrs):
59 for attrname, value in attrs:
61 if attrname == 'http-equiv':
62 http_equiv = value.lower()
63 elif attrname == 'content':
66 if http_equiv == "content-type":
68 # extract charset from "text/html; charset=UTF-8"
69 self.charset = content.split('=')[1]
73 def start_title(self, attrs):
75 self.accumulator += '<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=%s">\n' % default_encoding
76 self.accumulator += "<TITLE>"
79 self.accumulator += "</TITLE>"
82 def start_h1(self, attrs):
83 root_folder = Folder()
84 self.current_object = root_folder
85 self.root_folder = root_folder
86 self.current_folder = root_folder
87 self.folder_stack = [root_folder]
89 self.root_folder.header = self.accumulator.strip()
93 accumulator = self.accumulator
96 debug("Root folder name: `%s'" % accumulator)
97 self.root_folder.name = accumulator
100 def start_h3(self, attrs):
102 for attrname, value in attrs:
103 value = value.strip()
104 if attrname == 'add_date':
106 elif attrname == 'last_modified':
107 last_modified = value
109 debug("New folder...")
110 folder = Folder(add_date, last_modified=last_modified)
111 self.current_object = folder
112 self.current_folder.append(folder)
113 self.folder_stack.append(folder) # push new folder
114 self.current_folder = folder
118 accumulator = self.accumulator
119 self.accumulator = ''
121 debug("Folder name: `%s'" % accumulator)
122 self.current_folder.name = accumulator
125 def start_a(self, attrs):
133 for attrname, value in attrs:
134 value = value.strip()
135 if attrname == "href":
137 elif attrname == "add_date":
139 elif attrname == "last_visit":
141 elif attrname == "last_modified":
142 last_modified = value
143 elif attrname == "shortcuturl":
145 elif attrname == "icon":
147 elif attrname == "last_charset":
150 debug("Bookmark points to: `%s'" % href)
151 bookmark = Bookmark(href, add_date, last_visit, last_modified,
152 keyword=keyword, icon=icon,
153 charset=charset, parser_charset=self.charset or default_encoding)
154 self.current_object = bookmark
155 self.current_folder.append(bookmark)
160 accumulator = self.accumulator
161 self.accumulator = ''
163 debug("Bookmark name: `%s'" % accumulator)
164 bookmark = self.current_folder[-1]
165 bookmark.name = accumulator
168 accumulator = self.accumulator
171 self.accumulator = ''
173 current_object = self.current_object
175 current_object.comment += accumulator.strip()
176 debug("Comment: `%s'" % current_object.comment)
178 def start_dl(self, attrs):
187 debug("Folder stack: %s" % dump_names(self.folder_stack))
188 if self.folder_stack:
189 del self.folder_stack[-1] # pop last folder
190 if self.folder_stack:
191 self.current_folder = self.folder_stack[-1]
193 debug("FOLDER STACK is EMPTY!!! (1)")
195 debug("FOLDER STACK is EMPTY!!! (2)")
196 self.current_object = None
199 HTMLParser.close(self)
200 if self.folder_stack:
201 raise ValueError("wrong folder stack: %s" % self.folder_stack)
203 def do_dd(self, attrs):
209 def do_hr(self, attrs):
212 self.current_folder.append(Ruler())
213 self.current_object = None
217 def do_br(self, attrs):
218 self.accumulator += "<BR>"
220 # Allow < in the text
221 def unknown_starttag(self, tag, attrs):
222 self.accumulator += "<%s>" % tag
224 # Do not allow unknow end tags
225 def unknown_endtag(self, tag):
226 raise NotImplementedError("Unknow end tag `%s'" % tag)