]> git.phdru.name Git - bookmarks_db.git/blob - bkmk_parser.py
Fix(Robot): Stop splitting and un-splitting URLs
[bookmarks_db.git] / bkmk_parser.py
1 """Parser for Netscape Navigator's and Mozilla's bookmarks.html
2
3 This file is a part of Bookmarks database and Internet robot.
4 """
5
6 __author__ = "Oleg Broytman <phd@phdru.name>"
7 __copyright__ = "Copyright (C) 1997-2017 PhiloSoft Design"
8 __license__ = "GNU GPL"
9
10 __all__ = ['BkmkParser']
11
12
13 import os
14 from m_lib.defenc import default_encoding
15 from m_lib.net.www.html import HTMLParser
16 from bkmk_objects import Folder, Bookmark, Ruler
17
18
19 DEBUG = os.environ.has_key("BKMK_DEBUG")
20
21 if DEBUG:
22     def debug(note):
23         print(note)
24
25     def dump_names(folder_stack):
26         l = []
27         for object in folder_stack:
28             if object.isFolder:
29                 l.append(object.name)
30         return "'%s'" % "' '".join(l)
31
32 else:
33     def debug(note):
34         pass
35     dump_names = debug
36
37
38 class BkmkParser(HTMLParser):
39     def __init__(self):
40         HTMLParser.__init__(self)
41
42         self.urls = 0
43         self.objects = 0
44
45         self.charset = None
46         self.recode = None
47
48     def handle_data(self, data):
49         if data:
50             if self.charset and default_encoding:
51                 data = unicode(data, self.charset, "replace").encode(default_encoding, "xmlcharrefreplace")
52             self.accumulator += data
53
54     # Mozilla - get charset
55     def do_meta(self, attrs):
56         http_equiv = ""
57         content = ""
58
59         for attrname, value in attrs:
60             value = value.strip()
61             if attrname == 'http-equiv':
62                 http_equiv = value.lower()
63             elif attrname == 'content':
64                 content = value
65
66         if http_equiv == "content-type":
67             try:
68                 # extract charset from "text/html; charset=UTF-8"
69                 self.charset = content.split('=')[1]
70             except IndexError:
71                 pass
72
73     def start_title(self, attrs):
74         if default_encoding:
75             self.accumulator += '<META HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=%s">\n' % default_encoding
76         self.accumulator += "<TITLE>"
77
78     def end_title(self):
79         self.accumulator += "</TITLE>"
80
81     # Start root folder
82     def start_h1(self, attrs):
83         root_folder = Folder()
84         self.current_object = root_folder
85         self.root_folder = root_folder
86         self.current_folder = root_folder
87         self.folder_stack = [root_folder]
88
89         self.root_folder.header = self.accumulator.strip()
90         self.accumulator = ''
91
92     def end_h1(self):
93         accumulator = self.accumulator
94         self.accumulator = ''
95
96         debug("Root folder name: `%s'" % accumulator)
97         self.root_folder.name = accumulator
98
99     # Start a folder
100     def start_h3(self, attrs):
101         last_modified = None
102         for attrname, value in attrs:
103             value = value.strip()
104             if attrname == 'add_date':
105                 add_date = value
106             elif attrname == 'last_modified':
107                 last_modified = value
108
109         debug("New folder...")
110         folder = Folder(add_date, last_modified=last_modified)
111         self.current_object = folder
112         self.current_folder.append(folder)
113         self.folder_stack.append(folder) # push new folder
114         self.current_folder = folder
115         self.objects += 1
116
117     def end_h3(self):
118         accumulator = self.accumulator
119         self.accumulator = ''
120
121         debug("Folder name: `%s'" % accumulator)
122         self.current_folder.name = accumulator
123
124     # Start a bookmark
125     def start_a(self, attrs):
126         add_date = None
127         last_visit = None
128         last_modified = None
129         keyword = ''
130         icon = None
131         charset = None
132
133         for attrname, value in attrs:
134             value = value.strip()
135             if attrname == "href":
136                 href = value
137             elif attrname == "add_date":
138                 add_date = value
139             elif attrname == "last_visit":
140                 last_visit = value
141             elif attrname == "last_modified":
142                 last_modified = value
143             elif attrname == "shortcuturl":
144                 keyword = value
145             elif attrname == "icon":
146                 icon = value
147             elif attrname == "last_charset":
148                 charset = value
149
150         debug("Bookmark points to: `%s'" % href)
151         bookmark = Bookmark(href, add_date, last_visit, last_modified,
152            keyword=keyword, icon=icon,
153            charset=charset, parser_charset=self.charset or default_encoding)
154         self.current_object = bookmark
155         self.current_folder.append(bookmark)
156         self.urls += 1
157         self.objects += 1
158
159     def end_a(self):
160         accumulator = self.accumulator
161         self.accumulator = ''
162
163         debug("Bookmark name: `%s'" % accumulator)
164         bookmark = self.current_folder[-1]
165         bookmark.name = accumulator
166
167     def flush(self):
168         accumulator = self.accumulator
169
170         if accumulator:
171             self.accumulator = ''
172
173             current_object = self.current_object
174             if current_object:
175                 current_object.comment += accumulator.strip()
176                 debug("Comment: `%s'" % current_object.comment)
177
178     def start_dl(self, attrs):
179         self.flush()
180
181     do_dt = start_dl
182
183     # End of folder
184     def end_dl(self):
185         self.flush()
186         debug("End folder")
187         debug("Folder stack: %s" % dump_names(self.folder_stack))
188         if self.folder_stack:
189             del self.folder_stack[-1] # pop last folder
190             if self.folder_stack:
191                 self.current_folder = self.folder_stack[-1]
192             else:
193                 debug("FOLDER STACK is EMPTY!!! (1)")
194         else:
195             debug("FOLDER STACK is EMPTY!!! (2)")
196         self.current_object = None
197
198     def close(self):
199         HTMLParser.close(self)
200         if self.folder_stack:
201             raise ValueError("wrong folder stack: %s" % self.folder_stack)
202
203     def do_dd(self, attrs):
204         pass
205
206     do_p = do_dd
207
208     # Start ruler
209     def do_hr(self, attrs):
210         self.flush()
211         debug("Ruler")
212         self.current_folder.append(Ruler())
213         self.current_object = None
214         self.objects += 1
215
216     # BR in comment
217     def do_br(self, attrs):
218         self.accumulator += "<BR>"
219
220     # Allow < in the text
221     def unknown_starttag(self, tag, attrs):
222         self.accumulator += "<%s>" % tag
223
224     # Do not allow unknow end tags
225     def unknown_endtag(self, tag):
226         raise NotImplementedError("Unknow end tag `%s'" % tag)