]> git.phdru.name Git - bookmarks_db.git/blob - Robots/bkmk_robot_base.py
Do not assign icon errors to bookmark.error
[bookmarks_db.git] / Robots / bkmk_robot_base.py
1 """Base class for robots
2
3 This file is a part of Bookmarks database and Internet robot.
4
5 """
6
7 __author__ = "Oleg Broytman <phd@phdru.name>"
8 __copyright__ = "Copyright (C) 2000-2014 PhiloSoft Design"
9 __license__ = "GNU GPL"
10
11 __all__ = ['robot_base', 'get_error']
12
13
14 import sys
15 import time, urllib
16 from base64 import b64encode
17 from urlparse import urljoin
18
19 from m_lib.net.www.util import parse_time
20 from m_lib.md5wrapper import md5wrapper
21
22 from bkmk_objects import Robot
23 from parse_html import parse_html
24
25
26 class RedirectException(Exception):
27    reloc_dict = {
28       301: "perm.",
29       302: "temp2.",
30       303: "temp3.",
31       307: "temp7.",
32       "html": "html"
33    }
34    def __init__(self, errcode, newurl):
35       Exception.__init__(self, "(%s) to %s" % (self.reloc_dict[errcode], newurl))
36       self.url = newurl
37
38
39 def get_error(msg):
40    if isinstance(msg, str):
41       return msg
42
43    else:
44       s = []
45       for i in msg:
46          s.append("'%s'" % str(i).replace('\n', "\\n"))
47       return "(%s)" % ' '.join(s)
48
49
50 icons = {} # Icon cache; maps URL to a tuple (content type, data)
51            # or None if there is no icon.
52
53 class robot_base(Robot):
54    def check_url(self, bookmark):
55       try:
56          self.start = int(time.time())
57          bookmark.icon = None
58
59          url_type, url_rest = urllib.splittype(bookmark.href)
60          url_host, url_path = urllib.splithost(url_rest)
61          url_path, url_tag  = urllib.splittag(url_path)
62
63          url = "%s://%s%s" % (url_type, url_host, url_path)
64          headers, content, error = self.urlretrieve(bookmark, url, True)
65
66          if error:
67              bookmark.error = error
68
69          if content is None:
70              return 1
71
72          size = 0
73          last_modified = None
74
75          if headers:
76             try:
77                size = headers["Content-Length"]
78             except KeyError:
79                size = len(content)
80
81             try:
82                last_modified = headers["Last-Modified"]
83             except KeyError:
84                pass
85
86             if last_modified:
87                last_modified = parse_time(last_modified)
88          else:
89             size = len(content)
90
91          if last_modified:
92             last_modified = str(int(last_modified))
93          else:
94             last_modified = bookmark.last_visit
95
96          bookmark.size = size
97          bookmark.last_modified = last_modified
98
99          md5 = md5wrapper()
100          if urllib._urlopener.type == "ftp": # Pass welcome message through MD5
101             md5.update(self.get_ftp_welcome())
102
103          md5.update(content)
104          bookmark.md5 = str(md5)
105
106          if headers:
107             try:
108                content_type = headers["Content-Type"]
109                self.log("   Content-Type: %s" % content_type)
110                try:
111                   # extract charset from "text/html; foo; charset=UTF-8, bar; baz;"
112                   content_type, charset = content_type.split(';', 1)
113                   content_type = content_type.strip()
114                   charset = charset.split('=')[1].strip().split(',')[0]
115                   self.log("   HTTP charset   : %s" % charset)
116                except (ValueError, IndexError):
117                   charset = None
118                   self.log("   no charset in Content-Type header")
119                for ctype in ("text/html", "application/xhtml+xml"):
120                   if content_type.startswith(ctype):
121                       html = True
122                       break
123                else:
124                   html = False
125                if html:
126                   parser = parse_html(content, charset, self.log)
127                   if parser:
128                       bookmark.real_title = parser.title
129                       icon = parser.icon
130                   else:
131                      icon = None
132                   if not icon:
133                      icon = "/favicon.ico"
134                   icon_url = urljoin("%s://%s%s" % (url_type, url_host, url_path), icon)
135                   self.log("   looking for icon at: %s" % icon_url)
136                   if icon_url in icons:
137                      if icons[icon_url]:
138                         bookmark.icon_href = icon_url
139                         content_type, bookmark.icon = icons[icon_url]
140                         self.log("   cached icon: %s" % content_type)
141                      else:
142                         self.log("   cached icon: no icon")
143                   else:
144                      try:
145                         _icon_url = icon_url
146                         for i in range(8):
147                            try:
148                               icon_headers, icon_data, error = self.urlretrieve(bookmark, _icon_url)
149                            except RedirectException, e:
150                               _icon_url = e.url
151                               self.log("   redirect to : %s" % _icon_url)
152                            else:
153                               if icon_data is None:
154                                    raise IOError("No icon")
155                               break
156                         else:
157                            raise IOError("Too many redirects")
158                      except:
159                         etype, emsg, tb = sys.exc_info()
160                         self.log("   no icon        : %s %s" % (etype, emsg))
161                         etype = emsg = tb = None
162                         icons[icon_url] = None
163                      else:
164                         content_type = icon_headers["Content-Type"]
165                         if content_type.startswith("application/") \
166                               or content_type.startswith("image/") \
167                               or content_type.startswith("text/plain"):
168                            bookmark.icon_href = icon_url
169                            self.log("   got icon       : %s" % content_type)
170                            if content_type.startswith("application/") \
171                                  or content_type.startswith("text/plain"):
172                               self.log("   non-image content type, assume x-icon")
173                               content_type = 'image/x-icon'
174                            bookmark.icon = "data:%s;base64,%s" % (content_type, b64encode(icon_data))
175                            icons[icon_url] = (content_type, bookmark.icon)
176                         else:
177                            self.log("   no icon        : bad content type '%s'" % content_type)
178                            icons[icon_url] = None
179                   if parser and parser.refresh:
180                      refresh = parser.refresh
181                      try:
182                         url = refresh.split('=', 1)[1]
183                      except IndexError:
184                         url = "self"
185                      try:
186                         timeout = float(refresh.split(';')[0])
187                      except (IndexError, ValueError):
188                         raise RedirectException("html", "Bad redirect to %s (%s)" % (url, refresh))
189                      else:
190                         try:
191                            timeout = int(refresh.split(';')[0])
192                         except ValueError:
193                            pass # float timeout
194                         raise RedirectException("html", "%s (%s sec)" % (url, timeout))
195
196             except KeyError, key:
197                self.log("   no header: %s" % key)
198
199       except EOFError:
200          bookmark.error = "Unexpected EOF (FTP server closed connection)"
201          self.log('   EOF: %s' % bookmark.error)
202
203       except RedirectException, msg:
204          bookmark.moved = str(msg)
205          self.log('   Moved: %s' % bookmark.moved)
206
207       except KeyboardInterrupt:
208          self.log("Keyboard interrupt (^C)")
209          return 0
210
211       except:
212          import traceback
213          traceback.print_exc()
214          bookmark.error = "Exception!"
215          self.log('   Exception: %s' % bookmark.error)
216
217       finally:
218          self.finish_check_url(bookmark)
219
220       # Tested
221       return 1
222
223    def finish_check_url(self, bookmark):
224       start = self.start
225       bookmark.last_tested = str(start)
226
227       now = int(time.time())
228       bookmark.test_time = str(now - start)
229
230       self.cleanup()