]> git.phdru.name Git - bookmarks_db.git/blob - Robots/bkmk_robot_base.py
210d092b6509603c60832cc38b40cbfdcec058cc
[bookmarks_db.git] / Robots / bkmk_robot_base.py
1 """Base class for robots
2
3 This file is a part of Bookmarks database and Internet robot.
4
5 """
6
7 __author__ = "Oleg Broytman <phd@phdru.name>"
8 __copyright__ = "Copyright (C) 2000-2014 PhiloSoft Design"
9 __license__ = "GNU GPL"
10
11 __all__ = ['robot_base', 'get_error']
12
13
14 import sys
15 import time, urllib
16 from base64 import b64encode
17 from urlparse import urljoin
18
19 from m_lib.net.www.util import parse_time
20 from m_lib.md5wrapper import md5wrapper
21
22 from bkmk_objects import Robot
23 from parse_html import parse_html
24
25
26 reloc_dict = {
27   301: "perm.",
28   302: "temp2.",
29   303: "temp3.",
30   307: "temp7.",
31   "html": "html"
32 }
33
34
35 def get_error(msg):
36    if isinstance(msg, str):
37       return msg
38
39    else:
40       s = []
41       for i in msg:
42          s.append("'%s'" % str(i).replace('\n', "\\n"))
43       return "(%s)" % ' '.join(s)
44
45
46 icons = {} # Icon cache; maps URL to a tuple (content type, data)
47            # or None if there is no icon.
48
49 class robot_base(Robot):
50    def check_url(self, bookmark):
51       try:
52          self.start = int(time.time())
53          bookmark.icon = None
54
55          url_type, url_rest = urllib.splittype(bookmark.href)
56          url_host, url_path = urllib.splithost(url_rest)
57          url_path, url_tag  = urllib.splittag(url_path)
58
59          url = "%s://%s%s" % (url_type, url_host, url_path)
60          error, redirect_code, redirect_to, headers, content = self.get(bookmark, url, True)
61
62          if error:
63              bookmark.error = error
64              return 1
65
66          if redirect_code:
67              self.set_redirect(bookmark, redirect_code, redirect_to)
68              return 1
69
70          size = 0
71          last_modified = None
72
73          if headers:
74             try:
75                size = headers["Content-Length"]
76             except KeyError:
77                size = len(content)
78
79             try:
80                last_modified = headers["Last-Modified"]
81             except KeyError:
82                pass
83
84             if last_modified:
85                last_modified = parse_time(last_modified)
86          else:
87             size = len(content)
88
89          if last_modified:
90             last_modified = str(int(last_modified))
91          else:
92             last_modified = bookmark.last_visit
93
94          bookmark.size = size
95          bookmark.last_modified = last_modified
96
97          md5 = md5wrapper()
98          if url_type == "ftp": # Pass welcome message through MD5
99             md5.update(self.get_ftp_welcome())
100
101          md5.update(content)
102          bookmark.md5 = str(md5)
103
104          if headers:
105             try:
106                content_type = headers["Content-Type"]
107                self.log("   Content-Type: %s" % content_type)
108                try:
109                   # extract charset from "text/html; foo; charset=UTF-8, bar; baz;"
110                   content_type, charset = content_type.split(';', 1)
111                   content_type = content_type.strip()
112                   charset = charset.split('=')[1].strip().split(',')[0]
113                   self.log("   HTTP charset   : %s" % charset)
114                except (ValueError, IndexError):
115                   charset = None
116                   self.log("   no charset in Content-Type header")
117                for ctype in ("text/html", "application/xhtml+xml"):
118                   if content_type.startswith(ctype):
119                       html = True
120                       break
121                else:
122                   html = False
123                if html:
124                   parser = parse_html(content, charset, self.log)
125                   if parser:
126                       bookmark.real_title = parser.title
127                       icon = parser.icon
128                   else:
129                      icon = None
130                   if not icon:
131                      icon = "/favicon.ico"
132                   icon_url = urljoin("%s://%s%s" % (url_type, url_host, url_path), icon)
133                   self.log("   looking for icon at: %s" % icon_url)
134                   if icon_url in icons:
135                      if icons[icon_url]:
136                         bookmark.icon_href = icon_url
137                         content_type, bookmark.icon = icons[icon_url]
138                         self.log("   cached icon: %s" % content_type)
139                      else:
140                         self.log("   cached icon: no icon")
141                   else:
142                      try:
143                         _icon_url = icon_url
144                         for i in range(8):
145                            error, icon_redirect_code, icon_redirect_to, \
146                               icon_headers, icon_data = \
147                                  self.get(bookmark, _icon_url)
148                            if icon_redirect_code:
149                               _icon_url = icon_redirect_to
150                               self.log("   redirect to : %s" % _icon_url)
151                            else:
152                               if icon_data is None:
153                                    raise IOError("No icon")
154                               break
155                         else:
156                            raise IOError("Too many redirects")
157                      except:
158                         etype, emsg, tb = sys.exc_info()
159                         self.log("   no icon        : %s %s" % (etype, emsg))
160                         etype = emsg = tb = None
161                         icons[icon_url] = None
162                      else:
163                         content_type = icon_headers["Content-Type"]
164                         if content_type.startswith("application/") \
165                               or content_type.startswith("image/") \
166                               or content_type.startswith("text/plain"):
167                            bookmark.icon_href = icon_url
168                            self.log("   got icon       : %s" % content_type)
169                            if content_type.startswith("application/") \
170                                  or content_type.startswith("text/plain"):
171                               self.log("   non-image content type, assume x-icon")
172                               content_type = 'image/x-icon'
173                            bookmark.icon = "data:%s;base64,%s" % (content_type, b64encode(icon_data))
174                            icons[icon_url] = (content_type, bookmark.icon)
175                         else:
176                            self.log("   no icon        : bad content type '%s'" % content_type)
177                            icons[icon_url] = None
178                   if parser and parser.refresh:
179                      refresh = parser.refresh
180                      try:
181                         url = refresh.split('=', 1)[1]
182                      except IndexError:
183                         url = "self"
184                      try:
185                         timeout = float(refresh.split(';')[0])
186                      except (IndexError, ValueError):
187                         self.set_redirect(bookmark, "html", "Bad redirect to %s (%s)" % (url, refresh))
188                      else:
189                         try:
190                            timeout = int(refresh.split(';')[0])
191                         except ValueError:
192                            pass # float timeout
193                         self.set_redirect(bookmark, "html", "%s (%s sec)" % (url, timeout))
194
195             except KeyError, key:
196                self.log("   no header: %s" % key)
197
198       except EOFError:
199          bookmark.error = "Unexpected EOF (FTP server closed connection)"
200          self.log('   EOF: %s' % bookmark.error)
201
202       except KeyboardInterrupt:
203          self.log("Keyboard interrupt (^C)")
204          return 0
205
206       except:
207          import traceback
208          traceback.print_exc()
209          bookmark.error = "Exception!"
210          self.log('   Exception: %s' % bookmark.error)
211
212       finally:
213          self.finish_check_url(bookmark)
214
215       # Tested
216       return 1
217
218    def set_redirect(self, bookmark, errcode, newurl):
219         bookmark.moved = "(%s) to %s" % (reloc_dict[errcode], newurl)
220         self.log('   Moved: %s' % bookmark.moved)
221
222    def finish_check_url(self, bookmark):
223       start = self.start
224       bookmark.last_tested = str(start)
225       now = int(time.time())
226       bookmark.test_time = str(now - start)