Robots/bkmk_robot_base.py

   1 """Base class for robots
   2
   3 This file is a part of Bookmarks database and Internet robot.
   4
   5 """
   6
   7 __author__ = "Oleg Broytman <phd@phdru.name>"
   8 __copyright__ = "Copyright (C) 2000-2024 PhiloSoft Design"
   9 __license__ = "GNU GPL"
  10
  11 __all__ = ['robot_base', 'get_error']
  12
  13
  14 from base64 import b64encode
  15 from urllib.parse import urlsplit, urljoin
  16 import sys
  17 import socket
  18 import time
  19
  20 from m_lib.md5wrapper import md5wrapper
  21 from m_lib.net.www.util import parse_time
  22
  23 from bkmk_objects import Robot
  24 from parse_html import parse_html
  25
  26
  27 # Fake headers to pretend this is a real browser
  28 _user_agent = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en)"
  29 " Gecko/20001221 Firefox/2.0.0"
  30 _x_user_agent = "bookmarks_db (Python %d.%d.%d)" % sys.version_info[:3]
  31
  32 request_headers = {
  33     'Accept': '*/*',
  34     'Accept-Language': 'ru,en',
  35     'Cache-Control': 'max-age=300',
  36     'Connection': 'close',
  37     'Referer': '/',
  38     'User-Agent': _user_agent,
  39     'X-User-Agent': _x_user_agent,
  40 }
  41
  42
  43 reloc_dict = {
  44   301: "perm1.",
  45   302: "temp2.",
  46   303: "temp3.",
  47   307: "temp7.",
  48   308: "temp8.",
  49   "html": "html"
  50 }
  51
  52
  53 def get_error(e):
  54     if isinstance(e, str):
  55         return e
  56
  57     else:
  58         s = []
  59         for i in e:
  60             s.append("'%s'" % str(i).replace('\n', "\\n"))
  61         return "(%s)" % ' '.join(s)
  62
  63
  64 # Icon cache; maps URL to a tuple (content type, data)
  65 # or None if there is no icon.
  66 icons = {}
  67
  68
  69 class robot_base(Robot):
  70     timeout = 60
  71
  72     def __init__(self, *args, **kw):
  73         Robot.__init__(self, *args, **kw)
  74         socket.setdefaulttimeout(int(self.timeout))
  75
  76     def check_url(self, bookmark):
  77         try:
  78             self.start = int(time.time())
  79             bookmark.icon = None
  80
  81             split_results = urlsplit(bookmark.href)
  82             url_type, netloc, url_path, query, url_tag = split_results
  83             url_host = split_results.hostname
  84
  85             url = "%s://%s%s" % (url_type, url_host, url_path)
  86             error, redirect_code, redirect_to, headers, content = \
  87                 self.get(bookmark, url, True)
  88
  89             if error:
  90                 bookmark.error = error
  91                 return 1
  92
  93             if redirect_code:
  94                 self.set_redirect(bookmark, redirect_code, redirect_to)
  95                 return 1
  96
  97             size = 0
  98             last_modified = None
  99
 100             if headers:
 101                 try:
 102                     size = headers["Content-Length"]
 103                 except KeyError:
 104                     pass
 105
 106                 try:
 107                     last_modified = headers["Last-Modified"]
 108                 except KeyError:
 109                     pass
 110
 111                 if last_modified:
 112                     last_modified = parse_time(last_modified)
 113
 114             if not size:  # Could be None from headers
 115                 size = len(content)
 116
 117             if last_modified:
 118                 last_modified = str(int(last_modified))
 119             else:
 120                 last_modified = bookmark.last_visit
 121
 122             bookmark.size = size
 123             bookmark.last_modified = last_modified
 124
 125             charset = None
 126             if headers:
 127                 try:
 128                     content_type = headers["Content-Type"]
 129                     self.log("   Content-Type   : %s" % content_type)
 130                     if content_type is None:
 131                         if b'html' in content.lower():
 132                             content_type = 'text/html'
 133                         else:
 134                             content_type = 'text/plain'
 135                         self.log("   Set Content-Type to: %s"
 136                                  % content_type)
 137                     try:
 138                         # extract charset from
 139                         # "text/html; charset=UTF-8, foo; bar"
 140                         content_type, charset = content_type.split(';', 1)
 141                         content_type = content_type.strip()
 142                         charset = charset.split('=')[1].strip().split(',')[0]
 143                         self.log("   HTTP charset   : %s" % charset)
 144                     except (ValueError, IndexError):
 145                         charset = None
 146                         self.log("   no charset in Content-Type header")
 147                     is_html = False
 148                     for ctype in ("text/html", "application/xhtml+xml"):
 149                         if content_type.startswith(ctype):
 150                             is_html = True
 151                             break
 152                     content_stripped = content.strip()
 153                     if content_stripped and charset:
 154                         try:
 155                             content_stripped = content_stripped.decode(
 156                                 charset, 'replace')
 157                         except LookupError:
 158                             charset = None
 159                             self.log("   unknown charset "
 160                                      "in Content-Type header")
 161                     if content_stripped and is_html:
 162                         parser = parse_html(
 163                             content_stripped, charset, self.log)
 164                         if charset:
 165                             bookmark.charset = charset
 166                         elif parser and parser.meta_charset:
 167                             bookmark.charset = parser.meta_charset
 168                         if parser:
 169                             bookmark.real_title = parser.title
 170                             icon = parser.icon
 171                         else:
 172                             icon = None
 173                         if not icon:
 174                             icon = "/favicon.ico"
 175                         icon_url = urljoin(
 176                             "%s://%s%s" % (url_type, url_host, url_path), icon)
 177                         self.log("   looking for icon at: %s" % icon_url)
 178                         if icon_url in icons:
 179                             if icons[icon_url]:
 180                                 bookmark.icon_href = icon_url
 181                                 content_type, bookmark.icon = icons[icon_url]
 182                                 self.log("   cached icon: %s" % content_type)
 183                             else:
 184                                 self.log("   cached icon: no icon")
 185                         else:
 186                             try:
 187                                 _icon_url = icon_url
 188                                 for i in range(8):
 189                                     error, icon_redirect_code, \
 190                                         icon_redirect_to, icon_headers, \
 191                                         icon_data = \
 192                                         self.get(bookmark, _icon_url)
 193                                     if icon_redirect_code:
 194                                         _icon_url = icon_redirect_to
 195                                         self.log("   redirect to : %s"
 196                                                  % _icon_url)
 197                                     else:
 198                                         if icon_data is None:
 199                                             raise IOError("No icon")
 200                                         break
 201                                 else:
 202                                     raise IOError("Too many redirects")
 203                             except:
 204                                 etype, emsg, _ = sys.exc_info()
 205                                 self.log("   no icon        : %s %s"
 206                                          % (etype, emsg))
 207                                 etype = emsg = _ = None
 208                                 icons[icon_url] = None
 209                             else:
 210                                 content_type = icon_headers["Content-Type"]
 211                                 if content_type and (
 212                                     content_type.startswith("application/")
 213                                     or content_type.startswith("image/")
 214                                     or content_type.startswith("text/plain")
 215                                 ):
 216                                     bookmark.icon_href = icon_url
 217                                     self.log("   got icon       : %s"
 218                                              % content_type)
 219                                     if (
 220                                         content_type.startswith("application/")
 221                                         or content_type.startswith(
 222                                             "text/plain")
 223                                     ):
 224                                         self.log("   non-image content type,"
 225                                                  " assume x-icon")
 226                                         content_type = 'image/x-icon'
 227                                     if not isinstance(icon_data, bytes):
 228                                         icon_data = icon_data.encode('latin1')
 229                                     bookmark.icon = "data:%s;base64,%s" \
 230                                         % (content_type, b64encode(icon_data))
 231                                     icons[icon_url] = (content_type,
 232                                                        bookmark.icon
 233                                                        )
 234                                 else:
 235                                     self.log("   no icon        : "
 236                                              "bad content type '%s'"
 237                                              % content_type
 238                                              )
 239                                     icons[icon_url] = None
 240                         if parser and parser.refresh:
 241                             refresh = parser.refresh
 242                             try:
 243                                 url = refresh.split('=', 1)[1]
 244                             except IndexError:
 245                                 url = "self"
 246                             try:
 247                                 timeout = float(refresh.split(';')[0])
 248                             except (IndexError, ValueError):
 249                                 self.set_redirect(bookmark, "html",
 250                                                   "Bad redirect to %s (%s)"
 251                                                   % (url, refresh)
 252                                                   )
 253                             else:
 254                                 try:
 255                                     timeout = int(refresh.split(';')[0])
 256                                 except ValueError:
 257                                     pass  # float timeout
 258                                 self.set_redirect(bookmark, "html",
 259                                                   "%s (%s sec)"
 260                                                   % (url, timeout)
 261                                                   )
 262                     elif charset:
 263                         bookmark.charset = charset
 264
 265                     if not content_stripped:
 266                         self.log("   empty response, no content")
 267                     if not is_html:
 268                         self.log("   not html")
 269                 except KeyError as key:
 270                     self.log("   no header: %s" % key)
 271
 272             md5 = md5wrapper()
 273             if url_type == "ftp":  # Pass welcome message through MD5
 274                 ftp_welcome = self.get_ftp_welcome()
 275                 if not isinstance(ftp_welcome, bytes):
 276                     ftp_welcome = ftp_welcome.encode(charset or 'utf-8')
 277                 md5.update(ftp_welcome)
 278
 279             if isinstance(content, bytes):
 280                 md5.update(content)
 281             else:
 282                 md5.update(content.encode(charset or 'utf-8'))
 283             bookmark.md5 = str(md5)
 284
 285         except EOFError:
 286             bookmark.error = "Unexpected EOF (FTP server closed connection)"
 287             self.log('   EOF: %s' % bookmark.error)
 288
 289         except KeyboardInterrupt:
 290             self.log("Keyboard interrupt (^C)")
 291             return 0
 292
 293         except socket.error as e:
 294             bookmark.error = get_error(e)
 295             self.log(bookmark.error)
 296
 297         except:
 298             import traceback
 299             traceback.print_exc()
 300             bookmark.error = "Exception!"
 301             self.log('   Exception: %s' % bookmark.error)
 302
 303         finally:
 304             self.finish_check_url(bookmark)
 305
 306         # Tested
 307         return 1
 308
 309     def set_redirect(self, bookmark, errcode, newurl):
 310         bookmark.moved = moved = "(%s) to %s" % (reloc_dict[errcode], newurl)
 311         try:
 312             moved.encode('ascii')
 313         except UnicodeEncodeError:
 314             try:
 315                 moved = moved.encode(bookmark.charset)
 316             except (LookupError, TypeError, UnicodeEncodeError):
 317                 moved = moved.encode('utf-8')
 318         self.log('   Moved: %s' % moved)
 319
 320     def finish_check_url(self, bookmark):
 321         start = self.start
 322         bookmark.last_tested = str(start)
 323         now = int(time.time())
 324         bookmark.test_time = str(now - start)