Robots/bkmk_robot_base.py

   1 """Base class for robots
   2
   3 This file is a part of Bookmarks database and Internet robot.
   4
   5 """
   6
   7 __author__ = "Oleg Broytman <phd@phdru.name>"
   8 __copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design"
   9 __license__ = "GNU GPL"
  10
  11 __all__ = ['robot_base', 'get_error']
  12
  13
  14 from base64 import b64encode
  15 from urllib.parse import urlsplit, urljoin
  16 import sys
  17 import socket
  18 import time
  19
  20 from m_lib.md5wrapper import md5wrapper
  21 from m_lib.net.www.util import parse_time
  22
  23 from bkmk_objects import Robot
  24 from parse_html import parse_html
  25
  26
  27 reloc_dict = {
  28   301: "perm1.",
  29   302: "temp2.",
  30   303: "temp3.",
  31   307: "temp7.",
  32   308: "temp8.",
  33   "html": "html"
  34 }
  35
  36
  37 def get_error(e):
  38     if isinstance(e, str):
  39         return e
  40
  41     else:
  42         s = []
  43         for i in e:
  44             s.append("'%s'" % str(i).replace('\n', "\\n"))
  45         return "(%s)" % ' '.join(s)
  46
  47
  48 # Icon cache; maps URL to a tuple (content type, data)
  49 # or None if there is no icon.
  50 icons = {}
  51
  52
  53 class robot_base(Robot):
  54     timeout = 60
  55
  56     def __init__(self, *args, **kw):
  57         Robot.__init__(self, *args, **kw)
  58         socket.setdefaulttimeout(int(self.timeout))
  59
  60     def check_url(self, bookmark):
  61         try:
  62             self.start = int(time.time())
  63             bookmark.icon = None
  64
  65             split_results = urlsplit(bookmark.href)
  66             url_type, netloc, url_path, query, url_tag = split_results
  67             url_host = split_results.hostname
  68
  69             url = "%s://%s%s" % (url_type, url_host, url_path)
  70             error, redirect_code, redirect_to, headers, content = \
  71                 self.get(bookmark, url, True)
  72
  73             if error:
  74                 bookmark.error = error
  75                 return 1
  76
  77             if redirect_code:
  78                 self.set_redirect(bookmark, redirect_code, redirect_to)
  79                 return 1
  80
  81             size = 0
  82             last_modified = None
  83
  84             if headers:
  85                 try:
  86                     size = headers["Content-Length"]
  87                 except KeyError:
  88                     pass
  89
  90                 try:
  91                     last_modified = headers["Last-Modified"]
  92                 except KeyError:
  93                     pass
  94
  95                 if last_modified:
  96                     last_modified = parse_time(last_modified)
  97
  98             if not size:  # Could be None from headers
  99                 size = len(content)
 100
 101             if last_modified:
 102                 last_modified = str(int(last_modified))
 103             else:
 104                 last_modified = bookmark.last_visit
 105
 106             bookmark.size = size
 107             bookmark.last_modified = last_modified
 108
 109             charset = None
 110             if headers:
 111                 try:
 112                     content_type = headers["Content-Type"]
 113                     self.log("   Content-Type   : %s" % content_type)
 114                     if content_type is None:
 115                         if b'html' in content.lower():
 116                             content_type = 'text/html'
 117                         else:
 118                             content_type = 'text/plain'
 119                         self.log("   Set Content-Type to: %s"
 120                                  % content_type)
 121                     try:
 122                         # extract charset from
 123                         # "text/html; charset=UTF-8, foo; bar"
 124                         content_type, charset = content_type.split(';', 1)
 125                         content_type = content_type.strip()
 126                         charset = charset.split('=')[1].strip().split(',')[0]
 127                         self.log("   HTTP charset   : %s" % charset)
 128                     except (ValueError, IndexError):
 129                         charset = None
 130                         self.log("   no charset in Content-Type header")
 131                     is_html = False
 132                     for ctype in ("text/html", "application/xhtml+xml"):
 133                         if content_type.startswith(ctype):
 134                             is_html = True
 135                             break
 136                     content_stripped = content.strip()
 137                     if content_stripped and charset:
 138                         content_stripped = content_stripped.decode(
 139                             charset, 'replace')
 140                     if content_stripped and is_html:
 141                         parser = parse_html(
 142                             content_stripped, charset, self.log)
 143                         if charset:
 144                             bookmark.charset = charset
 145                         elif parser and parser.meta_charset:
 146                             bookmark.charset = parser.meta_charset
 147                         if parser:
 148                             bookmark.real_title = parser.title
 149                             icon = parser.icon
 150                         else:
 151                             icon = None
 152                         if not icon:
 153                             icon = "/favicon.ico"
 154                         icon_url = urljoin(
 155                             "%s://%s%s" % (url_type, url_host, url_path), icon)
 156                         self.log("   looking for icon at: %s" % icon_url)
 157                         if icon_url in icons:
 158                             if icons[icon_url]:
 159                                 bookmark.icon_href = icon_url
 160                                 content_type, bookmark.icon = icons[icon_url]
 161                                 self.log("   cached icon: %s" % content_type)
 162                             else:
 163                                 self.log("   cached icon: no icon")
 164                         else:
 165                             try:
 166                                 _icon_url = icon_url
 167                                 for i in range(8):
 168                                     error, icon_redirect_code, \
 169                                         icon_redirect_to, icon_headers, \
 170                                         icon_data = \
 171                                         self.get(bookmark, _icon_url)
 172                                     if icon_redirect_code:
 173                                         _icon_url = icon_redirect_to
 174                                         self.log("   redirect to : %s"
 175                                                  % _icon_url)
 176                                     else:
 177                                         if icon_data is None:
 178                                             raise IOError("No icon")
 179                                         break
 180                                 else:
 181                                     raise IOError("Too many redirects")
 182                             except:
 183                                 etype, emsg, _ = sys.exc_info()
 184                                 self.log("   no icon        : %s %s"
 185                                          % (etype, emsg))
 186                                 etype = emsg = _ = None
 187                                 icons[icon_url] = None
 188                             else:
 189                                 content_type = icon_headers["Content-Type"]
 190                                 if content_type and (
 191                                     content_type.startswith("application/")
 192                                     or content_type.startswith("image/")
 193                                     or content_type.startswith("text/plain")
 194                                 ):
 195                                     bookmark.icon_href = icon_url
 196                                     self.log("   got icon       : %s"
 197                                              % content_type)
 198                                     if (
 199                                         content_type.startswith("application/")
 200                                         or content_type.startswith(
 201                                             "text/plain")
 202                                     ):
 203                                         self.log("   non-image content type,"
 204                                                  " assume x-icon")
 205                                         content_type = 'image/x-icon'
 206                                     if not isinstance(icon_data, bytes):
 207                                         icon_data = icon_data.encode('latin1')
 208                                     bookmark.icon = "data:%s;base64,%s" \
 209                                         % (content_type, b64encode(icon_data))
 210                                     icons[icon_url] = (content_type,
 211                                                        bookmark.icon
 212                                                        )
 213                                 else:
 214                                     self.log("   no icon        :"
 215                                              "bad content type '%s'"
 216                                              % content_type
 217                                              )
 218                                     icons[icon_url] = None
 219                         if parser and parser.refresh:
 220                             refresh = parser.refresh
 221                             try:
 222                                 url = refresh.split('=', 1)[1]
 223                             except IndexError:
 224                                 url = "self"
 225                             try:
 226                                 timeout = float(refresh.split(';')[0])
 227                             except (IndexError, ValueError):
 228                                 self.set_redirect(bookmark, "html",
 229                                                   "Bad redirect to %s (%s)"
 230                                                   % (url, refresh)
 231                                                   )
 232                             else:
 233                                 try:
 234                                     timeout = int(refresh.split(';')[0])
 235                                 except ValueError:
 236                                     pass  # float timeout
 237                                 self.set_redirect(bookmark, "html",
 238                                                   "%s (%s sec)"
 239                                                   % (url, timeout)
 240                                                   )
 241                     elif charset:
 242                         bookmark.charset = charset
 243
 244                     if not content_stripped:
 245                         self.log("   empty response, no content")
 246                     if not is_html:
 247                         self.log("   not html")
 248                 except KeyError as key:
 249                     self.log("   no header: %s" % key)
 250
 251             md5 = md5wrapper()
 252             if url_type == "ftp":  # Pass welcome message through MD5
 253                 ftp_welcome = self.get_ftp_welcome()
 254                 if not isinstance(ftp_welcome, bytes):
 255                     ftp_welcome = ftp_welcome.encode(charset or 'utf-8')
 256                 md5.update(ftp_welcome)
 257
 258             if isinstance(content, bytes):
 259                 md5.update(content)
 260             else:
 261                 md5.update(content.encode(charset or 'utf-8'))
 262             bookmark.md5 = str(md5)
 263
 264         except EOFError:
 265             bookmark.error = "Unexpected EOF (FTP server closed connection)"
 266             self.log('   EOF: %s' % bookmark.error)
 267
 268         except KeyboardInterrupt:
 269             self.log("Keyboard interrupt (^C)")
 270             return 0
 271
 272         except socket.error as e:
 273             bookmark.error = get_error(e)
 274             self.log(bookmark.error)
 275
 276         except:
 277             import traceback
 278             traceback.print_exc()
 279             bookmark.error = "Exception!"
 280             self.log('   Exception: %s' % bookmark.error)
 281
 282         finally:
 283             self.finish_check_url(bookmark)
 284
 285         # Tested
 286         return 1
 287
 288     def set_redirect(self, bookmark, errcode, newurl):
 289         bookmark.moved = moved = "(%s) to %s" % (reloc_dict[errcode], newurl)
 290         try:
 291             moved.encode('ascii')
 292         except UnicodeEncodeError:
 293             try:
 294                 moved = moved.encode(bookmark.charset)
 295             except (LookupError, TypeError, UnicodeEncodeError):
 296                 moved = moved.encode('utf-8')
 297         self.log('   Moved: %s' % moved)
 298
 299     def finish_check_url(self, bookmark):
 300         start = self.start
 301         bookmark.last_tested = str(start)
 302         now = int(time.time())
 303         bookmark.test_time = str(now - start)