"""
__author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2000-2024 PhiloSoft Design"
__license__ = "GNU GPL"
__all__ = ['robot_base', 'get_error']
from base64 import b64encode
+from urllib.parse import urlsplit, urljoin
import sys
import socket
import time
-try:
- from urllib.parse import splittype, splithost, splittag, urljoin
-except ImportError:
- from urllib import splittype, splithost, splittag
- from urlparse import urljoin
from m_lib.md5wrapper import md5wrapper
from m_lib.net.www.util import parse_time
from parse_html import parse_html
+# Fake headers to pretend this is a real browser
+_user_agent = "Mozilla/5.0 (X11; U; Linux 2.6 i686; en)"
+" Gecko/20001221 Firefox/2.0.0"
+_x_user_agent = "bookmarks_db (Python %d.%d.%d)" % sys.version_info[:3]
+
+request_headers = {
+ 'Accept': '*/*',
+ 'Accept-Language': 'ru,en',
+ 'Cache-Control': 'max-age=300',
+ 'Connection': 'close',
+ 'Referer': '/',
+ 'User-Agent': _user_agent,
+ 'X-User-Agent': _x_user_agent,
+}
+
+
reloc_dict = {
301: "perm1.",
302: "temp2.",
self.start = int(time.time())
bookmark.icon = None
- url_type, url_rest = splittype(bookmark.href)
- url_host, url_path = splithost(url_rest)
- url_path, url_tag = splittag(url_path) # noqa: E221
- # multiple spaces before operator
+ split_results = urlsplit(bookmark.href)
+ url_type, netloc, url_path, query, url_tag = split_results
+ url_host = split_results.hostname
url = "%s://%s%s" % (url_type, url_host, url_path)
error, redirect_code, redirect_to, headers, content = \
bookmark.size = size
bookmark.last_modified = last_modified
- md5 = md5wrapper()
- if url_type == "ftp": # Pass welcome message through MD5
- ftp_welcome = self.get_ftp_welcome()
- if not isinstance(ftp_welcome, bytes):
- ftp_welcome = ftp_welcome.encode('utf-8')
- md5.update(ftp_welcome)
-
- if isinstance(content, bytes):
- md5.update(content)
- else:
- md5.update(content.encode('utf-8'))
- bookmark.md5 = str(md5)
-
+ charset = None
if headers:
try:
content_type = headers["Content-Type"]
self.log(" Content-Type : %s" % content_type)
if content_type is None:
- if 'html' in content.lower():
+ if b'html' in content.lower():
content_type = 'text/html'
else:
content_type = 'text/plain'
% content_type)
try:
# extract charset from
- # "text/html; foo; charset=UTF-8, bar; baz;"
+ # "text/html; charset=UTF-8, foo; bar"
content_type, charset = content_type.split(';', 1)
content_type = content_type.strip()
charset = charset.split('=')[1].strip().split(',')[0]
is_html = True
break
content_stripped = content.strip()
+ if content_stripped and charset:
+ try:
+ content_stripped = content_stripped.decode(
+ charset, 'replace')
+ except LookupError:
+ charset = None
+ self.log(" unknown charset "
+ "in Content-Type header")
if content_stripped and is_html:
- parser = parse_html(content_stripped, charset, self.log)
+ parser = parse_html(
+ content_stripped, charset, self.log)
if charset:
bookmark.charset = charset
elif parser and parser.meta_charset:
if icons[icon_url]:
bookmark.icon_href = icon_url
content_type, bookmark.icon = icons[icon_url]
- self.log(" cached icon: %s" % content_type)
+ self.log(" cached icon : %s"
+ % content_type)
else:
- self.log(" cached icon: no icon")
+ self.log(" cached icon : no icon")
+ elif icon_url.startswith('data:'):
+ content_type, icon_data = \
+ icon_url[len('data:'):].split(',', 1)
+ bookmark.icon_href = bookmark.icon = icon_url
+ self.log(" got data icon : %s" % content_type)
+ icons[icon_url] = (content_type, icon_url)
else:
try:
_icon_url = icon_url
" assume x-icon")
content_type = 'image/x-icon'
if not isinstance(icon_data, bytes):
- icon_data = icon_data.encode('utf-8')
+ icon_data = icon_data.encode('latin1')
bookmark.icon = "data:%s;base64,%s" \
% (content_type, b64encode(icon_data))
icons[icon_url] = (content_type,
bookmark.icon
)
else:
- self.log(" no icon :"
+ self.log(" no icon : "
"bad content type '%s'"
% content_type
)
"%s (%s sec)"
% (url, timeout)
)
+ elif charset:
+ bookmark.charset = charset
if not content_stripped:
self.log(" empty response, no content")
except KeyError as key:
self.log(" no header: %s" % key)
+ md5 = md5wrapper()
+ if url_type == "ftp": # Pass welcome message through MD5
+ ftp_welcome = self.get_ftp_welcome()
+ if not isinstance(ftp_welcome, bytes):
+ ftp_welcome = ftp_welcome.encode(charset or 'utf-8')
+ md5.update(ftp_welcome)
+
+ if isinstance(content, bytes):
+ md5.update(content)
+ else:
+ md5.update(content.encode(charset or 'utf-8'))
+ bookmark.md5 = str(md5)
+
except EOFError:
bookmark.error = "Unexpected EOF (FTP server closed connection)"
self.log(' EOF: %s' % bookmark.error)