From: Oleg Broytman Date: Thu, 8 Aug 2024 04:45:58 +0000 (+0300) Subject: Feat: Dropped support for Python 2 X-Git-Tag: 5.6.0~1 X-Git-Url: https://git.phdru.name/?a=commitdiff_plain;h=dbae974f58b0fb5695dab72f56e4dfd80d854b1a;p=bookmarks_db.git Feat: Dropped support for Python 2 --- diff --git a/Storage/bkmk_stjson.py b/Storage/bkmk_stjson.py index 1d32fdc..9daf1bb 100644 --- a/Storage/bkmk_stjson.py +++ b/Storage/bkmk_stjson.py @@ -5,17 +5,13 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2010-2023 PhiloSoft Design" +__copyright__ = "Copyright (C) 2010-2024 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['storage_json'] -try: - import json -except ImportError: - import simplejson as json - +import json from bkmk_objects import Folder, Bookmark, Ruler, Walker diff --git a/Writers/bkmk_wflad.py b/Writers/bkmk_wflad.py index 3b33de6..e273d34 100644 --- a/Writers/bkmk_wflad.py +++ b/Writers/bkmk_wflad.py @@ -4,7 +4,7 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design" +__copyright__ = "Copyright (C) 2000-2024 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['writer_flad'] @@ -12,7 +12,6 @@ __all__ = ['writer_flad'] import time from bkmk_objects import Writer, bkmk_attrs -from compat import unicode def strftime(s): @@ -59,8 +58,6 @@ Comment: %s""" % ( for attr, title in bkmk_attrs.items(): if hasattr(b, attr): value = getattr(b, attr) - #if isinstance(value, unicode): - # value = value.encode('utf-8') self.outfile.write("\n%s: %s" % (title, value)) if hasattr(b, "last_tested"): diff --git a/Writers/bkmk_whtml.py b/Writers/bkmk_whtml.py index 05d9847..a9a02b5 100644 --- a/Writers/bkmk_whtml.py +++ b/Writers/bkmk_whtml.py @@ -4,14 +4,13 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design" +__copyright__ = "Copyright (C) 2000-2024 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['writer_html'] from bkmk_objects import Writer, BKMK_FORMAT, quote_title -from compat import unicode def dump_comment(comment): @@ -62,8 +61,6 @@ class writer_html(Writer): if b.keyword: self.outfile.write(' SHORTCUTURL="%s"' % b.keyword) if b.icon_href: value = b.icon_href - #if isinstance(value, unicode): - # value = value.encode('utf-8') self.outfile.write(' ICON_URI="%s"' % value) if b.icon: self.outfile.write(' ICON="%s"' % b.icon) if b.charset: self.outfile.write(' LAST_CHARSET="%s"' % b.charset) diff --git a/bkmk_parser.py b/bkmk_parser.py index 997728f..24dc7c6 100644 --- a/bkmk_parser.py +++ b/bkmk_parser.py @@ -47,9 +47,6 @@ class BkmkParser(HTMLParser): def handle_data(self, data): if data: - #if self.charset and default_encoding: - # data = data.decode(self.charset, "replace").\ - # encode(default_encoding, "xmlcharrefreplace") self.accumulator += data # Mozilla - get charset diff --git a/compat.py b/compat.py deleted file mode 100644 index b197445..0000000 --- a/compat.py +++ /dev/null @@ -1,13 +0,0 @@ -import sys - -# Compatability definitions (inspired by six) -PY2 = sys.version_info[0] < 3 -if PY2: - # disable flake8 checks on python 3 - string_type = basestring # noqa - unicode = unicode # noqa - unichr = unichr # noqa -else: - string_type = str - unicode = str - unichr = chr diff --git a/doc/ANNOUNCE b/doc/ANNOUNCE index 5948b9c..7161011 100644 --- a/doc/ANNOUNCE +++ b/doc/ANNOUNCE @@ -11,6 +11,8 @@ Version 5.6.0 (2024-??-??) Removed urllib-based robots. + Dropped support for Python 2. + Default list of robots is now curl,requests,aiohttp. diff --git a/doc/ChangeLog b/doc/ChangeLog index e4a5013..38ecb85 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -2,6 +2,8 @@ Version 5.6.0 (2024-??-??) Removed urllib-based robots. + Dropped support for Python 2. + Default list of robots is now curl,requests,aiohttp. Version 5.5.1 (2024-08-??) diff --git a/parse_html/bkmk_parse_html.py b/parse_html/bkmk_parse_html.py index 922c745..d8168bd 100644 --- a/parse_html/bkmk_parse_html.py +++ b/parse_html/bkmk_parse_html.py @@ -5,7 +5,7 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 1997-2023 PhiloSoft Design" +__copyright__ = "Copyright (C) 1997-2024 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['parse_html', 'parse_filename', 'universal_charset'] @@ -14,12 +14,7 @@ __all__ = ['parse_html', 'parse_filename', 'universal_charset'] import codecs import os import re -try: - from html.entities import name2codepoint -except ImportError: - from htmlentitydefs import name2codepoint - -from compat import unicode, unichr +from html.entities import name2codepoint DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic parsers = [] @@ -66,7 +61,7 @@ def recode_entities(title, charset): entity_re.match(part): _part = name2codepoint.get(part[1:-1], None) if _part is not None: - part = unichr(_part) + part = chr(_part) output.append(part) title = ''.join(output) @@ -74,7 +69,7 @@ def recode_entities(title, charset): for part in num_entity_re.split(title): if num_entity_re.match(part): try: - part = unichr(int(part[2:-1])) + part = chr(int(part[2:-1])) except UnicodeEncodeError: pass # Leave the entity as is output.append(part) @@ -134,24 +129,6 @@ def parse_html(html_text, charset=None, log=None): p, parser = _parsers[0] if log: log(" Using %s" % p.__module__) - #title = parser.title - #if isinstance(title, unicode): - # if parser.charset: - # parser.title = title.encode(parser.charset) - # else: - # try: - # parser.title = title.encode('ascii') - # except UnicodeEncodeError: - # try: - # parser.title = title.encode(DEFAULT_CHARSET) - # except UnicodeEncodeError: - # parser.title = title.encode(universal_charset) - # parser.charset = universal_charset - # else: - # parser.charset = DEFAULT_CHARSET - # else: - # parser.charset = 'ascii' - converted_title = title = parser.title if title and isinstance(title, bytes) and (not parser.charset): try: @@ -170,23 +147,7 @@ def parse_html(html_text, charset=None, log=None): if log: log(" META charset : %s" % parser.charset) elif (not charset) or (charset != parser.charset): if log: log(" guessed charset: %s" % parser.charset) - # if log: log(" current charset: %s" % universal_charset) if log: log(" title : %s" % title) - #if parser.charset != universal_charset: - # try: - # converted_title = title.decode(parser.charset).\ - # encode(universal_charset) - # except UnicodeError: - # if log: - # log(" incorrect conversion from %s," - # "converting from %s" - # % (parser.charset, DEFAULT_CHARSET)) - # converted_title = \ - # title.decode(DEFAULT_CHARSET, "replace").\ - # encode(universal_charset, "replace") - # parser.charset = DEFAULT_CHARSET - #if log and (converted_title != title): - # log(" converted title: %s" % converted_title) except LookupError: if log: log(" unknown charset: '%s'" % parser.charset) else: @@ -200,13 +161,6 @@ def parse_html(html_text, charset=None, log=None): log(" final title : %s" % final_title) parser.title = final_title - #icon = parser.icon - #if isinstance(icon, unicode): - # try: - # parser.icon = icon.encode('ascii') - # except UnicodeEncodeError: - # if parser.charset: - # parser.icon = icon.encode(parser.charset) return parser diff --git a/parse_html/bkmk_ph_beautifulsoup.py b/parse_html/bkmk_ph_beautifulsoup.py deleted file mode 100644 index 0aad3dd..0000000 --- a/parse_html/bkmk_ph_beautifulsoup.py +++ /dev/null @@ -1,166 +0,0 @@ -"""HTML Parser using BeautifulSoup - -This file is a part of Bookmarks database and Internet robot. - -""" - -__author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2007-2023 PhiloSoft Design" -__license__ = "GNU GPL" - -__all__ = ['parse_html'] - - -import re -from sgmllib import SGMLParser, SGMLParseError -from BeautifulSoup import BeautifulSoup, CData - -from .bkmk_ph_util import HTMLParser -from compat import string_type - -DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic - -# http://groups.google.com/group/beautifulsoup/browse_thread/thread/69093cb0d3a3cf63 - - -class BadDeclParser(BeautifulSoup): - def parse_declaration(self, i): - """Treat a bogus SGML declaration as raw data. Treat a CDATA - declaration as a CData object.""" - j = None - if self.rawdata[i:i+9] == '', i) - if k == -1: - k = len(self.rawdata) - data = self.rawdata[i+9:k] - j = k+3 - self._toStringSubclass(data, CData) - else: - try: - j = SGMLParser.parse_declaration(self, i) - except SGMLParseError: - # Could not parse the DOCTYPE declaration - # Try to just skip the actual declaration - match = re.search( - r']*?)>', self.rawdata[i:], - re.MULTILINE|re.IGNORECASE) # noqa: E227 - # missing whitespace around bitwise or shift operator - if match: - toHandle = self.rawdata[i:match.end()] - else: - toHandle = self.rawdata[i:] - self.handle_data(toHandle) - j = i + len(toHandle) - return j - - -def _parse_html(html_text, charset): - try: - return BadDeclParser(html_text, fromEncoding=charset) - except TypeError: - return None - - -def parse_html(html_text, charset=None, log=None): - if not html_text: - return None - root = _parse_html(html_text, charset) - if root is None: - return None - - _charset = root.originalEncoding - if _charset in ("ISO-8859-2", "windows-1252", "MacCyrillic"): - # Replace with default and re-parse - _charset = DEFAULT_CHARSET - root = _parse_html(html_text, _charset) - if root is None: - return None - - html = root.html - if html is None: - html = root - - head = html.head - if head is None: - head = html # Some sites put TITLE in HTML without HEAD - - title = head.title - if (title is None) and (html is not head): - # Some sites put TITLE in HTML outside of HEAD - title = html.title - - if title is None: - # Lookup TITLE in the root - title = root.title - - if title is not None: - if title.string: - title = title.string - else: - parts = [] - for part in title: - if not isinstance(part, string_type): - part = part.decode() - parts.append(part.strip()) - title = ''.join(parts) - - meta = head.find(_find_contenttype, recursive=False) - if meta: - try: - meta_content = meta.get("content") - if meta_content: - __charset = meta_content.lower().split('charset=')[1].\ - split(';')[0] - else: - __charset = False - except IndexError: # No charset in the META Content-Type - meta_charset = False - else: - meta_charset = _charset == __charset - else: - meta_charset = False - - if not meta_charset: - meta = head.find(_find_charset, recursive=False) - if meta: - meta_content = meta.get("charset") - if meta_content: - meta_charset = _charset = meta_content.lower() - - #if title and (_charset or meta_charset): - # title = title.encode(_charset or meta_charset) - - meta = head.find(_find_refresh, recursive=False) - if meta: - refresh = meta.get("content") - else: - refresh = None - - meta = head.find(_find_icon, recursive=False) - if meta: - icon = meta.get("href") - else: - icon = None - - if (title is None) and (refresh is None) and (icon is None): - return None - return HTMLParser(_charset, meta_charset, title, refresh, icon) - - -def _find_contenttype(Tag): - return (Tag.name == "meta") and \ - (Tag.get("http-equiv", '').lower() == "content-type") - - -def _find_charset(Tag): - return (Tag.name == "meta") and Tag.get("charset", '') - - -def _find_refresh(Tag): - return (Tag.name == "meta") and \ - (Tag.get("http-equiv", '').lower() == "refresh") - - -def _find_icon(Tag): - return (Tag.name == "link") and \ - (Tag.get("rel", '').lower() in ('icon', 'shortcut icon')) diff --git a/parse_html/bkmk_ph_beautifulsoup4.py b/parse_html/bkmk_ph_beautifulsoup4.py index 060f078..148a6f7 100644 --- a/parse_html/bkmk_ph_beautifulsoup4.py +++ b/parse_html/bkmk_ph_beautifulsoup4.py @@ -5,7 +5,7 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2017-2023 PhiloSoft Design" +__copyright__ = "Copyright (C) 2017-2024 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['parse_html'] @@ -16,7 +16,6 @@ import warnings from bs4 import BeautifulSoup from .bkmk_ph_util import HTMLParser -from compat import string_type warnings.filterwarnings( 'ignore', 'No parser was explicitly specified') @@ -69,8 +68,6 @@ def parse_html(html_text, charset=None, log=None): else: parts = [] for part in title: - #if not isinstance(part, string_type): - # part = part.decode() if part.strip: parts.append(part.strip()) else: @@ -100,13 +97,6 @@ def parse_html(html_text, charset=None, log=None): if meta_content: meta_charset = _charset = meta_content.lower() - #if title and (_charset or meta_charset): - # try: - # title = title.encode(_charset or meta_charset) - # except LookupError: - # title = title.encode(universal_charset) - # _charset = universal_charset - meta = head.find(_find_refresh, recursive=False) if meta: refresh = meta.get("content") diff --git a/parse_html/bkmk_ph_htmlparser.py b/parse_html/bkmk_ph_htmlparser.py index d11a2ff..b3da8e7 100644 --- a/parse_html/bkmk_ph_htmlparser.py +++ b/parse_html/bkmk_ph_htmlparser.py @@ -5,19 +5,16 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 1997-2023 PhiloSoft Design" +__copyright__ = "Copyright (C) 1997-2024 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['parse_html'] -try: - from HTMLParser import HTMLParseError -except ImportError: - class HTMLParseError(Exception): pass from m_lib.net.www.html import HTMLParser as _HTMLParser +class HTMLParseError(Exception): pass class HTMLHeadDone(Exception): pass diff --git a/parse_html/bkmk_ph_lxml.py b/parse_html/bkmk_ph_lxml.py index 2427482..7af98e7 100644 --- a/parse_html/bkmk_ph_lxml.py +++ b/parse_html/bkmk_ph_lxml.py @@ -5,7 +5,7 @@ This file is a part of Bookmarks database and Internet robot. """ __author__ = "Oleg Broytman " -__copyright__ = "Copyright (C) 2010-2023 PhiloSoft Design" +__copyright__ = "Copyright (C) 2010-2024 PhiloSoft Design" __license__ = "GNU GPL" __all__ = ['parse_html'] @@ -61,9 +61,6 @@ def parse_html(html_text, charset=None, log=None): else: meta_charset = False - #if title and (charset or meta_charset): - # title = title.encode(charset or meta_charset) - for m in meta: if m.get('http-equiv', '').lower() == 'refresh': refresh = m.get("content") diff --git a/setup.py b/setup.py index 638e946..4a629e4 100755 --- a/setup.py +++ b/setup.py @@ -20,9 +20,8 @@ setup( 'Intended Audience :: End Users/Desktop', 'License :: OSI Approved :: GNU General Public License (GPL)', 'Operating System :: OS Independent', - 'Programming Language :: Python :: 2', - 'Programming Language :: Python :: 2.7', 'Programming Language :: Python :: 3', + 'Programming Language :: Python :: 3 :: Only', 'Programming Language :: Python :: 3.4', 'Programming Language :: Python :: 3.5', 'Programming Language :: Python :: 3.6', @@ -33,7 +32,7 @@ setup( 'Programming Language :: Python :: 3.11', 'Programming Language :: Python :: 3.12', ], - python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*', + python_requires='>=3.4', install_requires=[ 'm_lib.full>=1.0', ], @@ -41,6 +40,6 @@ setup( 'html': ['beautifulsoup4', 'lxml'], 'requests': ['requests[socks]', 'requests-ftp'], 'curl': ['pycurl', 'certifi'], - 'aiohttp:python_version>="3.4"': ['aiohttp', 'aioftp'], + 'aiohttp': ['aiohttp', 'aioftp'], }, )