"""
__author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2010-2023 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2010-2024 PhiloSoft Design"
__license__ = "GNU GPL"
__all__ = ['storage_json']
-try:
- import json
-except ImportError:
- import simplejson as json
-
+import json
from bkmk_objects import Folder, Bookmark, Ruler, Walker
"""
__author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2000-2024 PhiloSoft Design"
__license__ = "GNU GPL"
__all__ = ['writer_flad']
import time
from bkmk_objects import Writer, bkmk_attrs
-from compat import unicode
def strftime(s):
for attr, title in bkmk_attrs.items():
if hasattr(b, attr):
value = getattr(b, attr)
- #if isinstance(value, unicode):
- # value = value.encode('utf-8')
self.outfile.write("\n%s: %s" % (title, value))
if hasattr(b, "last_tested"):
"""
__author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2000-2024 PhiloSoft Design"
__license__ = "GNU GPL"
__all__ = ['writer_html']
from bkmk_objects import Writer, BKMK_FORMAT, quote_title
-from compat import unicode
def dump_comment(comment):
if b.keyword: self.outfile.write(' SHORTCUTURL="%s"' % b.keyword)
if b.icon_href:
value = b.icon_href
- #if isinstance(value, unicode):
- # value = value.encode('utf-8')
self.outfile.write(' ICON_URI="%s"' % value)
if b.icon: self.outfile.write(' ICON="%s"' % b.icon)
if b.charset: self.outfile.write(' LAST_CHARSET="%s"' % b.charset)
def handle_data(self, data):
if data:
- #if self.charset and default_encoding:
- # data = data.decode(self.charset, "replace").\
- # encode(default_encoding, "xmlcharrefreplace")
self.accumulator += data
# Mozilla - get charset
+++ /dev/null
-import sys
-
-# Compatability definitions (inspired by six)
-PY2 = sys.version_info[0] < 3
-if PY2:
- # disable flake8 checks on python 3
- string_type = basestring # noqa
- unicode = unicode # noqa
- unichr = unichr # noqa
-else:
- string_type = str
- unicode = str
- unichr = chr
Removed urllib-based robots.
+ Dropped support for Python 2.
+
Default list of robots is now curl,requests,aiohttp.
Removed urllib-based robots.
+ Dropped support for Python 2.
+
Default list of robots is now curl,requests,aiohttp.
Version 5.5.1 (2024-08-??)
"""
__author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 1997-2023 PhiloSoft Design"
+__copyright__ = "Copyright (C) 1997-2024 PhiloSoft Design"
__license__ = "GNU GPL"
__all__ = ['parse_html', 'parse_filename', 'universal_charset']
import codecs
import os
import re
-try:
- from html.entities import name2codepoint
-except ImportError:
- from htmlentitydefs import name2codepoint
-
-from compat import unicode, unichr
+from html.entities import name2codepoint
DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic
parsers = []
entity_re.match(part):
_part = name2codepoint.get(part[1:-1], None)
if _part is not None:
- part = unichr(_part)
+ part = chr(_part)
output.append(part)
title = ''.join(output)
for part in num_entity_re.split(title):
if num_entity_re.match(part):
try:
- part = unichr(int(part[2:-1]))
+ part = chr(int(part[2:-1]))
except UnicodeEncodeError:
pass # Leave the entity as is
output.append(part)
p, parser = _parsers[0]
if log: log(" Using %s" % p.__module__)
- #title = parser.title
- #if isinstance(title, unicode):
- # if parser.charset:
- # parser.title = title.encode(parser.charset)
- # else:
- # try:
- # parser.title = title.encode('ascii')
- # except UnicodeEncodeError:
- # try:
- # parser.title = title.encode(DEFAULT_CHARSET)
- # except UnicodeEncodeError:
- # parser.title = title.encode(universal_charset)
- # parser.charset = universal_charset
- # else:
- # parser.charset = DEFAULT_CHARSET
- # else:
- # parser.charset = 'ascii'
-
converted_title = title = parser.title
if title and isinstance(title, bytes) and (not parser.charset):
try:
if log: log(" META charset : %s" % parser.charset)
elif (not charset) or (charset != parser.charset):
if log: log(" guessed charset: %s" % parser.charset)
- # if log: log(" current charset: %s" % universal_charset)
if log: log(" title : %s" % title)
- #if parser.charset != universal_charset:
- # try:
- # converted_title = title.decode(parser.charset).\
- # encode(universal_charset)
- # except UnicodeError:
- # if log:
- # log(" incorrect conversion from %s,"
- # "converting from %s"
- # % (parser.charset, DEFAULT_CHARSET))
- # converted_title = \
- # title.decode(DEFAULT_CHARSET, "replace").\
- # encode(universal_charset, "replace")
- # parser.charset = DEFAULT_CHARSET
- #if log and (converted_title != title):
- # log(" converted title: %s" % converted_title)
except LookupError:
if log: log(" unknown charset: '%s'" % parser.charset)
else:
log(" final title : %s" % final_title)
parser.title = final_title
- #icon = parser.icon
- #if isinstance(icon, unicode):
- # try:
- # parser.icon = icon.encode('ascii')
- # except UnicodeEncodeError:
- # if parser.charset:
- # parser.icon = icon.encode(parser.charset)
return parser
+++ /dev/null
-"""HTML Parser using BeautifulSoup
-
-This file is a part of Bookmarks database and Internet robot.
-
-"""
-
-__author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2007-2023 PhiloSoft Design"
-__license__ = "GNU GPL"
-
-__all__ = ['parse_html']
-
-
-import re
-from sgmllib import SGMLParser, SGMLParseError
-from BeautifulSoup import BeautifulSoup, CData
-
-from .bkmk_ph_util import HTMLParser
-from compat import string_type
-
-DEFAULT_CHARSET = "cp1251" # Stupid default for Russian Cyrillic
-
-# http://groups.google.com/group/beautifulsoup/browse_thread/thread/69093cb0d3a3cf63
-
-
-class BadDeclParser(BeautifulSoup):
- def parse_declaration(self, i):
- """Treat a bogus SGML declaration as raw data. Treat a CDATA
- declaration as a CData object."""
- j = None
- if self.rawdata[i:i+9] == '<![CDATA[':
- k = self.rawdata.find(']]>', i)
- if k == -1:
- k = len(self.rawdata)
- data = self.rawdata[i+9:k]
- j = k+3
- self._toStringSubclass(data, CData)
- else:
- try:
- j = SGMLParser.parse_declaration(self, i)
- except SGMLParseError:
- # Could not parse the DOCTYPE declaration
- # Try to just skip the actual declaration
- match = re.search(
- r'<!DOCTYPE([^>]*?)>', self.rawdata[i:],
- re.MULTILINE|re.IGNORECASE) # noqa: E227
- # missing whitespace around bitwise or shift operator
- if match:
- toHandle = self.rawdata[i:match.end()]
- else:
- toHandle = self.rawdata[i:]
- self.handle_data(toHandle)
- j = i + len(toHandle)
- return j
-
-
-def _parse_html(html_text, charset):
- try:
- return BadDeclParser(html_text, fromEncoding=charset)
- except TypeError:
- return None
-
-
-def parse_html(html_text, charset=None, log=None):
- if not html_text:
- return None
- root = _parse_html(html_text, charset)
- if root is None:
- return None
-
- _charset = root.originalEncoding
- if _charset in ("ISO-8859-2", "windows-1252", "MacCyrillic"):
- # Replace with default and re-parse
- _charset = DEFAULT_CHARSET
- root = _parse_html(html_text, _charset)
- if root is None:
- return None
-
- html = root.html
- if html is None:
- html = root
-
- head = html.head
- if head is None:
- head = html # Some sites put TITLE in HTML without HEAD
-
- title = head.title
- if (title is None) and (html is not head):
- # Some sites put TITLE in HTML outside of HEAD
- title = html.title
-
- if title is None:
- # Lookup TITLE in the root
- title = root.title
-
- if title is not None:
- if title.string:
- title = title.string
- else:
- parts = []
- for part in title:
- if not isinstance(part, string_type):
- part = part.decode()
- parts.append(part.strip())
- title = ''.join(parts)
-
- meta = head.find(_find_contenttype, recursive=False)
- if meta:
- try:
- meta_content = meta.get("content")
- if meta_content:
- __charset = meta_content.lower().split('charset=')[1].\
- split(';')[0]
- else:
- __charset = False
- except IndexError: # No charset in the META Content-Type
- meta_charset = False
- else:
- meta_charset = _charset == __charset
- else:
- meta_charset = False
-
- if not meta_charset:
- meta = head.find(_find_charset, recursive=False)
- if meta:
- meta_content = meta.get("charset")
- if meta_content:
- meta_charset = _charset = meta_content.lower()
-
- #if title and (_charset or meta_charset):
- # title = title.encode(_charset or meta_charset)
-
- meta = head.find(_find_refresh, recursive=False)
- if meta:
- refresh = meta.get("content")
- else:
- refresh = None
-
- meta = head.find(_find_icon, recursive=False)
- if meta:
- icon = meta.get("href")
- else:
- icon = None
-
- if (title is None) and (refresh is None) and (icon is None):
- return None
- return HTMLParser(_charset, meta_charset, title, refresh, icon)
-
-
-def _find_contenttype(Tag):
- return (Tag.name == "meta") and \
- (Tag.get("http-equiv", '').lower() == "content-type")
-
-
-def _find_charset(Tag):
- return (Tag.name == "meta") and Tag.get("charset", '')
-
-
-def _find_refresh(Tag):
- return (Tag.name == "meta") and \
- (Tag.get("http-equiv", '').lower() == "refresh")
-
-
-def _find_icon(Tag):
- return (Tag.name == "link") and \
- (Tag.get("rel", '').lower() in ('icon', 'shortcut icon'))
"""
__author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2017-2023 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2017-2024 PhiloSoft Design"
__license__ = "GNU GPL"
__all__ = ['parse_html']
from bs4 import BeautifulSoup
from .bkmk_ph_util import HTMLParser
-from compat import string_type
warnings.filterwarnings(
'ignore', 'No parser was explicitly specified')
else:
parts = []
for part in title:
- #if not isinstance(part, string_type):
- # part = part.decode()
if part.strip:
parts.append(part.strip())
else:
if meta_content:
meta_charset = _charset = meta_content.lower()
- #if title and (_charset or meta_charset):
- # try:
- # title = title.encode(_charset or meta_charset)
- # except LookupError:
- # title = title.encode(universal_charset)
- # _charset = universal_charset
-
meta = head.find(_find_refresh, recursive=False)
if meta:
refresh = meta.get("content")
"""
__author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 1997-2023 PhiloSoft Design"
+__copyright__ = "Copyright (C) 1997-2024 PhiloSoft Design"
__license__ = "GNU GPL"
__all__ = ['parse_html']
-try:
- from HTMLParser import HTMLParseError
-except ImportError:
- class HTMLParseError(Exception): pass
from m_lib.net.www.html import HTMLParser as _HTMLParser
+class HTMLParseError(Exception): pass
class HTMLHeadDone(Exception): pass
"""
__author__ = "Oleg Broytman <phd@phdru.name>"
-__copyright__ = "Copyright (C) 2010-2023 PhiloSoft Design"
+__copyright__ = "Copyright (C) 2010-2024 PhiloSoft Design"
__license__ = "GNU GPL"
__all__ = ['parse_html']
else:
meta_charset = False
- #if title and (charset or meta_charset):
- # title = title.encode(charset or meta_charset)
-
for m in meta:
if m.get('http-equiv', '').lower() == 'refresh':
refresh = m.get("content")
'Intended Audience :: End Users/Desktop',
'License :: OSI Approved :: GNU General Public License (GPL)',
'Operating System :: OS Independent',
- 'Programming Language :: Python :: 2',
- 'Programming Language :: Python :: 2.7',
'Programming Language :: Python :: 3',
+ 'Programming Language :: Python :: 3 :: Only',
'Programming Language :: Python :: 3.4',
'Programming Language :: Python :: 3.5',
'Programming Language :: Python :: 3.6',
'Programming Language :: Python :: 3.11',
'Programming Language :: Python :: 3.12',
],
- python_requires='>=2.7, !=3.0.*, !=3.1.*, !=3.2.*, !=3.3.*',
+ python_requires='>=3.4',
install_requires=[
'm_lib.full>=1.0',
],
'html': ['beautifulsoup4', 'lxml'],
'requests': ['requests[socks]', 'requests-ftp'],
'curl': ['pycurl', 'certifi'],
- 'aiohttp:python_version>="3.4"': ['aiohttp', 'aioftp'],
+ 'aiohttp': ['aiohttp', 'aioftp'],
},
)