From: Oleg Broytman Date: Tue, 14 Nov 2023 15:11:12 +0000 (+0300) Subject: Feat: Report redirects and set URLs X-Git-Tag: 5.0.0~35 X-Git-Url: https://git.phdru.name/?a=commitdiff_plain;h=8a1de1f6955285e17c7cdf55caf34ff937cc55e7;p=bookmarks_db.git Feat: Report redirects and set URLs Run through the bookmarks database and set URLs from redirects from an external file. --- diff --git a/bkmk-chk b/bkmk-chk index 3e5e5fe..6a748e9 100755 --- a/bkmk-chk +++ b/bkmk-chk @@ -7,7 +7,7 @@ # This file is a part of Bookmarks database and Internet robot. # # __author__ = "Oleg Broytman " -# __copyright__ = "Copyright (C) 2000-2014 PhiloSoft Design" +# __copyright__ = "Copyright (C) 2000-2023 PhiloSoft Design" # __license__ = "GNU GPL" . "`dirname \"$0\"`"/set-path @@ -24,6 +24,7 @@ check_urls.py -e && BKMK_WRITER=flad db2bkmk.py && check_dups.py -s -l bookmarks.err >/dev/null && check_title.py > check_title.txt && +check_redirects.py > check_redirects.txt && bkmk-sort && # Write results to the bookmarks files diff --git a/check_redirects.py b/check_redirects.py new file mode 100755 index 0000000..12c74d8 --- /dev/null +++ b/check_redirects.py @@ -0,0 +1,72 @@ +#! /usr/bin/env python3 +"""Check and show URLs in the bookmarks database that have redirects + +This file is a part of Bookmarks database and Internet robot. +""" + +from __future__ import print_function +import sys +from bkmk_objects import make_linear + + +__author__ = "Oleg Broytman " +__copyright__ = "Copyright (C) 2023 PhiloSoft Design" +__license__ = "GNU GPL" + + +def run(): + from getopt import getopt + optlist, args = getopt(sys.argv[1:], "s") + + report_stats = 1 + + for _opt, _arg in optlist: + if _opt == '-s': + report_stats = 0 + try: + del _opt, _arg + except NameError: + pass + + if report_stats: + print("Broytman check_redirects, Copyright (C) 2023 PhiloSoft Design") + + if args: + sys.stderr.write("check_redirects: too many arguments\n") + sys.stderr.write("Usage: check_redirects [-s]\n") + sys.exit(1) + + from storage import storage + storage = storage() + + if report_stats: + sys.stdout.write("Loading %s: " % storage.filename) + sys.stdout.flush() + + root_folder = storage.load() + make_linear(root_folder) + objects = len(root_folder.linear) + + if report_stats: + print("Ok") + + for object_no in range(objects): + object = root_folder.linear[object_no] + + if object.isBookmark: + if hasattr(object, "error") or \ + object.href.startswith('place:'): # Firefox SmartBookmarks + continue + + if hasattr(object, "moved"): + print(object.href) + print(object.moved) + print() + del object.moved + + if report_stats: + print(objects, "objects passed") + + +if __name__ == '__main__': + run() diff --git a/doc/ANNOUNCE b/doc/ANNOUNCE index a047cd3..b6712fa 100644 --- a/doc/ANNOUNCE +++ b/doc/ANNOUNCE @@ -10,6 +10,8 @@ Version 5.0.0 (2023-??-??) Python 3. + Report redirects and set URLs. + Remove BeautifulSoup.py (use globally installed). diff --git a/doc/ChangeLog b/doc/ChangeLog index 2bb4915..0952548 100644 --- a/doc/ChangeLog +++ b/doc/ChangeLog @@ -2,6 +2,8 @@ Version 5.0.0 (2023-??-??) Python 3. + Report redirects and set URLs. + Remove BeautifulSoup.py (use globally installed). Version 4.6.0 (2014-07-06) diff --git a/set-URLs b/set-URLs new file mode 100755 index 0000000..d408e16 --- /dev/null +++ b/set-URLs @@ -0,0 +1,16 @@ +#! /bin/sh +# Set URLs from redirects +# +# This file is a part of Bookmarks database and Internet robot. +# +# __author__ = "Oleg Broytman " +# __copyright__ = "Copyright (C) 2023 PhiloSoft Design" +# __license__ = "GNU GPL" + +. "`dirname \"$0\"`"/set-path + +set-URLs.py "$@" || exit 1 + +BKMK_WRITER=flad db2bkmk.py && +convert_st.py json && +exec db2bkmk.py diff --git a/set-URLs.py b/set-URLs.py new file mode 100755 index 0000000..bfcf5da --- /dev/null +++ b/set-URLs.py @@ -0,0 +1,125 @@ +#! /usr/bin/env python3 +"""Run through the bookmarks database and set URLs from redirects +from an external file + +This file is a part of Bookmarks database and Internet robot. +""" + +from __future__ import print_function +import sys + + +__author__ = "Oleg Broytman " +__copyright__ = "Copyright (C) 2023 PhiloSoft Design" +__license__ = "GNU GPL" + + +def run(): + from getopt import getopt + optlist, args = getopt(sys.argv[1:], "s") + + report_stats = 1 + + for _opt, _arg in optlist: + if _opt == '-s': + report_stats = 0 + try: + del _opt, _arg + except NameError: + pass + + if report_stats: + print("Broytman set-URLs, Copyright (C) 2023 PhiloSoft Design") + + if len(args) != 1: + sys.stderr.write("Usage: set-URLs [-s] urls_file") + sys.exit(1) + + # Read the external file with URLs and build a mapping (URL => redirect) + urls_dict = {} + + URL = None + redirect = None + + urls_file = open(args[0], 'rt') + for line in urls_file: + line = line[:-1] # strip trailing newline + if URL is None: + URL = line + + elif redirect is None: + redirect = line + + elif line: # the third line in every 3 lines must be empty + raise ValueError( + "line is not empty for URL `%s', redirect `%s': line `%s'" + % (URL, redirect, line) + ) + + else: # We've got 3 lines - add new entry to the mapping + if not redirect.startswith('https://') \ + and not redirect.startswith('http://'): + raise ValueError("Redirect is not an URL: `%s'" % redirect) + if URL in urls_dict: + if redirect != urls_dict[URL]: + raise ValueError( + "Redirects are not identical for URL `%s':" + " `%s' != `%s'" % (URL, redirect, urls_dict[URL]) + ) + + else: + urls_dict[URL] = redirect + + # reset + URL = None + redirect = None + + urls_file.close() + + from storage import storage + storage = storage() + + if report_stats: + sys.stdout.write("Loading %s: " % storage.filename) + sys.stdout.flush() + + root_folder = storage.load() + from bkmk_objects import make_linear, break_tree + make_linear(root_folder) + objects = len(root_folder.linear) + + if report_stats: + print("Ok") + + # Run through the list of objects and check URLs/redirects + changed = 0 + for object_no in range(objects): + object = root_folder.linear[object_no] + + if object.isBookmark: + URL = object.href + if URL in urls_dict: + redirect = urls_dict[URL] + object.href = redirect + changed += 1 + + if changed and report_stats: + sys.stdout.write("Saving %s: " % storage.filename) + sys.stdout.flush() + + if not changed and report_stats: + sys.stdout.write("No need to save data\n") + sys.stdout.flush() + + if changed: + break_tree(root_folder.linear) + storage.store(root_folder) + + if changed and report_stats: + print("Ok") + print(objects, "objects passed") + print(changed, "objects changed") + + +if __name__ == '__main__': + run()